patina-cli 3.11.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/.patina.default.yaml +29 -29
  2. package/CHANGELOG.md +53 -0
  3. package/NOTICE +21 -0
  4. package/README.md +117 -224
  5. package/README_JA.md +134 -77
  6. package/README_KR.md +132 -74
  7. package/README_ZH.md +137 -80
  8. package/SKILL.md +11 -20
  9. package/artifacts/rebaseline-2025/README.md +147 -0
  10. package/artifacts/rebaseline-2025/human-controls.public.jsonl +250 -0
  11. package/artifacts/rebaseline-2025/intake.example.jsonl +2 -0
  12. package/artifacts/rebaseline-2025/intake.local.example.jsonl +25 -0
  13. package/artifacts/rebaseline-2025/prompts.template.jsonl +7 -0
  14. package/artifacts/rebaseline-2025/sources.ko-public.jsonl +39 -0
  15. package/assets/brand/patina-badge.svg +18 -0
  16. package/assets/brand/patina-mark.svg +8 -0
  17. package/assets/demo/README.md +79 -0
  18. package/core/scoring.md +12 -12
  19. package/core/standalone-prompt.md +3 -1
  20. package/core/stylometry.md +93 -22
  21. package/docs/API.md +1554 -0
  22. package/docs/AUTHENTICATION.md +50 -26
  23. package/docs/AUTHENTICATION_KR.md +54 -29
  24. package/docs/BRANDING.md +9 -8
  25. package/docs/CLI.md +55 -14
  26. package/docs/COOKBOOK.md +8 -21
  27. package/docs/DEMO.md +32 -5
  28. package/docs/EXIT-CODES.md +2 -3
  29. package/docs/FALSE-POSITIVES.md +63 -0
  30. package/docs/FAQ.md +9 -1
  31. package/docs/FAQ_KR.md +3 -1
  32. package/docs/FLAG-PARITY.md +33 -47
  33. package/docs/ISSUE-WAVES.md +57 -0
  34. package/docs/PATTERNS-EN.md +67 -3
  35. package/docs/PATTERNS-JA.md +68 -2
  36. package/docs/PATTERNS-KO.md +70 -7
  37. package/docs/PATTERNS-ZH.md +67 -3
  38. package/docs/PATTERNS.md +5 -5
  39. package/docs/RESEARCH-DOCS-PLATFORM.md +54 -0
  40. package/docs/ROADMAP.md +46 -66
  41. package/docs/TRANSLATIONESE-KO.md +51 -0
  42. package/docs/audits/2026-05-deep-research.md +3 -1
  43. package/docs/benchmarks/README.md +51 -0
  44. package/docs/benchmarks/detector-comparison.json +69 -9
  45. package/docs/benchmarks/detector-comparison.md +10 -5
  46. package/docs/benchmarks/katfish-ko-latest.json +657 -0
  47. package/docs/benchmarks/katfish-ko-latest.md +77 -0
  48. package/docs/benchmarks/latest.json +1183 -108
  49. package/docs/benchmarks/latest.md +84 -60
  50. package/docs/benchmarks/lexicon-freshness-en-2026-05-22.json +1121 -0
  51. package/docs/benchmarks/lexicon-freshness-en-2026-05-22.md +136 -0
  52. package/docs/benchmarks/rebaseline-latest.json +381 -0
  53. package/docs/benchmarks/rebaseline-latest.md +121 -0
  54. package/docs/benchmarks/register-stratified-latest.json +164 -0
  55. package/docs/benchmarks/register-stratified-latest.md +99 -0
  56. package/docs/benchmarks/register-stratified.md +43 -0
  57. package/docs/integrations/github-action.md +44 -11
  58. package/docs/integrations/playground.md +58 -0
  59. package/docs/integrations/pre-commit.md +5 -5
  60. package/docs/integrations/release.md +5 -3
  61. package/docs/integrations/static-sites.md +83 -0
  62. package/docs/research/2025-rebaseline-plan.md +71 -2
  63. package/docs/research/2026-rebaseline.md +102 -0
  64. package/docs/research/adversarial-mps.md +41 -0
  65. package/docs/research/ai-human-metrics.md +35 -23
  66. package/docs/research/human-eval-panel.md +42 -0
  67. package/docs/research/judge-agreement.md +24 -0
  68. package/docs/research/ko-2025-corpus-sources.md +135 -0
  69. package/docs/research/lexicon-freshness-audit.md +64 -0
  70. package/docs/research/zh-ja-lexicon-calibration.md +60 -0
  71. package/docs/social/patina-launch-copy.md +173 -100
  72. package/docs/social/patina-launch-execution.md +94 -0
  73. package/docs/social/patina-launch-korean-first.md +83 -0
  74. package/docs/social/signs-of-ai-writing.md +26 -0
  75. package/docs/social/signs-of-ai-writing_KR.md +26 -0
  76. package/lexicon/ai-en.md +21 -24
  77. package/lexicon/ai-ja.md +158 -0
  78. package/lexicon/ai-ko.md +9 -9
  79. package/lexicon/ai-zh.md +158 -0
  80. package/lexicon/provenance/ai-en.json +970 -0
  81. package/lexicon/provenance/ai-ja.json +542 -0
  82. package/lexicon/provenance/ai-ko.json +866 -0
  83. package/lexicon/provenance/ai-zh.json +542 -0
  84. package/package.json +49 -8
  85. package/patterns/en-communication.md +5 -0
  86. package/patterns/en-content.md +5 -0
  87. package/patterns/en-filler.md +5 -0
  88. package/patterns/en-language.md +29 -1
  89. package/patterns/en-structure.md +5 -0
  90. package/patterns/en-style.md +5 -0
  91. package/patterns/en-viral-hook.md +42 -2
  92. package/patterns/ja-communication.md +5 -0
  93. package/patterns/ja-content.md +5 -0
  94. package/patterns/ja-filler.md +5 -0
  95. package/patterns/ja-language.md +33 -1
  96. package/patterns/ja-structure.md +12 -0
  97. package/patterns/ja-style.md +5 -0
  98. package/patterns/ja-viral-hook.md +41 -2
  99. package/patterns/ko-communication.md +5 -0
  100. package/patterns/ko-content.md +5 -0
  101. package/patterns/ko-filler.md +5 -0
  102. package/patterns/ko-language.md +33 -1
  103. package/patterns/ko-structure.md +25 -6
  104. package/patterns/ko-style.md +5 -0
  105. package/patterns/ko-viral-hook.md +38 -2
  106. package/patterns/zh-communication.md +5 -0
  107. package/patterns/zh-content.md +5 -0
  108. package/patterns/zh-filler.md +5 -0
  109. package/patterns/zh-language.md +37 -1
  110. package/patterns/zh-structure.md +12 -0
  111. package/patterns/zh-style.md +5 -0
  112. package/patterns/zh-viral-hook.md +38 -2
  113. package/playground/README.md +55 -0
  114. package/playground/analytics.js +4 -0
  115. package/playground/analyzer.js +883 -0
  116. package/playground/app.js +157 -0
  117. package/playground/data/lexicons.js +343 -0
  118. package/playground/index.html +138 -0
  119. package/playground/styles.css +267 -0
  120. package/profiles/namuwiki.md +111 -0
  121. package/scripts/adversarial-mps-report.mjs +201 -0
  122. package/scripts/badge-json.mjs +79 -0
  123. package/scripts/benchmark-report.mjs +56 -9
  124. package/scripts/check-release-metadata.mjs +0 -2
  125. package/scripts/detector-comparison.mjs +7 -7
  126. package/scripts/generate-playground-data.mjs +77 -0
  127. package/scripts/katfish-calibration.mjs +464 -0
  128. package/scripts/lexicon-freshness.mjs +485 -0
  129. package/scripts/lint.mjs +1 -1
  130. package/scripts/precommit-score.mjs +4 -3
  131. package/scripts/prose-score.mjs +81 -5
  132. package/scripts/rebaseline-intake.mjs +242 -0
  133. package/scripts/rebaseline-score.mjs +268 -0
  134. package/scripts/rebaseline-summary.mjs +773 -0
  135. package/scripts/rebaseline-web-collect.mjs +410 -0
  136. package/scripts/update-benchmark-ranges.mjs +1 -0
  137. package/src/api.js +69 -105
  138. package/src/auth.js +50 -2
  139. package/src/backends/claude-cli.js +19 -4
  140. package/src/backends/codex-cli.js +19 -3
  141. package/src/backends/contract.js +230 -1
  142. package/src/backends/gemini-cli.js +18 -5
  143. package/src/backends/index.js +87 -12
  144. package/src/backends/kimi-cli.js +161 -0
  145. package/src/cli.js +577 -567
  146. package/src/commands/doctor.js +2 -2
  147. package/src/config.js +29 -0
  148. package/src/errors.js +53 -1
  149. package/src/features/discourse-tells.js +68 -0
  150. package/src/features/index.js +82 -8
  151. package/src/features/lexicon.js +40 -6
  152. package/src/features/markup-leakage.js +69 -0
  153. package/src/features/segment.js +41 -0
  154. package/src/features/signal-strength.js +81 -0
  155. package/src/features/stylometry.js +231 -1
  156. package/src/features/translationese.js +127 -0
  157. package/src/loader.js +76 -0
  158. package/src/logger.js +22 -23
  159. package/src/model-defaults.js +55 -0
  160. package/src/ouroboros.js +31 -0
  161. package/src/output.js +102 -90
  162. package/src/prompt-builder.js +103 -68
  163. package/src/providers.js +51 -4
  164. package/src/scoring.js +210 -2
  165. package/src/security.js +75 -0
  166. package/tests/fixtures/live-quality/en/public-docs-01.md +26 -0
  167. package/tests/fixtures/live-quality/ko/public-docs-01.md +26 -0
  168. package/tests/fixtures/suspect-zones/expected-ranges.json +207 -16
  169. package/tests/fixtures/suspect-zones/ja/ai/ja-ai-04-lexicon.md +11 -0
  170. package/tests/fixtures/suspect-zones/ja/natural/ja-nat-04-lexicon-cold.md +11 -0
  171. package/tests/fixtures/suspect-zones/ko/ai/ko-ai-02.md +4 -5
  172. package/tests/fixtures/suspect-zones/ko/ai/ko-ai-07-ko-diagnostic.md +11 -0
  173. package/tests/fixtures/suspect-zones/zh/ai/zh-ai-04-lexicon.md +11 -0
  174. package/tests/fixtures/suspect-zones/zh/natural/zh-nat-04-lexicon-cold.md +11 -0
  175. package/tests/quality/README.md +188 -11
  176. package/tests/quality/adversarial-mps/fixtures.jsonl +10 -0
  177. package/tests/quality/benchmark.mjs +39 -1
  178. package/tests/quality/dogfood.mjs +5 -3
  179. package/tests/quality/live-fixtures.jsonl +2 -0
  180. package/tests/quality/live-quality.mjs +596 -0
  181. package/tests/quality/ranking-metrics.mjs +136 -0
  182. package/tests/quality/rebaseline-manifest.example.jsonl +5 -0
  183. package/vercel.json +53 -0
  184. package/SKILL-MAX.md +0 -455
  185. package/docs/internal/HARNESS.md +0 -14
  186. package/docs/internal/README.md +0 -14
  187. package/docs/internal/WARP.md +0 -23
  188. package/patina-max/SKILL.md +0 -523
  189. package/patina-max/composite.py +0 -457
  190. package/src/cache.js +0 -106
  191. package/src/commands/init.js +0 -208
  192. package/src/manifest.js +0 -162
  193. package/src/max-mode.js +0 -207
@@ -54,8 +54,8 @@ export function buildDoctorReport({ version } = {}) {
54
54
  status: tmux.ok ? 'ok' : 'warning',
55
55
  summary: tmux.ok ? tmux.stdout.trim() : 'tmux not found',
56
56
  detail: tmux.ok
57
- ? 'available for MAX omc dispatch workflows'
58
- : 'only needed for tmux-based MAX dispatch; direct/API modes still work',
57
+ ? 'available when you want tmux-based parallel workflows outside patina itself'
58
+ : 'optional; patina no longer requires tmux for any built-in mode',
59
59
  });
60
60
 
61
61
  const apiKeySource = inspectHttpApiKeySource();
package/src/config.js CHANGED
@@ -7,6 +7,15 @@ import yaml from 'js-yaml';
7
7
  const __dirname = dirname(fileURLToPath(import.meta.url));
8
8
  const REPO_ROOT = resolve(__dirname, '..');
9
9
 
10
+ /**
11
+ * Load default config and merge global/project .patina.yaml overrides.
12
+ *
13
+ * @param {string} [path] Base YAML config path.
14
+ * @returns {object} Merged patina configuration object.
15
+ * @throws {Error} When a config file is missing, invalid YAML, or not a mapping.
16
+ * @example
17
+ * const config = loadConfig();
18
+ */
10
19
  export function loadConfig(path = resolve(REPO_ROOT, '.patina.default.yaml')) {
11
20
  const raw = readFileSync(path, 'utf8');
12
21
  const parsed = yaml.load(raw);
@@ -54,6 +63,14 @@ function deepMerge(target, source) {
54
63
  }
55
64
  }
56
65
 
66
+ /**
67
+ * Return the repository root inferred from this source file location.
68
+ *
69
+ * @returns {string} Absolute repository root path.
70
+ * @throws {Error} Propagates validation, filesystem, network, or dependency failures when the underlying operation cannot complete.
71
+ * @example
72
+ * const root = getRepoRoot();
73
+ */
57
74
  export function getRepoRoot() {
58
75
  return REPO_ROOT;
59
76
  }
@@ -63,6 +80,18 @@ const VALID_TONES = ['casual', 'professional', 'academic', 'narrative', 'marketi
63
80
  // Resolve the effective tone from CLI flag and config (v3.10).
64
81
  // Priority: cliTone > configTone > unset. zh/ja + explicit tone → fallback path.
65
82
  // Returns: { tone, tone_source, tone_evidence, tone_confidence, warning? }
83
+ /**
84
+ * Resolve CLI/config tone settings into prompt-ready tone metadata.
85
+ *
86
+ * @param {object} options Tone inputs.
87
+ * @param {string|null} [options.cliTone] CLI tone override.
88
+ * @param {string|null} [options.configTone] Configured tone value.
89
+ * @param {string} [options.lang] Active language code.
90
+ * @returns {Object} Tone metadata.
91
+ * @throws {Error} When cliTone or configTone is not supported.
92
+ * @example
93
+ * const tone = resolveTone({ cliTone: 'casual', lang: 'ko' });
94
+ */
66
95
  export function resolveTone({ cliTone, configTone, lang }) {
67
96
  if (cliTone !== undefined && cliTone !== null) {
68
97
  if (!VALID_TONES.includes(cliTone)) {
package/src/errors.js CHANGED
@@ -1,3 +1,14 @@
1
+ /**
2
+ * Structured CLI error with separate what/why/action fields and exit code.
3
+ *
4
+ * @param {object} options Error fields.
5
+ * @param {string} options.what Short failure headline.
6
+ * @param {string} options.why Explanation of the failure.
7
+ * @param {string} options.action Suggested user action.
8
+ * @param {number} [options.exitCode=1] Process exit code.
9
+ * @example
10
+ * throw new PatinaCliError({ what: 'missing input', why: 'No file was provided', action: 'Pass a file path.' });
11
+ */
1
12
  export class PatinaCliError extends Error {
2
13
  constructor({ what, why, action, exitCode = 1 }) {
3
14
  super([what, why, action].filter(Boolean).join('\n'));
@@ -9,14 +20,45 @@ export class PatinaCliError extends Error {
9
20
  }
10
21
  }
11
22
 
23
+ /**
24
+ * Create a user-input error that should exit with code 2.
25
+ *
26
+ * @param {string} what Short failure headline.
27
+ * @param {string} why Explanation of the invalid input.
28
+ * @param {string} action Suggested user action.
29
+ * @returns {PatinaCliError} Structured input error.
30
+ * @throws {Error} Does not intentionally throw; returns an Error instance for callers to throw.
31
+ * @example
32
+ * throw inputError('missing input', 'No file was provided.', 'Pass a file path.');
33
+ */
12
34
  export function inputError(what, why, action) {
13
35
  return new PatinaCliError({ what, why, action, exitCode: 2 });
14
36
  }
15
37
 
38
+ /**
39
+ * Create a runtime error that should exit with code 1.
40
+ *
41
+ * @param {string} what Short failure headline.
42
+ * @param {string} why Explanation of the runtime failure.
43
+ * @param {string} action Suggested user action.
44
+ * @returns {PatinaCliError} Structured runtime error.
45
+ * @throws {Error} Propagates validation, filesystem, network, or dependency failures when the underlying operation cannot complete.
46
+ * @example
47
+ * throw runtimeError('provider failed', 'The API timed out.', 'Retry later.');
48
+ */
16
49
  export function runtimeError(what, why, action) {
17
50
  return new PatinaCliError({ what, why, action, exitCode: 1 });
18
51
  }
19
52
 
53
+ /**
54
+ * Render any thrown value into the patina CLI error format.
55
+ *
56
+ * @param {unknown} err Error-like value to render.
57
+ * @returns {string} Multi-line user-facing error text.
58
+ * @throws {Error} Propagates validation, filesystem, network, or dependency failures when the underlying operation cannot complete.
59
+ * @example
60
+ * const message = renderCliError(inputError('bad flag', 'Unknown flag.', 'Run --help.'));
61
+ */
20
62
  export function renderCliError(err) {
21
63
  const normalized = normalizeError(err);
22
64
  return [
@@ -26,8 +68,18 @@ export function renderCliError(err) {
26
68
  ].join('\n');
27
69
  }
28
70
 
71
+ /**
72
+ * Extract a safe process exit code from an error-like value.
73
+ *
74
+ * @param {unknown} err Error-like value.
75
+ * @param {number} [fallback=1] Exit code used when err.exitCode is absent or invalid.
76
+ * @returns {number} Non-negative integer exit code.
77
+ * @throws {Error} Propagates validation, filesystem, network, or dependency failures when the underlying operation cannot complete.
78
+ * @example
79
+ * const code = getExitCode(inputError('bad', 'why', 'fix')); // 2
80
+ */
29
81
  export function getExitCode(err, fallback = 1) {
30
- const n = Number(err?.exitCode);
82
+ const n = Number(err ? /** @type {any} */ (err).exitCode : undefined);
31
83
  return Number.isInteger(n) && n >= 0 ? n : fallback;
32
84
  }
33
85
 
@@ -0,0 +1,68 @@
1
+ // Density-gated discourse tells (issue #334). Unlike markup-leakage (a single
2
+ // near-proof-grade hit), these are constructions humans also use, so each fires
3
+ // only past a density threshold to keep false positives low.
4
+ //
5
+ // 1. Fake-candor / manufactured-intimacy openers (English): AI overuses
6
+ // intimacy-signaling openers ("here's the thing", "let's be honest") that
7
+ // real writers use sparingly. Fires at >= 2 per document.
8
+ // 2. Decorative thematic breaks: AI sprinkles `---` / `***` / `___` dividers,
9
+ // often before every heading. Fires at >= 3 per document.
10
+
11
+ const FAKE_CANDOR_RULES = [
12
+ /\bhere'?s the thing\b/gi,
13
+ /\bhere'?s the kicker\b/gi,
14
+ /\blet'?s be honest\b/gi,
15
+ /\blet'?s be real\b/gi,
16
+ /\bthe truth is\b/gi,
17
+ /\bi'?ll be honest(?: with you)?\b/gi,
18
+ /\breal talk\b/gi,
19
+ ];
20
+
21
+ const FAKE_CANDOR_MIN = 2;
22
+ const THEMATIC_BREAK_MIN = 3;
23
+
24
+ // A markdown thematic break: a line that is only ---, ***, or ___ (3+), optionally spaced.
25
+ const THEMATIC_BREAK_LINE = /^[ \t]*(?:-[ \t]*){3,}$|^[ \t]*(?:\*[ \t]*){3,}$|^[ \t]*(?:_[ \t]*){3,}$/;
26
+ const HEADING_LINE = /^[ \t]*#{1,6}[ \t]+\S/;
27
+
28
+ export function detectFakeCandor(text) {
29
+ const str = typeof text === 'string' ? text : '';
30
+ const hits = [];
31
+ let count = 0;
32
+ for (const re of FAKE_CANDOR_RULES) {
33
+ const m = str.match(re);
34
+ if (m && m.length) {
35
+ count += m.length;
36
+ hits.push(...new Set(m.map((x) => x.trim().toLowerCase())));
37
+ }
38
+ }
39
+ return { count, hits: [...new Set(hits)].slice(0, 5), hot: count >= FAKE_CANDOR_MIN, threshold: FAKE_CANDOR_MIN };
40
+ }
41
+
42
+ export function detectThematicBreaks(text) {
43
+ const lines = (typeof text === 'string' ? text : '').split(/\r?\n/);
44
+ let count = 0;
45
+ let adjacentToHeading = 0;
46
+ for (let i = 0; i < lines.length; i++) {
47
+ if (!THEMATIC_BREAK_LINE.test(lines[i])) continue;
48
+ count++;
49
+ // "adjacent to a heading" = the next non-empty line is a heading.
50
+ for (let j = i + 1; j < lines.length; j++) {
51
+ if (lines[j].trim() === '') continue;
52
+ if (HEADING_LINE.test(lines[j])) adjacentToHeading++;
53
+ break;
54
+ }
55
+ }
56
+ return { count, adjacentToHeading, hot: count >= THEMATIC_BREAK_MIN, threshold: THEMATIC_BREAK_MIN };
57
+ }
58
+
59
+ /**
60
+ * @returns {{ fakeCandor: object, thematicBreaks: object, hot: boolean }}
61
+ */
62
+ export function detectDiscourseTells(text) {
63
+ const fakeCandor = detectFakeCandor(text);
64
+ const thematicBreaks = detectThematicBreaks(text);
65
+ return { fakeCandor, thematicBreaks, hot: fakeCandor.hot || thematicBreaks.hot };
66
+ }
67
+
68
+ export { FAKE_CANDOR_MIN, THEMATIC_BREAK_MIN };
@@ -3,26 +3,45 @@
3
3
  // is the in-tree port of the algorithm previously delegated to the LLM via
4
4
  // SKILL.md Step 4.6/4.7. It does not call any LLM.
5
5
 
6
- import { splitParagraphs, splitSentences, tokenize } from './segment.js';
6
+ import { splitParagraphs, splitSentences, splitProseSentences, tokenize } from './segment.js';
7
7
  import {
8
8
  burstinessCV,
9
9
  mattr,
10
10
  classifyBurstiness,
11
11
  classifyMattr,
12
+ classifyKoreanDiagnostics,
13
+ commaDensity,
14
+ koreanPosDiversityProxy,
15
+ koreanSpacingFeatures,
12
16
  DEFAULT_BURSTINESS_BANDS,
17
+ DEFAULT_KO_DIAGNOSTIC_BANDS,
13
18
  DEFAULT_MATTR_BANDS,
14
19
  DEFAULT_MATTR_WINDOW,
20
+ DEFAULT_MIN_BURSTINESS_SENTENCES,
15
21
  } from './stylometry.js';
16
- import { loadLexicon, computeDensity, DEFAULT_LEXICON_DENSITY_THRESHOLD } from './lexicon.js';
22
+ import {
23
+ classifyLexiconHot,
24
+ loadLexicon,
25
+ computeDensity,
26
+ DEFAULT_LEXICON_DENSITY_THRESHOLD,
27
+ DEFAULT_LEXICON_MIN_HOT_MATCHES,
28
+ } from './lexicon.js';
29
+ import { detectMarkupLeakage } from './markup-leakage.js';
30
+ import { detectDiscourseTells } from './discourse-tells.js';
31
+ import { detectTranslationese } from './translationese.js';
17
32
 
18
33
  export function analyzeText(text, opts = {}) {
19
34
  const {
20
35
  lang = 'en',
21
36
  repoRoot,
22
37
  burstinessBands = DEFAULT_BURSTINESS_BANDS,
38
+ minBurstinessSentences = DEFAULT_MIN_BURSTINESS_SENTENCES,
23
39
  mattrBands = DEFAULT_MATTR_BANDS,
24
40
  mattrWindow = DEFAULT_MATTR_WINDOW,
41
+ koDiagnosticsEnabled = true,
42
+ koDiagnosticBands = DEFAULT_KO_DIAGNOSTIC_BANDS,
25
43
  lexiconDensityThreshold = DEFAULT_LEXICON_DENSITY_THRESHOLD,
44
+ lexiconMinHotMatches = DEFAULT_LEXICON_MIN_HOT_MATCHES,
26
45
  lexicon: providedLexicon,
27
46
  } = opts;
28
47
 
@@ -31,6 +50,18 @@ export function analyzeText(text, opts = {}) {
31
50
  // vs decomposed) would otherwise yield different MATTR/lexicon hits.
32
51
  const normalized = text ? text.normalize('NFC') : '';
33
52
  const paragraphs = splitParagraphs(normalized);
53
+
54
+ // Document-level leakage scan (issue #332). Near-proof-grade: a single hit is
55
+ // strong evidence of pasted model output, so it forces the document hot
56
+ // regardless of the per-paragraph stylometry/lexicon signals.
57
+ const markupLeakage = detectMarkupLeakage(normalized);
58
+ // Density-gated discourse tells (issue #334): fake-candor openers (>=2) and
59
+ // decorative thematic breaks (>=3). Document-level, weaker than leakage.
60
+ const discourseTells = detectDiscourseTells(normalized);
61
+ // ko translationese (번역투/calque) — lexical, NOT structural. Advisory signal:
62
+ // surfaced for callers/SKILL but deliberately NOT folded into `hot` (these
63
+ // constructions appear in good Korean too; gating hot would regress FP).
64
+ const translationese = detectTranslationese(normalized, { lang });
34
65
  const lexicon =
35
66
  providedLexicon ??
36
67
  (repoRoot ? loadLexicon(lang, repoRoot) : { strict: [], phrases: [] });
@@ -39,7 +70,7 @@ export function analyzeText(text, opts = {}) {
39
70
  // can suppress meta-block emission, but the benchmark wants raw signals on
40
71
  // single-paragraph fixtures so we compute them unconditionally.
41
72
  const totalSentences = paragraphs.reduce(
42
- (n, p) => n + splitSentences(p).length,
73
+ (n, p) => n + splitProseSentences(p).length,
43
74
  0
44
75
  );
45
76
  const skipReason =
@@ -48,20 +79,33 @@ export function analyzeText(text, opts = {}) {
48
79
  null;
49
80
 
50
81
  const analyzed = paragraphs.map((paragraph, idx) => {
51
- const sentences = splitSentences(paragraph);
82
+ const sentences = splitProseSentences(paragraph);
52
83
  const sentenceTokens = sentences.map((sentence) => tokenize(sentence, { lang }));
53
84
  const sentenceTokenCounts = sentenceTokens.map((t) => t.length);
54
85
  const allTokens = sentenceTokens.flat();
55
86
 
56
87
  const cv = burstinessCV(sentenceTokenCounts);
57
- const cvBand = classifyBurstiness(cv, burstinessBands);
88
+ const cvBand =
89
+ sentences.length >= minBurstinessSentences
90
+ ? classifyBurstiness(cv, burstinessBands)
91
+ : null;
58
92
  const mattrValue = mattr(allTokens, mattrWindow);
59
93
  const mattrBand = classifyMattr(mattrValue, mattrBands);
60
94
  const lex = computeDensity(paragraph, allTokens, lexicon);
95
+ const koSignals = lang === 'ko'
96
+ ? buildKoreanSignals(paragraph, sentences.length, {
97
+ enabled: koDiagnosticsEnabled,
98
+ bands: koDiagnosticBands,
99
+ })
100
+ : {};
61
101
 
62
- const lexiconHot = lex.density > lexiconDensityThreshold;
102
+ const lexiconHot = classifyLexiconHot(lex, {
103
+ lang,
104
+ densityThreshold: lexiconDensityThreshold,
105
+ minHotMatches: lexiconMinHotMatches,
106
+ });
63
107
  const hot =
64
- cvBand === 'low' || mattrBand === 'low' || lexiconHot;
108
+ cvBand === 'low' || mattrBand === 'low' || lexiconHot || Boolean(koSignals.koDiagnostics?.hot);
65
109
 
66
110
  return {
67
111
  id: `P${idx + 1}`,
@@ -70,6 +114,7 @@ export function analyzeText(text, opts = {}) {
70
114
  burstiness: { cv, band: cvBand },
71
115
  mattr: { value: mattrValue, band: mattrBand },
72
116
  lexicon: { ...lex, hot: lexiconHot },
117
+ ...koSignals,
73
118
  hot,
74
119
  };
75
120
  });
@@ -79,18 +124,47 @@ export function analyzeText(text, opts = {}) {
79
124
  skipped: Boolean(skipReason),
80
125
  skipReason,
81
126
  paragraphs: analyzed,
82
- hot: analyzed.some((p) => p.hot),
127
+ markupLeakage,
128
+ discourseTells,
129
+ translationese,
130
+ hot: markupLeakage.leaked || discourseTells.hot || analyzed.some((p) => p.hot),
83
131
  };
84
132
  }
85
133
 
86
134
  export {
87
135
  splitParagraphs,
88
136
  splitSentences,
137
+ splitProseSentences,
89
138
  tokenize,
90
139
  burstinessCV,
91
140
  mattr,
92
141
  classifyBurstiness,
93
142
  classifyMattr,
143
+ classifyKoreanDiagnostics,
144
+ commaDensity,
145
+ koreanPosDiversityProxy,
146
+ koreanSpacingFeatures,
94
147
  loadLexicon,
95
148
  computeDensity,
96
149
  };
150
+
151
+ function buildKoreanSignals(paragraph, sentenceCount, { enabled, bands }) {
152
+ const spacing = koreanSpacingFeatures(paragraph);
153
+ const comma = commaDensity(paragraph, sentenceCount);
154
+ const posDiversity = koreanPosDiversityProxy(paragraph);
155
+ const koDiagnostics = enabled
156
+ ? classifyKoreanDiagnostics({
157
+ sentenceCount,
158
+ spacing,
159
+ comma,
160
+ posDiversity,
161
+ }, bands)
162
+ : { hot: false, strength: 0, reasons: [], thresholds: bands };
163
+
164
+ return {
165
+ spacing,
166
+ comma,
167
+ posDiversity,
168
+ koDiagnostics,
169
+ };
170
+ }
@@ -7,6 +7,12 @@ import { readFileSync, existsSync } from 'node:fs';
7
7
  import { resolve } from 'node:path';
8
8
 
9
9
  export const DEFAULT_LEXICON_DENSITY_THRESHOLD = 2.0;
10
+ export const DEFAULT_LEXICON_MIN_HOT_MATCHES = {
11
+ default: 1,
12
+ ko: 2,
13
+ zh: 2,
14
+ ja: 2,
15
+ };
10
16
 
11
17
  // Parses the two well-known sections out of a lexicon markdown file.
12
18
  // Returns { strict: string[], phrases: string[] }.
@@ -65,11 +71,12 @@ export function computeDensity(paragraphText, tokens, lexicon) {
65
71
  const hits = [];
66
72
  const tokenSet = new Set(tokens.map((t) => t.toLowerCase()));
67
73
 
68
- // §16: English strict entries match whole-word; Korean strict entries are
69
- // approximated by substring (어절 inflection means `자리매김` should also
70
- // hit `자리매김했다`, `자리매김으로`, etc.). Punctuated entries always need
71
- // substring fallback because tokenization strips edge punct.
72
- const koSubstring = lexicon.lang === 'ko';
74
+ // §16: English strict entries match whole-word; CJK strict entries are
75
+ // approximated by substring. Korean inflection and zh/ja character fallback
76
+ // mean `자리매김`, `可以说`, or `まとめると` may not survive as whole tokens.
77
+ // Punctuated entries always need substring fallback because tokenization
78
+ // strips edge punct.
79
+ const cjkSubstring = ['ko', 'zh', 'ja'].includes(lexicon.lang);
73
80
  for (const entry of lexicon.strict) {
74
81
  const lowerEntry = entry.toLowerCase();
75
82
  if (tokenSet.has(lowerEntry)) {
@@ -77,7 +84,7 @@ export function computeDensity(paragraphText, tokens, lexicon) {
77
84
  continue;
78
85
  }
79
86
  const hasInternalPunct = /[^\p{L}\p{N}]/u.test(lowerEntry);
80
- if ((koSubstring || hasInternalPunct) && lowerText.includes(lowerEntry)) {
87
+ if ((cjkSubstring || hasInternalPunct) && lowerText.includes(lowerEntry)) {
81
88
  hits.push(entry);
82
89
  }
83
90
  }
@@ -88,3 +95,30 @@ export function computeDensity(paragraphText, tokens, lexicon) {
88
95
  const density = tokens.length > 0 ? (hits.length / tokens.length) * 1000 : 0;
89
96
  return { matches: hits.length, density, hits };
90
97
  }
98
+
99
+ /**
100
+ * @param {{ matches?: number, density?: number }} [lexiconStats]
101
+ * @param {{ lang?: string, densityThreshold?: number, minHotMatches?: (number|Record<string, number>) }} [options]
102
+ */
103
+ export function classifyLexiconHot(
104
+ lexiconStats,
105
+ {
106
+ lang,
107
+ densityThreshold = DEFAULT_LEXICON_DENSITY_THRESHOLD,
108
+ minHotMatches = DEFAULT_LEXICON_MIN_HOT_MATCHES,
109
+ } = {}
110
+ ) {
111
+ const matches = lexiconStats?.matches ?? 0;
112
+ const density = lexiconStats?.density ?? 0;
113
+ const minMatches = resolveMinHotMatches(lang, minHotMatches);
114
+ return matches >= minMatches && density > densityThreshold;
115
+ }
116
+
117
+ function resolveMinHotMatches(lang, minHotMatches) {
118
+ if (typeof minHotMatches === 'number' && Number.isFinite(minHotMatches)) {
119
+ return Math.max(1, minHotMatches);
120
+ }
121
+ const normalized = typeof lang === 'string' ? lang.toLowerCase() : 'default';
122
+ const value = minHotMatches?.[normalized] ?? minHotMatches?.default;
123
+ return typeof value === 'number' && Number.isFinite(value) ? Math.max(1, value) : 1;
124
+ }
@@ -0,0 +1,69 @@
1
+ // Deterministic detection of model-output *leakage* artifacts: tokens that LLM
2
+ // web-search / tooling inject and that essentially never appear in human-written
3
+ // prose. Unlike the stylometry/lexicon signals (which are probabilistic and
4
+ // fire on clusters), a single hit here is near-proof-grade evidence of pasted
5
+ // model output, so it fires hard at the document level. See issue #332.
6
+ //
7
+ // Language-agnostic literal token set — applies to ko/en/zh/ja alike.
8
+ //
9
+ // Self-scan caveat: patina's own docs, fixtures, and issues that *discuss* these
10
+ // tokens will match. That is correct behavior (the text genuinely contains
11
+ // them); callers scanning the repo's own meta-content should expect hits.
12
+
13
+ const OBJECT_REPLACEMENT_CHAR = '';
14
+
15
+ // Each entry: { id, label, build() => fresh RegExp }. We build a fresh regex per
16
+ // scan so the shared module is reentrant (no leaking lastIndex across calls).
17
+ const MARKUP_RULES = [
18
+ {
19
+ id: 'oai-citation-markup',
20
+ label: 'OpenAI citation markup',
21
+ build: () => /:contentReference|oaicite|oai_citation/gi,
22
+ },
23
+ {
24
+ id: 'model-tool-token',
25
+ label: 'Model tool token',
26
+ build: () => /\bturn\d+(?:search|view|news|image|forecast|finance|fetch)\d*\b|\bnavlist\b|\bgrok_card\b/gi,
27
+ },
28
+ {
29
+ id: 'object-replacement-char',
30
+ label: 'Object-replacement character ()',
31
+ build: () => new RegExp(OBJECT_REPLACEMENT_CHAR, 'g'),
32
+ },
33
+ {
34
+ id: 'ai-tracking-param',
35
+ label: 'AI-tool tracking parameter in URL',
36
+ build: () => /utm_source=(?:chatgpt\.com|openai\.com|perplexity\.ai|claude\.ai|gemini\.google\.com)|[?&](?:ref|utm_source)=chatgpt/gi,
37
+ },
38
+ {
39
+ id: 'explicit-self-identification',
40
+ label: 'Explicit AI self-identification',
41
+ build: () => /\bas an? (?:AI|artificial intelligence) language model\b|\bas a large language model\b|\bas a language model\b|\bas an AI assistant\b|\bI am an AI\b|\bI'?m an AI\b/gi,
42
+ },
43
+ ];
44
+
45
+ /**
46
+ * Scan raw text for model-output leakage artifacts.
47
+ * @param {string} text
48
+ * @returns {{ leaked: boolean, hits: Array<{id:string,label:string,count:number,samples:string[]}> }}
49
+ */
50
+ export function detectMarkupLeakage(text) {
51
+ const str = typeof text === 'string' ? text : '';
52
+ const hits = [];
53
+ if (!str) return { leaked: false, hits };
54
+
55
+ for (const rule of MARKUP_RULES) {
56
+ const matches = str.match(rule.build());
57
+ if (matches && matches.length > 0) {
58
+ hits.push({
59
+ id: rule.id,
60
+ label: rule.label,
61
+ count: matches.length,
62
+ samples: [...new Set(matches.map((m) => m.trim()).filter(Boolean))].slice(0, 3),
63
+ });
64
+ }
65
+ }
66
+ return { leaked: hits.length > 0, hits };
67
+ }
68
+
69
+ export { MARKUP_RULES, OBJECT_REPLACEMENT_CHAR };
@@ -9,6 +9,7 @@
9
9
 
10
10
  const SENTENCE_SPLIT_RE = /[.!?]+\s+|(?<=[。!?…])|\n+/u;
11
11
  const PARAGRAPH_SPLIT_RE = /\n\s*\n/;
12
+ const LIST_LINE_RE = /^\s*(?:[-*+]\s+|\d+[.)]\s+)/u;
12
13
  // \W in Unicode-aware mode. Strips edge punctuation but keeps internal
13
14
  // hyphens / apostrophes (e.g. "don't", "좋은-도구") as a single token.
14
15
  const EDGE_PUNCT_RE = /^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu;
@@ -22,6 +23,42 @@ export function splitParagraphs(text) {
22
23
  .filter((p) => p.length > 0);
23
24
  }
24
25
 
26
+ function stripListBlocks(paragraph) {
27
+ const lines = String(paragraph ?? '').split(/\r?\n/);
28
+ const proseLines = [];
29
+ let colonListRemaining = 0;
30
+ for (let i = 0; i < lines.length; i++) {
31
+ const rawLine = lines[i];
32
+ const trimmed = rawLine.trim();
33
+ if (trimmed === '') {
34
+ colonListRemaining = 0;
35
+ proseLines.push(rawLine);
36
+ continue;
37
+ }
38
+ if (LIST_LINE_RE.test(rawLine)) continue;
39
+ if (colonListRemaining > 0) {
40
+ colonListRemaining--;
41
+ continue;
42
+ }
43
+ if (trimmed.endsWith(':')) {
44
+ colonListRemaining = countFollowingPlainListLines(lines, i + 1);
45
+ }
46
+ proseLines.push(rawLine);
47
+ }
48
+ return proseLines.join('\n');
49
+ }
50
+
51
+ function countFollowingPlainListLines(lines, start) {
52
+ let count = 0;
53
+ for (let i = start; i < lines.length; i++) {
54
+ const trimmed = lines[i].trim();
55
+ if (trimmed === '') break;
56
+ if (LIST_LINE_RE.test(lines[i])) continue;
57
+ count++;
58
+ }
59
+ return count >= 2 ? count : 0;
60
+ }
61
+
25
62
  export function splitSentences(paragraph) {
26
63
  if (!paragraph) return [];
27
64
  return paragraph
@@ -30,6 +67,10 @@ export function splitSentences(paragraph) {
30
67
  .filter((s) => s.length > 0);
31
68
  }
32
69
 
70
+ export function splitProseSentences(paragraph) {
71
+ return splitSentences(stripListBlocks(paragraph));
72
+ }
73
+
33
74
  function tokenizeCjk(text) {
34
75
  const tokens = [];
35
76
  for (const match of text.matchAll(CJK_TOKEN_RE)) {
@@ -0,0 +1,81 @@
1
+ import { DEFAULT_LEXICON_DENSITY_THRESHOLD } from './lexicon.js';
2
+ import { DEFAULT_BURSTINESS_BANDS, DEFAULT_MATTR_BANDS } from './stylometry.js';
3
+
4
+ /**
5
+ * Average the strongest deterministic signal for each paragraph.
6
+ *
7
+ * This is diagnostic-only. It intentionally does not replace the existing
8
+ * hot-paragraph ratio used by gates and reconciliation.
9
+ *
10
+ * @param {object[]} [paragraphs] Analyzer paragraph payloads.
11
+ * @param {object} [options] Thresholds used by the analyzer.
12
+ * @returns {number} 0..100 average signal strength.
13
+ */
14
+ export function summarizeSignalStrength(paragraphs = [], options = {}) {
15
+ if (!Array.isArray(paragraphs) || paragraphs.length === 0) return 0;
16
+ const total = paragraphs.reduce(
17
+ (sum, paragraph) => sum + paragraphSignalStrength(paragraph, options),
18
+ 0
19
+ );
20
+ return total / paragraphs.length;
21
+ }
22
+
23
+ /**
24
+ * Score how deep a paragraph is inside its strongest deterministic trigger.
25
+ *
26
+ * @param {object} [paragraph] Analyzer paragraph payload.
27
+ * @param {object} [options] Thresholds used by the analyzer.
28
+ * @returns {number} 0..100 paragraph signal strength.
29
+ */
30
+ export function paragraphSignalStrength(paragraph = {}, options = {}) {
31
+ const burstiness = lowBandStrength(
32
+ paragraph.burstiness?.cv,
33
+ resolveLowThreshold(options.burstinessBands, DEFAULT_BURSTINESS_BANDS.low),
34
+ paragraph.burstiness?.band
35
+ );
36
+ const mattr = lowBandStrength(
37
+ paragraph.mattr?.value,
38
+ resolveLowThreshold(options.mattrBands, DEFAULT_MATTR_BANDS.low),
39
+ paragraph.mattr?.band
40
+ );
41
+ const lexicon = highThresholdStrength(
42
+ paragraph.lexicon?.density,
43
+ resolveThreshold(
44
+ options.lexiconDensityThreshold,
45
+ DEFAULT_LEXICON_DENSITY_THRESHOLD
46
+ ),
47
+ paragraph.lexicon?.hot
48
+ );
49
+ const koDiagnostics =
50
+ paragraph.koDiagnostics?.hot &&
51
+ typeof paragraph.koDiagnostics?.strength === 'number' &&
52
+ Number.isFinite(paragraph.koDiagnostics.strength)
53
+ ? paragraph.koDiagnostics.strength
54
+ : 0;
55
+ return Math.max(burstiness, mattr, lexicon, koDiagnostics);
56
+ }
57
+
58
+ function resolveLowThreshold(bands, fallback) {
59
+ return resolveThreshold(bands?.low, fallback);
60
+ }
61
+
62
+ function resolveThreshold(value, fallback) {
63
+ return typeof value === 'number' && Number.isFinite(value) ? value : fallback;
64
+ }
65
+
66
+ function lowBandStrength(value, threshold, band) {
67
+ if (band !== 'low' || typeof value !== 'number' || !Number.isFinite(value)) return 0;
68
+ if (!threshold || threshold <= 0) return 0;
69
+ return clampPercent((1 - value / threshold) * 100);
70
+ }
71
+
72
+ function highThresholdStrength(value, threshold, isHot) {
73
+ if (!isHot || typeof value !== 'number' || !Number.isFinite(value)) return 0;
74
+ if (!threshold || threshold <= 0) return 0;
75
+ return clampPercent(((value - threshold) / threshold) * 100);
76
+ }
77
+
78
+ function clampPercent(value) {
79
+ if (!Number.isFinite(value)) return 0;
80
+ return Math.max(0, Math.min(100, value));
81
+ }