dravoice 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/package.json +1 -1
- package/src/index.js +106 -13
- package/src/v2/analyzers/discourse.js +7 -1
- package/src/v2/analyzers/evidence.js +3 -3
- package/src/v2/analyzers/register.js +28 -4
- package/src/v2/analyzers/rhetorical-shape.js +7 -1
- package/src/v2/analyzers/structure.js +18 -1
- package/src/v2/benchmark.js +83 -0
- package/src/v2/doctor.js +308 -0
- package/src/v2/document-model.js +77 -6
- package/src/v2/inspect.js +2 -2
- package/src/v2/profile.js +126 -11
- package/src/v2/review.js +142 -16
- package/src/v2/revise-plan.js +111 -8
- package/src/v2/stylometry.js +11 -7
package/src/v2/doctor.js
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import {
|
|
4
|
+
DEFAULT_MAX_FILE_BYTES,
|
|
5
|
+
DEFAULT_MAX_FILES,
|
|
6
|
+
DEFAULT_MAX_TOTAL_BYTES,
|
|
7
|
+
VOICE_EXTENSIONS,
|
|
8
|
+
VOICE_SKIP_DIRS,
|
|
9
|
+
parseDocument,
|
|
10
|
+
} from "./document-model.js";
|
|
11
|
+
import { buildVoiceProfileV2 } from "./profile.js";
|
|
12
|
+
|
|
13
|
+
export function diagnoseVoiceCorpusV2({ examplesDir, cwd = process.cwd() }) {
|
|
14
|
+
const requestedExamples = examplesDir ?? "./articles";
|
|
15
|
+
const resolvedExamples = resolvePath(cwd, requestedExamples);
|
|
16
|
+
const displayedExamples = path.isAbsolute(requestedExamples)
|
|
17
|
+
? displayPath(resolvedExamples, cwd)
|
|
18
|
+
: displayPathForCommand(requestedExamples);
|
|
19
|
+
const baseResult = {
|
|
20
|
+
schemaVersion: 2,
|
|
21
|
+
generatedBy: "dravoice-v2-doctor",
|
|
22
|
+
examples: displayedExamples,
|
|
23
|
+
status: "needs-work",
|
|
24
|
+
exitCode: 1,
|
|
25
|
+
summary: emptySummary(),
|
|
26
|
+
files: [],
|
|
27
|
+
unsupportedFiles: [],
|
|
28
|
+
recommendations: [],
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
if (!fs.existsSync(resolvedExamples)) {
|
|
32
|
+
return {
|
|
33
|
+
...baseResult,
|
|
34
|
+
recommendations: [
|
|
35
|
+
"Examples directory is missing.",
|
|
36
|
+
"Run mkdir -p articles, then copy at least 3 representative .md, .mdx, or .txt pieces into it.",
|
|
37
|
+
`If your writing lives elsewhere, run drav doctor --examples ${displayPathForCommand(requestedExamples) === "./articles" ? "~/path/to/writing" : displayPathForCommand(requestedExamples)}.`,
|
|
38
|
+
],
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!fs.statSync(resolvedExamples).isDirectory()) {
|
|
43
|
+
return {
|
|
44
|
+
...baseResult,
|
|
45
|
+
recommendations: [
|
|
46
|
+
"Examples path is not a directory.",
|
|
47
|
+
"Point --examples at a folder containing representative .md, .mdx, or .txt pieces.",
|
|
48
|
+
],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const discovered = discoverCorpusFiles(resolvedExamples);
|
|
53
|
+
const validDocuments = [];
|
|
54
|
+
const files = [];
|
|
55
|
+
let totalBytes = 0;
|
|
56
|
+
|
|
57
|
+
discovered.supportedFiles.forEach((filePath, index) => {
|
|
58
|
+
const relative = displayPath(filePath, resolvedExamples);
|
|
59
|
+
const extension = path.extname(filePath).toLowerCase();
|
|
60
|
+
const record = { path: relative, extension, wordCount: 0, sentenceCount: 0 };
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
if (index >= DEFAULT_MAX_FILES) {
|
|
64
|
+
throw new Error(`Corpus contains more than ${DEFAULT_MAX_FILES} voice file(s) allowed.`);
|
|
65
|
+
}
|
|
66
|
+
const stats = fs.statSync(filePath);
|
|
67
|
+
if (stats.size > DEFAULT_MAX_FILE_BYTES) {
|
|
68
|
+
throw new Error(`Voice file ${relative} exceeds the ${DEFAULT_MAX_FILE_BYTES} byte limit.`);
|
|
69
|
+
}
|
|
70
|
+
totalBytes += stats.size;
|
|
71
|
+
if (totalBytes > DEFAULT_MAX_TOTAL_BYTES) {
|
|
72
|
+
throw new Error(`Voice corpus exceeds the ${DEFAULT_MAX_TOTAL_BYTES} byte total limit.`);
|
|
73
|
+
}
|
|
74
|
+
const contents = fs.readFileSync(filePath, "utf8");
|
|
75
|
+
if (contents.includes("\0")) {
|
|
76
|
+
throw new Error(`Voice file ${relative} looks like binary-looking text and cannot be analyzed.`);
|
|
77
|
+
}
|
|
78
|
+
const document = parseDocument({ filePath, rootDir: resolvedExamples, contents });
|
|
79
|
+
record.wordCount = document.wordCount;
|
|
80
|
+
record.sentenceCount = document.sentences.length;
|
|
81
|
+
validDocuments.push(document);
|
|
82
|
+
} catch (error) {
|
|
83
|
+
record.issue = error.message;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
files.push(record);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
const summary = summaryFor(validDocuments);
|
|
90
|
+
const unsupportedFiles = discovered.unsupportedFiles.map((filePath) => ({
|
|
91
|
+
path: displayPath(filePath, resolvedExamples),
|
|
92
|
+
extension: path.extname(filePath).toLowerCase() || null,
|
|
93
|
+
}));
|
|
94
|
+
const status = statusFor({ summary, files, supportedCount: discovered.supportedFiles.length });
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
...baseResult,
|
|
98
|
+
status,
|
|
99
|
+
exitCode: status === "ready" ? 0 : 1,
|
|
100
|
+
summary,
|
|
101
|
+
files,
|
|
102
|
+
unsupportedFiles,
|
|
103
|
+
recommendations: recommendationsFor({
|
|
104
|
+
status,
|
|
105
|
+
summary,
|
|
106
|
+
files,
|
|
107
|
+
unsupportedFiles,
|
|
108
|
+
examples: baseResult.examples,
|
|
109
|
+
}),
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export function renderCorpusDoctorV2(result) {
|
|
114
|
+
const lines = [
|
|
115
|
+
"# Corpus Doctor",
|
|
116
|
+
"",
|
|
117
|
+
`Examples: ${result.examples}`,
|
|
118
|
+
`Status: ${result.status}`,
|
|
119
|
+
`Documents: ${result.summary.documentCount}`,
|
|
120
|
+
`Words: ${result.summary.wordCount}`,
|
|
121
|
+
`Sentences: ${result.summary.sentenceCount}`,
|
|
122
|
+
`Confidence: ${result.summary.confidence.band} - ${result.summary.confidence.message}`,
|
|
123
|
+
`Supported files: ${result.files.length}`,
|
|
124
|
+
`Unsupported files: ${result.unsupportedFiles.length}`,
|
|
125
|
+
"",
|
|
126
|
+
];
|
|
127
|
+
|
|
128
|
+
const issueFiles = result.files.filter((file) => file.issue);
|
|
129
|
+
if (issueFiles.length > 0) {
|
|
130
|
+
lines.push("File issues:");
|
|
131
|
+
issueFiles.slice(0, 8).forEach((file) => {
|
|
132
|
+
lines.push(`- ${file.path}: ${file.issue}`);
|
|
133
|
+
});
|
|
134
|
+
if (issueFiles.length > 8) {
|
|
135
|
+
lines.push(`- ...and ${issueFiles.length - 8} more`);
|
|
136
|
+
}
|
|
137
|
+
lines.push("");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (result.unsupportedFiles.length > 0) {
|
|
141
|
+
lines.push("Unsupported files:");
|
|
142
|
+
result.unsupportedFiles.slice(0, 8).forEach((file) => {
|
|
143
|
+
lines.push(`- ${file.path}`);
|
|
144
|
+
});
|
|
145
|
+
if (result.unsupportedFiles.length > 8) {
|
|
146
|
+
lines.push(`- ...and ${result.unsupportedFiles.length - 8} more`);
|
|
147
|
+
}
|
|
148
|
+
lines.push("");
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (result.summary.quality?.warnings?.length) {
|
|
152
|
+
lines.push("Corpus quality warnings:");
|
|
153
|
+
result.summary.quality.warnings.forEach((warning) => {
|
|
154
|
+
lines.push(`- ${warning}`);
|
|
155
|
+
});
|
|
156
|
+
lines.push("");
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
lines.push("Recommendations:");
|
|
160
|
+
result.recommendations.forEach((item) => {
|
|
161
|
+
lines.push(`- ${item}`);
|
|
162
|
+
});
|
|
163
|
+
lines.push("");
|
|
164
|
+
lines.push(`Next: ${nextStepFor(result)}`);
|
|
165
|
+
lines.push("");
|
|
166
|
+
return lines.join("\n");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
function discoverCorpusFiles(rootDir) {
|
|
170
|
+
const supportedFiles = [];
|
|
171
|
+
const unsupportedFiles = [];
|
|
172
|
+
const stack = [rootDir];
|
|
173
|
+
|
|
174
|
+
while (stack.length) {
|
|
175
|
+
const current = stack.pop();
|
|
176
|
+
const entries = fs.readdirSync(current, { withFileTypes: true }).sort((left, right) => left.name.localeCompare(right.name));
|
|
177
|
+
const directories = [];
|
|
178
|
+
for (const entry of entries) {
|
|
179
|
+
const fullPath = path.join(current, entry.name);
|
|
180
|
+
if (entry.isDirectory()) {
|
|
181
|
+
if (!VOICE_SKIP_DIRS.has(entry.name)) {
|
|
182
|
+
directories.push(fullPath);
|
|
183
|
+
}
|
|
184
|
+
} else if (entry.isFile()) {
|
|
185
|
+
if (VOICE_EXTENSIONS.has(path.extname(entry.name).toLowerCase())) {
|
|
186
|
+
supportedFiles.push(fullPath);
|
|
187
|
+
} else {
|
|
188
|
+
unsupportedFiles.push(fullPath);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
stack.push(...directories.reverse());
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
supportedFiles: supportedFiles.sort((left, right) => left.localeCompare(right)),
|
|
197
|
+
unsupportedFiles: unsupportedFiles.sort((left, right) => left.localeCompare(right)),
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function summaryFor(documents) {
|
|
202
|
+
if (documents.length === 0) {
|
|
203
|
+
return emptySummary();
|
|
204
|
+
}
|
|
205
|
+
const source = buildVoiceProfileV2({ documents }).source;
|
|
206
|
+
return {
|
|
207
|
+
documentCount: source.documentCount,
|
|
208
|
+
wordCount: source.wordCount,
|
|
209
|
+
sentenceCount: source.sentenceCount,
|
|
210
|
+
confidence: source.confidence,
|
|
211
|
+
quality: source.quality,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function emptySummary() {
|
|
216
|
+
return {
|
|
217
|
+
documentCount: 0,
|
|
218
|
+
wordCount: 0,
|
|
219
|
+
sentenceCount: 0,
|
|
220
|
+
confidence: {
|
|
221
|
+
band: "weak",
|
|
222
|
+
message: "No usable source documents were found.",
|
|
223
|
+
},
|
|
224
|
+
quality: {
|
|
225
|
+
lengthSpread: { minWords: 0, maxWords: 0, minSentences: 0, maxSentences: 0 },
|
|
226
|
+
duplicateGroups: 0,
|
|
227
|
+
warnings: [],
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function statusFor({ summary, files, supportedCount }) {
|
|
233
|
+
const issueCount = files.filter((file) => file.issue).length;
|
|
234
|
+
if (supportedCount === 0 || summary.documentCount === 0) {
|
|
235
|
+
return "needs-work";
|
|
236
|
+
}
|
|
237
|
+
if (issueCount > 0) {
|
|
238
|
+
return "needs-work";
|
|
239
|
+
}
|
|
240
|
+
if (summary.documentCount >= 3 && summary.sentenceCount >= 10) {
|
|
241
|
+
return "ready";
|
|
242
|
+
}
|
|
243
|
+
return "weak";
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function recommendationsFor({ status, summary, files, unsupportedFiles, examples }) {
|
|
247
|
+
const recommendations = [];
|
|
248
|
+
const issueCount = files.filter((file) => file.issue).length;
|
|
249
|
+
|
|
250
|
+
if (status === "ready") {
|
|
251
|
+
recommendations.push("Corpus is ready for cautious profile learning.");
|
|
252
|
+
if (summary.quality?.warnings?.length) {
|
|
253
|
+
recommendations.push("Review corpus quality warnings before trusting strict calibration.");
|
|
254
|
+
}
|
|
255
|
+
recommendations.push(`Run drav init --examples ${examples}.`);
|
|
256
|
+
return recommendations;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (files.length === 0) {
|
|
260
|
+
recommendations.push("Convert or copy your writing into Markdown, MDX, or plain text.");
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (summary.documentCount < 3) {
|
|
264
|
+
const remaining = 3 - summary.documentCount;
|
|
265
|
+
recommendations.push(`Add at least ${remaining} more representative long-form piece(s).`);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if (summary.sentenceCount < 10) {
|
|
269
|
+
recommendations.push("Use longer source pieces; Dravoice needs at least 10 total sentences for a ready corpus.");
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (issueCount > 0) {
|
|
273
|
+
recommendations.push("Fix or remove supported files with issues, then rerun drav doctor.");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (unsupportedFiles.length > 0) {
|
|
277
|
+
recommendations.push("Supported extensions: .md, .mdx, .txt.");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (recommendations.length === 0) {
|
|
281
|
+
recommendations.push("Add more representative long-form writing, then rerun drav doctor.");
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return recommendations;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function nextStepFor(result) {
|
|
288
|
+
if (result.status === "ready") {
|
|
289
|
+
return `drav init --examples ${result.examples}`;
|
|
290
|
+
}
|
|
291
|
+
return "add more writing, then run drav doctor";
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function resolvePath(cwd, value) {
|
|
295
|
+
return path.isAbsolute(value) ? value : path.join(cwd, value);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function displayPath(filePath, rootDir) {
|
|
299
|
+
const relative = path.relative(rootDir, filePath);
|
|
300
|
+
if (relative && !relative.startsWith("..") && !path.isAbsolute(relative)) {
|
|
301
|
+
return relative.split(path.sep).join("/");
|
|
302
|
+
}
|
|
303
|
+
return filePath.split(path.sep).join("/");
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function displayPathForCommand(value) {
|
|
307
|
+
return String(value).replace(/[\\/]+/g, "/");
|
|
308
|
+
}
|
package/src/v2/document-model.js
CHANGED
|
@@ -3,9 +3,10 @@ import path from "node:path";
|
|
|
3
3
|
import { normalizeText, splitSentences, tokenizeWords } from "./text-utils.js";
|
|
4
4
|
|
|
5
5
|
export const VOICE_EXTENSIONS = new Set([".md", ".mdx", ".txt"]);
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
6
|
+
export const VOICE_SKIP_DIRS = new Set([".git", "node_modules", "dist", "build", "__pycache__", "prompts", "voice-pack", "dravoice-voice"]);
|
|
7
|
+
export const DEFAULT_MAX_FILES = 500;
|
|
8
|
+
export const DEFAULT_MAX_FILE_BYTES = 1024 * 1024;
|
|
9
|
+
export const DEFAULT_MAX_TOTAL_BYTES = 20 * 1024 * 1024;
|
|
9
10
|
|
|
10
11
|
export function loadDocuments({
|
|
11
12
|
examplesDir,
|
|
@@ -56,6 +57,8 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
56
57
|
let inFrontmatter = lines[0]?.trim() === "---";
|
|
57
58
|
let inHtmlComment = false;
|
|
58
59
|
let jsxBlockTag = null;
|
|
60
|
+
let mdxScaffoldDepth = 0;
|
|
61
|
+
let mdxExpressionDepth = 0;
|
|
59
62
|
|
|
60
63
|
const flushParagraph = () => {
|
|
61
64
|
if (currentParagraph?.lines.length) {
|
|
@@ -85,6 +88,16 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
85
88
|
return;
|
|
86
89
|
}
|
|
87
90
|
|
|
91
|
+
if (mdxScaffoldDepth > 0) {
|
|
92
|
+
mdxScaffoldDepth = Math.max(0, mdxScaffoldDepth + syntaxDepthDelta(trimmed));
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (mdxExpressionDepth > 0) {
|
|
97
|
+
mdxExpressionDepth = Math.max(0, mdxExpressionDepth + curlyDepthDelta(trimmed));
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
88
101
|
if (inHtmlComment) {
|
|
89
102
|
if (trimmed.includes("-->")) {
|
|
90
103
|
inHtmlComment = false;
|
|
@@ -114,7 +127,19 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
114
127
|
return;
|
|
115
128
|
}
|
|
116
129
|
|
|
117
|
-
if (
|
|
130
|
+
if (isMdxScaffold(trimmed)) {
|
|
131
|
+
flushParagraph();
|
|
132
|
+
mdxScaffoldDepth = Math.max(0, syntaxDepthDelta(trimmed));
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (mdxExpressionStart(trimmed)) {
|
|
137
|
+
flushParagraph();
|
|
138
|
+
mdxExpressionDepth = Math.max(0, curlyDepthDelta(trimmed));
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (!trimmed || isTableLine(trimmed) || isJsxLike(trimmed)) {
|
|
118
143
|
if (!trimmed) {
|
|
119
144
|
flushParagraph();
|
|
120
145
|
}
|
|
@@ -277,6 +302,53 @@ function isMdxScaffold(trimmed) {
|
|
|
277
302
|
return /^(?:import|export)\s/.test(trimmed);
|
|
278
303
|
}
|
|
279
304
|
|
|
305
|
+
function mdxExpressionStart(trimmed) {
|
|
306
|
+
return trimmed.startsWith("{") && /[A-Za-z_$][\w$]*\s*[.(]/.test(trimmed);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function syntaxDepthDelta(text) {
|
|
310
|
+
return curlyDepthDelta(text) + bracketDepthDelta(text) + parenDepthDelta(text);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function curlyDepthDelta(text) {
|
|
314
|
+
return depthDelta(text, "{", "}");
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
function bracketDepthDelta(text) {
|
|
318
|
+
return depthDelta(text, "[", "]");
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
function parenDepthDelta(text) {
|
|
322
|
+
return depthDelta(text, "(", ")");
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
function depthDelta(text, open, close) {
|
|
326
|
+
let depth = 0;
|
|
327
|
+
let inSingle = false;
|
|
328
|
+
let inDouble = false;
|
|
329
|
+
for (let index = 0; index < text.length; index += 1) {
|
|
330
|
+
const char = text[index];
|
|
331
|
+
const previous = text[index - 1];
|
|
332
|
+
if (char === "'" && !inDouble && previous !== "\\") {
|
|
333
|
+
inSingle = !inSingle;
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
if (char === "\"" && !inSingle && previous !== "\\") {
|
|
337
|
+
inDouble = !inDouble;
|
|
338
|
+
continue;
|
|
339
|
+
}
|
|
340
|
+
if (inSingle || inDouble) {
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
if (char === open) {
|
|
344
|
+
depth += 1;
|
|
345
|
+
} else if (char === close) {
|
|
346
|
+
depth -= 1;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
return depth;
|
|
350
|
+
}
|
|
351
|
+
|
|
280
352
|
function isTableLine(trimmed) {
|
|
281
353
|
return /^\|.*\|$/.test(trimmed);
|
|
282
354
|
}
|
|
@@ -291,7 +363,6 @@ function walkVoiceFiles(rootDir, { maxFiles, excludePaths }) {
|
|
|
291
363
|
return result;
|
|
292
364
|
}
|
|
293
365
|
const stack = [rootDir];
|
|
294
|
-
const skipDirs = new Set([".git", "node_modules", "dist", "build", "__pycache__", "prompts", "voice-pack", "dravoice-voice"]);
|
|
295
366
|
while (stack.length) {
|
|
296
367
|
const current = stack.pop();
|
|
297
368
|
if (isExcludedPath(current, excludePaths)) {
|
|
@@ -305,7 +376,7 @@ function walkVoiceFiles(rootDir, { maxFiles, excludePaths }) {
|
|
|
305
376
|
continue;
|
|
306
377
|
}
|
|
307
378
|
if (entry.isDirectory()) {
|
|
308
|
-
if (!
|
|
379
|
+
if (!VOICE_SKIP_DIRS.has(entry.name)) {
|
|
309
380
|
directories.push(fullPath);
|
|
310
381
|
}
|
|
311
382
|
} else if (entry.isFile() && VOICE_EXTENSIONS.has(path.extname(entry.name).toLowerCase())) {
|
package/src/v2/inspect.js
CHANGED
|
@@ -49,7 +49,7 @@ function featureSummary(name, features) {
|
|
|
49
49
|
return `wordCount=${features.wordCount}; contentTypeTokenRatio=${features.vocabularyRichness.contentTypeTokenRatio}; wordLength.median=${features.wordLength.median}; maskedCharacterFourgrams=${features.maskedCharacterFourgrams?.length ?? 0}; functionWordBigrams=${features.functionWordBigrams?.length ?? 0}`;
|
|
50
50
|
}
|
|
51
51
|
if (name === "register") {
|
|
52
|
-
return `primary=${features.primary.value} (${features.primary.score}); alternates=${features.scores.slice(1, 4).map((score) => `${score.value}:${score.score}`).join(", ")}`;
|
|
52
|
+
return `primary=${features.primary.value} (${features.primary.score}); mixedRegister=${features.mixedRegister ? "yes" : "no"}; markerSets=${features.markerSets?.length ?? 0}; alternates=${features.scores.slice(1, 4).map((score) => `${score.value}:${score.score}`).join(", ")}`;
|
|
53
53
|
}
|
|
54
54
|
if (name === "discourse") {
|
|
55
55
|
return `transitionRates=${Object.entries(features.transitionRates).map(([key, value]) => `${key}:${value}`).join(", ")}; sentenceCallbacks=${features.sentenceCallbacks}`;
|
|
@@ -61,7 +61,7 @@ function featureSummary(name, features) {
|
|
|
61
61
|
return `evidenceSentenceRate=${features.evidenceSentenceRate}; claimSentenceRate=${features.claimSentenceRate}; supportedClaimRate=${features.supportedClaimRate}; unsupportedClaimRate=${features.unsupportedClaimRate}; evidenceTypes=${features.evidenceTypes.map((item) => `${item.value}:${item.count}`).join(", ") || "none"}`;
|
|
62
62
|
}
|
|
63
63
|
if (name === "structure") {
|
|
64
|
-
return `sectionWords.median=${features.sectionWords.median}; headingCount.median=${features.headingCount.median}; listDocumentRate=${features.listDocumentRate}; quoteDocumentRate=${features.quoteDocumentRate}`;
|
|
64
|
+
return `sectionWords.median=${features.sectionWords.median}; headingCount.median=${features.headingCount.median}; maxHeadingDepth.median=${features.maxHeadingDepth?.median ?? 0}; sectionOrderPatterns=${features.sectionOrderPatterns?.length ?? 0}; listDocumentRate=${features.listDocumentRate}; quoteDocumentRate=${features.quoteDocumentRate}`;
|
|
65
65
|
}
|
|
66
66
|
return JSON.stringify(features);
|
|
67
67
|
}
|
package/src/v2/profile.js
CHANGED
|
@@ -17,9 +17,9 @@ import {
|
|
|
17
17
|
stabilityFromDistances,
|
|
18
18
|
} from "./stylometry.js";
|
|
19
19
|
|
|
20
|
-
export function learnVoicePackV2({ examplesDir, outDir, excludePaths = [] }) {
|
|
20
|
+
export function learnVoicePackV2({ examplesDir, outDir, excludePaths = [], registerMarkers }) {
|
|
21
21
|
const documents = loadDocuments({ examplesDir, excludePaths });
|
|
22
|
-
const profile = buildVoiceProfileV2({ documents });
|
|
22
|
+
const profile = buildVoiceProfileV2({ documents, registerMarkers });
|
|
23
23
|
if (outDir) {
|
|
24
24
|
writeVoicePackV2(outDir, profile);
|
|
25
25
|
}
|
|
@@ -39,9 +39,10 @@ export function loadVoicePackV2(voiceDir) {
|
|
|
39
39
|
return profile;
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
export function buildVoiceProfileV2({ documents }) {
|
|
42
|
+
export function buildVoiceProfileV2({ documents, registerMarkers }) {
|
|
43
43
|
const source = sourceSummary(documents);
|
|
44
|
-
const families = analyzeFeatureFamilies(documents);
|
|
44
|
+
const families = analyzeFeatureFamilies(documents, { registerMarkers });
|
|
45
|
+
const styleThresholds = styleThresholdsFor(documents, families, { registerMarkers });
|
|
45
46
|
|
|
46
47
|
return {
|
|
47
48
|
schemaVersion: 2,
|
|
@@ -56,7 +57,8 @@ export function buildVoiceProfileV2({ documents }) {
|
|
|
56
57
|
rhythmMedianWords: toleranceFor(source.confidence.band, 5, 8, 12),
|
|
57
58
|
evidenceRate: toleranceFor(source.confidence.band, 0.12, 0.18, 0.25),
|
|
58
59
|
},
|
|
59
|
-
styleThresholds
|
|
60
|
+
styleThresholds,
|
|
61
|
+
familyDiagnostics: familyCalibrationDiagnostics({ source, families, styleThresholds }),
|
|
60
62
|
minimumDraftSize: {
|
|
61
63
|
words: source.confidence.band === "weak" ? 25 : 35,
|
|
62
64
|
sentences: source.confidence.band === "weak" ? 3 : 4,
|
|
@@ -65,11 +67,11 @@ export function buildVoiceProfileV2({ documents }) {
|
|
|
65
67
|
};
|
|
66
68
|
}
|
|
67
69
|
|
|
68
|
-
function analyzeFeatureFamilies(documents) {
|
|
70
|
+
function analyzeFeatureFamilies(documents, { registerMarkers } = {}) {
|
|
69
71
|
return {
|
|
70
72
|
rhythm: analyzeRhythm(documents),
|
|
71
73
|
lexical: analyzeLexical(documents),
|
|
72
|
-
register: analyzeRegister(documents),
|
|
74
|
+
register: analyzeRegister(documents, registerMarkers ? { markers: registerMarkers } : undefined),
|
|
73
75
|
discourse: analyzeDiscourse(documents),
|
|
74
76
|
rhetoricalShape: analyzeRhetoricalShape(documents),
|
|
75
77
|
evidence: analyzeEvidence(documents),
|
|
@@ -77,15 +79,32 @@ function analyzeFeatureFamilies(documents) {
|
|
|
77
79
|
};
|
|
78
80
|
}
|
|
79
81
|
|
|
80
|
-
function styleThresholdsFor(documents, fallbackFamilies) {
|
|
82
|
+
function styleThresholdsFor(documents, fallbackFamilies, { registerMarkers } = {}) {
|
|
81
83
|
const fallbackThresholds = defaultStyleThresholds();
|
|
82
84
|
const distancesByFamily = Object.fromEntries(Object.keys(fallbackFamilies).map((family) => [family, []]));
|
|
83
85
|
|
|
84
86
|
if (documents.length >= 2) {
|
|
85
87
|
for (let index = 0; index < documents.length; index += 1) {
|
|
86
88
|
const referenceDocuments = documents.filter((_, candidateIndex) => candidateIndex !== index);
|
|
87
|
-
const referenceFamilies = analyzeFeatureFamilies(referenceDocuments);
|
|
88
|
-
const heldoutFamilies = analyzeFeatureFamilies([documents[index]]);
|
|
89
|
+
const referenceFamilies = analyzeFeatureFamilies(referenceDocuments, { registerMarkers });
|
|
90
|
+
const heldoutFamilies = analyzeFeatureFamilies([documents[index]], { registerMarkers });
|
|
91
|
+
for (const family of Object.keys(fallbackFamilies)) {
|
|
92
|
+
distancesByFamily[family].push(distanceByFamily(
|
|
93
|
+
family,
|
|
94
|
+
referenceFamilies[family].features,
|
|
95
|
+
heldoutFamilies[family].features,
|
|
96
|
+
));
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (documents.length >= 5) {
|
|
102
|
+
for (let start = 0; start < documents.length; start += 1) {
|
|
103
|
+
const heldoutIndexes = new Set([start, (start + 1) % documents.length]);
|
|
104
|
+
const referenceDocuments = documents.filter((_, index) => !heldoutIndexes.has(index));
|
|
105
|
+
const heldoutDocuments = documents.filter((_, index) => heldoutIndexes.has(index));
|
|
106
|
+
const referenceFamilies = analyzeFeatureFamilies(referenceDocuments, { registerMarkers });
|
|
107
|
+
const heldoutFamilies = analyzeFeatureFamilies(heldoutDocuments, { registerMarkers });
|
|
89
108
|
for (const family of Object.keys(fallbackFamilies)) {
|
|
90
109
|
distancesByFamily[family].push(distanceByFamily(
|
|
91
110
|
family,
|
|
@@ -109,7 +128,9 @@ function styleThresholdsFor(documents, fallbackFamilies) {
|
|
|
109
128
|
}
|
|
110
129
|
|
|
111
130
|
return {
|
|
112
|
-
method:
|
|
131
|
+
method: documents.length >= 5
|
|
132
|
+
? "leave-one-out-and-rotating-holdout-cosine-delta"
|
|
133
|
+
: "leave-one-out-cosine-delta",
|
|
113
134
|
references: STYLOMETRIC_REFERENCES,
|
|
114
135
|
families,
|
|
115
136
|
};
|
|
@@ -168,6 +189,100 @@ function sourceSummary(documents) {
|
|
|
168
189
|
sentenceCount,
|
|
169
190
|
genres: [],
|
|
170
191
|
confidence,
|
|
192
|
+
quality: sourceQualityFor(documents),
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function sourceQualityFor(documents) {
|
|
197
|
+
const wordCounts = documents.map((document) => document.wordCount);
|
|
198
|
+
const sentenceCounts = documents.map((document) => document.sentences.length);
|
|
199
|
+
const fingerprints = new Map();
|
|
200
|
+
for (const document of documents) {
|
|
201
|
+
const fingerprint = documentFingerprint(document);
|
|
202
|
+
if (!fingerprint) {
|
|
203
|
+
continue;
|
|
204
|
+
}
|
|
205
|
+
fingerprints.set(fingerprint, (fingerprints.get(fingerprint) ?? 0) + 1);
|
|
206
|
+
}
|
|
207
|
+
const duplicateGroups = Array.from(fingerprints.values()).filter((count) => count > 1).length;
|
|
208
|
+
const minWords = minValue(wordCounts);
|
|
209
|
+
const maxWords = maxValue(wordCounts);
|
|
210
|
+
const warnings = [];
|
|
211
|
+
if (duplicateGroups > 0) {
|
|
212
|
+
warnings.push(`${duplicateGroups} duplicate-looking source group(s) detected; remove repeated drafts before trusting calibration.`);
|
|
213
|
+
}
|
|
214
|
+
if (documents.length >= 3 && minWords > 0 && maxWords / minWords >= 5) {
|
|
215
|
+
warnings.push(`Document length imbalance detected (${minWords}-${maxWords} words); long pieces may dominate the learned profile.`);
|
|
216
|
+
}
|
|
217
|
+
if (documents.some((document) => document.wordCount < 80)) {
|
|
218
|
+
warnings.push("One or more source files are very short; prefer representative long-form pieces.");
|
|
219
|
+
}
|
|
220
|
+
return {
|
|
221
|
+
lengthSpread: {
|
|
222
|
+
minWords,
|
|
223
|
+
maxWords,
|
|
224
|
+
minSentences: minValue(sentenceCounts),
|
|
225
|
+
maxSentences: maxValue(sentenceCounts),
|
|
226
|
+
},
|
|
227
|
+
duplicateGroups,
|
|
228
|
+
warnings,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function minValue(values) {
|
|
233
|
+
return values.length ? Math.min(...values) : 0;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function maxValue(values) {
|
|
237
|
+
return values.length ? Math.max(...values) : 0;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
function documentFingerprint(document) {
|
|
241
|
+
return String(document.text ?? "")
|
|
242
|
+
.toLowerCase()
|
|
243
|
+
.replace(/\s+/g, " ")
|
|
244
|
+
.replace(/[^a-z0-9 ]+/g, "")
|
|
245
|
+
.trim();
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function familyCalibrationDiagnostics({ source, families, styleThresholds }) {
|
|
249
|
+
return Object.fromEntries(Object.keys(families).map((family) => {
|
|
250
|
+
const minimumEvidence = minimumEvidenceFor(family, source, families[family]);
|
|
251
|
+
const threshold = styleThresholds.families[family] ?? {};
|
|
252
|
+
return [family, {
|
|
253
|
+
confidence: families[family].confidence,
|
|
254
|
+
threshold: threshold.threshold ?? 0,
|
|
255
|
+
observations: threshold.observations ?? 0,
|
|
256
|
+
stability: threshold.stability ?? 0.45,
|
|
257
|
+
minimumEvidence,
|
|
258
|
+
usableForFindings: source.confidence.band !== "weak" &&
|
|
259
|
+
minimumEvidence.documentsMet &&
|
|
260
|
+
minimumEvidence.sentencesMet &&
|
|
261
|
+
minimumEvidence.wordsMet &&
|
|
262
|
+
(threshold.stability ?? 0.45) >= 0.35,
|
|
263
|
+
}];
|
|
264
|
+
}));
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function minimumEvidenceFor(family, source, familyData) {
|
|
268
|
+
const requirements = {
|
|
269
|
+
rhythm: { documents: 1, sentences: 8, words: 80 },
|
|
270
|
+
lexical: { documents: 1, sentences: 4, words: 120 },
|
|
271
|
+
register: { documents: 3, sentences: 8, words: 120 },
|
|
272
|
+
discourse: { documents: 1, sentences: 12, words: 120 },
|
|
273
|
+
rhetoricalShape: { documents: 1, sentences: 12, words: 120 },
|
|
274
|
+
evidence: { documents: 1, sentences: 12, words: 120 },
|
|
275
|
+
structure: { documents: 3, sentences: 8, words: 120 },
|
|
276
|
+
}[family] ?? { documents: 1, sentences: 1, words: 1 };
|
|
277
|
+
const wordCount = familyData.features?.wordCount ?? source.wordCount;
|
|
278
|
+
const sentenceCount = familyData.features?.sentenceCount ?? source.sentenceCount;
|
|
279
|
+
return {
|
|
280
|
+
requiredDocuments: requirements.documents,
|
|
281
|
+
requiredSentences: requirements.sentences,
|
|
282
|
+
requiredWords: requirements.words,
|
|
283
|
+
documentsMet: source.documentCount >= requirements.documents,
|
|
284
|
+
sentencesMet: sentenceCount >= requirements.sentences,
|
|
285
|
+
wordsMet: wordCount >= requirements.words,
|
|
171
286
|
};
|
|
172
287
|
}
|
|
173
288
|
|