dravoice 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -9
- package/package.json +1 -1
- package/src/index.js +113 -13
- package/src/v2/analyzers/discourse.js +7 -1
- package/src/v2/analyzers/evidence.js +3 -3
- package/src/v2/analyzers/register.js +28 -4
- package/src/v2/analyzers/rhetorical-shape.js +7 -1
- package/src/v2/analyzers/structure.js +109 -1
- package/src/v2/benchmark.js +83 -0
- package/src/v2/brief.js +41 -7
- package/src/v2/doctor.js +308 -0
- package/src/v2/document-model.js +78 -6
- package/src/v2/inspect.js +2 -2
- package/src/v2/profile.js +238 -19
- package/src/v2/prompt.js +10 -3
- package/src/v2/review.js +142 -16
- package/src/v2/revise-plan.js +111 -8
- package/src/v2/stylometry.js +11 -7
- package/src/v2/text-utils.js +5 -2
package/src/v2/benchmark.js
CHANGED
|
@@ -81,6 +81,40 @@ export function prepareVoiceBenchmark({ examplesDir, topic, outDir, seed = 1, cw
|
|
|
81
81
|
return benchmark;
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
+
export function prepareVoiceBenchmarkRuns({ examplesDir, topic, outDir, seed = 1, runs = 3, cwd = process.cwd() }) {
|
|
85
|
+
const runCount = normalizeRunCount(runs);
|
|
86
|
+
const normalizedSeed = normalizeSeed(seed);
|
|
87
|
+
const root = path.resolve(resolvePath(cwd, outDir));
|
|
88
|
+
const preparedRuns = [];
|
|
89
|
+
for (let index = 0; index < runCount; index += 1) {
|
|
90
|
+
const runSeed = (normalizedSeed + index) >>> 0;
|
|
91
|
+
const runName = `run-${String(index + 1).padStart(3, "0")}`;
|
|
92
|
+
const benchmark = prepareVoiceBenchmark({
|
|
93
|
+
examplesDir,
|
|
94
|
+
topic,
|
|
95
|
+
outDir: path.join(root, runName),
|
|
96
|
+
seed: runSeed,
|
|
97
|
+
cwd,
|
|
98
|
+
});
|
|
99
|
+
preparedRuns.push({
|
|
100
|
+
index: index + 1,
|
|
101
|
+
name: runName,
|
|
102
|
+
seed: runSeed,
|
|
103
|
+
path: runName,
|
|
104
|
+
corpusFileCount: benchmark.corpus.fileCount,
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
const manifest = {
|
|
108
|
+
schemaVersion: BENCHMARK_SCHEMA_VERSION,
|
|
109
|
+
generatedBy: `${GENERATED_BY}-runs`,
|
|
110
|
+
topic,
|
|
111
|
+
runs: preparedRuns,
|
|
112
|
+
minimumRunsRecommended: Math.max(3, runCount),
|
|
113
|
+
};
|
|
114
|
+
writeUtf8FileSafely(path.join(root, "benchmark-runs.json"), `${JSON.stringify(manifest, null, 2)}\n`);
|
|
115
|
+
return manifest;
|
|
116
|
+
}
|
|
117
|
+
|
|
84
118
|
export function scoreVoiceBenchmark({ runDir, judgePath, judgeFile, judge, cwd = process.cwd() }) {
|
|
85
119
|
const root = path.resolve(resolvePath(cwd, runDir));
|
|
86
120
|
const resolvedJudgePath = judgePath ?? judgeFile ?? judge;
|
|
@@ -145,6 +179,15 @@ export function renderBenchmarkReport(scores) {
|
|
|
145
179
|
lines.push(`Deterministic provisional leader: ${scores.deterministicWinner.draft} (${scores.deterministicWinner.label})`);
|
|
146
180
|
}
|
|
147
181
|
lines.push("Single benchmark run is directional, not proof; compare repeated runs and family diagnostics before deciding.");
|
|
182
|
+
if (scores.comparison) {
|
|
183
|
+
lines.push("");
|
|
184
|
+
lines.push(`Deterministic comparison: ${scores.comparison.deterministicLeader} leads by ${scores.comparison.deterministicMargin} point(s).`);
|
|
185
|
+
lines.push(`Repeated runs recommended: ${scores.comparison.repeatedRunsRecommended ? "yes" : "no"}.`);
|
|
186
|
+
}
|
|
187
|
+
if (scores.repeatSummary) {
|
|
188
|
+
lines.push(`Minimum repeat runs recommended: ${scores.repeatSummary.minimumRunsRecommended}.`);
|
|
189
|
+
lines.push(`Suggested next seeds: ${scores.repeatSummary.nextSeeds.join(", ")}.`);
|
|
190
|
+
}
|
|
148
191
|
|
|
149
192
|
for (const key of ["baseline", "voiceAssisted"]) {
|
|
150
193
|
const draft = scores.drafts[key];
|
|
@@ -217,12 +260,52 @@ function benchmarkScores({ benchmark, baselineReview, voiceReview, judge, judgeP
|
|
|
217
260
|
baseline,
|
|
218
261
|
voiceAssisted,
|
|
219
262
|
},
|
|
263
|
+
comparison: benchmarkComparison({ baseline, voiceAssisted }),
|
|
264
|
+
repeatSummary: repeatSummaryFor(benchmark.seed),
|
|
220
265
|
deterministicWinner,
|
|
221
266
|
winner,
|
|
222
267
|
exitCode: 0,
|
|
223
268
|
};
|
|
224
269
|
}
|
|
225
270
|
|
|
271
|
+
function repeatSummaryFor(seed) {
|
|
272
|
+
const normalized = normalizeSeed(seed);
|
|
273
|
+
return {
|
|
274
|
+
minimumRunsRecommended: 3,
|
|
275
|
+
nextSeeds: [1, 2, 3].map((offset) => (normalized + offset) >>> 0),
|
|
276
|
+
reason: "Compare multiple topics, blind mappings, and draft pairs before treating benchmark results as product evidence.",
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function normalizeRunCount(runs) {
|
|
281
|
+
const value = String(runs);
|
|
282
|
+
if (!/^\d+$/.test(value)) {
|
|
283
|
+
throw new Error(`Invalid runs value: ${runs}`);
|
|
284
|
+
}
|
|
285
|
+
const parsed = Number(value);
|
|
286
|
+
if (!Number.isSafeInteger(parsed) || parsed < 1 || parsed > 50) {
|
|
287
|
+
throw new Error("Benchmark runs must be an integer between 1 and 50.");
|
|
288
|
+
}
|
|
289
|
+
return parsed;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function benchmarkComparison({ baseline, voiceAssisted }) {
|
|
293
|
+
const margin = roundHalfUp(Math.abs(
|
|
294
|
+
voiceAssisted.deterministic.voiceFit - baseline.deterministic.voiceFit,
|
|
295
|
+
), 2);
|
|
296
|
+
const deterministicLeader = voiceAssisted.deterministic.voiceFit === baseline.deterministic.voiceFit
|
|
297
|
+
? "tie"
|
|
298
|
+
: voiceAssisted.deterministic.voiceFit > baseline.deterministic.voiceFit
|
|
299
|
+
? "voice-assisted"
|
|
300
|
+
: "baseline";
|
|
301
|
+
return {
|
|
302
|
+
deterministicLeader,
|
|
303
|
+
deterministicMargin: margin,
|
|
304
|
+
repeatedRunsRecommended: true,
|
|
305
|
+
caution: "Single benchmark runs are directional; repeat with more topics and draft pairs before making product claims.",
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
226
309
|
function draftScore({ key, name, label, review, judge }) {
|
|
227
310
|
const deterministic = deterministicScore(review);
|
|
228
311
|
const judgeDraft = judge ? normalizeJudgeDraft(judge.drafts?.[label], label) : null;
|
package/src/v2/brief.js
CHANGED
|
@@ -27,7 +27,7 @@ export function voiceArticleBriefV2({ voice, topic, evidence, cwd = process.cwd(
|
|
|
27
27
|
workingThesis: `Draft a grounded article about ${topic}. Let the supplied evidence set the size of each claim before broadening the lesson.`,
|
|
28
28
|
evidence: evidenceResult,
|
|
29
29
|
missingEvidence: missingEvidenceFor({ topic, evidenceAnchors: evidenceResult.anchors }),
|
|
30
|
-
outline: outlineFor(profile),
|
|
30
|
+
outline: outlineFor(profile, topic),
|
|
31
31
|
voiceCautions: [
|
|
32
32
|
...profile.guidance.avoid,
|
|
33
33
|
"Mark unsupported claims as [specific evidence needed] instead of inventing proof.",
|
|
@@ -116,19 +116,53 @@ function missingEvidenceFor({ topic, evidenceAnchors }) {
|
|
|
116
116
|
return items;
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
-
function outlineFor(profile) {
|
|
120
|
-
const
|
|
121
|
-
const
|
|
119
|
+
function outlineFor(profile, topic = "") {
|
|
120
|
+
const rhythm = profile.families.rhythm.features.sentenceWords;
|
|
121
|
+
const seed = topicSeed(topic);
|
|
122
|
+
|
|
123
|
+
const openingPatterns = (profile.families.rhetoricalShape.features.openingMovePatterns || [])
|
|
124
|
+
.map((item) => item.value)
|
|
125
|
+
.filter(Boolean);
|
|
126
|
+
const opening = pickSeeded(openingPatterns, seed);
|
|
127
|
+
|
|
128
|
+
const sectionShapes = (profile.families.structure.features.sectionOrderPatterns || [])
|
|
129
|
+
.map((item) => item.value)
|
|
130
|
+
.filter(Boolean);
|
|
131
|
+
const sectionShape = pickSeeded(sectionShapes, seed + 1);
|
|
132
|
+
const headingCount = profile.families.structure.features.headingCount || {};
|
|
133
|
+
|
|
134
|
+
const rhythmRange = rhythm.count > 0
|
|
135
|
+
? `${rhythm.p25}-${rhythm.p75} words (median ${rhythm.median}, variation ~${rhythm.stdev})`
|
|
136
|
+
: "a varied range of sentence lengths";
|
|
137
|
+
|
|
122
138
|
return [
|
|
123
139
|
opening
|
|
124
|
-
? `Start from a concrete artifact or observation
|
|
125
|
-
: "Start from a concrete artifact or observation before making the larger claim.",
|
|
140
|
+
? `Start from a concrete artifact or observation; for this piece try the opening shape "${opening}", but do not reuse one opening across articles.`
|
|
141
|
+
: "Start from a concrete artifact or observation before making the larger claim, varying the opening across pieces.",
|
|
126
142
|
"Name the pressure, question, or practical stakes that make the evidence matter.",
|
|
127
|
-
|
|
143
|
+
sectionShape
|
|
144
|
+
? `Shape the body using a section pattern from the corpus such as "${sectionShape}" (heading count usually ${headingCount.min ?? 0}-${headingCount.max ?? 0}); pick what fits this topic rather than a fixed skeleton.`
|
|
145
|
+
: "Shape the body to fit this topic; vary sectioning across pieces instead of reusing one skeleton.",
|
|
146
|
+
`Develop the article in the learned register, mixing short and long sentences across ${rhythmRange} rather than holding a constant cadence.`,
|
|
128
147
|
"Close by returning to the evidence and leaving the reader with a practical handle, not a generic conclusion.",
|
|
129
148
|
];
|
|
130
149
|
}
|
|
131
150
|
|
|
151
|
+
function topicSeed(topic) {
|
|
152
|
+
let hash = 0;
|
|
153
|
+
for (const char of String(topic ?? "")) {
|
|
154
|
+
hash = (hash * 31 + char.charCodeAt(0)) >>> 0;
|
|
155
|
+
}
|
|
156
|
+
return hash;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function pickSeeded(values, seed) {
|
|
160
|
+
if (!values.length) {
|
|
161
|
+
return "";
|
|
162
|
+
}
|
|
163
|
+
return values[seed % values.length];
|
|
164
|
+
}
|
|
165
|
+
|
|
132
166
|
function resolvePath(cwd, value) {
|
|
133
167
|
return path.isAbsolute(value) ? value : path.join(cwd, value);
|
|
134
168
|
}
|
package/src/v2/doctor.js
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import {
|
|
4
|
+
DEFAULT_MAX_FILE_BYTES,
|
|
5
|
+
DEFAULT_MAX_FILES,
|
|
6
|
+
DEFAULT_MAX_TOTAL_BYTES,
|
|
7
|
+
VOICE_EXTENSIONS,
|
|
8
|
+
VOICE_SKIP_DIRS,
|
|
9
|
+
parseDocument,
|
|
10
|
+
} from "./document-model.js";
|
|
11
|
+
import { buildVoiceProfileV2 } from "./profile.js";
|
|
12
|
+
|
|
13
|
+
export function diagnoseVoiceCorpusV2({ examplesDir, cwd = process.cwd() }) {
|
|
14
|
+
const requestedExamples = examplesDir ?? "./articles";
|
|
15
|
+
const resolvedExamples = resolvePath(cwd, requestedExamples);
|
|
16
|
+
const displayedExamples = path.isAbsolute(requestedExamples)
|
|
17
|
+
? displayPath(resolvedExamples, cwd)
|
|
18
|
+
: displayPathForCommand(requestedExamples);
|
|
19
|
+
const baseResult = {
|
|
20
|
+
schemaVersion: 2,
|
|
21
|
+
generatedBy: "dravoice-v2-doctor",
|
|
22
|
+
examples: displayedExamples,
|
|
23
|
+
status: "needs-work",
|
|
24
|
+
exitCode: 1,
|
|
25
|
+
summary: emptySummary(),
|
|
26
|
+
files: [],
|
|
27
|
+
unsupportedFiles: [],
|
|
28
|
+
recommendations: [],
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
if (!fs.existsSync(resolvedExamples)) {
|
|
32
|
+
return {
|
|
33
|
+
...baseResult,
|
|
34
|
+
recommendations: [
|
|
35
|
+
"Examples directory is missing.",
|
|
36
|
+
"Run mkdir -p articles, then copy at least 3 representative .md, .mdx, or .txt pieces into it.",
|
|
37
|
+
`If your writing lives elsewhere, run drav doctor --examples ${displayPathForCommand(requestedExamples) === "./articles" ? "~/path/to/writing" : displayPathForCommand(requestedExamples)}.`,
|
|
38
|
+
],
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!fs.statSync(resolvedExamples).isDirectory()) {
|
|
43
|
+
return {
|
|
44
|
+
...baseResult,
|
|
45
|
+
recommendations: [
|
|
46
|
+
"Examples path is not a directory.",
|
|
47
|
+
"Point --examples at a folder containing representative .md, .mdx, or .txt pieces.",
|
|
48
|
+
],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const discovered = discoverCorpusFiles(resolvedExamples);
|
|
53
|
+
const validDocuments = [];
|
|
54
|
+
const files = [];
|
|
55
|
+
let totalBytes = 0;
|
|
56
|
+
|
|
57
|
+
discovered.supportedFiles.forEach((filePath, index) => {
|
|
58
|
+
const relative = displayPath(filePath, resolvedExamples);
|
|
59
|
+
const extension = path.extname(filePath).toLowerCase();
|
|
60
|
+
const record = { path: relative, extension, wordCount: 0, sentenceCount: 0 };
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
if (index >= DEFAULT_MAX_FILES) {
|
|
64
|
+
throw new Error(`Corpus contains more than ${DEFAULT_MAX_FILES} voice file(s) allowed.`);
|
|
65
|
+
}
|
|
66
|
+
const stats = fs.statSync(filePath);
|
|
67
|
+
if (stats.size > DEFAULT_MAX_FILE_BYTES) {
|
|
68
|
+
throw new Error(`Voice file ${relative} exceeds the ${DEFAULT_MAX_FILE_BYTES} byte limit.`);
|
|
69
|
+
}
|
|
70
|
+
totalBytes += stats.size;
|
|
71
|
+
if (totalBytes > DEFAULT_MAX_TOTAL_BYTES) {
|
|
72
|
+
throw new Error(`Voice corpus exceeds the ${DEFAULT_MAX_TOTAL_BYTES} byte total limit.`);
|
|
73
|
+
}
|
|
74
|
+
const contents = fs.readFileSync(filePath, "utf8");
|
|
75
|
+
if (contents.includes("\0")) {
|
|
76
|
+
throw new Error(`Voice file ${relative} looks like binary-looking text and cannot be analyzed.`);
|
|
77
|
+
}
|
|
78
|
+
const document = parseDocument({ filePath, rootDir: resolvedExamples, contents });
|
|
79
|
+
record.wordCount = document.wordCount;
|
|
80
|
+
record.sentenceCount = document.sentences.length;
|
|
81
|
+
validDocuments.push(document);
|
|
82
|
+
} catch (error) {
|
|
83
|
+
record.issue = error.message;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
files.push(record);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
const summary = summaryFor(validDocuments);
|
|
90
|
+
const unsupportedFiles = discovered.unsupportedFiles.map((filePath) => ({
|
|
91
|
+
path: displayPath(filePath, resolvedExamples),
|
|
92
|
+
extension: path.extname(filePath).toLowerCase() || null,
|
|
93
|
+
}));
|
|
94
|
+
const status = statusFor({ summary, files, supportedCount: discovered.supportedFiles.length });
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
...baseResult,
|
|
98
|
+
status,
|
|
99
|
+
exitCode: status === "ready" ? 0 : 1,
|
|
100
|
+
summary,
|
|
101
|
+
files,
|
|
102
|
+
unsupportedFiles,
|
|
103
|
+
recommendations: recommendationsFor({
|
|
104
|
+
status,
|
|
105
|
+
summary,
|
|
106
|
+
files,
|
|
107
|
+
unsupportedFiles,
|
|
108
|
+
examples: baseResult.examples,
|
|
109
|
+
}),
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export function renderCorpusDoctorV2(result) {
|
|
114
|
+
const lines = [
|
|
115
|
+
"# Corpus Doctor",
|
|
116
|
+
"",
|
|
117
|
+
`Examples: ${result.examples}`,
|
|
118
|
+
`Status: ${result.status}`,
|
|
119
|
+
`Documents: ${result.summary.documentCount}`,
|
|
120
|
+
`Words: ${result.summary.wordCount}`,
|
|
121
|
+
`Sentences: ${result.summary.sentenceCount}`,
|
|
122
|
+
`Confidence: ${result.summary.confidence.band} - ${result.summary.confidence.message}`,
|
|
123
|
+
`Supported files: ${result.files.length}`,
|
|
124
|
+
`Unsupported files: ${result.unsupportedFiles.length}`,
|
|
125
|
+
"",
|
|
126
|
+
];
|
|
127
|
+
|
|
128
|
+
const issueFiles = result.files.filter((file) => file.issue);
|
|
129
|
+
if (issueFiles.length > 0) {
|
|
130
|
+
lines.push("File issues:");
|
|
131
|
+
issueFiles.slice(0, 8).forEach((file) => {
|
|
132
|
+
lines.push(`- ${file.path}: ${file.issue}`);
|
|
133
|
+
});
|
|
134
|
+
if (issueFiles.length > 8) {
|
|
135
|
+
lines.push(`- ...and ${issueFiles.length - 8} more`);
|
|
136
|
+
}
|
|
137
|
+
lines.push("");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (result.unsupportedFiles.length > 0) {
|
|
141
|
+
lines.push("Unsupported files:");
|
|
142
|
+
result.unsupportedFiles.slice(0, 8).forEach((file) => {
|
|
143
|
+
lines.push(`- ${file.path}`);
|
|
144
|
+
});
|
|
145
|
+
if (result.unsupportedFiles.length > 8) {
|
|
146
|
+
lines.push(`- ...and ${result.unsupportedFiles.length - 8} more`);
|
|
147
|
+
}
|
|
148
|
+
lines.push("");
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (result.summary.quality?.warnings?.length) {
|
|
152
|
+
lines.push("Corpus quality warnings:");
|
|
153
|
+
result.summary.quality.warnings.forEach((warning) => {
|
|
154
|
+
lines.push(`- ${warning}`);
|
|
155
|
+
});
|
|
156
|
+
lines.push("");
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
lines.push("Recommendations:");
|
|
160
|
+
result.recommendations.forEach((item) => {
|
|
161
|
+
lines.push(`- ${item}`);
|
|
162
|
+
});
|
|
163
|
+
lines.push("");
|
|
164
|
+
lines.push(`Next: ${nextStepFor(result)}`);
|
|
165
|
+
lines.push("");
|
|
166
|
+
return lines.join("\n");
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
function discoverCorpusFiles(rootDir) {
|
|
170
|
+
const supportedFiles = [];
|
|
171
|
+
const unsupportedFiles = [];
|
|
172
|
+
const stack = [rootDir];
|
|
173
|
+
|
|
174
|
+
while (stack.length) {
|
|
175
|
+
const current = stack.pop();
|
|
176
|
+
const entries = fs.readdirSync(current, { withFileTypes: true }).sort((left, right) => left.name.localeCompare(right.name));
|
|
177
|
+
const directories = [];
|
|
178
|
+
for (const entry of entries) {
|
|
179
|
+
const fullPath = path.join(current, entry.name);
|
|
180
|
+
if (entry.isDirectory()) {
|
|
181
|
+
if (!VOICE_SKIP_DIRS.has(entry.name)) {
|
|
182
|
+
directories.push(fullPath);
|
|
183
|
+
}
|
|
184
|
+
} else if (entry.isFile()) {
|
|
185
|
+
if (VOICE_EXTENSIONS.has(path.extname(entry.name).toLowerCase())) {
|
|
186
|
+
supportedFiles.push(fullPath);
|
|
187
|
+
} else {
|
|
188
|
+
unsupportedFiles.push(fullPath);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
stack.push(...directories.reverse());
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
supportedFiles: supportedFiles.sort((left, right) => left.localeCompare(right)),
|
|
197
|
+
unsupportedFiles: unsupportedFiles.sort((left, right) => left.localeCompare(right)),
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function summaryFor(documents) {
|
|
202
|
+
if (documents.length === 0) {
|
|
203
|
+
return emptySummary();
|
|
204
|
+
}
|
|
205
|
+
const source = buildVoiceProfileV2({ documents }).source;
|
|
206
|
+
return {
|
|
207
|
+
documentCount: source.documentCount,
|
|
208
|
+
wordCount: source.wordCount,
|
|
209
|
+
sentenceCount: source.sentenceCount,
|
|
210
|
+
confidence: source.confidence,
|
|
211
|
+
quality: source.quality,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function emptySummary() {
|
|
216
|
+
return {
|
|
217
|
+
documentCount: 0,
|
|
218
|
+
wordCount: 0,
|
|
219
|
+
sentenceCount: 0,
|
|
220
|
+
confidence: {
|
|
221
|
+
band: "weak",
|
|
222
|
+
message: "No usable source documents were found.",
|
|
223
|
+
},
|
|
224
|
+
quality: {
|
|
225
|
+
lengthSpread: { minWords: 0, maxWords: 0, minSentences: 0, maxSentences: 0 },
|
|
226
|
+
duplicateGroups: 0,
|
|
227
|
+
warnings: [],
|
|
228
|
+
},
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function statusFor({ summary, files, supportedCount }) {
|
|
233
|
+
const issueCount = files.filter((file) => file.issue).length;
|
|
234
|
+
if (supportedCount === 0 || summary.documentCount === 0) {
|
|
235
|
+
return "needs-work";
|
|
236
|
+
}
|
|
237
|
+
if (issueCount > 0) {
|
|
238
|
+
return "needs-work";
|
|
239
|
+
}
|
|
240
|
+
if (summary.documentCount >= 3 && summary.sentenceCount >= 10) {
|
|
241
|
+
return "ready";
|
|
242
|
+
}
|
|
243
|
+
return "weak";
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function recommendationsFor({ status, summary, files, unsupportedFiles, examples }) {
|
|
247
|
+
const recommendations = [];
|
|
248
|
+
const issueCount = files.filter((file) => file.issue).length;
|
|
249
|
+
|
|
250
|
+
if (status === "ready") {
|
|
251
|
+
recommendations.push("Corpus is ready for cautious profile learning.");
|
|
252
|
+
if (summary.quality?.warnings?.length) {
|
|
253
|
+
recommendations.push("Review corpus quality warnings before trusting strict calibration.");
|
|
254
|
+
}
|
|
255
|
+
recommendations.push(`Run drav init --examples ${examples}.`);
|
|
256
|
+
return recommendations;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (files.length === 0) {
|
|
260
|
+
recommendations.push("Convert or copy your writing into Markdown, MDX, or plain text.");
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (summary.documentCount < 3) {
|
|
264
|
+
const remaining = 3 - summary.documentCount;
|
|
265
|
+
recommendations.push(`Add at least ${remaining} more representative long-form piece(s).`);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if (summary.sentenceCount < 10) {
|
|
269
|
+
recommendations.push("Use longer source pieces; Dravoice needs at least 10 total sentences for a ready corpus.");
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (issueCount > 0) {
|
|
273
|
+
recommendations.push("Fix or remove supported files with issues, then rerun drav doctor.");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (unsupportedFiles.length > 0) {
|
|
277
|
+
recommendations.push("Supported extensions: .md, .mdx, .txt.");
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (recommendations.length === 0) {
|
|
281
|
+
recommendations.push("Add more representative long-form writing, then rerun drav doctor.");
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return recommendations;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function nextStepFor(result) {
|
|
288
|
+
if (result.status === "ready") {
|
|
289
|
+
return `drav init --examples ${result.examples}`;
|
|
290
|
+
}
|
|
291
|
+
return "add more writing, then run drav doctor";
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function resolvePath(cwd, value) {
|
|
295
|
+
return path.isAbsolute(value) ? value : path.join(cwd, value);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function displayPath(filePath, rootDir) {
|
|
299
|
+
const relative = path.relative(rootDir, filePath);
|
|
300
|
+
if (relative && !relative.startsWith("..") && !path.isAbsolute(relative)) {
|
|
301
|
+
return relative.split(path.sep).join("/");
|
|
302
|
+
}
|
|
303
|
+
return filePath.split(path.sep).join("/");
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function displayPathForCommand(value) {
|
|
307
|
+
return String(value).replace(/[\\/]+/g, "/");
|
|
308
|
+
}
|
package/src/v2/document-model.js
CHANGED
|
@@ -3,9 +3,10 @@ import path from "node:path";
|
|
|
3
3
|
import { normalizeText, splitSentences, tokenizeWords } from "./text-utils.js";
|
|
4
4
|
|
|
5
5
|
export const VOICE_EXTENSIONS = new Set([".md", ".mdx", ".txt"]);
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
6
|
+
export const VOICE_SKIP_DIRS = new Set([".git", "node_modules", "dist", "build", "__pycache__", "prompts", "voice-pack", "dravoice-voice"]);
|
|
7
|
+
export const DEFAULT_MAX_FILES = 500;
|
|
8
|
+
export const DEFAULT_MAX_FILE_BYTES = 1024 * 1024;
|
|
9
|
+
export const DEFAULT_MAX_TOTAL_BYTES = 20 * 1024 * 1024;
|
|
9
10
|
|
|
10
11
|
export function loadDocuments({
|
|
11
12
|
examplesDir,
|
|
@@ -56,6 +57,8 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
56
57
|
let inFrontmatter = lines[0]?.trim() === "---";
|
|
57
58
|
let inHtmlComment = false;
|
|
58
59
|
let jsxBlockTag = null;
|
|
60
|
+
let mdxScaffoldDepth = 0;
|
|
61
|
+
let mdxExpressionDepth = 0;
|
|
59
62
|
|
|
60
63
|
const flushParagraph = () => {
|
|
61
64
|
if (currentParagraph?.lines.length) {
|
|
@@ -85,6 +88,16 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
85
88
|
return;
|
|
86
89
|
}
|
|
87
90
|
|
|
91
|
+
if (mdxScaffoldDepth > 0) {
|
|
92
|
+
mdxScaffoldDepth = Math.max(0, mdxScaffoldDepth + syntaxDepthDelta(trimmed));
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (mdxExpressionDepth > 0) {
|
|
97
|
+
mdxExpressionDepth = Math.max(0, mdxExpressionDepth + curlyDepthDelta(trimmed));
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
88
101
|
if (inHtmlComment) {
|
|
89
102
|
if (trimmed.includes("-->")) {
|
|
90
103
|
inHtmlComment = false;
|
|
@@ -114,7 +127,19 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
114
127
|
return;
|
|
115
128
|
}
|
|
116
129
|
|
|
117
|
-
if (
|
|
130
|
+
if (isMdxScaffold(trimmed)) {
|
|
131
|
+
flushParagraph();
|
|
132
|
+
mdxScaffoldDepth = Math.max(0, syntaxDepthDelta(trimmed));
|
|
133
|
+
return;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (mdxExpressionStart(trimmed)) {
|
|
137
|
+
flushParagraph();
|
|
138
|
+
mdxExpressionDepth = Math.max(0, curlyDepthDelta(trimmed));
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (!trimmed || isTableLine(trimmed) || isJsxLike(trimmed)) {
|
|
118
143
|
if (!trimmed) {
|
|
119
144
|
flushParagraph();
|
|
120
145
|
}
|
|
@@ -166,6 +191,7 @@ export function parseDocument({ filePath, rootDir = process.cwd(), contents }) {
|
|
|
166
191
|
sentences,
|
|
167
192
|
wordCount: sentences.reduce((sum, sentence) => sum + sentence.tokens.length, 0),
|
|
168
193
|
text: paragraphs.map((paragraph) => paragraph.text).join("\n\n"),
|
|
194
|
+
raw: String(contents ?? ""),
|
|
169
195
|
};
|
|
170
196
|
}
|
|
171
197
|
|
|
@@ -277,6 +303,53 @@ function isMdxScaffold(trimmed) {
|
|
|
277
303
|
return /^(?:import|export)\s/.test(trimmed);
|
|
278
304
|
}
|
|
279
305
|
|
|
306
|
+
function mdxExpressionStart(trimmed) {
|
|
307
|
+
return trimmed.startsWith("{") && /[A-Za-z_$][\w$]*\s*[.(]/.test(trimmed);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function syntaxDepthDelta(text) {
|
|
311
|
+
return curlyDepthDelta(text) + bracketDepthDelta(text) + parenDepthDelta(text);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
function curlyDepthDelta(text) {
|
|
315
|
+
return depthDelta(text, "{", "}");
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function bracketDepthDelta(text) {
|
|
319
|
+
return depthDelta(text, "[", "]");
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function parenDepthDelta(text) {
|
|
323
|
+
return depthDelta(text, "(", ")");
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
function depthDelta(text, open, close) {
|
|
327
|
+
let depth = 0;
|
|
328
|
+
let inSingle = false;
|
|
329
|
+
let inDouble = false;
|
|
330
|
+
for (let index = 0; index < text.length; index += 1) {
|
|
331
|
+
const char = text[index];
|
|
332
|
+
const previous = text[index - 1];
|
|
333
|
+
if (char === "'" && !inDouble && previous !== "\\") {
|
|
334
|
+
inSingle = !inSingle;
|
|
335
|
+
continue;
|
|
336
|
+
}
|
|
337
|
+
if (char === "\"" && !inSingle && previous !== "\\") {
|
|
338
|
+
inDouble = !inDouble;
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
if (inSingle || inDouble) {
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
if (char === open) {
|
|
345
|
+
depth += 1;
|
|
346
|
+
} else if (char === close) {
|
|
347
|
+
depth -= 1;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
return depth;
|
|
351
|
+
}
|
|
352
|
+
|
|
280
353
|
function isTableLine(trimmed) {
|
|
281
354
|
return /^\|.*\|$/.test(trimmed);
|
|
282
355
|
}
|
|
@@ -291,7 +364,6 @@ function walkVoiceFiles(rootDir, { maxFiles, excludePaths }) {
|
|
|
291
364
|
return result;
|
|
292
365
|
}
|
|
293
366
|
const stack = [rootDir];
|
|
294
|
-
const skipDirs = new Set([".git", "node_modules", "dist", "build", "__pycache__", "prompts", "voice-pack", "dravoice-voice"]);
|
|
295
367
|
while (stack.length) {
|
|
296
368
|
const current = stack.pop();
|
|
297
369
|
if (isExcludedPath(current, excludePaths)) {
|
|
@@ -305,7 +377,7 @@ function walkVoiceFiles(rootDir, { maxFiles, excludePaths }) {
|
|
|
305
377
|
continue;
|
|
306
378
|
}
|
|
307
379
|
if (entry.isDirectory()) {
|
|
308
|
-
if (!
|
|
380
|
+
if (!VOICE_SKIP_DIRS.has(entry.name)) {
|
|
309
381
|
directories.push(fullPath);
|
|
310
382
|
}
|
|
311
383
|
} else if (entry.isFile() && VOICE_EXTENSIONS.has(path.extname(entry.name).toLowerCase())) {
|
package/src/v2/inspect.js
CHANGED
|
@@ -49,7 +49,7 @@ function featureSummary(name, features) {
|
|
|
49
49
|
return `wordCount=${features.wordCount}; contentTypeTokenRatio=${features.vocabularyRichness.contentTypeTokenRatio}; wordLength.median=${features.wordLength.median}; maskedCharacterFourgrams=${features.maskedCharacterFourgrams?.length ?? 0}; functionWordBigrams=${features.functionWordBigrams?.length ?? 0}`;
|
|
50
50
|
}
|
|
51
51
|
if (name === "register") {
|
|
52
|
-
return `primary=${features.primary.value} (${features.primary.score}); alternates=${features.scores.slice(1, 4).map((score) => `${score.value}:${score.score}`).join(", ")}`;
|
|
52
|
+
return `primary=${features.primary.value} (${features.primary.score}); mixedRegister=${features.mixedRegister ? "yes" : "no"}; markerSets=${features.markerSets?.length ?? 0}; alternates=${features.scores.slice(1, 4).map((score) => `${score.value}:${score.score}`).join(", ")}`;
|
|
53
53
|
}
|
|
54
54
|
if (name === "discourse") {
|
|
55
55
|
return `transitionRates=${Object.entries(features.transitionRates).map(([key, value]) => `${key}:${value}`).join(", ")}; sentenceCallbacks=${features.sentenceCallbacks}`;
|
|
@@ -61,7 +61,7 @@ function featureSummary(name, features) {
|
|
|
61
61
|
return `evidenceSentenceRate=${features.evidenceSentenceRate}; claimSentenceRate=${features.claimSentenceRate}; supportedClaimRate=${features.supportedClaimRate}; unsupportedClaimRate=${features.unsupportedClaimRate}; evidenceTypes=${features.evidenceTypes.map((item) => `${item.value}:${item.count}`).join(", ") || "none"}`;
|
|
62
62
|
}
|
|
63
63
|
if (name === "structure") {
|
|
64
|
-
return `sectionWords.median=${features.sectionWords.median}; headingCount.median=${features.headingCount.median}; listDocumentRate=${features.listDocumentRate}; quoteDocumentRate=${features.quoteDocumentRate}`;
|
|
64
|
+
return `sectionWords.median=${features.sectionWords.median}; headingCount.median=${features.headingCount.median}; maxHeadingDepth.median=${features.maxHeadingDepth?.median ?? 0}; sectionOrderPatterns=${features.sectionOrderPatterns?.length ?? 0}; listDocumentRate=${features.listDocumentRate}; quoteDocumentRate=${features.quoteDocumentRate}`;
|
|
65
65
|
}
|
|
66
66
|
return JSON.stringify(features);
|
|
67
67
|
}
|