dravoice 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +126 -37
- package/bin/dravoice.js +11 -10
- package/package.json +47 -45
- package/src/index.js +967 -197
- package/src/v2/analyzers/discourse.js +69 -63
- package/src/v2/analyzers/evidence.js +82 -82
- package/src/v2/analyzers/lexical.js +114 -114
- package/src/v2/analyzers/register.js +70 -34
- package/src/v2/analyzers/rhetorical-shape.js +65 -59
- package/src/v2/analyzers/rhythm.js +39 -47
- package/src/v2/analyzers/structure.js +41 -24
- package/src/v2/benchmark.js +657 -568
- package/src/v2/brief.js +154 -146
- package/src/v2/config.js +78 -0
- package/src/v2/doctor.js +308 -0
- package/src/v2/document-model.js +422 -260
- package/src/v2/inspect.js +67 -67
- package/src/v2/io-utils.js +51 -0
- package/src/v2/profile.js +342 -203
- package/src/v2/prompt.js +65 -64
- package/src/v2/review.js +303 -173
- package/src/v2/revise-plan.js +540 -433
- package/src/v2/stylometry.js +346 -332
- package/src/v2/text-utils.js +123 -123
|
@@ -1,63 +1,69 @@
|
|
|
1
|
-
import { rate, topItems } from "../text-utils.js";
|
|
2
|
-
|
|
3
|
-
const TRANSITIONS = {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
temporal:
|
|
8
|
-
example:
|
|
9
|
-
|
|
10
|
-
};
|
|
11
|
-
|
|
12
|
-
export function analyzeDiscourse(documents) {
|
|
13
|
-
const sentences = documents.flatMap((document) => document.sentences);
|
|
14
|
-
const labels = sentences.map((sentence) => transitionLabel(sentence.text));
|
|
15
|
-
const nonPlainLabels = labels.filter((label) => label !== "plain");
|
|
16
|
-
const transitionRates = {};
|
|
17
|
-
for (const key of Object.keys(TRANSITIONS)) {
|
|
18
|
-
transitionRates[key] = rate(labels.filter((label) => label === key).length, sentences.length, 2);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
return {
|
|
22
|
-
family: "discourse",
|
|
23
|
-
confidence: sentences.length >= 12 ? "medium" : "low",
|
|
24
|
-
features: {
|
|
25
|
-
transitionRates,
|
|
26
|
-
transitionSequence: labels.slice(0, 12),
|
|
27
|
-
transitionBigrams: topItems(sequenceNgrams(labels, 2), 12),
|
|
28
|
-
transitionTrigrams: topItems(sequenceNgrams(labels, 3), 12),
|
|
29
|
-
sentenceCallbacks: callbackRate(sentences),
|
|
30
|
-
},
|
|
31
|
-
examples: topItems(nonPlainLabels, 5).map((item) => item.value),
|
|
32
|
-
warnings: sentences.length < 12 ? ["Discourse confidence is limited because the corpus has fewer than 12 sentences."] : [],
|
|
33
|
-
revisionHandles: ["Compare how sentences turn, contrast, explain, and return to earlier ideas."],
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
export function transitionLabel(text) {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
1
|
+
import { rate, topItems } from "../text-utils.js";
|
|
2
|
+
|
|
3
|
+
const TRANSITIONS = {
|
|
4
|
+
contrast: /^(?:but|however|although|yet|instead|still)\b/i,
|
|
5
|
+
causal: /^(?:because|so|therefore|since|as a result)\b/i,
|
|
6
|
+
conclusion: /^(?:therefore|finally|in the end|the lesson)\b/i,
|
|
7
|
+
temporal: /^(?:then|before|after|while|when|first|second|later)\b/i,
|
|
8
|
+
example: /^(?:for example|such as|including)\b/i,
|
|
9
|
+
additive: /^(?:also|another|again|plus|and)\b/i,
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
export function analyzeDiscourse(documents) {
|
|
13
|
+
const sentences = documents.flatMap((document) => document.sentences);
|
|
14
|
+
const labels = sentences.map((sentence) => transitionLabel(sentence.text));
|
|
15
|
+
const nonPlainLabels = labels.filter((label) => label !== "plain");
|
|
16
|
+
const transitionRates = {};
|
|
17
|
+
for (const key of Object.keys(TRANSITIONS)) {
|
|
18
|
+
transitionRates[key] = rate(labels.filter((label) => label === key).length, sentences.length, 2);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
family: "discourse",
|
|
23
|
+
confidence: sentences.length >= 12 ? "medium" : "low",
|
|
24
|
+
features: {
|
|
25
|
+
transitionRates,
|
|
26
|
+
transitionSequence: labels.slice(0, 12),
|
|
27
|
+
transitionBigrams: topItems(sequenceNgrams(labels, 2), 12),
|
|
28
|
+
transitionTrigrams: topItems(sequenceNgrams(labels, 3), 12),
|
|
29
|
+
sentenceCallbacks: callbackRate(sentences),
|
|
30
|
+
},
|
|
31
|
+
examples: topItems(nonPlainLabels, 5).map((item) => item.value),
|
|
32
|
+
warnings: sentences.length < 12 ? ["Discourse confidence is limited because the corpus has fewer than 12 sentences."] : [],
|
|
33
|
+
revisionHandles: ["Compare how sentences turn, contrast, explain, and return to earlier ideas."],
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export function transitionLabel(text) {
|
|
38
|
+
const normalized = String(text ?? "");
|
|
39
|
+
for (const [label, pattern] of Object.entries(TRANSITIONS)) {
|
|
40
|
+
if (pattern.test(normalized) || embeddedTransitionPattern(pattern).test(normalized)) {
|
|
41
|
+
return label;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return "plain";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function embeddedTransitionPattern(pattern) {
|
|
48
|
+
const source = pattern.source.replace(/^\^\(\?:/, "(?:");
|
|
49
|
+
return new RegExp(`[.;:,]\\s+${source}`, pattern.flags);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function callbackRate(sentences) {
|
|
53
|
+
let callbacks = 0;
|
|
54
|
+
for (let index = 1; index < sentences.length; index += 1) {
|
|
55
|
+
const previous = new Set(sentences[index - 1].tokens.filter((word) => word.length > 3));
|
|
56
|
+
if (sentences[index].tokens.some((word) => previous.has(word))) {
|
|
57
|
+
callbacks += 1;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return rate(callbacks, Math.max(1, sentences.length - 1), 2);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function sequenceNgrams(values, size) {
|
|
64
|
+
const result = [];
|
|
65
|
+
for (let index = 0; index <= values.length - size; index += 1) {
|
|
66
|
+
result.push(values.slice(index, index + size).join(" -> "));
|
|
67
|
+
}
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
@@ -1,82 +1,82 @@
|
|
|
1
|
-
import { rate, topItems } from "../text-utils.js";
|
|
2
|
-
|
|
3
|
-
const EVIDENCE_PATTERNS = {
|
|
4
|
-
date: /\b\d{1,2}:\d{2}\s?(?:am|pm)?\b|\b20\d{2}-\d{2}-\d{2}\b|\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b/i,
|
|
5
|
-
number: /\b\d+(?:\.\d+)?\b/,
|
|
6
|
-
quote: /"[^"]+"|'[^']+'|^>/,
|
|
7
|
-
url: /https?:\/\/\S+/i,
|
|
8
|
-
citation: /\[[^\]]+\]\([^)]+\)|\([A-Z][A-Za-z]+,\s*\d{4}\)/,
|
|
9
|
-
sourceAttribution: /\b(according to|reported|observed|noted|recorded|quoted|interviewed|surveyed|field notes said|data shows|study found|the memo|the log|the report)\b/i,
|
|
10
|
-
sensory: /\b(cold|warm|hot|cool|quiet|loud|bright|dark|red|blue|green|rough|smooth|sharp|soft|smelled|smell|scent|tasted|heard|sound|noise|flashed|visible|physical|rain|metal|smoke)\b/i,
|
|
11
|
-
specificExample: /\b(for example|for instance|such as|including|included|includes|sample|case in point|specifically|in one case)\b/i,
|
|
12
|
-
};
|
|
13
|
-
|
|
14
|
-
const ABSTRACT_CLAIM_RE = /\b(always|never|everyone|everything|nothing|best|better|worse|important|obvious|clearly|should|must|need to|have to|all|none|every)\b/i;
|
|
15
|
-
|
|
16
|
-
export function analyzeEvidence(documents) {
|
|
17
|
-
const documentResults = documents.map(documentEvidence);
|
|
18
|
-
const sentenceCount = documentResults.reduce((sum, item) => sum + item.sentenceCount, 0);
|
|
19
|
-
const evidenceSentenceCount = documentResults.reduce((sum, item) => sum + item.evidenceSentenceCount, 0);
|
|
20
|
-
const claimSentenceCount = documentResults.reduce((sum, item) => sum + item.claimSentenceCount, 0);
|
|
21
|
-
const supportedClaimCount = documentResults.reduce((sum, item) => sum + item.supportedClaimCount, 0);
|
|
22
|
-
const unsupportedClaimCount = documentResults.reduce((sum, item) => sum + item.unsupportedClaimCount, 0);
|
|
23
|
-
const typeValues = documentResults.flatMap((item) => item.typeValues);
|
|
24
|
-
|
|
25
|
-
return {
|
|
26
|
-
family: "evidence",
|
|
27
|
-
confidence: sentenceCount >= 12 ? "medium" : "low",
|
|
28
|
-
features: {
|
|
29
|
-
sentenceCount,
|
|
30
|
-
evidenceSentenceCount,
|
|
31
|
-
evidenceSentenceRate: rate(evidenceSentenceCount, sentenceCount, 2),
|
|
32
|
-
claimSentenceCount,
|
|
33
|
-
claimSentenceRate: rate(claimSentenceCount, sentenceCount, 2),
|
|
34
|
-
supportedClaimRate: rate(supportedClaimCount, Math.max(1, claimSentenceCount), 2),
|
|
35
|
-
unsupportedClaimRate: rate(unsupportedClaimCount, Math.max(1, claimSentenceCount), 2),
|
|
36
|
-
evidenceTypes: topItems(typeValues, 8),
|
|
37
|
-
},
|
|
38
|
-
examples: topItems(typeValues, 4).map((item) => `${item.value}: ${item.count}`),
|
|
39
|
-
warnings: sentenceCount < 12 ? ["Evidence confidence is limited because the corpus has fewer than 12 sentences."] : [],
|
|
40
|
-
revisionHandles: ["Compare how broad claims are supported by concrete scenes, numbers, quotes, citations, or examples."],
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function documentEvidence(document) {
|
|
45
|
-
const sentences = document.sentences;
|
|
46
|
-
const sentenceEvidenceTypes = sentences.map((sentence) => evidenceTypes(sentence.text));
|
|
47
|
-
const evidenceSentences = sentences.filter((_, index) => sentenceEvidenceTypes[index].length > 0);
|
|
48
|
-
const claimIndexes = sentences
|
|
49
|
-
.map((sentence, index) => ({ sentence, index }))
|
|
50
|
-
.filter(({ sentence }) => ABSTRACT_CLAIM_RE.test(sentence.text))
|
|
51
|
-
.map(({ index }) => index);
|
|
52
|
-
const supportedClaimIndexes = claimIndexes.filter((index) => hasNearbyEvidence(sentenceEvidenceTypes, index));
|
|
53
|
-
return {
|
|
54
|
-
sentenceCount: sentences.length,
|
|
55
|
-
evidenceSentenceCount: evidenceSentences.length,
|
|
56
|
-
claimSentenceCount: claimIndexes.length,
|
|
57
|
-
supportedClaimCount: supportedClaimIndexes.length,
|
|
58
|
-
unsupportedClaimCount: claimIndexes.length - supportedClaimIndexes.length,
|
|
59
|
-
typeValues: evidenceSentences.flatMap((sentence) => evidenceTypes(sentence.text)),
|
|
60
|
-
};
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function hasNearbyEvidence(sentenceEvidenceTypes, claimIndex) {
|
|
64
|
-
const start = Math.max(0, claimIndex - 2);
|
|
65
|
-
const end = Math.min(sentenceEvidenceTypes.length - 1, claimIndex + 2);
|
|
66
|
-
for (let index = start; index <= end; index += 1) {
|
|
67
|
-
if (sentenceEvidenceTypes[index].length > 0) {
|
|
68
|
-
return true;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
return false;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
export function evidenceTypes(text) {
|
|
75
|
-
return Object.entries(EVIDENCE_PATTERNS)
|
|
76
|
-
.filter(([, pattern]) => pattern.test(text))
|
|
77
|
-
.map(([type]) => type);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
export function isAbstractClaim(text) {
|
|
81
|
-
return ABSTRACT_CLAIM_RE.test(text);
|
|
82
|
-
}
|
|
1
|
+
import { rate, topItems } from "../text-utils.js";
|
|
2
|
+
|
|
3
|
+
const EVIDENCE_PATTERNS = {
|
|
4
|
+
date: /\b\d{1,2}:\d{2}\s?(?:am|pm)?\b|\b20\d{2}-\d{2}-\d{2}\b|\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b|\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s+\d{4}\b/i,
|
|
5
|
+
number: /\b\d+(?:\.\d+)?\b/,
|
|
6
|
+
quote: /"[^"]+"|'[^']+'|^>/,
|
|
7
|
+
url: /https?:\/\/\S+/i,
|
|
8
|
+
citation: /\[[^\]]+\]\([^)]+\)|\[\^[^\]]+\]|\([A-Z][A-Za-z]+,\s*\d{4}\)/,
|
|
9
|
+
sourceAttribution: /\b(according to|reported|observed|noted|recorded|quoted|interviewed|surveyed|field notes said|data shows|study found|the memo|the log|the report|source:|internal memo)\b/i,
|
|
10
|
+
sensory: /\b(cold|warm|hot|cool|quiet|loud|bright|dark|red|blue|green|rough|smooth|sharp|soft|smelled|smell|scent|tasted|heard|sound|noise|flashed|visible|physical|rain|metal|smoke)\b/i,
|
|
11
|
+
specificExample: /\b(for example|for instance|such as|including|included|includes|sample|case in point|specifically|in one case)\b/i,
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
const ABSTRACT_CLAIM_RE = /\b(always|never|everyone|everything|nothing|best|better|worse|important|obvious|clearly|should|must|need to|have to|all|none|every)\b/i;
|
|
15
|
+
|
|
16
|
+
export function analyzeEvidence(documents) {
|
|
17
|
+
const documentResults = documents.map(documentEvidence);
|
|
18
|
+
const sentenceCount = documentResults.reduce((sum, item) => sum + item.sentenceCount, 0);
|
|
19
|
+
const evidenceSentenceCount = documentResults.reduce((sum, item) => sum + item.evidenceSentenceCount, 0);
|
|
20
|
+
const claimSentenceCount = documentResults.reduce((sum, item) => sum + item.claimSentenceCount, 0);
|
|
21
|
+
const supportedClaimCount = documentResults.reduce((sum, item) => sum + item.supportedClaimCount, 0);
|
|
22
|
+
const unsupportedClaimCount = documentResults.reduce((sum, item) => sum + item.unsupportedClaimCount, 0);
|
|
23
|
+
const typeValues = documentResults.flatMap((item) => item.typeValues);
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
family: "evidence",
|
|
27
|
+
confidence: sentenceCount >= 12 ? "medium" : "low",
|
|
28
|
+
features: {
|
|
29
|
+
sentenceCount,
|
|
30
|
+
evidenceSentenceCount,
|
|
31
|
+
evidenceSentenceRate: rate(evidenceSentenceCount, sentenceCount, 2),
|
|
32
|
+
claimSentenceCount,
|
|
33
|
+
claimSentenceRate: rate(claimSentenceCount, sentenceCount, 2),
|
|
34
|
+
supportedClaimRate: rate(supportedClaimCount, Math.max(1, claimSentenceCount), 2),
|
|
35
|
+
unsupportedClaimRate: rate(unsupportedClaimCount, Math.max(1, claimSentenceCount), 2),
|
|
36
|
+
evidenceTypes: topItems(typeValues, 8),
|
|
37
|
+
},
|
|
38
|
+
examples: topItems(typeValues, 4).map((item) => `${item.value}: ${item.count}`),
|
|
39
|
+
warnings: sentenceCount < 12 ? ["Evidence confidence is limited because the corpus has fewer than 12 sentences."] : [],
|
|
40
|
+
revisionHandles: ["Compare how broad claims are supported by concrete scenes, numbers, quotes, citations, or examples."],
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function documentEvidence(document) {
|
|
45
|
+
const sentences = document.sentences;
|
|
46
|
+
const sentenceEvidenceTypes = sentences.map((sentence) => evidenceTypes(sentence.text));
|
|
47
|
+
const evidenceSentences = sentences.filter((_, index) => sentenceEvidenceTypes[index].length > 0);
|
|
48
|
+
const claimIndexes = sentences
|
|
49
|
+
.map((sentence, index) => ({ sentence, index }))
|
|
50
|
+
.filter(({ sentence }) => ABSTRACT_CLAIM_RE.test(sentence.text))
|
|
51
|
+
.map(({ index }) => index);
|
|
52
|
+
const supportedClaimIndexes = claimIndexes.filter((index) => hasNearbyEvidence(sentenceEvidenceTypes, index));
|
|
53
|
+
return {
|
|
54
|
+
sentenceCount: sentences.length,
|
|
55
|
+
evidenceSentenceCount: evidenceSentences.length,
|
|
56
|
+
claimSentenceCount: claimIndexes.length,
|
|
57
|
+
supportedClaimCount: supportedClaimIndexes.length,
|
|
58
|
+
unsupportedClaimCount: claimIndexes.length - supportedClaimIndexes.length,
|
|
59
|
+
typeValues: evidenceSentences.flatMap((sentence) => evidenceTypes(sentence.text)),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function hasNearbyEvidence(sentenceEvidenceTypes, claimIndex) {
|
|
64
|
+
const start = Math.max(0, claimIndex - 2);
|
|
65
|
+
const end = Math.min(sentenceEvidenceTypes.length - 1, claimIndex + 2);
|
|
66
|
+
for (let index = start; index <= end; index += 1) {
|
|
67
|
+
if (sentenceEvidenceTypes[index].length > 0) {
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function evidenceTypes(text) {
|
|
75
|
+
return Object.entries(EVIDENCE_PATTERNS)
|
|
76
|
+
.filter(([, pattern]) => pattern.test(text))
|
|
77
|
+
.map(([type]) => type);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export function isAbstractClaim(text) {
|
|
81
|
+
return ABSTRACT_CLAIM_RE.test(text);
|
|
82
|
+
}
|
|
@@ -1,114 +1,114 @@
|
|
|
1
|
-
import {
|
|
2
|
-
FUNCTION_WORDS,
|
|
3
|
-
characterNgrams,
|
|
4
|
-
contentWords,
|
|
5
|
-
distribution,
|
|
6
|
-
normalizeText,
|
|
7
|
-
rate,
|
|
8
|
-
tokenizeWords,
|
|
9
|
-
topItems,
|
|
10
|
-
} from "../text-utils.js";
|
|
11
|
-
|
|
12
|
-
export function analyzeLexical(documents) {
|
|
13
|
-
const text = documents.map((document) => document.text).join("\n\n");
|
|
14
|
-
const words = tokenizeWords(text);
|
|
15
|
-
const content = contentWords(text);
|
|
16
|
-
const sentences = documents.flatMap((document) => document.sentences);
|
|
17
|
-
const functionWordSet = new Set(FUNCTION_WORDS);
|
|
18
|
-
|
|
19
|
-
return {
|
|
20
|
-
family: "lexical",
|
|
21
|
-
confidence: confidenceFor(words.length),
|
|
22
|
-
features: {
|
|
23
|
-
wordCount: words.length,
|
|
24
|
-
contentWordCount: content.length,
|
|
25
|
-
vocabularyRichness: {
|
|
26
|
-
uniqueContentWords: new Set(content).size,
|
|
27
|
-
contentTypeTokenRatio: rate(new Set(content).size, content.length, 3),
|
|
28
|
-
},
|
|
29
|
-
wordLength: distribution(words.map((word) => word.length)),
|
|
30
|
-
functionWords: topItems(words.filter((word) => functionWordSet.has(word)), 24),
|
|
31
|
-
functionWordBigrams: topItems(tokenNgrams(words.filter((word) => functionWordSet.has(word)), 2), 36),
|
|
32
|
-
characterTrigrams: topItems(characterNgrams(text, 3), 24),
|
|
33
|
-
maskedCharacterFourgrams: topItems(maskedCharacterNgrams(text, 4), 48),
|
|
34
|
-
repeatedMotifs: topItems(content, 16).filter((item) => item.count > 1),
|
|
35
|
-
sentenceInitialTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens[0], functionWordSet)), 16),
|
|
36
|
-
sentenceFinalTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens.at(-1), functionWordSet)), 16),
|
|
37
|
-
punctuationNgrams: topItems(punctuationNgrams(text, 3), 16),
|
|
38
|
-
punctuation: {
|
|
39
|
-
commaRate: rate(count(text, /,/g), sentences.length, 2),
|
|
40
|
-
semicolonRate: rate(count(text, /;/g), sentences.length, 2),
|
|
41
|
-
colonRate: rate(count(text, /:/g), sentences.length, 2),
|
|
42
|
-
questionRate: rate(count(text, /\?/g), sentences.length, 2),
|
|
43
|
-
dashRate: rate(count(text, /--|-/g), sentences.length, 2),
|
|
44
|
-
},
|
|
45
|
-
},
|
|
46
|
-
examples: topItems(content, 5).map((item) => item.value),
|
|
47
|
-
warnings: words.length < 200 ? ["Lexical confidence is limited because the corpus has fewer than 200 words."] : [],
|
|
48
|
-
revisionHandles: ["Compare function words, vocabulary richness, motifs, and punctuation habits."],
|
|
49
|
-
};
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function boundaryToken(word, functionWordSet) {
|
|
53
|
-
if (!word) {
|
|
54
|
-
return null;
|
|
55
|
-
}
|
|
56
|
-
if (functionWordSet.has(word)) {
|
|
57
|
-
return word;
|
|
58
|
-
}
|
|
59
|
-
if (/^\d+$/.test(word)) {
|
|
60
|
-
return "<number>";
|
|
61
|
-
}
|
|
62
|
-
return "<content>";
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
function tokenNgrams(tokens, size) {
|
|
66
|
-
const grams = [];
|
|
67
|
-
for (let index = 0; index <= tokens.length - size; index += 1) {
|
|
68
|
-
grams.push(tokens.slice(index, index + size).join(" "));
|
|
69
|
-
}
|
|
70
|
-
return grams;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
function maskedCharacterNgrams(text, size) {
|
|
74
|
-
const functionWordSet = new Set(FUNCTION_WORDS);
|
|
75
|
-
const masked = normalizeText(text)
|
|
76
|
-
.replace(/\p{L}[\p{L}\p{N}'-]*|\p{N}+/gu, (word) => {
|
|
77
|
-
const normalized = word.toLowerCase().replace(/'s$/, "");
|
|
78
|
-
if (functionWordSet.has(normalized)) {
|
|
79
|
-
return normalized;
|
|
80
|
-
}
|
|
81
|
-
if (/^\p{N}+$/u.test(normalized)) {
|
|
82
|
-
return "@";
|
|
83
|
-
}
|
|
84
|
-
return "#";
|
|
85
|
-
})
|
|
86
|
-
.replace(/\s+/g, " ");
|
|
87
|
-
const grams = [];
|
|
88
|
-
for (let index = 0; index <= masked.length - size; index += 1) {
|
|
89
|
-
const gram = masked.slice(index, index + size);
|
|
90
|
-
if (gram.trim()) {
|
|
91
|
-
grams.push(gram);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
return grams;
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
function punctuationNgrams(text, size) {
|
|
98
|
-
const marks = Array.from(String(text ?? "").matchAll(/[.,;:!?-]/g)).map((match) => match[0]);
|
|
99
|
-
return tokenNgrams(marks, size);
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
function confidenceFor(wordCount) {
|
|
103
|
-
if (wordCount >= 2000) {
|
|
104
|
-
return "high";
|
|
105
|
-
}
|
|
106
|
-
if (wordCount >= 120) {
|
|
107
|
-
return "medium";
|
|
108
|
-
}
|
|
109
|
-
return "low";
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
function count(text, pattern) {
|
|
113
|
-
return Array.from(text.matchAll(pattern)).length;
|
|
114
|
-
}
|
|
1
|
+
import {
|
|
2
|
+
FUNCTION_WORDS,
|
|
3
|
+
characterNgrams,
|
|
4
|
+
contentWords,
|
|
5
|
+
distribution,
|
|
6
|
+
normalizeText,
|
|
7
|
+
rate,
|
|
8
|
+
tokenizeWords,
|
|
9
|
+
topItems,
|
|
10
|
+
} from "../text-utils.js";
|
|
11
|
+
|
|
12
|
+
export function analyzeLexical(documents) {
|
|
13
|
+
const text = documents.map((document) => document.text).join("\n\n");
|
|
14
|
+
const words = tokenizeWords(text);
|
|
15
|
+
const content = contentWords(text);
|
|
16
|
+
const sentences = documents.flatMap((document) => document.sentences);
|
|
17
|
+
const functionWordSet = new Set(FUNCTION_WORDS);
|
|
18
|
+
|
|
19
|
+
return {
|
|
20
|
+
family: "lexical",
|
|
21
|
+
confidence: confidenceFor(words.length),
|
|
22
|
+
features: {
|
|
23
|
+
wordCount: words.length,
|
|
24
|
+
contentWordCount: content.length,
|
|
25
|
+
vocabularyRichness: {
|
|
26
|
+
uniqueContentWords: new Set(content).size,
|
|
27
|
+
contentTypeTokenRatio: rate(new Set(content).size, content.length, 3),
|
|
28
|
+
},
|
|
29
|
+
wordLength: distribution(words.map((word) => word.length)),
|
|
30
|
+
functionWords: topItems(words.filter((word) => functionWordSet.has(word)), 24),
|
|
31
|
+
functionWordBigrams: topItems(tokenNgrams(words.filter((word) => functionWordSet.has(word)), 2), 36),
|
|
32
|
+
characterTrigrams: topItems(characterNgrams(text, 3), 24),
|
|
33
|
+
maskedCharacterFourgrams: topItems(maskedCharacterNgrams(text, 4), 48),
|
|
34
|
+
repeatedMotifs: topItems(content, 16).filter((item) => item.count > 1),
|
|
35
|
+
sentenceInitialTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens[0], functionWordSet)), 16),
|
|
36
|
+
sentenceFinalTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens.at(-1), functionWordSet)), 16),
|
|
37
|
+
punctuationNgrams: topItems(punctuationNgrams(text, 3), 16),
|
|
38
|
+
punctuation: {
|
|
39
|
+
commaRate: rate(count(text, /,/g), sentences.length, 2),
|
|
40
|
+
semicolonRate: rate(count(text, /;/g), sentences.length, 2),
|
|
41
|
+
colonRate: rate(count(text, /:/g), sentences.length, 2),
|
|
42
|
+
questionRate: rate(count(text, /\?/g), sentences.length, 2),
|
|
43
|
+
dashRate: rate(count(text, /--|-/g), sentences.length, 2),
|
|
44
|
+
},
|
|
45
|
+
},
|
|
46
|
+
examples: topItems(content, 5).map((item) => item.value),
|
|
47
|
+
warnings: words.length < 200 ? ["Lexical confidence is limited because the corpus has fewer than 200 words."] : [],
|
|
48
|
+
revisionHandles: ["Compare function words, vocabulary richness, motifs, and punctuation habits."],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function boundaryToken(word, functionWordSet) {
|
|
53
|
+
if (!word) {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
if (functionWordSet.has(word)) {
|
|
57
|
+
return word;
|
|
58
|
+
}
|
|
59
|
+
if (/^\d+$/.test(word)) {
|
|
60
|
+
return "<number>";
|
|
61
|
+
}
|
|
62
|
+
return "<content>";
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function tokenNgrams(tokens, size) {
|
|
66
|
+
const grams = [];
|
|
67
|
+
for (let index = 0; index <= tokens.length - size; index += 1) {
|
|
68
|
+
grams.push(tokens.slice(index, index + size).join(" "));
|
|
69
|
+
}
|
|
70
|
+
return grams;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function maskedCharacterNgrams(text, size) {
|
|
74
|
+
const functionWordSet = new Set(FUNCTION_WORDS);
|
|
75
|
+
const masked = normalizeText(text)
|
|
76
|
+
.replace(/\p{L}[\p{L}\p{N}'-]*|\p{N}+/gu, (word) => {
|
|
77
|
+
const normalized = word.toLowerCase().replace(/'s$/, "");
|
|
78
|
+
if (functionWordSet.has(normalized)) {
|
|
79
|
+
return normalized;
|
|
80
|
+
}
|
|
81
|
+
if (/^\p{N}+$/u.test(normalized)) {
|
|
82
|
+
return "@";
|
|
83
|
+
}
|
|
84
|
+
return "#";
|
|
85
|
+
})
|
|
86
|
+
.replace(/\s+/g, " ");
|
|
87
|
+
const grams = [];
|
|
88
|
+
for (let index = 0; index <= masked.length - size; index += 1) {
|
|
89
|
+
const gram = masked.slice(index, index + size);
|
|
90
|
+
if (gram.trim()) {
|
|
91
|
+
grams.push(gram);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return grams;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function punctuationNgrams(text, size) {
|
|
98
|
+
const marks = Array.from(String(text ?? "").matchAll(/[.,;:!?-]/g)).map((match) => match[0]);
|
|
99
|
+
return tokenNgrams(marks, size);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function confidenceFor(wordCount) {
|
|
103
|
+
if (wordCount >= 2000) {
|
|
104
|
+
return "high";
|
|
105
|
+
}
|
|
106
|
+
if (wordCount >= 120) {
|
|
107
|
+
return "medium";
|
|
108
|
+
}
|
|
109
|
+
return "low";
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function count(text, pattern) {
|
|
113
|
+
return Array.from(text.matchAll(pattern)).length;
|
|
114
|
+
}
|
|
@@ -1,34 +1,70 @@
|
|
|
1
|
-
import { contentWords, rate, topItems } from "../text-utils.js";
|
|
2
|
-
|
|
3
|
-
const REGISTER_MARKERS = {
|
|
4
|
-
narrative: ["scene", "moment", "watched", "waited", "noticed", "remembered", "described", "story"],
|
|
5
|
-
explanatory: ["because", "so", "therefore", "means", "shows", "explains", "reason", "pattern"],
|
|
6
|
-
argumentative: ["should", "must", "better", "important", "claim", "therefore", "argue", "evidence"],
|
|
7
|
-
instructional: ["start", "fix", "use", "avoid", "keep", "write", "revise", "follow"],
|
|
8
|
-
reflective: ["lesson", "pause", "changed", "remember", "noticed", "felt", "learned", "realized"],
|
|
9
|
-
technical: ["file", "test", "system", "code", "api", "build"],
|
|
10
|
-
personal: ["i", "my", "we", "our"],
|
|
11
|
-
formal: ["requires", "outcomes", "process", "alignment", "therefore"],
|
|
12
|
-
};
|
|
13
|
-
|
|
14
|
-
export function analyzeRegister(documents) {
|
|
15
|
-
const text = documents.map((document) => document.text.toLowerCase()).join("\n\n");
|
|
16
|
-
const words = new Set(contentWords(text));
|
|
17
|
-
const
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
1
|
+
import { contentWords, rate, topItems } from "../text-utils.js";
|
|
2
|
+
|
|
3
|
+
const REGISTER_MARKERS = {
|
|
4
|
+
narrative: ["scene", "moment", "watched", "waited", "noticed", "remembered", "described", "story"],
|
|
5
|
+
explanatory: ["because", "so", "therefore", "means", "shows", "explains", "reason", "pattern"],
|
|
6
|
+
argumentative: ["should", "must", "better", "important", "claim", "therefore", "argue", "evidence"],
|
|
7
|
+
instructional: ["start", "fix", "use", "avoid", "keep", "write", "revise", "follow"],
|
|
8
|
+
reflective: ["lesson", "pause", "changed", "remember", "noticed", "felt", "learned", "realized"],
|
|
9
|
+
technical: ["file", "test", "system", "code", "api", "build"],
|
|
10
|
+
personal: ["i", "my", "we", "our"],
|
|
11
|
+
formal: ["requires", "outcomes", "process", "alignment", "therefore"],
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
export function analyzeRegister(documents, { markers = REGISTER_MARKERS } = {}) {
|
|
15
|
+
const text = documents.map((document) => document.text.toLowerCase()).join("\n\n");
|
|
16
|
+
const words = new Set(contentWords(text));
|
|
17
|
+
const markerSets = Object.entries(markers).map(([value, markerList]) => {
|
|
18
|
+
const matchedMarkers = markerList.filter((marker) => markerAppears(text, words, marker));
|
|
19
|
+
return {
|
|
20
|
+
value,
|
|
21
|
+
markers: markerList,
|
|
22
|
+
matchedMarkers,
|
|
23
|
+
score: rate(matchedMarkers.length, markerList.length, 2),
|
|
24
|
+
};
|
|
25
|
+
});
|
|
26
|
+
const scores = markerSets.map(({ value, score }) => ({
|
|
27
|
+
value,
|
|
28
|
+
score,
|
|
29
|
+
})).sort((left, right) => right.score - left.score || left.value.localeCompare(right.value));
|
|
30
|
+
const mixedRegister = isMixedRegister(scores);
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
family: "register",
|
|
34
|
+
confidence: documents.length >= 3 ? "medium" : "low",
|
|
35
|
+
features: {
|
|
36
|
+
primary: scores[0] ?? { value: "unknown", score: 0 },
|
|
37
|
+
scores,
|
|
38
|
+
markerSets,
|
|
39
|
+
mixedRegister,
|
|
40
|
+
topContentWords: topItems(contentWords(text), 12),
|
|
41
|
+
},
|
|
42
|
+
examples: scores.slice(0, 3).map((item) => `${item.value}: ${item.score}`),
|
|
43
|
+
warnings: [
|
|
44
|
+
...(documents.length < 3 ? ["Register confidence is limited because the corpus has fewer than 3 documents."] : []),
|
|
45
|
+
...(mixedRegister ? ["Mixed register signals detected; treat the primary register as a weak summary of the genre mix."] : []),
|
|
46
|
+
],
|
|
47
|
+
revisionHandles: ["Check whether the draft uses the same broad register and genre mix as the corpus."],
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function isMixedRegister(scores) {
|
|
52
|
+
const active = scores.filter((item) => item.score > 0);
|
|
53
|
+
if (active.length < 2) {
|
|
54
|
+
return false;
|
|
55
|
+
}
|
|
56
|
+
const [first, second] = active;
|
|
57
|
+
return second.score >= Math.max(0.2, first.score * 0.6);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function markerAppears(text, words, marker) {
|
|
61
|
+
const normalized = marker.toLowerCase();
|
|
62
|
+
if (/^[a-z0-9'-]+$/.test(normalized)) {
|
|
63
|
+
return words.has(normalized) || new RegExp(`\\b${escapeRegExp(normalized)}\\b`, "i").test(text);
|
|
64
|
+
}
|
|
65
|
+
return new RegExp(`\\b${escapeRegExp(normalized).replaceAll("\\ ", "\\s+")}\\b`, "i").test(text);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function escapeRegExp(value) {
|
|
69
|
+
return String(value).replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
70
|
+
}
|