dravoice 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,63 +1,63 @@
1
- import { rate, topItems } from "../text-utils.js";
2
-
3
- const TRANSITIONS = {
4
- additive: /\b(also|and|another|again|plus)\b/i,
5
- contrast: /\b(but|however|although|yet|instead|still)\b/i,
6
- causal: /\b(because|so|therefore|since|as a result)\b/i,
7
- temporal: /\b(then|before|after|while|when|first|second|later)\b/i,
8
- example: /\b(for example|such as|including|like)\b/i,
9
- conclusion: /\b(so|therefore|finally|in the end|the lesson)\b/i,
10
- };
11
-
12
- export function analyzeDiscourse(documents) {
13
- const sentences = documents.flatMap((document) => document.sentences);
14
- const labels = sentences.map((sentence) => transitionLabel(sentence.text));
15
- const nonPlainLabels = labels.filter((label) => label !== "plain");
16
- const transitionRates = {};
17
- for (const key of Object.keys(TRANSITIONS)) {
18
- transitionRates[key] = rate(labels.filter((label) => label === key).length, sentences.length, 2);
19
- }
20
-
21
- return {
22
- family: "discourse",
23
- confidence: sentences.length >= 12 ? "medium" : "low",
24
- features: {
25
- transitionRates,
26
- transitionSequence: labels.slice(0, 12),
27
- transitionBigrams: topItems(sequenceNgrams(labels, 2), 12),
28
- transitionTrigrams: topItems(sequenceNgrams(labels, 3), 12),
29
- sentenceCallbacks: callbackRate(sentences),
30
- },
31
- examples: topItems(nonPlainLabels, 5).map((item) => item.value),
32
- warnings: sentences.length < 12 ? ["Discourse confidence is limited because the corpus has fewer than 12 sentences."] : [],
33
- revisionHandles: ["Compare how sentences turn, contrast, explain, and return to earlier ideas."],
34
- };
35
- }
36
-
37
- export function transitionLabel(text) {
38
- for (const [label, pattern] of Object.entries(TRANSITIONS)) {
39
- if (pattern.test(text)) {
40
- return label;
41
- }
42
- }
43
- return "plain";
44
- }
45
-
46
- function callbackRate(sentences) {
47
- let callbacks = 0;
48
- for (let index = 1; index < sentences.length; index += 1) {
49
- const previous = new Set(sentences[index - 1].tokens.filter((word) => word.length > 3));
50
- if (sentences[index].tokens.some((word) => previous.has(word))) {
51
- callbacks += 1;
52
- }
53
- }
54
- return rate(callbacks, Math.max(1, sentences.length - 1), 2);
55
- }
56
-
57
- function sequenceNgrams(values, size) {
58
- const result = [];
59
- for (let index = 0; index <= values.length - size; index += 1) {
60
- result.push(values.slice(index, index + size).join(" -> "));
61
- }
62
- return result;
63
- }
1
+ import { rate, topItems } from "../text-utils.js";
2
+
3
+ const TRANSITIONS = {
4
+ contrast: /^(?:but|however|although|yet|instead|still)\b/i,
5
+ causal: /^(?:because|so|therefore|since|as a result)\b/i,
6
+ conclusion: /^(?:therefore|finally|in the end|the lesson)\b/i,
7
+ temporal: /^(?:then|before|after|while|when|first|second|later)\b/i,
8
+ example: /^(?:for example|such as|including)\b/i,
9
+ additive: /^(?:also|another|again|plus|and)\b/i,
10
+ };
11
+
12
+ export function analyzeDiscourse(documents) {
13
+ const sentences = documents.flatMap((document) => document.sentences);
14
+ const labels = sentences.map((sentence) => transitionLabel(sentence.text));
15
+ const nonPlainLabels = labels.filter((label) => label !== "plain");
16
+ const transitionRates = {};
17
+ for (const key of Object.keys(TRANSITIONS)) {
18
+ transitionRates[key] = rate(labels.filter((label) => label === key).length, sentences.length, 2);
19
+ }
20
+
21
+ return {
22
+ family: "discourse",
23
+ confidence: sentences.length >= 12 ? "medium" : "low",
24
+ features: {
25
+ transitionRates,
26
+ transitionSequence: labels.slice(0, 12),
27
+ transitionBigrams: topItems(sequenceNgrams(labels, 2), 12),
28
+ transitionTrigrams: topItems(sequenceNgrams(labels, 3), 12),
29
+ sentenceCallbacks: callbackRate(sentences),
30
+ },
31
+ examples: topItems(nonPlainLabels, 5).map((item) => item.value),
32
+ warnings: sentences.length < 12 ? ["Discourse confidence is limited because the corpus has fewer than 12 sentences."] : [],
33
+ revisionHandles: ["Compare how sentences turn, contrast, explain, and return to earlier ideas."],
34
+ };
35
+ }
36
+
37
+ export function transitionLabel(text) {
38
+ for (const [label, pattern] of Object.entries(TRANSITIONS)) {
39
+ if (pattern.test(text)) {
40
+ return label;
41
+ }
42
+ }
43
+ return "plain";
44
+ }
45
+
46
+ function callbackRate(sentences) {
47
+ let callbacks = 0;
48
+ for (let index = 1; index < sentences.length; index += 1) {
49
+ const previous = new Set(sentences[index - 1].tokens.filter((word) => word.length > 3));
50
+ if (sentences[index].tokens.some((word) => previous.has(word))) {
51
+ callbacks += 1;
52
+ }
53
+ }
54
+ return rate(callbacks, Math.max(1, sentences.length - 1), 2);
55
+ }
56
+
57
+ function sequenceNgrams(values, size) {
58
+ const result = [];
59
+ for (let index = 0; index <= values.length - size; index += 1) {
60
+ result.push(values.slice(index, index + size).join(" -> "));
61
+ }
62
+ return result;
63
+ }
@@ -1,82 +1,82 @@
1
- import { rate, topItems } from "../text-utils.js";
2
-
3
- const EVIDENCE_PATTERNS = {
4
- date: /\b\d{1,2}:\d{2}\s?(?:am|pm)?\b|\b20\d{2}-\d{2}-\d{2}\b|\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b/i,
5
- number: /\b\d+(?:\.\d+)?\b/,
6
- quote: /"[^"]+"|'[^']+'|^>/,
7
- url: /https?:\/\/\S+/i,
8
- citation: /\[[^\]]+\]\([^)]+\)|\([A-Z][A-Za-z]+,\s*\d{4}\)/,
9
- sourceAttribution: /\b(according to|reported|observed|noted|recorded|quoted|interviewed|surveyed|field notes said|data shows|study found|the memo|the log|the report)\b/i,
10
- sensory: /\b(cold|warm|hot|cool|quiet|loud|bright|dark|red|blue|green|rough|smooth|sharp|soft|smelled|smell|scent|tasted|heard|sound|noise|flashed|visible|physical|rain|metal|smoke)\b/i,
11
- specificExample: /\b(for example|for instance|such as|including|included|includes|sample|case in point|specifically|in one case)\b/i,
12
- };
13
-
14
- const ABSTRACT_CLAIM_RE = /\b(always|never|everyone|everything|nothing|best|better|worse|important|obvious|clearly|should|must|need to|have to|all|none|every)\b/i;
15
-
16
- export function analyzeEvidence(documents) {
17
- const documentResults = documents.map(documentEvidence);
18
- const sentenceCount = documentResults.reduce((sum, item) => sum + item.sentenceCount, 0);
19
- const evidenceSentenceCount = documentResults.reduce((sum, item) => sum + item.evidenceSentenceCount, 0);
20
- const claimSentenceCount = documentResults.reduce((sum, item) => sum + item.claimSentenceCount, 0);
21
- const supportedClaimCount = documentResults.reduce((sum, item) => sum + item.supportedClaimCount, 0);
22
- const unsupportedClaimCount = documentResults.reduce((sum, item) => sum + item.unsupportedClaimCount, 0);
23
- const typeValues = documentResults.flatMap((item) => item.typeValues);
24
-
25
- return {
26
- family: "evidence",
27
- confidence: sentenceCount >= 12 ? "medium" : "low",
28
- features: {
29
- sentenceCount,
30
- evidenceSentenceCount,
31
- evidenceSentenceRate: rate(evidenceSentenceCount, sentenceCount, 2),
32
- claimSentenceCount,
33
- claimSentenceRate: rate(claimSentenceCount, sentenceCount, 2),
34
- supportedClaimRate: rate(supportedClaimCount, Math.max(1, claimSentenceCount), 2),
35
- unsupportedClaimRate: rate(unsupportedClaimCount, Math.max(1, claimSentenceCount), 2),
36
- evidenceTypes: topItems(typeValues, 8),
37
- },
38
- examples: topItems(typeValues, 4).map((item) => `${item.value}: ${item.count}`),
39
- warnings: sentenceCount < 12 ? ["Evidence confidence is limited because the corpus has fewer than 12 sentences."] : [],
40
- revisionHandles: ["Compare how broad claims are supported by concrete scenes, numbers, quotes, citations, or examples."],
41
- };
42
- }
43
-
44
- function documentEvidence(document) {
45
- const sentences = document.sentences;
46
- const sentenceEvidenceTypes = sentences.map((sentence) => evidenceTypes(sentence.text));
47
- const evidenceSentences = sentences.filter((_, index) => sentenceEvidenceTypes[index].length > 0);
48
- const claimIndexes = sentences
49
- .map((sentence, index) => ({ sentence, index }))
50
- .filter(({ sentence }) => ABSTRACT_CLAIM_RE.test(sentence.text))
51
- .map(({ index }) => index);
52
- const supportedClaimIndexes = claimIndexes.filter((index) => hasNearbyEvidence(sentenceEvidenceTypes, index));
53
- return {
54
- sentenceCount: sentences.length,
55
- evidenceSentenceCount: evidenceSentences.length,
56
- claimSentenceCount: claimIndexes.length,
57
- supportedClaimCount: supportedClaimIndexes.length,
58
- unsupportedClaimCount: claimIndexes.length - supportedClaimIndexes.length,
59
- typeValues: evidenceSentences.flatMap((sentence) => evidenceTypes(sentence.text)),
60
- };
61
- }
62
-
63
- function hasNearbyEvidence(sentenceEvidenceTypes, claimIndex) {
64
- const start = Math.max(0, claimIndex - 2);
65
- const end = Math.min(sentenceEvidenceTypes.length - 1, claimIndex + 2);
66
- for (let index = start; index <= end; index += 1) {
67
- if (sentenceEvidenceTypes[index].length > 0) {
68
- return true;
69
- }
70
- }
71
- return false;
72
- }
73
-
74
- export function evidenceTypes(text) {
75
- return Object.entries(EVIDENCE_PATTERNS)
76
- .filter(([, pattern]) => pattern.test(text))
77
- .map(([type]) => type);
78
- }
79
-
80
- export function isAbstractClaim(text) {
81
- return ABSTRACT_CLAIM_RE.test(text);
82
- }
1
+ import { rate, topItems } from "../text-utils.js";
2
+
3
+ const EVIDENCE_PATTERNS = {
4
+ date: /\b\d{1,2}:\d{2}\s?(?:am|pm)?\b|\b20\d{2}-\d{2}-\d{2}\b|\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b/i,
5
+ number: /\b\d+(?:\.\d+)?\b/,
6
+ quote: /"[^"]+"|'[^']+'|^>/,
7
+ url: /https?:\/\/\S+/i,
8
+ citation: /\[[^\]]+\]\([^)]+\)|\([A-Z][A-Za-z]+,\s*\d{4}\)/,
9
+ sourceAttribution: /\b(according to|reported|observed|noted|recorded|quoted|interviewed|surveyed|field notes said|data shows|study found|the memo|the log|the report)\b/i,
10
+ sensory: /\b(cold|warm|hot|cool|quiet|loud|bright|dark|red|blue|green|rough|smooth|sharp|soft|smelled|smell|scent|tasted|heard|sound|noise|flashed|visible|physical|rain|metal|smoke)\b/i,
11
+ specificExample: /\b(for example|for instance|such as|including|included|includes|sample|case in point|specifically|in one case)\b/i,
12
+ };
13
+
14
+ const ABSTRACT_CLAIM_RE = /\b(always|never|everyone|everything|nothing|best|better|worse|important|obvious|clearly|should|must|need to|have to|all|none|every)\b/i;
15
+
16
+ export function analyzeEvidence(documents) {
17
+ const documentResults = documents.map(documentEvidence);
18
+ const sentenceCount = documentResults.reduce((sum, item) => sum + item.sentenceCount, 0);
19
+ const evidenceSentenceCount = documentResults.reduce((sum, item) => sum + item.evidenceSentenceCount, 0);
20
+ const claimSentenceCount = documentResults.reduce((sum, item) => sum + item.claimSentenceCount, 0);
21
+ const supportedClaimCount = documentResults.reduce((sum, item) => sum + item.supportedClaimCount, 0);
22
+ const unsupportedClaimCount = documentResults.reduce((sum, item) => sum + item.unsupportedClaimCount, 0);
23
+ const typeValues = documentResults.flatMap((item) => item.typeValues);
24
+
25
+ return {
26
+ family: "evidence",
27
+ confidence: sentenceCount >= 12 ? "medium" : "low",
28
+ features: {
29
+ sentenceCount,
30
+ evidenceSentenceCount,
31
+ evidenceSentenceRate: rate(evidenceSentenceCount, sentenceCount, 2),
32
+ claimSentenceCount,
33
+ claimSentenceRate: rate(claimSentenceCount, sentenceCount, 2),
34
+ supportedClaimRate: rate(supportedClaimCount, Math.max(1, claimSentenceCount), 2),
35
+ unsupportedClaimRate: rate(unsupportedClaimCount, Math.max(1, claimSentenceCount), 2),
36
+ evidenceTypes: topItems(typeValues, 8),
37
+ },
38
+ examples: topItems(typeValues, 4).map((item) => `${item.value}: ${item.count}`),
39
+ warnings: sentenceCount < 12 ? ["Evidence confidence is limited because the corpus has fewer than 12 sentences."] : [],
40
+ revisionHandles: ["Compare how broad claims are supported by concrete scenes, numbers, quotes, citations, or examples."],
41
+ };
42
+ }
43
+
44
+ function documentEvidence(document) {
45
+ const sentences = document.sentences;
46
+ const sentenceEvidenceTypes = sentences.map((sentence) => evidenceTypes(sentence.text));
47
+ const evidenceSentences = sentences.filter((_, index) => sentenceEvidenceTypes[index].length > 0);
48
+ const claimIndexes = sentences
49
+ .map((sentence, index) => ({ sentence, index }))
50
+ .filter(({ sentence }) => ABSTRACT_CLAIM_RE.test(sentence.text))
51
+ .map(({ index }) => index);
52
+ const supportedClaimIndexes = claimIndexes.filter((index) => hasNearbyEvidence(sentenceEvidenceTypes, index));
53
+ return {
54
+ sentenceCount: sentences.length,
55
+ evidenceSentenceCount: evidenceSentences.length,
56
+ claimSentenceCount: claimIndexes.length,
57
+ supportedClaimCount: supportedClaimIndexes.length,
58
+ unsupportedClaimCount: claimIndexes.length - supportedClaimIndexes.length,
59
+ typeValues: evidenceSentences.flatMap((sentence) => evidenceTypes(sentence.text)),
60
+ };
61
+ }
62
+
63
+ function hasNearbyEvidence(sentenceEvidenceTypes, claimIndex) {
64
+ const start = Math.max(0, claimIndex - 2);
65
+ const end = Math.min(sentenceEvidenceTypes.length - 1, claimIndex + 2);
66
+ for (let index = start; index <= end; index += 1) {
67
+ if (sentenceEvidenceTypes[index].length > 0) {
68
+ return true;
69
+ }
70
+ }
71
+ return false;
72
+ }
73
+
74
+ export function evidenceTypes(text) {
75
+ return Object.entries(EVIDENCE_PATTERNS)
76
+ .filter(([, pattern]) => pattern.test(text))
77
+ .map(([type]) => type);
78
+ }
79
+
80
+ export function isAbstractClaim(text) {
81
+ return ABSTRACT_CLAIM_RE.test(text);
82
+ }
@@ -1,114 +1,114 @@
1
- import {
2
- FUNCTION_WORDS,
3
- characterNgrams,
4
- contentWords,
5
- distribution,
6
- normalizeText,
7
- rate,
8
- tokenizeWords,
9
- topItems,
10
- } from "../text-utils.js";
11
-
12
- export function analyzeLexical(documents) {
13
- const text = documents.map((document) => document.text).join("\n\n");
14
- const words = tokenizeWords(text);
15
- const content = contentWords(text);
16
- const sentences = documents.flatMap((document) => document.sentences);
17
- const functionWordSet = new Set(FUNCTION_WORDS);
18
-
19
- return {
20
- family: "lexical",
21
- confidence: confidenceFor(words.length),
22
- features: {
23
- wordCount: words.length,
24
- contentWordCount: content.length,
25
- vocabularyRichness: {
26
- uniqueContentWords: new Set(content).size,
27
- contentTypeTokenRatio: rate(new Set(content).size, content.length, 3),
28
- },
29
- wordLength: distribution(words.map((word) => word.length)),
30
- functionWords: topItems(words.filter((word) => functionWordSet.has(word)), 24),
31
- functionWordBigrams: topItems(tokenNgrams(words.filter((word) => functionWordSet.has(word)), 2), 36),
32
- characterTrigrams: topItems(characterNgrams(text, 3), 24),
33
- maskedCharacterFourgrams: topItems(maskedCharacterNgrams(text, 4), 48),
34
- repeatedMotifs: topItems(content, 16).filter((item) => item.count > 1),
35
- sentenceInitialTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens[0], functionWordSet)), 16),
36
- sentenceFinalTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens.at(-1), functionWordSet)), 16),
37
- punctuationNgrams: topItems(punctuationNgrams(text, 3), 16),
38
- punctuation: {
39
- commaRate: rate(count(text, /,/g), sentences.length, 2),
40
- semicolonRate: rate(count(text, /;/g), sentences.length, 2),
41
- colonRate: rate(count(text, /:/g), sentences.length, 2),
42
- questionRate: rate(count(text, /\?/g), sentences.length, 2),
43
- dashRate: rate(count(text, /--|-/g), sentences.length, 2),
44
- },
45
- },
46
- examples: topItems(content, 5).map((item) => item.value),
47
- warnings: words.length < 200 ? ["Lexical confidence is limited because the corpus has fewer than 200 words."] : [],
48
- revisionHandles: ["Compare function words, vocabulary richness, motifs, and punctuation habits."],
49
- };
50
- }
51
-
52
- function boundaryToken(word, functionWordSet) {
53
- if (!word) {
54
- return null;
55
- }
56
- if (functionWordSet.has(word)) {
57
- return word;
58
- }
59
- if (/^\d+$/.test(word)) {
60
- return "<number>";
61
- }
62
- return "<content>";
63
- }
64
-
65
- function tokenNgrams(tokens, size) {
66
- const grams = [];
67
- for (let index = 0; index <= tokens.length - size; index += 1) {
68
- grams.push(tokens.slice(index, index + size).join(" "));
69
- }
70
- return grams;
71
- }
72
-
73
- function maskedCharacterNgrams(text, size) {
74
- const functionWordSet = new Set(FUNCTION_WORDS);
75
- const masked = normalizeText(text)
76
- .replace(/\p{L}[\p{L}\p{N}'-]*|\p{N}+/gu, (word) => {
77
- const normalized = word.toLowerCase().replace(/'s$/, "");
78
- if (functionWordSet.has(normalized)) {
79
- return normalized;
80
- }
81
- if (/^\p{N}+$/u.test(normalized)) {
82
- return "@";
83
- }
84
- return "#";
85
- })
86
- .replace(/\s+/g, " ");
87
- const grams = [];
88
- for (let index = 0; index <= masked.length - size; index += 1) {
89
- const gram = masked.slice(index, index + size);
90
- if (gram.trim()) {
91
- grams.push(gram);
92
- }
93
- }
94
- return grams;
95
- }
96
-
97
- function punctuationNgrams(text, size) {
98
- const marks = Array.from(String(text ?? "").matchAll(/[.,;:!?-]/g)).map((match) => match[0]);
99
- return tokenNgrams(marks, size);
100
- }
101
-
102
- function confidenceFor(wordCount) {
103
- if (wordCount >= 2000) {
104
- return "high";
105
- }
106
- if (wordCount >= 120) {
107
- return "medium";
108
- }
109
- return "low";
110
- }
111
-
112
- function count(text, pattern) {
113
- return Array.from(text.matchAll(pattern)).length;
114
- }
1
+ import {
2
+ FUNCTION_WORDS,
3
+ characterNgrams,
4
+ contentWords,
5
+ distribution,
6
+ normalizeText,
7
+ rate,
8
+ tokenizeWords,
9
+ topItems,
10
+ } from "../text-utils.js";
11
+
12
+ export function analyzeLexical(documents) {
13
+ const text = documents.map((document) => document.text).join("\n\n");
14
+ const words = tokenizeWords(text);
15
+ const content = contentWords(text);
16
+ const sentences = documents.flatMap((document) => document.sentences);
17
+ const functionWordSet = new Set(FUNCTION_WORDS);
18
+
19
+ return {
20
+ family: "lexical",
21
+ confidence: confidenceFor(words.length),
22
+ features: {
23
+ wordCount: words.length,
24
+ contentWordCount: content.length,
25
+ vocabularyRichness: {
26
+ uniqueContentWords: new Set(content).size,
27
+ contentTypeTokenRatio: rate(new Set(content).size, content.length, 3),
28
+ },
29
+ wordLength: distribution(words.map((word) => word.length)),
30
+ functionWords: topItems(words.filter((word) => functionWordSet.has(word)), 24),
31
+ functionWordBigrams: topItems(tokenNgrams(words.filter((word) => functionWordSet.has(word)), 2), 36),
32
+ characterTrigrams: topItems(characterNgrams(text, 3), 24),
33
+ maskedCharacterFourgrams: topItems(maskedCharacterNgrams(text, 4), 48),
34
+ repeatedMotifs: topItems(content, 16).filter((item) => item.count > 1),
35
+ sentenceInitialTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens[0], functionWordSet)), 16),
36
+ sentenceFinalTokens: topItems(sentences.map((sentence) => boundaryToken(sentence.tokens.at(-1), functionWordSet)), 16),
37
+ punctuationNgrams: topItems(punctuationNgrams(text, 3), 16),
38
+ punctuation: {
39
+ commaRate: rate(count(text, /,/g), sentences.length, 2),
40
+ semicolonRate: rate(count(text, /;/g), sentences.length, 2),
41
+ colonRate: rate(count(text, /:/g), sentences.length, 2),
42
+ questionRate: rate(count(text, /\?/g), sentences.length, 2),
43
+ dashRate: rate(count(text, /--|-/g), sentences.length, 2),
44
+ },
45
+ },
46
+ examples: topItems(content, 5).map((item) => item.value),
47
+ warnings: words.length < 200 ? ["Lexical confidence is limited because the corpus has fewer than 200 words."] : [],
48
+ revisionHandles: ["Compare function words, vocabulary richness, motifs, and punctuation habits."],
49
+ };
50
+ }
51
+
52
+ function boundaryToken(word, functionWordSet) {
53
+ if (!word) {
54
+ return null;
55
+ }
56
+ if (functionWordSet.has(word)) {
57
+ return word;
58
+ }
59
+ if (/^\d+$/.test(word)) {
60
+ return "<number>";
61
+ }
62
+ return "<content>";
63
+ }
64
+
65
+ function tokenNgrams(tokens, size) {
66
+ const grams = [];
67
+ for (let index = 0; index <= tokens.length - size; index += 1) {
68
+ grams.push(tokens.slice(index, index + size).join(" "));
69
+ }
70
+ return grams;
71
+ }
72
+
73
+ function maskedCharacterNgrams(text, size) {
74
+ const functionWordSet = new Set(FUNCTION_WORDS);
75
+ const masked = normalizeText(text)
76
+ .replace(/\p{L}[\p{L}\p{N}'-]*|\p{N}+/gu, (word) => {
77
+ const normalized = word.toLowerCase().replace(/'s$/, "");
78
+ if (functionWordSet.has(normalized)) {
79
+ return normalized;
80
+ }
81
+ if (/^\p{N}+$/u.test(normalized)) {
82
+ return "@";
83
+ }
84
+ return "#";
85
+ })
86
+ .replace(/\s+/g, " ");
87
+ const grams = [];
88
+ for (let index = 0; index <= masked.length - size; index += 1) {
89
+ const gram = masked.slice(index, index + size);
90
+ if (gram.trim()) {
91
+ grams.push(gram);
92
+ }
93
+ }
94
+ return grams;
95
+ }
96
+
97
+ function punctuationNgrams(text, size) {
98
+ const marks = Array.from(String(text ?? "").matchAll(/[.,;:!?-]/g)).map((match) => match[0]);
99
+ return tokenNgrams(marks, size);
100
+ }
101
+
102
+ function confidenceFor(wordCount) {
103
+ if (wordCount >= 2000) {
104
+ return "high";
105
+ }
106
+ if (wordCount >= 120) {
107
+ return "medium";
108
+ }
109
+ return "low";
110
+ }
111
+
112
+ function count(text, pattern) {
113
+ return Array.from(text.matchAll(pattern)).length;
114
+ }
@@ -1,34 +1,46 @@
1
- import { contentWords, rate, topItems } from "../text-utils.js";
2
-
3
- const REGISTER_MARKERS = {
4
- narrative: ["scene", "moment", "watched", "waited", "noticed", "remembered", "described", "story"],
5
- explanatory: ["because", "so", "therefore", "means", "shows", "explains", "reason", "pattern"],
6
- argumentative: ["should", "must", "better", "important", "claim", "therefore", "argue", "evidence"],
7
- instructional: ["start", "fix", "use", "avoid", "keep", "write", "revise", "follow"],
8
- reflective: ["lesson", "pause", "changed", "remember", "noticed", "felt", "learned", "realized"],
9
- technical: ["file", "test", "system", "code", "api", "build"],
10
- personal: ["i", "my", "we", "our"],
11
- formal: ["requires", "outcomes", "process", "alignment", "therefore"],
12
- };
13
-
14
- export function analyzeRegister(documents) {
15
- const text = documents.map((document) => document.text.toLowerCase()).join("\n\n");
16
- const words = new Set(contentWords(text));
17
- const scores = Object.entries(REGISTER_MARKERS).map(([value, markers]) => ({
18
- value,
19
- score: rate(markers.filter((marker) => text.includes(marker) || words.has(marker)).length, markers.length, 2),
20
- })).sort((left, right) => right.score - left.score || left.value.localeCompare(right.value));
21
-
22
- return {
23
- family: "register",
24
- confidence: documents.length >= 3 ? "medium" : "low",
25
- features: {
26
- primary: scores[0] ?? { value: "unknown", score: 0 },
27
- scores,
28
- topContentWords: topItems(contentWords(text), 12),
29
- },
30
- examples: scores.slice(0, 3).map((item) => `${item.value}: ${item.score}`),
31
- warnings: documents.length < 3 ? ["Register confidence is limited because the corpus has fewer than 3 documents."] : [],
32
- revisionHandles: ["Check whether the draft uses the same broad register and genre mix as the corpus."],
33
- };
34
- }
1
+ import { contentWords, rate, topItems } from "../text-utils.js";
2
+
3
+ const REGISTER_MARKERS = {
4
+ narrative: ["scene", "moment", "watched", "waited", "noticed", "remembered", "described", "story"],
5
+ explanatory: ["because", "so", "therefore", "means", "shows", "explains", "reason", "pattern"],
6
+ argumentative: ["should", "must", "better", "important", "claim", "therefore", "argue", "evidence"],
7
+ instructional: ["start", "fix", "use", "avoid", "keep", "write", "revise", "follow"],
8
+ reflective: ["lesson", "pause", "changed", "remember", "noticed", "felt", "learned", "realized"],
9
+ technical: ["file", "test", "system", "code", "api", "build"],
10
+ personal: ["i", "my", "we", "our"],
11
+ formal: ["requires", "outcomes", "process", "alignment", "therefore"],
12
+ };
13
+
14
+ export function analyzeRegister(documents) {
15
+ const text = documents.map((document) => document.text.toLowerCase()).join("\n\n");
16
+ const words = new Set(contentWords(text));
17
+ const scores = Object.entries(REGISTER_MARKERS).map(([value, markers]) => ({
18
+ value,
19
+ score: rate(markers.filter((marker) => markerAppears(text, words, marker)).length, markers.length, 2),
20
+ })).sort((left, right) => right.score - left.score || left.value.localeCompare(right.value));
21
+
22
+ return {
23
+ family: "register",
24
+ confidence: documents.length >= 3 ? "medium" : "low",
25
+ features: {
26
+ primary: scores[0] ?? { value: "unknown", score: 0 },
27
+ scores,
28
+ topContentWords: topItems(contentWords(text), 12),
29
+ },
30
+ examples: scores.slice(0, 3).map((item) => `${item.value}: ${item.score}`),
31
+ warnings: documents.length < 3 ? ["Register confidence is limited because the corpus has fewer than 3 documents."] : [],
32
+ revisionHandles: ["Check whether the draft uses the same broad register and genre mix as the corpus."],
33
+ };
34
+ }
35
+
36
+ function markerAppears(text, words, marker) {
37
+ const normalized = marker.toLowerCase();
38
+ if (/^[a-z0-9'-]+$/.test(normalized)) {
39
+ return words.has(normalized) || new RegExp(`\\b${escapeRegExp(normalized)}\\b`, "i").test(text);
40
+ }
41
+ return new RegExp(`\\b${escapeRegExp(normalized).replaceAll("\\ ", "\\s+")}\\b`, "i").test(text);
42
+ }
43
+
44
+ function escapeRegExp(value) {
45
+ return String(value).replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
46
+ }