@tgies/megahal-js 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/LICENSE +21 -0
- package/README.md +156 -0
- package/index.d.ts +6 -0
- package/index.d.ts.map +1 -0
- package/index.js +29 -0
- package/package.json +83 -0
- package/src/binary.d.ts +18 -0
- package/src/binary.d.ts.map +1 -0
- package/src/binary.js +328 -0
- package/src/dict.d.ts +54 -0
- package/src/dict.d.ts.map +1 -0
- package/src/dict.js +115 -0
- package/src/engine.d.ts +140 -0
- package/src/engine.d.ts.map +1 -0
- package/src/engine.js +317 -0
- package/src/evaluator.d.ts +10 -0
- package/src/evaluator.d.ts.map +1 -0
- package/src/evaluator.js +101 -0
- package/src/generator.d.ts +36 -0
- package/src/generator.d.ts.map +1 -0
- package/src/generator.js +296 -0
- package/src/keywords.d.ts +34 -0
- package/src/keywords.d.ts.map +1 -0
- package/src/keywords.js +122 -0
- package/src/model.d.ts +73 -0
- package/src/model.d.ts.map +1 -0
- package/src/model.js +154 -0
- package/src/tokenizer.d.ts +8 -0
- package/src/tokenizer.d.ts.map +1 -0
- package/src/tokenizer.js +125 -0
- package/src/trie.d.ts +81 -0
- package/src/trie.d.ts.map +1 -0
- package/src/trie.js +164 -0
package/src/evaluator.js
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Score a candidate reply by keyword surprise (Shannon entropy of keywords in context).
|
|
3
|
+
*
|
|
4
|
+
* @param {import('./model.js').BidirectionalModel} model
|
|
5
|
+
* @param {string[]} candidate - Tokens of the candidate reply
|
|
6
|
+
* @param {Set<string>} keywords - Set of uppercase keywords
|
|
7
|
+
* @returns {number}
|
|
8
|
+
*/
|
|
9
|
+
export function evaluateReply(model, candidate, keywords) {
|
|
10
|
+
if (!candidate || candidate.length === 0) {
|
|
11
|
+
return 0.0;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
let entropy = 0.0;
|
|
15
|
+
let num = 0;
|
|
16
|
+
|
|
17
|
+
// Forward evaluation.
|
|
18
|
+
const fwdCtx = model.forwardContext();
|
|
19
|
+
for (const token of candidate) {
|
|
20
|
+
const symId = model.dictionary.find(token);
|
|
21
|
+
if (symId === undefined) {
|
|
22
|
+
continue;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const upperToken = token.toUpperCase();
|
|
26
|
+
if (keywords.has(upperToken)) {
|
|
27
|
+
let probability = 0.0;
|
|
28
|
+
let ctxCount = 0;
|
|
29
|
+
|
|
30
|
+
for (let j = 0; j < model.order; j++) {
|
|
31
|
+
const parentRef = fwdCtx.atDepth(j);
|
|
32
|
+
if (parentRef !== null && parentRef !== undefined) {
|
|
33
|
+
const childRef = model.forward.findChild(parentRef, symId);
|
|
34
|
+
if (childRef !== undefined) {
|
|
35
|
+
const childNode = model.forward.node(childRef);
|
|
36
|
+
const parentNode = model.forward.node(parentRef);
|
|
37
|
+
if (parentNode.usage > 0) {
|
|
38
|
+
probability += childNode.count / parentNode.usage;
|
|
39
|
+
ctxCount++;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (ctxCount > 0) {
|
|
46
|
+
entropy -= Math.log(probability / ctxCount);
|
|
47
|
+
}
|
|
48
|
+
num++;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
fwdCtx.advance(model.forward, symId);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Backward evaluation.
|
|
55
|
+
const bwdCtx = model.backwardContext();
|
|
56
|
+
for (let i = candidate.length - 1; i >= 0; i--) {
|
|
57
|
+
const token = candidate[i];
|
|
58
|
+
const symId = model.dictionary.find(token);
|
|
59
|
+
if (symId === undefined) {
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const upperToken = token.toUpperCase();
|
|
64
|
+
if (keywords.has(upperToken)) {
|
|
65
|
+
let probability = 0.0;
|
|
66
|
+
let ctxCount = 0;
|
|
67
|
+
|
|
68
|
+
for (let j = 0; j < model.order; j++) {
|
|
69
|
+
const parentRef = bwdCtx.atDepth(j);
|
|
70
|
+
if (parentRef !== null && parentRef !== undefined) {
|
|
71
|
+
const childRef = model.backward.findChild(parentRef, symId);
|
|
72
|
+
if (childRef !== undefined) {
|
|
73
|
+
const childNode = model.backward.node(childRef);
|
|
74
|
+
const parentNode = model.backward.node(parentRef);
|
|
75
|
+
if (parentNode.usage > 0) {
|
|
76
|
+
probability += childNode.count / parentNode.usage;
|
|
77
|
+
ctxCount++;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (ctxCount > 0) {
|
|
84
|
+
entropy -= Math.log(probability / ctxCount);
|
|
85
|
+
}
|
|
86
|
+
num++;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
bwdCtx.advance(model.backward, symId);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Length penalty.
|
|
93
|
+
if (num >= 8) {
|
|
94
|
+
entropy /= Math.sqrt(num - 1);
|
|
95
|
+
}
|
|
96
|
+
if (num >= 16) {
|
|
97
|
+
entropy /= num;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return entropy;
|
|
101
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate a single candidate reply (forward + backward phases).
|
|
3
|
+
*
|
|
4
|
+
* @param {BidirectionalModel} model
|
|
5
|
+
* @param {Set<string>} keywords
|
|
6
|
+
* @param {Set<string>} auxSet
|
|
7
|
+
* @param {any} rng
|
|
8
|
+
* @returns {string[]}
|
|
9
|
+
*/
|
|
10
|
+
export function generateOneReply(model: BidirectionalModel, keywords: Set<string>, auxSet: Set<string>, rng: any): string[];
|
|
11
|
+
/**
|
|
12
|
+
* Generate the best reply for given input tokens and keywords.
|
|
13
|
+
* Runs the candidate generation loop for up to TIMEOUT milliseconds or ITERATIONS.
|
|
14
|
+
*
|
|
15
|
+
* @param {BidirectionalModel} model
|
|
16
|
+
* @param {string[]} inputTokens
|
|
17
|
+
* @param {Set<string>} keywords
|
|
18
|
+
* @param {Set<string>} auxSet
|
|
19
|
+
* @param {{ timeout?: number, maxIterations?: number }} limit
|
|
20
|
+
* @param {any} rng
|
|
21
|
+
* @returns {string[]}
|
|
22
|
+
*/
|
|
23
|
+
export function generateReply(model: BidirectionalModel, inputTokens: string[], keywords: Set<string>, auxSet: Set<string>, limit: {
|
|
24
|
+
timeout?: number;
|
|
25
|
+
maxIterations?: number;
|
|
26
|
+
}, rng: any): string[];
|
|
27
|
+
/**
|
|
28
|
+
* Capitalize a token sequence per MegaHAL sentence-case rules.
|
|
29
|
+
*
|
|
30
|
+
* @param {string[]} tokens
|
|
31
|
+
* @returns {string}
|
|
32
|
+
*/
|
|
33
|
+
export function capitalize(tokens: string[]): string;
|
|
34
|
+
export type BidirectionalModel = import("./model.js").BidirectionalModel;
|
|
35
|
+
export type ContextWindow = import("./model.js").ContextWindow;
|
|
36
|
+
//# sourceMappingURL=generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["generator.js"],"names":[],"mappings":"AAoHA;;;;;;;;GAQG;AACH,wCANW,kBAAkB,YAClB,GAAG,CAAC,MAAM,CAAC,UACX,GAAG,CAAC,MAAM,CAAC,OACX,GAAG,GACD,MAAM,EAAE,CA8DpB;AAoBD;;;;;;;;;;;GAWG;AACH,qCARW,kBAAkB,eAClB,MAAM,EAAE,YACR,GAAG,CAAC,MAAM,CAAC,UACX,GAAG,CAAC,MAAM,CAAC,SACX;IAAE,OAAO,CAAC,EAAE,MAAM,CAAC;IAAC,aAAa,CAAC,EAAE,MAAM,CAAA;CAAE,OAC5C,GAAG,GACD,MAAM,EAAE,CA4CpB;AAED;;;;;GAKG;AACH,mCAHW,MAAM,EAAE,GACN,MAAM,CA8BlB;iCAnSY,OAAO,YAAY,EAAE,kBAAkB;4BACvC,OAAO,YAAY,EAAE,aAAa"}
|
package/src/generator.js
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import { ERROR_ID, FIN_ID } from './dict.js';
|
|
2
|
+
import { evaluateReply } from './evaluator.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @typedef {import('./model.js').BidirectionalModel} BidirectionalModel
|
|
6
|
+
* @typedef {import('./model.js').ContextWindow} ContextWindow
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Pick a random integer in [min, max) using the provided RNG.
|
|
11
|
+
* @param {any} rng
|
|
12
|
+
* @param {number} min
|
|
13
|
+
* @param {number} max
|
|
14
|
+
* @returns {number}
|
|
15
|
+
*/
|
|
16
|
+
function randomRange(rng, min, max) {
|
|
17
|
+
if (rng && typeof rng.randomRange === 'function') {
|
|
18
|
+
return rng.randomRange(min, max);
|
|
19
|
+
}
|
|
20
|
+
return Math.floor(Math.random() * (max - min)) + min;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Select a seed symbol ID for forward generation.
|
|
25
|
+
*
|
|
26
|
+
* @param {BidirectionalModel} model
|
|
27
|
+
* @param {Set<string>} keywords
|
|
28
|
+
* @param {Set<string>} auxSet
|
|
29
|
+
* @param {any} rng
|
|
30
|
+
* @returns {number}
|
|
31
|
+
*/
|
|
32
|
+
function seed(model, keywords, auxSet, rng) {
|
|
33
|
+
const root = model.forward.root();
|
|
34
|
+
const children = model.forward.children(root);
|
|
35
|
+
|
|
36
|
+
if (children.length === 0) {
|
|
37
|
+
return ERROR_ID;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// If keywords exist, try to find a non-auxiliary keyword as seed.
|
|
41
|
+
if (keywords.size > 0) {
|
|
42
|
+
const keywordList = Array.from(keywords).sort();
|
|
43
|
+
const start = randomRange(rng, 0, keywordList.length);
|
|
44
|
+
|
|
45
|
+
for (let offset = 0; offset < keywordList.length; offset++) {
|
|
46
|
+
const idx = (start + offset) % keywordList.length;
|
|
47
|
+
const kw = keywordList[idx];
|
|
48
|
+
|
|
49
|
+
const id = model.dictionary.find(kw);
|
|
50
|
+
if (id !== undefined && !auxSet.has(kw)) {
|
|
51
|
+
return id;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Default: pick a random child of the forward root.
|
|
57
|
+
const idx = randomRange(rng, 0, children.length);
|
|
58
|
+
return model.forward.node(children[idx]).symbol;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Keyword-biased random symbol selection (the "babble" function).
|
|
63
|
+
*
|
|
64
|
+
* @param {import('./trie.js').Trie} trie
|
|
65
|
+
* @param {ContextWindow} ctx
|
|
66
|
+
* @param {import('./dict.js').SymbolDict} dict
|
|
67
|
+
* @param {Set<string>} keywords
|
|
68
|
+
* @param {Set<string>} auxSet
|
|
69
|
+
* @param {number[]} reply - Array of symbol IDs currently in the reply
|
|
70
|
+
* @param {{ val: boolean }} usedKey - Object wrapper for usedKey boolean reference
|
|
71
|
+
* @param {any} rng
|
|
72
|
+
* @returns {number} Symbol ID
|
|
73
|
+
*/
|
|
74
|
+
function babble(trie, ctx, dict, keywords, auxSet, reply, usedKey, rng) {
|
|
75
|
+
const nodeRef = ctx.deepest();
|
|
76
|
+
if (nodeRef === null || nodeRef === undefined) {
|
|
77
|
+
return ERROR_ID;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const node = trie.node(nodeRef);
|
|
81
|
+
const children = trie.children(nodeRef);
|
|
82
|
+
|
|
83
|
+
if (children.length === 0 || node.usage === 0) {
|
|
84
|
+
return ERROR_ID;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const branch = children.length;
|
|
88
|
+
let i = randomRange(rng, 0, branch);
|
|
89
|
+
let count = randomRange(rng, 0, node.usage);
|
|
90
|
+
|
|
91
|
+
for (let step = 0; step < branch; step++) {
|
|
92
|
+
const childRef = children[i];
|
|
93
|
+
const child = trie.node(childRef);
|
|
94
|
+
const sym = child.symbol;
|
|
95
|
+
|
|
96
|
+
const word = dict.resolve(sym);
|
|
97
|
+
const isKeyword = keywords.has(word);
|
|
98
|
+
const isAux = auxSet.has(word);
|
|
99
|
+
const alreadyInReply = reply.includes(sym);
|
|
100
|
+
|
|
101
|
+
if (isKeyword && (usedKey.val || !isAux) && !alreadyInReply) {
|
|
102
|
+
usedKey.val = true;
|
|
103
|
+
return sym;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
count -= child.count;
|
|
107
|
+
if (count < 0) {
|
|
108
|
+
return sym;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
i = (i + 1) % branch;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return ERROR_ID;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Generate a single candidate reply (forward + backward phases).
|
|
119
|
+
*
|
|
120
|
+
* @param {BidirectionalModel} model
|
|
121
|
+
* @param {Set<string>} keywords
|
|
122
|
+
* @param {Set<string>} auxSet
|
|
123
|
+
* @param {any} rng
|
|
124
|
+
* @returns {string[]}
|
|
125
|
+
*/
|
|
126
|
+
export function generateOneReply(model, keywords, auxSet, rng) {
|
|
127
|
+
/** @type {number[]} */
|
|
128
|
+
const replyIds = [];
|
|
129
|
+
const usedKey = { val: false };
|
|
130
|
+
|
|
131
|
+
// Forward generation phase.
|
|
132
|
+
const fwdCtx = model.forwardContext();
|
|
133
|
+
const seedId = seed(model, keywords, auxSet, rng);
|
|
134
|
+
|
|
135
|
+
if (seedId !== ERROR_ID && seedId !== FIN_ID) {
|
|
136
|
+
replyIds.push(seedId);
|
|
137
|
+
fwdCtx.advance(model.forward, seedId);
|
|
138
|
+
|
|
139
|
+
while (true) {
|
|
140
|
+
const sym = babble(
|
|
141
|
+
model.forward,
|
|
142
|
+
fwdCtx,
|
|
143
|
+
model.dictionary,
|
|
144
|
+
keywords,
|
|
145
|
+
auxSet,
|
|
146
|
+
replyIds,
|
|
147
|
+
usedKey,
|
|
148
|
+
rng
|
|
149
|
+
);
|
|
150
|
+
if (sym === ERROR_ID || sym === FIN_ID) {
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
replyIds.push(sym);
|
|
154
|
+
fwdCtx.advance(model.forward, sym);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Backward generation phase.
|
|
159
|
+
const bwdCtx = model.backwardContext();
|
|
160
|
+
if (replyIds.length > 0) {
|
|
161
|
+
const start = Math.min(replyIds.length - 1, model.order);
|
|
162
|
+
for (let i = start; i >= 0; i--) {
|
|
163
|
+
bwdCtx.advance(model.backward, replyIds[i]);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
while (true) {
|
|
168
|
+
const sym = babble(
|
|
169
|
+
model.backward,
|
|
170
|
+
bwdCtx,
|
|
171
|
+
model.dictionary,
|
|
172
|
+
keywords,
|
|
173
|
+
auxSet,
|
|
174
|
+
replyIds,
|
|
175
|
+
usedKey,
|
|
176
|
+
rng
|
|
177
|
+
);
|
|
178
|
+
if (sym === ERROR_ID || sym === FIN_ID) {
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
replyIds.unshift(sym);
|
|
182
|
+
bwdCtx.advance(model.backward, sym);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return replyIds.map(id => model.dictionary.resolve(id));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Check if two token lists are equal (case-insensitive for MegaHAL comparison).
|
|
190
|
+
* @param {string[]} a
|
|
191
|
+
* @param {string[]} b
|
|
192
|
+
* @returns {boolean}
|
|
193
|
+
*/
|
|
194
|
+
function tokensEqual(a, b) {
|
|
195
|
+
if (a.length !== b.length) {
|
|
196
|
+
return false;
|
|
197
|
+
}
|
|
198
|
+
for (let i = 0; i < a.length; i++) {
|
|
199
|
+
if (a[i].toUpperCase() !== b[i].toUpperCase()) {
|
|
200
|
+
return false;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return true;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Generate the best reply for given input tokens and keywords.
|
|
208
|
+
* Runs the candidate generation loop for up to TIMEOUT milliseconds or ITERATIONS.
|
|
209
|
+
*
|
|
210
|
+
* @param {BidirectionalModel} model
|
|
211
|
+
* @param {string[]} inputTokens
|
|
212
|
+
* @param {Set<string>} keywords
|
|
213
|
+
* @param {Set<string>} auxSet
|
|
214
|
+
* @param {{ timeout?: number, maxIterations?: number }} limit
|
|
215
|
+
* @param {any} rng
|
|
216
|
+
* @returns {string[]}
|
|
217
|
+
*/
|
|
218
|
+
export function generateReply(model, inputTokens, keywords, auxSet, limit, rng) {
|
|
219
|
+
const emptyKeywords = new Set();
|
|
220
|
+
const emptyAux = new Set();
|
|
221
|
+
|
|
222
|
+
// Establish a baseline reply without keyword bias.
|
|
223
|
+
let best = generateOneReply(model, emptyKeywords, emptyAux, rng);
|
|
224
|
+
if (tokensEqual(best, inputTokens)) {
|
|
225
|
+
best = [];
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
let maxSurprise = -1.0;
|
|
229
|
+
const start = Date.now();
|
|
230
|
+
let iterations = 0;
|
|
231
|
+
|
|
232
|
+
const timeout = limit.timeout !== undefined ? limit.timeout : 1000;
|
|
233
|
+
const maxIterations = limit.maxIterations !== undefined ? limit.maxIterations : 0;
|
|
234
|
+
|
|
235
|
+
while (true) {
|
|
236
|
+
if (timeout > 0 && Date.now() - start >= timeout) {
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
if (maxIterations > 0 && iterations >= maxIterations) {
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const candidate = generateOneReply(model, keywords, auxSet, rng);
|
|
244
|
+
const surprise = evaluateReply(model, candidate, keywords);
|
|
245
|
+
|
|
246
|
+
if (surprise > maxSurprise && !tokensEqual(candidate, inputTokens)) {
|
|
247
|
+
maxSurprise = surprise;
|
|
248
|
+
best = candidate;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
iterations++;
|
|
252
|
+
|
|
253
|
+
// If no limits are specified, perform at least one iteration.
|
|
254
|
+
if (timeout === 0 && maxIterations === 0) {
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return best;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Capitalize a token sequence per MegaHAL sentence-case rules.
|
|
264
|
+
*
|
|
265
|
+
* @param {string[]} tokens
|
|
266
|
+
* @returns {string}
|
|
267
|
+
*/
|
|
268
|
+
export function capitalize(tokens) {
|
|
269
|
+
const raw = tokens.join('');
|
|
270
|
+
/** @type {string[]} */
|
|
271
|
+
const result = [];
|
|
272
|
+
let start = true;
|
|
273
|
+
|
|
274
|
+
for (let i = 0; i < raw.length; i++) {
|
|
275
|
+
const char = raw[i];
|
|
276
|
+
if (/^[a-zA-Z]$/.test(char)) {
|
|
277
|
+
if (start) {
|
|
278
|
+
result.push(char.toUpperCase());
|
|
279
|
+
} else {
|
|
280
|
+
result.push(char.toLowerCase());
|
|
281
|
+
}
|
|
282
|
+
start = false;
|
|
283
|
+
} else {
|
|
284
|
+
result.push(char);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (i > 2 && /^\s$/.test(char)) {
|
|
288
|
+
const prev = raw[i - 1];
|
|
289
|
+
if (prev === '!' || prev === '.' || prev === '?') {
|
|
290
|
+
start = true;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
return result.join('');
|
|
296
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract keywords from tokens based on MegaHAL's two-pass algorithm.
|
|
3
|
+
* @param {string[]} tokens
|
|
4
|
+
* @param {import('./dict.js').SymbolDict} dict
|
|
5
|
+
* @param {KeywordConfig} config
|
|
6
|
+
* @returns {Set<string>}
|
|
7
|
+
*/
|
|
8
|
+
export function extractKeywords(tokens: string[], dict: import("./dict.js").SymbolDict, config: KeywordConfig): Set<string>;
|
|
9
|
+
/**
|
|
10
|
+
* Perspective-swapping substitution table.
|
|
11
|
+
*/
|
|
12
|
+
export class SwapTable {
|
|
13
|
+
/** @type {[string, string][]} Array of [from, to] swap pairs. */
|
|
14
|
+
pairs: [string, string][];
|
|
15
|
+
/**
|
|
16
|
+
* Apply swap substitutions to a token.
|
|
17
|
+
* Returns all matching "to" values. If no match, returns [token].
|
|
18
|
+
* @param {string} token
|
|
19
|
+
* @returns {string[]}
|
|
20
|
+
*/
|
|
21
|
+
apply(token: string): string[];
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Configuration for keyword extraction.
|
|
25
|
+
*/
|
|
26
|
+
export class KeywordConfig {
|
|
27
|
+
/** @type {Set<string>} Banned words (uppercase). */
|
|
28
|
+
banned: Set<string>;
|
|
29
|
+
/** @type {Set<string>} Auxiliary words (uppercase). */
|
|
30
|
+
auxiliary: Set<string>;
|
|
31
|
+
/** @type {SwapTable} Perspective-swapping table. */
|
|
32
|
+
swap: SwapTable;
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=keywords.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"keywords.d.ts","sourceRoot":"","sources":["keywords.js"],"names":[],"mappings":"AAuFA;;;;;;GAMG;AACH,wCALW,MAAM,EAAE,QACR,OAAO,WAAW,EAAE,UAAU,UAC9B,aAAa,GACX,GAAG,CAAC,MAAM,CAAC,CA6BvB;AAxHD;;GAEG;AACH;IAEI,iEAAiE;IACjE,OADW,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CACd;IAGjB;;;;;OAKG;IACH,aAHW,MAAM,GACJ,MAAM,EAAE,CAiBpB;CACF;AAED;;GAEG;AACH;IAEI,oDAAoD;IACpD,QADW,GAAG,CAAC,MAAM,CAAC,CACC;IAEvB,uDAAuD;IACvD,WADW,GAAG,CAAC,MAAM,CAAC,CACI;IAE1B,oDAAoD;IACpD,MADW,SAAS,CACO;CAE9B"}
|
package/src/keywords.js
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
|
|
2
|
+
/**
|
|
3
|
+
* Perspective-swapping substitution table.
|
|
4
|
+
*/
|
|
5
|
+
export class SwapTable {
|
|
6
|
+
constructor() {
|
|
7
|
+
/** @type {[string, string][]} Array of [from, to] swap pairs. */
|
|
8
|
+
this.pairs = [];
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Apply swap substitutions to a token.
|
|
13
|
+
* Returns all matching "to" values. If no match, returns [token].
|
|
14
|
+
* @param {string} token
|
|
15
|
+
* @returns {string[]}
|
|
16
|
+
*/
|
|
17
|
+
apply(token) {
|
|
18
|
+
const upperTok = token.toUpperCase();
|
|
19
|
+
/** @type {string[]} */
|
|
20
|
+
const results = [];
|
|
21
|
+
|
|
22
|
+
for (const [from, to] of this.pairs) {
|
|
23
|
+
if (from.toUpperCase() === upperTok) {
|
|
24
|
+
results.push(to.toUpperCase());
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if (results.length === 0) {
|
|
29
|
+
return [upperTok];
|
|
30
|
+
}
|
|
31
|
+
return results;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Configuration for keyword extraction.
|
|
37
|
+
*/
|
|
38
|
+
export class KeywordConfig {
|
|
39
|
+
constructor() {
|
|
40
|
+
/** @type {Set<string>} Banned words (uppercase). */
|
|
41
|
+
this.banned = new Set();
|
|
42
|
+
|
|
43
|
+
/** @type {Set<string>} Auxiliary words (uppercase). */
|
|
44
|
+
this.auxiliary = new Set();
|
|
45
|
+
|
|
46
|
+
/** @type {SwapTable} Perspective-swapping table. */
|
|
47
|
+
this.swap = new SwapTable();
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Checks if a candidate is eligible for keyword selection.
|
|
53
|
+
* @param {string} candidate
|
|
54
|
+
* @param {import('./dict.js').SymbolDict} dict
|
|
55
|
+
* @param {KeywordConfig} config
|
|
56
|
+
* @param {boolean} auxPass
|
|
57
|
+
* @returns {boolean}
|
|
58
|
+
*/
|
|
59
|
+
function isKeywordEligible(candidate, dict, config, auxPass) {
|
|
60
|
+
if (!candidate || candidate.length === 0) {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const firstChar = candidate[0];
|
|
65
|
+
if (!/^[A-Z0-9]$/.test(firstChar)) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (dict.find(candidate) === undefined) {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const upper = candidate.toUpperCase();
|
|
74
|
+
|
|
75
|
+
if (auxPass) {
|
|
76
|
+
return config.auxiliary.has(upper);
|
|
77
|
+
} else {
|
|
78
|
+
if (config.banned.has(upper)) {
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
if (config.auxiliary.has(upper)) {
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
return true;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Extract keywords from tokens based on MegaHAL's two-pass algorithm.
|
|
90
|
+
* @param {string[]} tokens
|
|
91
|
+
* @param {import('./dict.js').SymbolDict} dict
|
|
92
|
+
* @param {KeywordConfig} config
|
|
93
|
+
* @returns {Set<string>}
|
|
94
|
+
*/
|
|
95
|
+
export function extractKeywords(tokens, dict, config) {
|
|
96
|
+
/** @type {Set<string>} */
|
|
97
|
+
const keywords = new Set();
|
|
98
|
+
|
|
99
|
+
const candidates = tokens.map(tok => config.swap.apply(tok));
|
|
100
|
+
|
|
101
|
+
// Primary keyword selection pass.
|
|
102
|
+
for (const group of candidates) {
|
|
103
|
+
for (const candidate of group) {
|
|
104
|
+
if (isKeywordEligible(candidate, dict, config, false)) {
|
|
105
|
+
keywords.add(candidate);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Auxiliary keyword selection pass (only if primary pass found matches).
|
|
111
|
+
if (keywords.size > 0) {
|
|
112
|
+
for (const group of candidates) {
|
|
113
|
+
for (const candidate of group) {
|
|
114
|
+
if (isKeywordEligible(candidate, dict, config, true)) {
|
|
115
|
+
keywords.add(candidate);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return keywords;
|
|
122
|
+
}
|
package/src/model.d.ts
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A sliding context window tracking position in an n-gram trie.
|
|
3
|
+
*/
|
|
4
|
+
export class ContextWindow {
|
|
5
|
+
/**
|
|
6
|
+
* @param {number} order - Markov model order
|
|
7
|
+
*/
|
|
8
|
+
constructor(order: number);
|
|
9
|
+
order: number;
|
|
10
|
+
/** @type {(number|null)[]} Context slots matching the model order. */
|
|
11
|
+
slots: (number | null)[];
|
|
12
|
+
/**
|
|
13
|
+
* Reset the context window using the specified root reference.
|
|
14
|
+
* @param {number} rootRef
|
|
15
|
+
*/
|
|
16
|
+
initialize(rootRef: number): void;
|
|
17
|
+
/**
|
|
18
|
+
* Update the context window without creating new trie nodes.
|
|
19
|
+
* @param {Trie} trie
|
|
20
|
+
* @param {number} symbolId
|
|
21
|
+
*/
|
|
22
|
+
advance(trie: Trie, symbolId: number): void;
|
|
23
|
+
/**
|
|
24
|
+
* Update the context window, creating new trie nodes if necessary.
|
|
25
|
+
* @param {Trie} trie
|
|
26
|
+
* @param {number} symbolId
|
|
27
|
+
*/
|
|
28
|
+
advanceAndLearn(trie: Trie, symbolId: number): void;
|
|
29
|
+
/**
|
|
30
|
+
* Get the context node at depth j.
|
|
31
|
+
* @param {number} j
|
|
32
|
+
* @returns {number|null}
|
|
33
|
+
*/
|
|
34
|
+
atDepth(j: number): number | null;
|
|
35
|
+
/**
|
|
36
|
+
* Get the deepest non-null context node.
|
|
37
|
+
* Scans from slot 0 up to slot `order` (inclusive), returning the last non-null.
|
|
38
|
+
* @returns {number|null}
|
|
39
|
+
*/
|
|
40
|
+
deepest(): number | null;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Bidirectional Markov model: forward trie + backward trie + shared dictionary.
|
|
44
|
+
*/
|
|
45
|
+
export class BidirectionalModel {
|
|
46
|
+
/**
|
|
47
|
+
* @param {number} order - Markov model order (default: 5)
|
|
48
|
+
*/
|
|
49
|
+
constructor(order?: number);
|
|
50
|
+
order: number;
|
|
51
|
+
forward: Trie;
|
|
52
|
+
backward: Trie;
|
|
53
|
+
dictionary: SymbolDict;
|
|
54
|
+
/**
|
|
55
|
+
* Learn from a sequence of token strings.
|
|
56
|
+
* Skips learning if tokens.length <= order.
|
|
57
|
+
* @param {string[]} tokens
|
|
58
|
+
*/
|
|
59
|
+
learn(tokens: string[]): void;
|
|
60
|
+
/**
|
|
61
|
+
* Create a context window initialized to the forward root.
|
|
62
|
+
* @returns {ContextWindow}
|
|
63
|
+
*/
|
|
64
|
+
forwardContext(): ContextWindow;
|
|
65
|
+
/**
|
|
66
|
+
* Create a context window initialized to the backward root.
|
|
67
|
+
* @returns {ContextWindow}
|
|
68
|
+
*/
|
|
69
|
+
backwardContext(): ContextWindow;
|
|
70
|
+
}
|
|
71
|
+
import { Trie } from './trie.js';
|
|
72
|
+
import { SymbolDict } from './dict.js';
|
|
73
|
+
//# sourceMappingURL=model.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model.d.ts","sourceRoot":"","sources":["model.js"],"names":[],"mappings":"AAGA;;GAEG;AACH;IACE;;OAEG;IACH,mBAFW,MAAM,EAMhB;IAHC,cAAkB;IAClB,sEAAsE;IACtE,OADW,CAAC,MAAM,GAAC,IAAI,CAAC,EAAE,CACkB;IAG9C;;;OAGG;IACH,oBAFW,MAAM,QAKhB;IAED;;;;OAIG;IACH,cAHW,IAAI,YACJ,MAAM,QAYhB;IAED;;;;OAIG;IACH,sBAHW,IAAI,YACJ,MAAM,QAWhB;IAED;;;;OAIG;IACH,WAHW,MAAM,GACJ,MAAM,GAAC,IAAI,CAOvB;IAED;;;;OAIG;IACH,WAFa,MAAM,GAAC,IAAI,CAUvB;CACF;AAED;;GAEG;AACH;IACE;;OAEG;IACH,oBAFW,MAAM,EAOhB;IAJC,cAAkB;IAClB,cAAyB;IACzB,eAA0B;IAC1B,uBAAkC;IAGpC;;;;OAIG;IACH,cAFW,MAAM,EAAE,QA6BlB;IAED;;;OAGG;IACH,kBAFa,aAAa,CAMzB;IAED;;;OAGG;IACH,mBAFa,aAAa,CAMzB;CACF;qBAzJoB,WAAW;2BACG,WAAW"}
|