@tgies/megahal-js 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/model.js ADDED
@@ -0,0 +1,154 @@
1
+ import { Trie } from './trie.js';
2
+ import { SymbolDict, FIN_ID } from './dict.js';
3
+
4
+ /**
5
+ * A sliding context window tracking position in an n-gram trie.
6
+ */
7
+ export class ContextWindow {
8
+ /**
9
+ * @param {number} order - Markov model order
10
+ */
11
+ constructor(order) {
12
+ this.order = order;
13
+ /** @type {(number|null)[]} Context slots matching the model order. */
14
+ this.slots = new Array(order + 2).fill(null);
15
+ }
16
+
17
+ /**
18
+ * Reset the context window using the specified root reference.
19
+ * @param {number} rootRef
20
+ */
21
+ initialize(rootRef) {
22
+ this.slots.fill(null);
23
+ this.slots[0] = rootRef;
24
+ }
25
+
26
+ /**
27
+ * Update the context window without creating new trie nodes.
28
+ * @param {Trie} trie
29
+ * @param {number} symbolId
30
+ */
31
+ advance(trie, symbolId) {
32
+ for (let d = this.order + 1; d >= 1; d--) {
33
+ const parent = this.slots[d - 1];
34
+ if (parent !== null && parent !== undefined) {
35
+ const child = trie.findChild(parent, symbolId);
36
+ this.slots[d] = child !== undefined ? child : null;
37
+ } else {
38
+ this.slots[d] = null;
39
+ }
40
+ }
41
+ }
42
+
43
+ /**
44
+ * Update the context window, creating new trie nodes if necessary.
45
+ * @param {Trie} trie
46
+ * @param {number} symbolId
47
+ */
48
+ advanceAndLearn(trie, symbolId) {
49
+ for (let d = this.order + 1; d >= 1; d--) {
50
+ const parent = this.slots[d - 1];
51
+ if (parent !== null && parent !== undefined) {
52
+ this.slots[d] = trie.addChild(parent, symbolId);
53
+ } else {
54
+ this.slots[d] = null;
55
+ }
56
+ }
57
+ }
58
+
59
+ /**
60
+ * Get the context node at depth j.
61
+ * @param {number} j
62
+ * @returns {number|null}
63
+ */
64
+ atDepth(j) {
65
+ if (j < 0 || j >= this.slots.length) {
66
+ return null;
67
+ }
68
+ return this.slots[j];
69
+ }
70
+
71
+ /**
72
+ * Get the deepest non-null context node.
73
+ * Scans from slot 0 up to slot `order` (inclusive), returning the last non-null.
74
+ * @returns {number|null}
75
+ */
76
+ deepest() {
77
+ let best = null;
78
+ for (let d = 0; d <= this.order; d++) {
79
+ if (this.slots[d] !== null && this.slots[d] !== undefined) {
80
+ best = this.slots[d];
81
+ }
82
+ }
83
+ return best;
84
+ }
85
+ }
86
+
87
+ /**
88
+ * Bidirectional Markov model: forward trie + backward trie + shared dictionary.
89
+ */
90
+ export class BidirectionalModel {
91
+ /**
92
+ * @param {number} order - Markov model order (default: 5)
93
+ */
94
+ constructor(order = 5) {
95
+ this.order = order;
96
+ this.forward = new Trie();
97
+ this.backward = new Trie();
98
+ this.dictionary = new SymbolDict();
99
+ }
100
+
101
+ /**
102
+ * Learn from a sequence of token strings.
103
+ * Skips learning if tokens.length <= order.
104
+ * @param {string[]} tokens
105
+ */
106
+ learn(tokens) {
107
+ if (tokens.length <= this.order) {
108
+ return;
109
+ }
110
+
111
+ // Forward pass: learn the sequence in the forward trie.
112
+ const fwdCtx = new ContextWindow(this.order);
113
+ fwdCtx.initialize(this.forward.root());
114
+
115
+ /** @type {number[]} */
116
+ const symbolIds = [];
117
+ for (const tok of tokens) {
118
+ const id = this.dictionary.intern(tok);
119
+ symbolIds.push(id);
120
+ fwdCtx.advanceAndLearn(this.forward, id);
121
+ }
122
+ fwdCtx.advanceAndLearn(this.forward, FIN_ID);
123
+
124
+ // Backward pass: learn the reverse sequence in the backward trie.
125
+ const bwdCtx = new ContextWindow(this.order);
126
+ bwdCtx.initialize(this.backward.root());
127
+
128
+ for (let i = symbolIds.length - 1; i >= 0; i--) {
129
+ const id = symbolIds[i];
130
+ bwdCtx.advanceAndLearn(this.backward, id);
131
+ }
132
+ bwdCtx.advanceAndLearn(this.backward, FIN_ID);
133
+ }
134
+
135
+ /**
136
+ * Create a context window initialized to the forward root.
137
+ * @returns {ContextWindow}
138
+ */
139
+ forwardContext() {
140
+ const ctx = new ContextWindow(this.order);
141
+ ctx.initialize(this.forward.root());
142
+ return ctx;
143
+ }
144
+
145
+ /**
146
+ * Create a context window initialized to the backward root.
147
+ * @returns {ContextWindow}
148
+ */
149
+ backwardContext() {
150
+ const ctx = new ContextWindow(this.order);
151
+ ctx.initialize(this.backward.root());
152
+ return ctx;
153
+ }
154
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Tokenize input text per MegaHAL rules.
3
+ *
4
+ * @param {string} input
5
+ * @returns {string[]}
6
+ */
7
+ export function tokenize(input: string): string[];
8
+ //# sourceMappingURL=tokenizer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["tokenizer.js"],"names":[],"mappings":"AAmFA;;;;;GAKG;AACH,gCAHW,MAAM,GACJ,MAAM,EAAE,CAqCpB"}
@@ -0,0 +1,125 @@
1
+ /**
2
+ * MegaHAL Tokenizer.
3
+ * Splits input text into an alternating sequence of word tokens and separator tokens.
4
+ */
5
+
6
+ /**
7
+ * Checks if a character is ASCII alphabetic.
8
+ * @param {string} char
9
+ * @returns {boolean}
10
+ */
11
+ function isAlpha(char) {
12
+ return typeof char === 'string' && char.length === 1 && /^[A-Z]$/.test(char);
13
+ }
14
+
15
+ /**
16
+ * Checks if a character is ASCII digit.
17
+ * @param {string} char
18
+ * @returns {boolean}
19
+ */
20
+ function isDigit(char) {
21
+ return typeof char === 'string' && char.length === 1 && /^[0-9]$/.test(char);
22
+ }
23
+
24
+ /**
25
+ * Checks if a character is ASCII alphanumeric.
26
+ * @param {string} char
27
+ * @returns {boolean}
28
+ */
29
+ function isAlphanumeric(char) {
30
+ return typeof char === 'string' && char.length === 1 && /^[A-Z0-9]$/.test(char);
31
+ }
32
+
33
+ /**
34
+ * Determine if position `pos` in the uppercase `input` string is a word boundary.
35
+ *
36
+ * Rules (from MEGAHAL_SPEC.md Section 4.1):
37
+ * 1. pos == 0: never a boundary
38
+ * 2. pos == len: always a boundary
39
+ * 3. Apostrophe rule: if char at pos is `'` and both neighbors are alpha, no boundary.
40
+ * If char at pos-1 is `'` and both pos-2 and pos are alpha, no boundary.
41
+ * 4. Alpha transition: exactly one of pos and pos-1 is alphabetic -> boundary
42
+ * 5. Digit transition: digit status differs between pos and pos-1 -> boundary
43
+ *
44
+ * @param {string} input - Uppercase string
45
+ * @param {number} pos - 0-indexed position to test
46
+ * @returns {boolean}
47
+ */
48
+ function isBoundary(input, pos) {
49
+ if (pos === 0) {
50
+ return false;
51
+ }
52
+ if (pos === input.length) {
53
+ return true;
54
+ }
55
+
56
+ const curr = input[pos];
57
+ const prev = input[pos - 1];
58
+
59
+ // Apostrophe rule.
60
+ if (curr === '\'' && pos + 1 < input.length && isAlpha(prev) && isAlpha(input[pos + 1])) {
61
+ return false;
62
+ }
63
+ if (prev === '\'' && pos >= 2 && isAlpha(input[pos - 2]) && isAlpha(curr)) {
64
+ return false;
65
+ }
66
+
67
+ // Alpha transition.
68
+ const currAlpha = isAlpha(curr);
69
+ const prevAlpha = isAlpha(prev);
70
+ if (currAlpha !== prevAlpha) {
71
+ return true;
72
+ }
73
+
74
+ // Digit transition.
75
+ const currDigit = isDigit(curr);
76
+ const prevDigit = isDigit(prev);
77
+ if (currDigit !== prevDigit) {
78
+ return true;
79
+ }
80
+
81
+ return false;
82
+ }
83
+
84
+ /**
85
+ * Tokenize input text per MegaHAL rules.
86
+ *
87
+ * @param {string} input
88
+ * @returns {string[]}
89
+ */
90
+ export function tokenize(input) {
91
+ if (!input || input.trim() === '') {
92
+ return ['.'];
93
+ }
94
+
95
+ const upper = input.toUpperCase();
96
+ /** @type {string[]} */
97
+ const tokens = [];
98
+ let start = 0;
99
+
100
+ for (let pos = 1; pos <= upper.length; pos++) {
101
+ if (isBoundary(upper, pos)) {
102
+ if (pos > start) {
103
+ tokens.push(upper.substring(start, pos));
104
+ }
105
+ start = pos;
106
+ }
107
+ }
108
+
109
+ if (tokens.length === 0) {
110
+ return ['.'];
111
+ }
112
+
113
+ // Sentence-terminal normalization.
114
+ const last = tokens[tokens.length - 1];
115
+ const firstChar = last[0];
116
+ const lastChar = last[last.length - 1];
117
+
118
+ if (isAlphanumeric(firstChar)) {
119
+ tokens.push('.');
120
+ } else if (lastChar !== '!' && lastChar !== '.' && lastChar !== '?') {
121
+ tokens[tokens.length - 1] = '.';
122
+ }
123
+
124
+ return tokens;
125
+ }
package/src/trie.d.ts ADDED
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Node in the frequency trie.
3
+ */
4
+ export class TrieNode {
5
+ /**
6
+ * @param {number} symbolId
7
+ */
8
+ constructor(symbolId: number);
9
+ /** @type {number} Symbol ID. */
10
+ symbol: number;
11
+ /** @type {number} Total count of all child observations. */
12
+ usage: number;
13
+ /** @type {number} Observation count of this symbol in its parent's context. */
14
+ count: number;
15
+ /** @type {number[]} References to child nodes in the arena. */
16
+ children: number[];
17
+ }
18
+ /**
19
+ * Arena-based frequency trie.
20
+ */
21
+ export class Trie {
22
+ /** @type {TrieNode[]} Arena storing all trie nodes (root at index 0). */
23
+ nodes: TrieNode[];
24
+ /**
25
+ * Get the root node reference (always index 0).
26
+ * @returns {number}
27
+ */
28
+ root(): number;
29
+ /**
30
+ * Helper to perform binary search on a parent node's children list.
31
+ * @private
32
+ * @param {number} parentRef - The index of the parent node in nodes.
33
+ * @param {number} symbolId - The symbol ID to search for.
34
+ * @returns {{ found: boolean, index: number }}
35
+ */
36
+ private _findChildIndex;
37
+ /**
38
+ * Find an existing child node of parent matching symbolId.
39
+ * Returns undefined if no such child exists.
40
+ * @param {number} parentRef
41
+ * @param {number} symbolId
42
+ * @returns {number|undefined}
43
+ */
44
+ findChild(parentRef: number, symbolId: number): number | undefined;
45
+ /**
46
+ * Find or create a child node of parent matching symbolId, incrementing counts.
47
+ * @param {number} parentRef
48
+ * @param {number} symbolId
49
+ * @returns {number} NodeRef (index in nodes arena)
50
+ */
51
+ addChild(parentRef: number, symbolId: number): number;
52
+ /**
53
+ * Get the children node references for a node.
54
+ * @param {number} parentRef
55
+ * @returns {number[]}
56
+ */
57
+ children(parentRef: number): number[];
58
+ /**
59
+ * Get the number of children of a node.
60
+ * @param {number} parentRef
61
+ * @returns {number}
62
+ */
63
+ branchCount(parentRef: number): number;
64
+ /**
65
+ * Access a node by its reference index.
66
+ * @param {number} ref
67
+ * @returns {TrieNode}
68
+ */
69
+ node(ref: number): TrieNode;
70
+ /**
71
+ * Total number of nodes in the trie (including root).
72
+ * @returns {number}
73
+ */
74
+ get size(): number;
75
+ /**
76
+ * Whether the trie contains only the root node.
77
+ * @returns {boolean}
78
+ */
79
+ isEmpty(): boolean;
80
+ }
81
+ //# sourceMappingURL=trie.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"trie.d.ts","sourceRoot":"","sources":["trie.js"],"names":[],"mappings":"AAIA;;GAEG;AACH;IACE;;OAEG;IACH,sBAFW,MAAM,EAchB;IAXC,gCAAgC;IAChC,QADW,MAAM,CACK;IAEtB,4DAA4D;IAC5D,OADW,MAAM,CACH;IAEd,+EAA+E;IAC/E,OADW,MAAM,CACH;IAEd,+DAA+D;IAC/D,UADW,MAAM,EAAE,CACD;CAErB;AAED;;GAEG;AACH;IAEI,yEAAyE;IACzE,OADW,QAAQ,EAAE,CACgB;IAGvC;;;OAGG;IACH,QAFa,MAAM,CAIlB;IAED;;;;;;OAMG;IACH,wBAsBC;IAED;;;;;;OAMG;IACH,qBAJW,MAAM,YACN,MAAM,GACJ,MAAM,GAAC,SAAS,CAQ5B;IAED;;;;;OAKG;IACH,oBAJW,MAAM,YACN,MAAM,GACJ,MAAM,CAuBlB;IAED;;;;OAIG;IACH,oBAHW,MAAM,GACJ,MAAM,EAAE,CAIpB;IAED;;;;OAIG;IACH,uBAHW,MAAM,GACJ,MAAM,CAIlB;IAED;;;;OAIG;IACH,UAHW,MAAM,GACJ,QAAQ,CAOpB;IAED;;;OAGG;IACH,YAFa,MAAM,CAIlB;IAED;;;OAGG;IACH,WAFa,OAAO,CAInB;CACF"}
package/src/trie.js ADDED
@@ -0,0 +1,164 @@
1
+ import { ERROR_ID } from './dict.js';
2
+
3
+ const U16_MAX = 65535;
4
+
5
+ /**
6
+ * Node in the frequency trie.
7
+ */
8
+ export class TrieNode {
9
+ /**
10
+ * @param {number} symbolId
11
+ */
12
+ constructor(symbolId) {
13
+ /** @type {number} Symbol ID. */
14
+ this.symbol = symbolId;
15
+
16
+ /** @type {number} Total count of all child observations. */
17
+ this.usage = 0;
18
+
19
+ /** @type {number} Observation count of this symbol in its parent's context. */
20
+ this.count = 0;
21
+
22
+ /** @type {number[]} References to child nodes in the arena. */
23
+ this.children = [];
24
+ }
25
+ }
26
+
27
+ /**
28
+ * Arena-based frequency trie.
29
+ */
30
+ export class Trie {
31
+ constructor() {
32
+ /** @type {TrieNode[]} Arena storing all trie nodes (root at index 0). */
33
+ this.nodes = [new TrieNode(ERROR_ID)];
34
+ }
35
+
36
+ /**
37
+ * Get the root node reference (always index 0).
38
+ * @returns {number}
39
+ */
40
+ root() {
41
+ return 0;
42
+ }
43
+
44
+ /**
45
+ * Helper to perform binary search on a parent node's children list.
46
+ * @private
47
+ * @param {number} parentRef - The index of the parent node in nodes.
48
+ * @param {number} symbolId - The symbol ID to search for.
49
+ * @returns {{ found: boolean, index: number }}
50
+ */
51
+ _findChildIndex(parentRef, symbolId) {
52
+ const parentNode = this.nodes[parentRef];
53
+ const childrenRefs = parentNode.children;
54
+
55
+ let low = 0;
56
+ let high = childrenRefs.length - 1;
57
+
58
+ while (low <= high) {
59
+ const mid = (low + high) >> 1;
60
+ const childRef = childrenRefs[mid];
61
+ const childSym = this.nodes[childRef].symbol;
62
+
63
+ if (childSym < symbolId) {
64
+ low = mid + 1;
65
+ } else if (childSym > symbolId) {
66
+ high = mid - 1;
67
+ } else {
68
+ return { found: true, index: mid };
69
+ }
70
+ }
71
+
72
+ return { found: false, index: low };
73
+ }
74
+
75
+ /**
76
+ * Find an existing child node of parent matching symbolId.
77
+ * Returns undefined if no such child exists.
78
+ * @param {number} parentRef
79
+ * @param {number} symbolId
80
+ * @returns {number|undefined}
81
+ */
82
+ findChild(parentRef, symbolId) {
83
+ const { found, index } = this._findChildIndex(parentRef, symbolId);
84
+ if (found) {
85
+ return this.nodes[parentRef].children[index];
86
+ }
87
+ return undefined;
88
+ }
89
+
90
+ /**
91
+ * Find or create a child node of parent matching symbolId, incrementing counts.
92
+ * @param {number} parentRef
93
+ * @param {number} symbolId
94
+ * @returns {number} NodeRef (index in nodes arena)
95
+ */
96
+ addChild(parentRef, symbolId) {
97
+ const { found, index } = this._findChildIndex(parentRef, symbolId);
98
+
99
+ if (found) {
100
+ const childRef = this.nodes[parentRef].children[index];
101
+ const child = this.nodes[childRef];
102
+ if (child.count < U16_MAX) {
103
+ child.count++;
104
+ this.nodes[parentRef].usage++;
105
+ }
106
+ return childRef;
107
+ }
108
+
109
+ const childRef = this.nodes.length;
110
+ const newChild = new TrieNode(symbolId);
111
+ newChild.count = 1;
112
+ this.nodes.push(newChild);
113
+
114
+ this.nodes[parentRef].usage++;
115
+ this.nodes[parentRef].children.splice(index, 0, childRef);
116
+ return childRef;
117
+ }
118
+
119
+ /**
120
+ * Get the children node references for a node.
121
+ * @param {number} parentRef
122
+ * @returns {number[]}
123
+ */
124
+ children(parentRef) {
125
+ return this.nodes[parentRef].children;
126
+ }
127
+
128
+ /**
129
+ * Get the number of children of a node.
130
+ * @param {number} parentRef
131
+ * @returns {number}
132
+ */
133
+ branchCount(parentRef) {
134
+ return this.nodes[parentRef].children.length;
135
+ }
136
+
137
+ /**
138
+ * Access a node by its reference index.
139
+ * @param {number} ref
140
+ * @returns {TrieNode}
141
+ */
142
+ node(ref) {
143
+ if (ref < 0 || ref >= this.nodes.length) {
144
+ throw new RangeError(`Node reference ${ref} is out of bounds`);
145
+ }
146
+ return this.nodes[ref];
147
+ }
148
+
149
+ /**
150
+ * Total number of nodes in the trie (including root).
151
+ * @returns {number}
152
+ */
153
+ get size() {
154
+ return this.nodes.length;
155
+ }
156
+
157
+ /**
158
+ * Whether the trie contains only the root node.
159
+ * @returns {boolean}
160
+ */
161
+ isEmpty() {
162
+ return this.nodes.length <= 1;
163
+ }
164
+ }