@tgies/megahal-js 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/LICENSE +21 -0
- package/README.md +156 -0
- package/index.d.ts +6 -0
- package/index.d.ts.map +1 -0
- package/index.js +29 -0
- package/package.json +83 -0
- package/src/binary.d.ts +18 -0
- package/src/binary.d.ts.map +1 -0
- package/src/binary.js +328 -0
- package/src/dict.d.ts +54 -0
- package/src/dict.d.ts.map +1 -0
- package/src/dict.js +115 -0
- package/src/engine.d.ts +140 -0
- package/src/engine.d.ts.map +1 -0
- package/src/engine.js +317 -0
- package/src/evaluator.d.ts +10 -0
- package/src/evaluator.d.ts.map +1 -0
- package/src/evaluator.js +101 -0
- package/src/generator.d.ts +36 -0
- package/src/generator.d.ts.map +1 -0
- package/src/generator.js +296 -0
- package/src/keywords.d.ts +34 -0
- package/src/keywords.d.ts.map +1 -0
- package/src/keywords.js +122 -0
- package/src/model.d.ts +73 -0
- package/src/model.d.ts.map +1 -0
- package/src/model.js +154 -0
- package/src/tokenizer.d.ts +8 -0
- package/src/tokenizer.d.ts.map +1 -0
- package/src/tokenizer.js +125 -0
- package/src/trie.d.ts +81 -0
- package/src/trie.d.ts.map +1 -0
- package/src/trie.js +164 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.0.0] - 2026-05-22
|
|
9
|
+
|
|
10
|
+
Initial release. A JavaScript port of the MegaHAL conversational engine
|
|
11
|
+
(Jason Hutchens, 1998), targeting both Node.js and browser environments.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- Forward and backward 5th-order Markov trie models with case-insensitive symbols.
|
|
15
|
+
- Tokenization matching the original C boundary rules, including apostrophe
|
|
16
|
+
handling for contractions and sentence-terminal normalization.
|
|
17
|
+
- Two-pass keyword extraction with banned, auxiliary, and swap-table support.
|
|
18
|
+
- Reply generation with seeded forward and backward babble phases, keyword
|
|
19
|
+
priority, and the `used_key` discipline.
|
|
20
|
+
- Surprise-based reply scoring with depth-averaged probability and the
|
|
21
|
+
num >= 8 / num >= 16 length penalties.
|
|
22
|
+
- Binary brain persistence compatible with the `MegaHALv8` cookie format,
|
|
23
|
+
with optional 64-bit count/usage extensions.
|
|
24
|
+
- Default support file data (banned, auxiliary, greeting, swap) bundled.
|
|
25
|
+
- TypeScript declarations generated from JSDoc.
|
|
26
|
+
|
|
27
|
+
[1.0.0]: https://github.com/tgies/megahal-js/releases/tag/v1.0.0
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tony Gies
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# MegaHAL-JS
|
|
2
|
+
|
|
3
|
+
[](https://github.com/tgies/megahal-js/actions/workflows/ci.yml)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
A JavaScript port of Jason Hutchens' famous 1998 MegaHAL conversational engine.
|
|
7
|
+
|
|
8
|
+
MegaHAL-JS runs natively in Node.js (>= 20) and in all modern browsers.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm install @tgies/megahal-js
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
### Modern ESM (Node & Browser)
|
|
23
|
+
|
|
24
|
+
```javascript
|
|
25
|
+
import { MegaHal } from '@tgies/megahal-js';
|
|
26
|
+
|
|
27
|
+
// Instantiate with order N (default is 5)
|
|
28
|
+
const hal = new MegaHal(5);
|
|
29
|
+
|
|
30
|
+
// Train the engine
|
|
31
|
+
hal.learn('The cat sat on the mat.');
|
|
32
|
+
hal.learn('The dog chased the cat around the yard.');
|
|
33
|
+
|
|
34
|
+
// Generate a reply (this will also learn from the prompt before generating)
|
|
35
|
+
const reply = hal.respond('Tell me about the cat.');
|
|
36
|
+
console.log(reply);
|
|
37
|
+
// e.g., "The dog chased the cat sat on the mat."
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Browser direct integration (Vanilla HTML/JS)
|
|
41
|
+
|
|
42
|
+
```html
|
|
43
|
+
<!DOCTYPE html>
|
|
44
|
+
<html>
|
|
45
|
+
<head>
|
|
46
|
+
<title>MegaHAL Chatbot</title>
|
|
47
|
+
</head>
|
|
48
|
+
<body>
|
|
49
|
+
<script type="module">
|
|
50
|
+
import { MegaHal } from './node_modules/megahal-js/index.js';
|
|
51
|
+
|
|
52
|
+
const hal = new MegaHal(3);
|
|
53
|
+
hal.learn('Hello world!');
|
|
54
|
+
hal.learn('Welcome to the web browser version of MegaHAL.');
|
|
55
|
+
|
|
56
|
+
console.log(hal.respond('hello'));
|
|
57
|
+
</script>
|
|
58
|
+
</body>
|
|
59
|
+
</html>
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## API Reference
|
|
65
|
+
|
|
66
|
+
### `class MegaHal`
|
|
67
|
+
|
|
68
|
+
#### `constructor(order = 5, rng = null)`
|
|
69
|
+
Creates a new MegaHAL engine.
|
|
70
|
+
- `order`: The Markov n-gram depth (trie depth). Defaults to `5`.
|
|
71
|
+
- `rng`: An optional custom random number generator. Must implement `randomRange(min, max)` returning an integer in `[min, max)`. If omitted, defaults to `Math.random`.
|
|
72
|
+
|
|
73
|
+
#### `respond(input)`
|
|
74
|
+
Learns from the input sentence, extracts its keywords, generates a response biased toward those keywords, capitalizes the response according to sentence-casing rules, and returns it.
|
|
75
|
+
- `input`: The prompt string.
|
|
76
|
+
- Returns: `string`
|
|
77
|
+
|
|
78
|
+
#### `generate(input)`
|
|
79
|
+
Generates a response to the prompt *without* learning from it. Returns `null` if no reply can be generated.
|
|
80
|
+
- `input`: The prompt string.
|
|
81
|
+
- Returns: `string | null`
|
|
82
|
+
|
|
83
|
+
#### `greet()`
|
|
84
|
+
Generates an initial greeting using a random word selected from the greeting keywords list. Falls back to the default fallback greeting if no greeting can be generated.
|
|
85
|
+
- Returns: `string`
|
|
86
|
+
|
|
87
|
+
#### `learn(input)`
|
|
88
|
+
Tokenizes and trains both forward and backward models on the given sentence.
|
|
89
|
+
- `input`: Sentence to train on.
|
|
90
|
+
|
|
91
|
+
#### `setLimit({ timeout, maxIterations })`
|
|
92
|
+
Configures generation limits for the reply loop.
|
|
93
|
+
- `timeout`: Maximum milliseconds to spend generating candidate responses (defaults to `1000`).
|
|
94
|
+
- `maxIterations`: Maximum candidates to generate.
|
|
95
|
+
|
|
96
|
+
#### `setKeywordConfig(config)`
|
|
97
|
+
Overrides the extraction config containing banned words, auxiliary words, and the swap table.
|
|
98
|
+
- `config`: A `KeywordConfig` instance.
|
|
99
|
+
|
|
100
|
+
#### `setGreetings(greetings)`
|
|
101
|
+
Sets the keywords list used to seed the initial greeting.
|
|
102
|
+
- `greetings`: Array of string greeting words.
|
|
103
|
+
|
|
104
|
+
#### `exportBrain()`
|
|
105
|
+
Serializes the engine's internal dictionary and tries into a spec-compliant C-compatible `.brn` binary format and returns a `Uint8Array`.
|
|
106
|
+
- Returns: `Uint8Array`
|
|
107
|
+
|
|
108
|
+
#### `importBrain(data)`
|
|
109
|
+
Deserializes a binary brain from a `Uint8Array` or `ArrayBuffer` into the engine, restoring dictionary and tries.
|
|
110
|
+
- `data`: Binary data buffer.
|
|
111
|
+
|
|
112
|
+
#### `trainFromContent(content)`
|
|
113
|
+
Trains the model on multi-line text corpus. Lines starting with `#` are ignored as comments.
|
|
114
|
+
- `content`: Plain text string.
|
|
115
|
+
|
|
116
|
+
#### `saveBrain(path)` *(Node-only)*
|
|
117
|
+
Asynchronously saves the serialized binary brain to a file.
|
|
118
|
+
- `path`: Target file path.
|
|
119
|
+
|
|
120
|
+
#### `loadBrain(path)` *(Node-only)*
|
|
121
|
+
Asynchronously loads a serialized binary brain from a file.
|
|
122
|
+
- `path`: Source file path.
|
|
123
|
+
|
|
124
|
+
#### `trainFromFile(path)` *(Node-only)*
|
|
125
|
+
Asynchronously trains the model from a corpus file.
|
|
126
|
+
- `path`: Text file path.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Quality & Verification Commands
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Run unit tests
|
|
134
|
+
npm test
|
|
135
|
+
|
|
136
|
+
# Run test coverage
|
|
137
|
+
npm run test:coverage
|
|
138
|
+
|
|
139
|
+
# Perform TypeScript check
|
|
140
|
+
npm run typecheck
|
|
141
|
+
|
|
142
|
+
# Build declaration types
|
|
143
|
+
npm run build:types
|
|
144
|
+
|
|
145
|
+
# Run ESLint linter
|
|
146
|
+
npm run lint
|
|
147
|
+
|
|
148
|
+
# Run Stryker Mutation Testing
|
|
149
|
+
npm run test:mutation
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT © [Tony Gies](mailto:tgies@tgies.net)
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { tokenize } from "./src/tokenizer.js";
|
|
2
|
+
export { SymbolDict } from "./src/dict.js";
|
|
3
|
+
export { Trie } from "./src/trie.js";
|
|
4
|
+
export { MegaHal, parseWordList, parseSwapFile, loadWordList, loadSwapFile } from "./src/engine.js";
|
|
5
|
+
export { extractKeywords, KeywordConfig, SwapTable } from "./src/keywords.js";
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
package/index.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["index.js"],"names":[],"mappings":""}
|
package/index.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MegaHAL conversational engine entry point.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export {
|
|
6
|
+
MegaHal,
|
|
7
|
+
parseWordList,
|
|
8
|
+
parseSwapFile,
|
|
9
|
+
loadWordList,
|
|
10
|
+
loadSwapFile
|
|
11
|
+
} from './src/engine.js';
|
|
12
|
+
|
|
13
|
+
export {
|
|
14
|
+
tokenize
|
|
15
|
+
} from './src/tokenizer.js';
|
|
16
|
+
|
|
17
|
+
export {
|
|
18
|
+
extractKeywords,
|
|
19
|
+
KeywordConfig,
|
|
20
|
+
SwapTable
|
|
21
|
+
} from './src/keywords.js';
|
|
22
|
+
|
|
23
|
+
export {
|
|
24
|
+
SymbolDict
|
|
25
|
+
} from './src/dict.js';
|
|
26
|
+
|
|
27
|
+
export {
|
|
28
|
+
Trie
|
|
29
|
+
} from './src/trie.js';
|
package/package.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tgies/megahal-js",
|
|
3
|
+
"publishConfig": {
|
|
4
|
+
"access": "public"
|
|
5
|
+
},
|
|
6
|
+
"version": "1.0.0",
|
|
7
|
+
"description": "A JavaScript port of the MegaHAL conversational engine supporting both Node.js and Browser environments.",
|
|
8
|
+
"type": "module",
|
|
9
|
+
"main": "./index.js",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"types": "./index.d.ts",
|
|
13
|
+
"import": "./index.js"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"types": "./index.d.ts",
|
|
17
|
+
"engines": {
|
|
18
|
+
"node": ">= 20"
|
|
19
|
+
},
|
|
20
|
+
"files": [
|
|
21
|
+
"index.js",
|
|
22
|
+
"index.d.ts",
|
|
23
|
+
"index.d.ts.map",
|
|
24
|
+
"src/**/*.js",
|
|
25
|
+
"src/**/*.d.ts",
|
|
26
|
+
"src/**/*.d.ts.map",
|
|
27
|
+
"README.md",
|
|
28
|
+
"LICENSE",
|
|
29
|
+
"CHANGELOG.md"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"test": "vitest run",
|
|
33
|
+
"test:coverage": "vitest run --coverage",
|
|
34
|
+
"test:mutation": "stryker run",
|
|
35
|
+
"typecheck": "tsc --noEmit",
|
|
36
|
+
"lint": "eslint .",
|
|
37
|
+
"lint:fix": "eslint . --fix",
|
|
38
|
+
"build:types": "node scripts/clean-types.js && tsc --declaration --emitDeclarationOnly --noEmit false",
|
|
39
|
+
"check": "npm run lint && npm run typecheck && npm run test",
|
|
40
|
+
"prepare": "husky",
|
|
41
|
+
"prepack": "npm run build:types"
|
|
42
|
+
},
|
|
43
|
+
"keywords": [
|
|
44
|
+
"megahal",
|
|
45
|
+
"chatbot",
|
|
46
|
+
"markov",
|
|
47
|
+
"n-gram",
|
|
48
|
+
"conversational-ai"
|
|
49
|
+
],
|
|
50
|
+
"author": "Tony Gies <tgies@tgies.net>",
|
|
51
|
+
"license": "MIT",
|
|
52
|
+
"repository": {
|
|
53
|
+
"type": "git",
|
|
54
|
+
"url": "git+https://github.com/tgies/megahal-js.git"
|
|
55
|
+
},
|
|
56
|
+
"bugs": {
|
|
57
|
+
"url": "https://github.com/tgies/megahal-js/issues"
|
|
58
|
+
},
|
|
59
|
+
"homepage": "https://github.com/tgies/megahal-js#readme",
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@arethetypeswrong/cli": "^0.18.2",
|
|
62
|
+
"@commitlint/cli": "^20.4.3",
|
|
63
|
+
"@commitlint/config-conventional": "^20.5.0",
|
|
64
|
+
"@eslint/js": "^10.0.1",
|
|
65
|
+
"@stryker-mutator/core": "^9.6.0",
|
|
66
|
+
"@stryker-mutator/vitest-runner": "^9.6.0",
|
|
67
|
+
"@types/node": "^25.6.0",
|
|
68
|
+
"@vitest/coverage-v8": "^3.0.0",
|
|
69
|
+
"eslint": "^10.0.3",
|
|
70
|
+
"globals": "^17.4.0",
|
|
71
|
+
"husky": "^9.1.7",
|
|
72
|
+
"lint-staged": "^16.3.2",
|
|
73
|
+
"publint": "^0.3.2",
|
|
74
|
+
"typescript": "^6.0.3",
|
|
75
|
+
"vitest": "^3.0.0"
|
|
76
|
+
},
|
|
77
|
+
"lint-staged": {
|
|
78
|
+
"*.js": "eslint --fix"
|
|
79
|
+
},
|
|
80
|
+
"overrides": {
|
|
81
|
+
"fflate": "0.8.2"
|
|
82
|
+
}
|
|
83
|
+
}
|
package/src/binary.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Serialize a BidirectionalModel into a binary buffer.
|
|
3
|
+
*
|
|
4
|
+
* @param {import('./model.js').BidirectionalModel} model
|
|
5
|
+
* @param {{ use64Bit?: boolean }} [options] - Options for serialization
|
|
6
|
+
* @returns {Uint8Array}
|
|
7
|
+
*/
|
|
8
|
+
export function serializeBrain(model: import("./model.js").BidirectionalModel, options?: {
|
|
9
|
+
use64Bit?: boolean;
|
|
10
|
+
}): Uint8Array;
|
|
11
|
+
/**
|
|
12
|
+
* Deserialize binary brain data into a BidirectionalModel.
|
|
13
|
+
*
|
|
14
|
+
* @param {Uint8Array|ArrayBuffer} data
|
|
15
|
+
* @param {import('./model.js').BidirectionalModel} model
|
|
16
|
+
*/
|
|
17
|
+
export function deserializeBrain(data: Uint8Array | ArrayBuffer, model: import("./model.js").BidirectionalModel): void;
|
|
18
|
+
//# sourceMappingURL=binary.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"binary.d.ts","sourceRoot":"","sources":["binary.js"],"names":[],"mappings":"AA0NA;;;;;;GAMG;AACH,sCAJW,OAAO,YAAY,EAAE,kBAAkB,YACvC;IAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;CAAE,GACpB,UAAU,CA2CtB;AAED;;;;;GAKG;AACH,uCAHW,UAAU,GAAC,WAAW,SACtB,OAAO,YAAY,EAAE,kBAAkB,QAuDjD"}
|
package/src/binary.js
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
import { Trie, TrieNode } from './trie.js';
|
|
2
|
+
import { SymbolDict } from './dict.js';
|
|
3
|
+
|
|
4
|
+
const COOKIE = 'MegaHALv8';
|
|
5
|
+
|
|
6
|
+
class BinaryWriter {
|
|
7
|
+
constructor() {
|
|
8
|
+
this.buffer = new Uint8Array(4096);
|
|
9
|
+
this.offset = 0;
|
|
10
|
+
this.view = new DataView(this.buffer.buffer);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Ensure the internal buffer is large enough.
|
|
15
|
+
* @private
|
|
16
|
+
* @param {number} size
|
|
17
|
+
*/
|
|
18
|
+
_ensure(size) {
|
|
19
|
+
if (this.offset + size > this.buffer.byteLength) {
|
|
20
|
+
let newLength = this.buffer.byteLength * 2;
|
|
21
|
+
while (this.offset + size > newLength) {
|
|
22
|
+
newLength *= 2;
|
|
23
|
+
}
|
|
24
|
+
const newBuffer = new Uint8Array(newLength);
|
|
25
|
+
newBuffer.set(this.buffer);
|
|
26
|
+
this.buffer = newBuffer;
|
|
27
|
+
this.view = new DataView(this.buffer.buffer);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Write a uint8 byte.
|
|
33
|
+
* @param {number} val
|
|
34
|
+
*/
|
|
35
|
+
writeUint8(val) {
|
|
36
|
+
this._ensure(1);
|
|
37
|
+
this.view.setUint8(this.offset, val);
|
|
38
|
+
this.offset += 1;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Write a uint16 word.
|
|
43
|
+
* @param {number} val
|
|
44
|
+
*/
|
|
45
|
+
writeUint16(val) {
|
|
46
|
+
this._ensure(2);
|
|
47
|
+
this.view.setUint16(this.offset, val, true);
|
|
48
|
+
this.offset += 2;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Write a uint32 double word.
|
|
53
|
+
* @param {number} val
|
|
54
|
+
*/
|
|
55
|
+
writeUint32(val) {
|
|
56
|
+
this._ensure(4);
|
|
57
|
+
this.view.setUint32(this.offset, val, true);
|
|
58
|
+
this.offset += 4;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Write a raw byte array.
|
|
63
|
+
* @param {Uint8Array} bytes
|
|
64
|
+
*/
|
|
65
|
+
writeBytes(bytes) {
|
|
66
|
+
this._ensure(bytes.length);
|
|
67
|
+
this.buffer.set(bytes, this.offset);
|
|
68
|
+
this.offset += bytes.length;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Write a UTF-8 string.
|
|
73
|
+
* @param {string} str
|
|
74
|
+
*/
|
|
75
|
+
writeString(str) {
|
|
76
|
+
const encoder = new TextEncoder();
|
|
77
|
+
const bytes = encoder.encode(str);
|
|
78
|
+
this.writeBytes(bytes);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Get the written contents as a Uint8Array.
|
|
83
|
+
* @returns {Uint8Array}
|
|
84
|
+
*/
|
|
85
|
+
getUint8Array() {
|
|
86
|
+
return this.buffer.subarray(0, this.offset);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
class BinaryReader {
|
|
91
|
+
/**
|
|
92
|
+
* @param {ArrayBuffer|Uint8Array} buffer
|
|
93
|
+
*/
|
|
94
|
+
constructor(buffer) {
|
|
95
|
+
this.buffer = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer);
|
|
96
|
+
this.view = new DataView(this.buffer.buffer, this.buffer.byteOffset, this.buffer.byteLength);
|
|
97
|
+
this.offset = 0;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Read a uint8 byte.
|
|
102
|
+
* @returns {number}
|
|
103
|
+
*/
|
|
104
|
+
readUint8() {
|
|
105
|
+
const val = this.view.getUint8(this.offset);
|
|
106
|
+
this.offset += 1;
|
|
107
|
+
return val;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Read a uint16 word.
|
|
112
|
+
* @returns {number}
|
|
113
|
+
*/
|
|
114
|
+
readUint16() {
|
|
115
|
+
const val = this.view.getUint16(this.offset, true);
|
|
116
|
+
this.offset += 2;
|
|
117
|
+
return val;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Read a uint32 double word.
|
|
122
|
+
* @returns {number}
|
|
123
|
+
*/
|
|
124
|
+
readUint32() {
|
|
125
|
+
const val = this.view.getUint32(this.offset, true);
|
|
126
|
+
this.offset += 4;
|
|
127
|
+
return val;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Read raw bytes.
|
|
132
|
+
* @param {number} length
|
|
133
|
+
* @returns {Uint8Array}
|
|
134
|
+
*/
|
|
135
|
+
readBytes(length) {
|
|
136
|
+
if (this.offset + length > this.buffer.byteLength) {
|
|
137
|
+
throw new Error('Unexpected end of file while reading bytes');
|
|
138
|
+
}
|
|
139
|
+
const bytes = this.buffer.subarray(this.offset, this.offset + length);
|
|
140
|
+
this.offset += length;
|
|
141
|
+
return bytes;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Read a UTF-8 string.
|
|
146
|
+
* @param {number} length
|
|
147
|
+
* @returns {string}
|
|
148
|
+
*/
|
|
149
|
+
readString(length) {
|
|
150
|
+
const bytes = this.readBytes(length);
|
|
151
|
+
const decoder = new TextDecoder();
|
|
152
|
+
return decoder.decode(bytes);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Check if there are more bytes to read.
|
|
157
|
+
* @returns {boolean}
|
|
158
|
+
*/
|
|
159
|
+
hasMore() {
|
|
160
|
+
return this.offset < this.buffer.byteLength;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Serialize a Trie node recursively.
|
|
166
|
+
* @param {Trie} trie
|
|
167
|
+
* @param {number} ref
|
|
168
|
+
* @param {BinaryWriter} writer
|
|
169
|
+
* @param {boolean} use64Bit
|
|
170
|
+
*/
|
|
171
|
+
function serializeNode(trie, ref, writer, use64Bit) {
|
|
172
|
+
const node = trie.node(ref);
|
|
173
|
+
writer.writeUint16(node.symbol);
|
|
174
|
+
if (use64Bit) {
|
|
175
|
+
writer.writeUint32(node.usage);
|
|
176
|
+
writer.writeUint32(0); // High 4 bytes
|
|
177
|
+
} else {
|
|
178
|
+
writer.writeUint32(node.usage);
|
|
179
|
+
}
|
|
180
|
+
writer.writeUint16(node.count);
|
|
181
|
+
writer.writeUint16(node.children.length);
|
|
182
|
+
|
|
183
|
+
for (const childRef of node.children) {
|
|
184
|
+
serializeNode(trie, childRef, writer, use64Bit);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Deserialize a Trie node recursively.
|
|
190
|
+
* @param {Trie} trie
|
|
191
|
+
* @param {BinaryReader} reader
|
|
192
|
+
* @param {number} byte4Size
|
|
193
|
+
* @returns {number} NodeRef
|
|
194
|
+
*/
|
|
195
|
+
function deserializeNode(trie, reader, byte4Size) {
|
|
196
|
+
const symbol = reader.readUint16();
|
|
197
|
+
const usage = reader.readUint32();
|
|
198
|
+
if (byte4Size === 8) {
|
|
199
|
+
reader.readUint32(); // Skip high 4 bytes
|
|
200
|
+
}
|
|
201
|
+
const count = reader.readUint16();
|
|
202
|
+
const branch = reader.readUint16();
|
|
203
|
+
|
|
204
|
+
const node = new TrieNode(symbol);
|
|
205
|
+
node.usage = usage;
|
|
206
|
+
node.count = count;
|
|
207
|
+
|
|
208
|
+
const ref = trie.nodes.length;
|
|
209
|
+
trie.nodes.push(node);
|
|
210
|
+
|
|
211
|
+
for (let i = 0; i < branch; i++) {
|
|
212
|
+
const childRef = deserializeNode(trie, reader, byte4Size);
|
|
213
|
+
node.children.push(childRef);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return ref;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Serialize a BidirectionalModel into a binary buffer.
|
|
221
|
+
*
|
|
222
|
+
* @param {import('./model.js').BidirectionalModel} model
|
|
223
|
+
* @param {{ use64Bit?: boolean }} [options] - Options for serialization
|
|
224
|
+
* @returns {Uint8Array}
|
|
225
|
+
*/
|
|
226
|
+
export function serializeBrain(model, options = {}) {
|
|
227
|
+
const use64Bit = !!options.use64Bit;
|
|
228
|
+
const writer = new BinaryWriter();
|
|
229
|
+
|
|
230
|
+
const cookieBytes = new TextEncoder().encode(COOKIE);
|
|
231
|
+
if (cookieBytes.length !== 9) {
|
|
232
|
+
throw new Error('Cookie size must be exactly 9 bytes');
|
|
233
|
+
}
|
|
234
|
+
writer.writeBytes(cookieBytes);
|
|
235
|
+
|
|
236
|
+
writer.writeUint8(model.order);
|
|
237
|
+
|
|
238
|
+
serializeNode(model.forward, model.forward.root(), writer, use64Bit);
|
|
239
|
+
|
|
240
|
+
serializeNode(model.backward, model.backward.root(), writer, use64Bit);
|
|
241
|
+
|
|
242
|
+
const dict = model.dictionary;
|
|
243
|
+
if (dict.entries.length > 65536) {
|
|
244
|
+
throw new RangeError(
|
|
245
|
+
`Dictionary size (${dict.entries.length}) exceeds maximum of 65536 symbols supported by the binary format`
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (use64Bit) {
|
|
250
|
+
writer.writeUint32(dict.entries.length);
|
|
251
|
+
writer.writeUint32(0); // High 4 bytes
|
|
252
|
+
} else {
|
|
253
|
+
writer.writeUint32(dict.entries.length);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
for (let i = 0; i < dict.entries.length; i++) {
|
|
257
|
+
const word = dict.entries[i];
|
|
258
|
+
const wordBytes = new TextEncoder().encode(word);
|
|
259
|
+
if (wordBytes.length > 255) {
|
|
260
|
+
throw new Error(`Symbol '${word}' exceeds maximum byte size of 255`);
|
|
261
|
+
}
|
|
262
|
+
writer.writeUint8(wordBytes.length);
|
|
263
|
+
writer.writeBytes(wordBytes);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return writer.getUint8Array();
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Deserialize binary brain data into a BidirectionalModel.
|
|
271
|
+
*
|
|
272
|
+
* @param {Uint8Array|ArrayBuffer} data
|
|
273
|
+
* @param {import('./model.js').BidirectionalModel} model
|
|
274
|
+
*/
|
|
275
|
+
export function deserializeBrain(data, model) {
|
|
276
|
+
const reader = new BinaryReader(data);
|
|
277
|
+
|
|
278
|
+
const cookie = reader.readString(9);
|
|
279
|
+
if (cookie !== COOKIE) {
|
|
280
|
+
throw new Error('Invalid brain file: Magic cookie mismatch');
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
const order = reader.readUint8();
|
|
284
|
+
model.order = order;
|
|
285
|
+
|
|
286
|
+
// Auto-detect byte4Size (4 or 8 bytes) by inspecting the root node of the forward tree.
|
|
287
|
+
let byte4Size = 4;
|
|
288
|
+
if (data.byteLength >= 24) {
|
|
289
|
+
const buffer = data instanceof Uint8Array ? data.buffer : data;
|
|
290
|
+
const byteOffset = data instanceof Uint8Array ? data.byteOffset : 0;
|
|
291
|
+
const byteLength = data.byteLength;
|
|
292
|
+
const view = new DataView(buffer, byteOffset, byteLength);
|
|
293
|
+
const branch4 = view.getUint16(18, true);
|
|
294
|
+
const branch8 = view.getUint16(22, true);
|
|
295
|
+
if (branch4 === 0 && branch8 > 0) {
|
|
296
|
+
byte4Size = 8;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
model.forward = new Trie();
|
|
301
|
+
model.forward.nodes = []; // Clear default root
|
|
302
|
+
deserializeNode(model.forward, reader, byte4Size);
|
|
303
|
+
|
|
304
|
+
model.backward = new Trie();
|
|
305
|
+
model.backward.nodes = []; // Clear default root
|
|
306
|
+
deserializeNode(model.backward, reader, byte4Size);
|
|
307
|
+
|
|
308
|
+
const dictSize = reader.readUint32();
|
|
309
|
+
if (byte4Size === 8) {
|
|
310
|
+
reader.readUint32(); // Skip high 4 bytes of dictionary size
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const dict = new SymbolDict();
|
|
314
|
+
dict.entries = [];
|
|
315
|
+
dict.sortedIndex = [];
|
|
316
|
+
|
|
317
|
+
for (let i = 0; i < dictSize; i++) {
|
|
318
|
+
const len = reader.readUint8();
|
|
319
|
+
const word = reader.readString(len);
|
|
320
|
+
dict.entries.push(word);
|
|
321
|
+
|
|
322
|
+
// Reconstruct sorted index using binary search insert position.
|
|
323
|
+
const { index } = dict._binarySearch(word);
|
|
324
|
+
dict.sortedIndex.splice(index, 0, i);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
model.dictionary = dict;
|
|
328
|
+
}
|