node-predict 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +24 -0
- package/predictjs.js +562 -0
- package/readme.md +171 -0
package/package.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "node-predict",
|
|
3
|
+
"version": "3.0.0",
|
|
4
|
+
"description": "Full lightweight and fast native offline, trainable text prediction engine for Node.js",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"predict",
|
|
7
|
+
"autocomplete",
|
|
8
|
+
"text-prediction",
|
|
9
|
+
"ngram",
|
|
10
|
+
"trie",
|
|
11
|
+
"tfidf",
|
|
12
|
+
"predictjs"
|
|
13
|
+
],
|
|
14
|
+
"license": "MIT",
|
|
15
|
+
"author": "Ismail Gidado",
|
|
16
|
+
"type": "commonjs",
|
|
17
|
+
"main": "predictjs.js",
|
|
18
|
+
"engines": {
|
|
19
|
+
"node": ">=14"
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"test": "node app.js"
|
|
23
|
+
}
|
|
24
|
+
}
|
package/predictjs.js
ADDED
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
'use strict'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* ╔══════════════════════════════════════════════════════════════════╗
|
|
5
|
+
* ║ PredictJS v3.0 ║
|
|
6
|
+
* ║ Offline text prediction engine — learns purely from text ║
|
|
7
|
+
* ║ ────────────────────────────────────────────────────────── ║
|
|
8
|
+
* ║ • Trie → partial word completion ║
|
|
9
|
+
* ║ • Ensemble NGram → multi-size n-gram voting for accuracy ║
|
|
10
|
+
* ║ • TF-IDF → meaningful word scoring over filler ║
|
|
11
|
+
* ║ • Loop guard → prevents repetitive generation cycles ║
|
|
12
|
+
* ║ • Predict mode → suggest / complete / next word ║
|
|
13
|
+
* ║ • Persistent → save/load trained index from JSON file ║
|
|
14
|
+
* ╚══════════════════════════════════════════════════════════════════╝
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const fs = require('fs')
|
|
18
|
+
const path = require('path')
|
|
19
|
+
|
|
20
|
+
// ─────────────────────────────────────────────
|
|
21
|
+
// DEFAULT CONFIG — tuned for best accuracy
|
|
22
|
+
// ─────────────────────────────────────────────
|
|
23
|
+
const DEFAULTS = {
|
|
24
|
+
// N-gram ensemble
|
|
25
|
+
nMin: 2, // bigram — broad coverage
|
|
26
|
+
nMax: 4, // 4-gram — specific context
|
|
27
|
+
smoothing: true,
|
|
28
|
+
smoothingAlpha: 0.05, // light smoothing — keeps predictions tight to actual data
|
|
29
|
+
|
|
30
|
+
// TF-IDF
|
|
31
|
+
useTFIDF: true,
|
|
32
|
+
tfidfBlend: 0.5, // 50/50 blend — balances frequency vs meaningfulness
|
|
33
|
+
|
|
34
|
+
// Ensemble weights — give more weight to larger n-grams (more specific)
|
|
35
|
+
// [weight for nMin, ..., weight for nMax] must match nMax - nMin + 1 entries
|
|
36
|
+
ensembleWeights: [0.15, 0.35, 0.50], // bigram=15%, trigram=35%, 4-gram=50%
|
|
37
|
+
|
|
38
|
+
// Trie
|
|
39
|
+
minWordLength: 2,
|
|
40
|
+
maxSuggestions: 5,
|
|
41
|
+
|
|
42
|
+
// Sentence completion
|
|
43
|
+
maxCompletionWords: 20,
|
|
44
|
+
completionTemp: 0.6, // lower = more faithful to dataset patterns
|
|
45
|
+
sentenceEndTokens: ['<END>'],
|
|
46
|
+
|
|
47
|
+
// Loop / repetition guard
|
|
48
|
+
maxRepeatBigram: 2, // stop if any 2-word sequence repeats this many times
|
|
49
|
+
penalizeRepeats: true, // reduce score of recently used words
|
|
50
|
+
|
|
51
|
+
// Tokenizer
|
|
52
|
+
caseSensitive: false,
|
|
53
|
+
keepPunctuation: false,
|
|
54
|
+
keepNumbers: false, // strip numbers — reduces noise
|
|
55
|
+
|
|
56
|
+
// Skip lines that look like metadata (comments, labels, etc.)
|
|
57
|
+
skipLinePatterns: [
|
|
58
|
+
/^#/, // comment lines
|
|
59
|
+
/^(input|response|output|user|bot|assistant|human)\s*:/i, // dialogue labels
|
|
60
|
+
/^\s*$/, // blank lines
|
|
61
|
+
],
|
|
62
|
+
|
|
63
|
+
// Dataset
|
|
64
|
+
encoding: 'utf8',
|
|
65
|
+
datasetFormat: 'auto',
|
|
66
|
+
jsonTextField: 'text',
|
|
67
|
+
|
|
68
|
+
// Index
|
|
69
|
+
indexPath: './model-index.json',
|
|
70
|
+
autoSave: false,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ─────────────────────────────────────────────
|
|
74
|
+
// TOKENIZER
|
|
75
|
+
// ─────────────────────────────────────────────
|
|
76
|
+
class Tokenizer {
|
|
77
|
+
constructor(cfg) { this.cfg = cfg }
|
|
78
|
+
|
|
79
|
+
clean(text) {
|
|
80
|
+
let t = text
|
|
81
|
+
if (!this.cfg.caseSensitive) t = t.toLowerCase()
|
|
82
|
+
if (!this.cfg.keepPunctuation) t = t.replace(/[^\w\s']/g, ' ')
|
|
83
|
+
if (!this.cfg.keepNumbers) t = t.replace(/\b\d+\b/g, ' ')
|
|
84
|
+
return t.replace(/\s+/g, ' ').trim()
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
tokenize(text) {
|
|
88
|
+
return this.clean(text)
|
|
89
|
+
.split(' ')
|
|
90
|
+
.map(w => w.replace(/^'+|'+$/g, ''))
|
|
91
|
+
.filter(w => w.length >= this.cfg.minWordLength)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Split text into clean sentence token arrays
|
|
95
|
+
// Filters out any lines that look like metadata/labels
|
|
96
|
+
toSentences(text) {
|
|
97
|
+
const sentences = []
|
|
98
|
+
|
|
99
|
+
// First split by lines so we can filter whole lines
|
|
100
|
+
const lines = text.split('\n')
|
|
101
|
+
|
|
102
|
+
for (const line of lines) {
|
|
103
|
+
const trimmed = line.trim()
|
|
104
|
+
|
|
105
|
+
// Skip lines matching any skip pattern
|
|
106
|
+
const skip = this.cfg.skipLinePatterns.some(p => p.test(trimmed))
|
|
107
|
+
if (skip) continue
|
|
108
|
+
|
|
109
|
+
// Split line into sentences by punctuation, then tokenize each
|
|
110
|
+
const parts = trimmed.split(/[.!?]+/).map(s => s.trim()).filter(Boolean)
|
|
111
|
+
for (const part of parts) {
|
|
112
|
+
const tokens = this.tokenize(part)
|
|
113
|
+
if (tokens.length >= 3) sentences.push(tokens) // minimum 3 tokens per sentence
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return sentences
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
detectFormat(filePath, content) {
|
|
121
|
+
if (this.cfg.datasetFormat !== 'auto') return this.cfg.datasetFormat
|
|
122
|
+
const ext = path.extname(filePath).toLowerCase()
|
|
123
|
+
if (ext === '.jsonl') return 'jsonl'
|
|
124
|
+
if (ext === '.json') return 'json'
|
|
125
|
+
const t = content.trimStart()
|
|
126
|
+
if (t.startsWith('[')) return 'json'
|
|
127
|
+
if (t.startsWith('{')) return 'jsonl'
|
|
128
|
+
return 'text'
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ─────────────────────────────────────────────
|
|
133
|
+
// TRIE
|
|
134
|
+
// ─────────────────────────────────────────────
|
|
135
|
+
class TrieNode {
|
|
136
|
+
constructor() { this.children = Object.create(null); this.isEnd = false; this.count = 0 }
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
class Trie {
|
|
140
|
+
constructor() { this.root = new TrieNode(); this.wordCount = 0 }
|
|
141
|
+
|
|
142
|
+
insert(word) {
|
|
143
|
+
let node = this.root
|
|
144
|
+
for (const ch of word) {
|
|
145
|
+
if (!node.children[ch]) node.children[ch] = new TrieNode()
|
|
146
|
+
node = node.children[ch]
|
|
147
|
+
}
|
|
148
|
+
if (!node.isEnd) this.wordCount++
|
|
149
|
+
node.isEnd = true
|
|
150
|
+
node.count++
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Only return words that are COMPLETE words in the trie
|
|
154
|
+
// "hi" should NOT match "higher" — user must explicitly ask for prefix
|
|
155
|
+
searchExact(word) {
|
|
156
|
+
let node = this.root
|
|
157
|
+
for (const ch of word) {
|
|
158
|
+
if (!node.children[ch]) return false
|
|
159
|
+
node = node.children[ch]
|
|
160
|
+
}
|
|
161
|
+
return node.isEnd ? node.count : false
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
search(prefix, limit = 5) {
|
|
165
|
+
let node = this.root
|
|
166
|
+
for (const ch of prefix) {
|
|
167
|
+
if (!node.children[ch]) return []
|
|
168
|
+
node = node.children[ch]
|
|
169
|
+
}
|
|
170
|
+
const results = []
|
|
171
|
+
this._dfs(node, prefix, results)
|
|
172
|
+
return results.sort((a, b) => b.count - a.count).slice(0, limit).map(r => r.word)
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
_dfs(node, cur, acc) {
|
|
176
|
+
if (node.isEnd) acc.push({ word: cur, count: node.count })
|
|
177
|
+
for (const [ch, child] of Object.entries(node.children)) this._dfs(child, cur + ch, acc)
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
serialize() { return { wordCount: this.wordCount, root: this._sn(this.root) } }
|
|
181
|
+
_sn(n) { return { e: n.isEnd ? 1 : 0, c: n.count, ch: Object.fromEntries(Object.entries(n.children).map(([k, v]) => [k, this._sn(v)])) } }
|
|
182
|
+
|
|
183
|
+
static deserialize(data) {
|
|
184
|
+
const t = new Trie(); t.wordCount = data.wordCount; t.root = Trie._dn(data.root); return t
|
|
185
|
+
}
|
|
186
|
+
static _dn(r) {
|
|
187
|
+
const n = new TrieNode(); n.isEnd = r.e === 1; n.count = r.c
|
|
188
|
+
for (const [k, v] of Object.entries(r.ch)) n.children[k] = Trie._dn(v)
|
|
189
|
+
return n
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// ─────────────────────────────────────────────
|
|
194
|
+
// TF-IDF
|
|
195
|
+
// ─────────────────────────────────────────────
|
|
196
|
+
class TFIDF {
|
|
197
|
+
constructor() { this.df = Object.create(null); this.tf = Object.create(null); this.total = 0 }
|
|
198
|
+
|
|
199
|
+
train(sentences) {
|
|
200
|
+
this.total += sentences.length
|
|
201
|
+
for (const tokens of sentences) {
|
|
202
|
+
const seen = new Set()
|
|
203
|
+
for (const t of tokens) {
|
|
204
|
+
this.tf[t] = (this.tf[t] || 0) + 1
|
|
205
|
+
if (!seen.has(t)) { this.df[t] = (this.df[t] || 0) + 1; seen.add(t) }
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
idf(word) { const df = this.df[word] || 0; if (!df) return 0; return Math.log((this.total + 1) / (df + 1)) + 1 }
|
|
211
|
+
score(word) { return (this.tf[word] || 0) * this.idf(word) }
|
|
212
|
+
|
|
213
|
+
serialize() { return { df: this.df, tf: this.tf, total: this.total } }
|
|
214
|
+
static deserialize(d) { const t = new TFIDF(); t.df = d.df; t.tf = d.tf; t.total = d.total; return t }
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// ─────────────────────────────────────────────
|
|
218
|
+
// ENSEMBLE N-GRAM
|
|
219
|
+
// ─────────────────────────────────────────────
|
|
220
|
+
class EnsembleNGram {
|
|
221
|
+
constructor(cfg) {
|
|
222
|
+
this.nMin = cfg.nMin
|
|
223
|
+
this.nMax = cfg.nMax
|
|
224
|
+
this.alpha = cfg.smoothing ? cfg.smoothingAlpha : 0
|
|
225
|
+
this.models = {}
|
|
226
|
+
for (let n = cfg.nMin; n <= cfg.nMax; n++) {
|
|
227
|
+
this.models[n] = { table: Object.create(null), vocab: new Set() }
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
train(sentences) {
|
|
232
|
+
for (const tokens of sentences) {
|
|
233
|
+
for (let n = this.nMin; n <= this.nMax; n++) {
|
|
234
|
+
const model = this.models[n]
|
|
235
|
+
for (const t of tokens) model.vocab.add(t)
|
|
236
|
+
for (let size = 1; size <= n; size++) {
|
|
237
|
+
for (let i = 0; i + size < tokens.length; i++) {
|
|
238
|
+
const ctx = tokens.slice(i, i + size).join(' ')
|
|
239
|
+
const next = tokens[i + size]
|
|
240
|
+
if (!model.table[ctx]) model.table[ctx] = Object.create(null)
|
|
241
|
+
model.table[ctx][next] = (model.table[ctx][next] || 0) + 1
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
predict(contextTokens, limit = 5, weights = null) {
|
|
249
|
+
const scores = Object.create(null)
|
|
250
|
+
const nSizes = Object.keys(this.models).map(Number)
|
|
251
|
+
const totalN = nSizes.reduce((a, b) => a + b, 0)
|
|
252
|
+
|
|
253
|
+
for (let idx = 0; idx < nSizes.length; idx++) {
|
|
254
|
+
const n = nSizes[idx]
|
|
255
|
+
const model = this.models[n]
|
|
256
|
+
const vocab = model.vocab.size || 1
|
|
257
|
+
const w = weights ? (weights[idx] || 1) : n / totalN
|
|
258
|
+
|
|
259
|
+
for (let size = Math.min(n, contextTokens.length); size >= 1; size--) {
|
|
260
|
+
const ctx = contextTokens.slice(-size).join(' ')
|
|
261
|
+
const dist = model.table[ctx]
|
|
262
|
+
if (!dist) continue
|
|
263
|
+
const total = Object.values(dist).reduce((a, b) => a + b, 0)
|
|
264
|
+
for (const [word, count] of Object.entries(dist)) {
|
|
265
|
+
scores[word] = (scores[word] || 0) + ((count + this.alpha) / (total + this.alpha * vocab)) * w
|
|
266
|
+
}
|
|
267
|
+
break
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return Object.entries(scores)
|
|
272
|
+
.map(([word, score]) => ({ word, score: +score.toFixed(5) }))
|
|
273
|
+
.sort((a, b) => b.score - a.score)
|
|
274
|
+
.slice(0, limit)
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/**
|
|
278
|
+
* Sample with temperature + repeat penalty
|
|
279
|
+
* recentWords: Set of recently generated words to penalize
|
|
280
|
+
*/
|
|
281
|
+
sample(predictions, temp = 0.6, recentWords = null) {
|
|
282
|
+
if (!predictions.length) return null
|
|
283
|
+
if (temp === 0) return predictions[0].word
|
|
284
|
+
|
|
285
|
+
let pool = predictions
|
|
286
|
+
if (recentWords && recentWords.size) {
|
|
287
|
+
// reduce score of recently used words by 60%
|
|
288
|
+
pool = predictions.map(p => ({
|
|
289
|
+
word: p.word,
|
|
290
|
+
score: recentWords.has(p.word) ? p.score * 0.4 : p.score
|
|
291
|
+
})).sort((a, b) => b.score - a.score)
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const scaled = pool.map(p => ({ word: p.word, w: Math.pow(Math.max(p.score, 1e-9), 1 / temp) }))
|
|
295
|
+
const total = scaled.reduce((s, p) => s + p.w, 0)
|
|
296
|
+
let rand = Math.random() * total
|
|
297
|
+
for (const p of scaled) { rand -= p.w; if (rand <= 0) return p.word }
|
|
298
|
+
return scaled[scaled.length - 1].word
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
serialize() {
|
|
302
|
+
const out = { nMin: this.nMin, nMax: this.nMax, alpha: this.alpha, models: {} }
|
|
303
|
+
for (const [n, m] of Object.entries(this.models)) out.models[n] = { table: m.table, vocab: [...m.vocab] }
|
|
304
|
+
return out
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
static deserialize(data) {
|
|
308
|
+
const e = new EnsembleNGram({ nMin: data.nMin, nMax: data.nMax, smoothing: false, smoothingAlpha: data.alpha })
|
|
309
|
+
e.alpha = data.alpha
|
|
310
|
+
for (const [n, m] of Object.entries(data.models)) e.models[n] = { table: m.table, vocab: new Set(m.vocab) }
|
|
311
|
+
return e
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// ─────────────────────────────────────────────
|
|
316
|
+
// PIPELINE
|
|
317
|
+
// ─────────────────────────────────────────────
|
|
318
|
+
class Pipeline {
|
|
319
|
+
constructor(trie, ngram, tfidf, tokenizer, cfg) {
|
|
320
|
+
this.trie = trie
|
|
321
|
+
this.ngram = ngram
|
|
322
|
+
this.tfidf = tfidf
|
|
323
|
+
this.tokenizer = tokenizer
|
|
324
|
+
this.cfg = cfg
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
_blend(predictions) {
|
|
328
|
+
if (!this.cfg.useTFIDF || !this.tfidf) return predictions
|
|
329
|
+
if (!predictions.length) return predictions
|
|
330
|
+
const blend = this.cfg.tfidfBlend
|
|
331
|
+
const scores = predictions.map(p => this.tfidf.score(p.word))
|
|
332
|
+
const maxTF = Math.max(...scores, 1e-9)
|
|
333
|
+
return predictions.map((p, i) => ({
|
|
334
|
+
...p,
|
|
335
|
+
score: +(p.score * (1 - blend) + (scores[i] / maxTF) * blend).toFixed(5)
|
|
336
|
+
})).sort((a, b) => b.score - a.score)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Suggest: if input ends with space → next words only
|
|
341
|
+
* if input doesn't end with space → word completions for partial + next words for context
|
|
342
|
+
*/
|
|
343
|
+
suggest(input, options = {}) {
|
|
344
|
+
const limit = options.limit || this.cfg.maxSuggestions
|
|
345
|
+
const endsSpace = input.endsWith(' ')
|
|
346
|
+
const tokens = this.tokenizer.tokenize(input)
|
|
347
|
+
const last = tokens[tokens.length - 1] || ''
|
|
348
|
+
const ctx = endsSpace ? tokens : tokens.slice(0, -1)
|
|
349
|
+
const result = { input, partialWord: endsSpace ? null : last, wordCompletions: [], nextWords: [] }
|
|
350
|
+
|
|
351
|
+
// Word completion — only when there is a partial word
|
|
352
|
+
if (!endsSpace && last.length >= 1) {
|
|
353
|
+
result.wordCompletions = this.trie.search(last, limit)
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Next word — always from context if we have any
|
|
357
|
+
if (ctx.length > 0) {
|
|
358
|
+
const raw = this.ngram.predict(ctx, limit * 2, this.cfg.ensembleWeights)
|
|
359
|
+
result.nextWords = this._blend(raw).slice(0, limit)
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
return result
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Complete a sentence from a seed.
|
|
367
|
+
* Fixes:
|
|
368
|
+
* - Resolves partial word at end via trie before generating
|
|
369
|
+
* - Loop detection: tracks bigrams in generated text, stops on repeat
|
|
370
|
+
* - Repeat penalty: reduces score of recently used words
|
|
371
|
+
*/
|
|
372
|
+
complete(input, options = {}) {
|
|
373
|
+
const maxWords = options.maxWords || this.cfg.maxCompletionWords
|
|
374
|
+
const temp = options.temperature !== undefined ? options.temperature : this.cfg.completionTemp
|
|
375
|
+
const limit = options.limit || this.cfg.maxSuggestions
|
|
376
|
+
|
|
377
|
+
let tokens = this.tokenizer.tokenize(input)
|
|
378
|
+
if (!tokens.length) return { seed: input.trim(), completed: '', generated: '', wordCount: 0 }
|
|
379
|
+
|
|
380
|
+
// Resolve partial word at end if input doesn't end with space
|
|
381
|
+
if (!input.endsWith(' ') && tokens.length > 0) {
|
|
382
|
+
const completions = this.trie.search(tokens[tokens.length - 1], 1)
|
|
383
|
+
if (completions.length) tokens[tokens.length - 1] = completions[0]
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const generated = []
|
|
387
|
+
const bigramCounts = Object.create(null) // bigram loop detection
|
|
388
|
+
const recentWindow = 5 // last N words for repeat penalty
|
|
389
|
+
|
|
390
|
+
for (let i = 0; i < maxWords; i++) {
|
|
391
|
+
const raw = this.ngram.predict(tokens, limit * 2, this.cfg.ensembleWeights)
|
|
392
|
+
const pred = this._blend(raw).slice(0, limit)
|
|
393
|
+
if (!pred.length) break
|
|
394
|
+
|
|
395
|
+
// Build recent word set for penalty
|
|
396
|
+
const recentWords = this.cfg.penalizeRepeats
|
|
397
|
+
? new Set(generated.slice(-recentWindow))
|
|
398
|
+
: null
|
|
399
|
+
|
|
400
|
+
const next = this.ngram.sample(pred, temp, recentWords)
|
|
401
|
+
if (!next) break
|
|
402
|
+
|
|
403
|
+
// Bigram loop detection — "X Y X Y X Y" = stop
|
|
404
|
+
const bigram = tokens.length > 0 ? `${tokens[tokens.length - 1]} ${next}` : next
|
|
405
|
+
bigramCounts[bigram] = (bigramCounts[bigram] || 0) + 1
|
|
406
|
+
if (bigramCounts[bigram] >= this.cfg.maxRepeatBigram) break
|
|
407
|
+
|
|
408
|
+
tokens.push(next)
|
|
409
|
+
generated.push(next)
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const seedTokens = this.tokenizer.tokenize(input.trim())
|
|
413
|
+
return {
|
|
414
|
+
seed: input.trim(),
|
|
415
|
+
completed: [...seedTokens, ...generated].join(' '),
|
|
416
|
+
generated: generated.join(' '),
|
|
417
|
+
wordCount: generated.length,
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
completions(input, options = {}) {
|
|
422
|
+
const count = options.count || 3
|
|
423
|
+
const seen = new Set()
|
|
424
|
+
const out = []
|
|
425
|
+
for (let i = 0; i < count * 5 && out.length < count; i++) {
|
|
426
|
+
const r = this.complete(input, options)
|
|
427
|
+
if (r.completed && !seen.has(r.completed)) { seen.add(r.completed); out.push(r) }
|
|
428
|
+
}
|
|
429
|
+
return out
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// ─────────────────────────────────────────────
|
|
434
|
+
// MAIN CLASS — PredictJS
|
|
435
|
+
// ─────────────────────────────────────────────
|
|
436
|
+
class PredictJS {
|
|
437
|
+
constructor(options = {}) {
|
|
438
|
+
this.config = { ...DEFAULTS, ...options }
|
|
439
|
+
this.tokenizer = new Tokenizer(this.config)
|
|
440
|
+
this.trie = new Trie()
|
|
441
|
+
this.ngram = new EnsembleNGram(this.config)
|
|
442
|
+
this.tfidf = new TFIDF()
|
|
443
|
+
this._rebuild()
|
|
444
|
+
this.trained = false
|
|
445
|
+
this.stats = { tokens: 0, sentences: 0, uniqueWords: 0, ngramContexts: 0 }
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
_rebuild() {
|
|
449
|
+
this.pipeline = new Pipeline(this.trie, this.ngram, this.tfidf, this.tokenizer, this.config)
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// ── Training ───────────────────────────────
|
|
453
|
+
|
|
454
|
+
trainText(text) {
|
|
455
|
+
this._trainSentences(this.tokenizer.toSentences(text))
|
|
456
|
+
return this
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
trainArray(arr) {
|
|
460
|
+
this._trainSentences(arr.flatMap(t => this.tokenizer.toSentences(t)))
|
|
461
|
+
return this
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
trainFile(filePath) {
|
|
465
|
+
const absPath = path.resolve(filePath)
|
|
466
|
+
if (!fs.existsSync(absPath)) throw new Error(`File not found: ${absPath}`)
|
|
467
|
+
const content = fs.readFileSync(absPath, this.config.encoding)
|
|
468
|
+
const format = this.tokenizer.detectFormat(absPath, content)
|
|
469
|
+
let texts = []
|
|
470
|
+
|
|
471
|
+
if (format === 'text') {
|
|
472
|
+
texts = [content]
|
|
473
|
+
} else if (format === 'json') {
|
|
474
|
+
const arr = Array.isArray(JSON.parse(content)) ? JSON.parse(content) : [JSON.parse(content)]
|
|
475
|
+
texts = arr.map(i => typeof i === 'string' ? i : (i[this.config.jsonTextField] || '')).filter(Boolean)
|
|
476
|
+
} else if (format === 'jsonl') {
|
|
477
|
+
texts = content.split('\n').filter(Boolean).map(line => {
|
|
478
|
+
try { const o = JSON.parse(line); return typeof o === 'string' ? o : (o[this.config.jsonTextField] || '') } catch { return '' }
|
|
479
|
+
}).filter(Boolean)
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return this.trainArray(texts)
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
_trainSentences(sentences) {
|
|
486
|
+
if (!sentences.length) return this
|
|
487
|
+
const allTokens = sentences.flat()
|
|
488
|
+
for (const t of allTokens) this.trie.insert(t)
|
|
489
|
+
this.ngram.train(sentences)
|
|
490
|
+
this.tfidf.train(sentences)
|
|
491
|
+
|
|
492
|
+
this.stats.tokens += allTokens.length
|
|
493
|
+
this.stats.sentences += sentences.length
|
|
494
|
+
this.stats.uniqueWords = this.trie.wordCount
|
|
495
|
+
this.stats.ngramContexts = Object.keys(this.ngram.models[this.config.nMax]?.table || {}).length
|
|
496
|
+
this.trained = true
|
|
497
|
+
|
|
498
|
+
if (this.config.autoSave) this.saveIndex()
|
|
499
|
+
return this
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// ── Prediction API ─────────────────────────
|
|
503
|
+
|
|
504
|
+
suggest(input, options = {}) { this._assert(); return this.pipeline.suggest(input, options) }
|
|
505
|
+
completeWord(partial, limit) { this._assert(); return this.trie.search(partial, limit || this.config.maxSuggestions) }
|
|
506
|
+
complete(input, options = {}) { this._assert(); return this.pipeline.complete(input, options) }
|
|
507
|
+
completions(input, options = {}) { this._assert(); return this.pipeline.completions(input, options) }
|
|
508
|
+
|
|
509
|
+
nextWord(context, limit) {
|
|
510
|
+
this._assert()
|
|
511
|
+
const tokens = this.tokenizer.tokenize(context)
|
|
512
|
+
const raw = this.ngram.predict(tokens, (limit || this.config.maxSuggestions) * 2, this.config.ensembleWeights)
|
|
513
|
+
return this.pipeline._blend(raw).slice(0, limit || this.config.maxSuggestions)
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// ── Index ──────────────────────────────────
|
|
517
|
+
|
|
518
|
+
saveIndex(filePath) {
|
|
519
|
+
const dest = path.resolve(filePath || this.config.indexPath)
|
|
520
|
+
const index = {
|
|
521
|
+
version: '3.0.0',
|
|
522
|
+
savedAt: new Date().toISOString(),
|
|
523
|
+
config: this.config,
|
|
524
|
+
stats: this.stats,
|
|
525
|
+
trie: this.trie.serialize(),
|
|
526
|
+
ngram: this.ngram.serialize(),
|
|
527
|
+
tfidf: this.tfidf.serialize(),
|
|
528
|
+
}
|
|
529
|
+
fs.writeFileSync(dest, JSON.stringify(index, null, 2))
|
|
530
|
+
console.log(`[PredictJS] Index saved → ${dest}`)
|
|
531
|
+
return dest
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
loadIndex(filePath) {
|
|
535
|
+
const src = path.resolve(filePath || this.config.indexPath)
|
|
536
|
+
if (!fs.existsSync(src)) throw new Error(`Index not found: ${src}`)
|
|
537
|
+
const index = JSON.parse(fs.readFileSync(src, 'utf8'))
|
|
538
|
+
this.config = { ...index.config, ...this.config }
|
|
539
|
+
this.stats = index.stats
|
|
540
|
+
this.trie = Trie.deserialize(index.trie)
|
|
541
|
+
this.ngram = EnsembleNGram.deserialize(index.ngram)
|
|
542
|
+
this.tfidf = TFIDF.deserialize(index.tfidf)
|
|
543
|
+
this._rebuild()
|
|
544
|
+
this.trained = true
|
|
545
|
+
console.log(`[PredictJS] Index loaded ← ${src} (${this.stats.uniqueWords} words, ${this.stats.sentences} sentences)`)
|
|
546
|
+
return this
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// ── Utils ──────────────────────────────────
|
|
550
|
+
|
|
551
|
+
getStats() { return { ...this.stats, trained: this.trained, config: this.config } }
|
|
552
|
+
configure(options) { this.config = { ...this.config, ...options }; this._rebuild(); return this }
|
|
553
|
+
reset() {
|
|
554
|
+
this.trie = new Trie(); this.ngram = new EnsembleNGram(this.config)
|
|
555
|
+
this.tfidf = new TFIDF(); this._rebuild(); this.trained = false
|
|
556
|
+
this.stats = { tokens: 0, sentences: 0, uniqueWords: 0, ngramContexts: 0 }
|
|
557
|
+
return this
|
|
558
|
+
}
|
|
559
|
+
_assert() { if (!this.trained) throw new Error('[PredictJS] Model not trained. Call trainText(), trainArray(), or trainFile() first.') }
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
module.exports = { PredictJS, DEFAULTS }
|
package/readme.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# node-predict
|
|
2
|
+
|
|
3
|
+
A lightweight, fully offline, trainable native text prediction engine for Node.js. Provides word completions, next-word predictions, and sentence generation using n-gram language models with TF-IDF weighting.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
Load a pre-trained model and make predictions:
|
|
8
|
+
|
|
9
|
+
```js
|
|
10
|
+
const { PredictJS } = require('node-predict')
|
|
11
|
+
|
|
12
|
+
const predictor = new PredictJS()
|
|
13
|
+
predictor.loadIndex('./model-index.json')
|
|
14
|
+
|
|
15
|
+
// 1) Word completions
|
|
16
|
+
console.log(predictor.completeWord('app'))
|
|
17
|
+
|
|
18
|
+
// 2) Next-word predictions
|
|
19
|
+
console.log(predictor.nextWord('The best way to'))
|
|
20
|
+
|
|
21
|
+
// 3) Complete a sentence
|
|
22
|
+
console.log(predictor.complete('Learning new skills'))
|
|
23
|
+
|
|
24
|
+
// 4) Multiple completions
|
|
25
|
+
console.log(predictor.completions('In my opinion', { count: 3 }))
|
|
26
|
+
|
|
27
|
+
// 5) Combined suggestion (partial + next words)
|
|
28
|
+
console.log(predictor.suggest('I en'))
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Training Your Own Model
|
|
34
|
+
|
|
35
|
+
To train the model on your custom dataset, edit `dataset.txt` with your text samples, then run `node train.js`. The training script reads `dataset.txt`, builds n-gram models, calculates TF-IDF weights, and saves the trained model to `model-index.json`.
|
|
36
|
+
|
|
37
|
+
### Training Code Example
|
|
38
|
+
|
|
39
|
+
Here's how `train.js` trains the model:
|
|
40
|
+
|
|
41
|
+
```js
|
|
42
|
+
const { PredictJS } = require('./predictjs')
|
|
43
|
+
|
|
44
|
+
const DATASET_PATH = './dataset.txt'
|
|
45
|
+
const INDEX_PATH = './model-index.json'
|
|
46
|
+
|
|
47
|
+
const predictor = new PredictJS({
|
|
48
|
+
// N-gram range
|
|
49
|
+
nMin: 2,
|
|
50
|
+
nMax: 4,
|
|
51
|
+
|
|
52
|
+
// Smoothing — keep low for large datasets, raise towards 0.3 for small ones
|
|
53
|
+
smoothing: true,
|
|
54
|
+
smoothingAlpha: 0.05,
|
|
55
|
+
|
|
56
|
+
// Ensemble weights [bigram, trigram, 4-gram]
|
|
57
|
+
// Higher weight = more influence on predictions
|
|
58
|
+
ensembleWeights: [0.15, 0.35, 0.50],
|
|
59
|
+
|
|
60
|
+
// TF-IDF blending
|
|
61
|
+
useTFIDF: true,
|
|
62
|
+
tfidfBlend: 0.5,
|
|
63
|
+
|
|
64
|
+
// Word settings
|
|
65
|
+
minWordLength: 2,
|
|
66
|
+
maxSuggestions: 5,
|
|
67
|
+
|
|
68
|
+
// Completion settings
|
|
69
|
+
maxCompletionWords: 20,
|
|
70
|
+
completionTemp: 0.6,
|
|
71
|
+
|
|
72
|
+
// Loop prevention
|
|
73
|
+
maxRepeatBigram: 2,
|
|
74
|
+
penalizeRepeats: true,
|
|
75
|
+
|
|
76
|
+
// Strip numbers, lowercase, no punctuation
|
|
77
|
+
caseSensitive: false,
|
|
78
|
+
keepPunctuation: false,
|
|
79
|
+
keepNumbers: false,
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
console.log('Training...')
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
const start = Date.now()
|
|
86
|
+
predictor.trainFile(DATASET_PATH)
|
|
87
|
+
const elapsed = Date.now() - start
|
|
88
|
+
const stats = predictor.getStats()
|
|
89
|
+
|
|
90
|
+
console.log('✔ Training complete')
|
|
91
|
+
console.log('Sentences:', stats.sentences)
|
|
92
|
+
console.log('Total tokens:', stats.tokens)
|
|
93
|
+
console.log('Unique words:', stats.uniqueWords)
|
|
94
|
+
console.log('Time taken:', elapsed, 'ms')
|
|
95
|
+
|
|
96
|
+
predictor.saveIndex(INDEX_PATH)
|
|
97
|
+
console.log('✔ Model saved to', INDEX_PATH)
|
|
98
|
+
|
|
99
|
+
} catch (err) {
|
|
100
|
+
console.error('✘ Training failed:', err.message)
|
|
101
|
+
process.exit(1)
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
To train with your own data:
|
|
106
|
+
|
|
107
|
+
1. Edit `dataset.txt` with your text samples
|
|
108
|
+
2. Run `node train.js`
|
|
109
|
+
3. The trained model is saved to `model-index.json`
|
|
110
|
+
4. Use it as shown in the Quick Start section
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Configuration Options
|
|
115
|
+
|
|
116
|
+
Edit `train.js` to customize training behavior. Common options:
|
|
117
|
+
|
|
118
|
+
- **`nMin`, `nMax`** — n-gram range (default `2–4`). Controls context window size for predictions.
|
|
119
|
+
- **`smoothingAlpha`** — smoothing strength for unseen word pairs (default `0.05`). Keeps low for large datasets, raise towards 0.3 for small ones.
|
|
120
|
+
- **`ensembleWeights`** — weights for [bigram, trigram, 4-gram] predictions (default `[0.15, 0.35, 0.50]`). Higher weights give more influence.
|
|
121
|
+
- **`tfidfBlend`** — blend between raw frequency and TF-IDF scoring (default `0.5`). Higher values make results less common-word-heavy.
|
|
122
|
+
- **`maxCompletionWords`** — maximum words to generate (default `20`).
|
|
123
|
+
- **`completionTemp`** — sampling temperature for completions (default `0.6`).
|
|
124
|
+
- **`minWordLength`** — minimum word length to consider (default `2`).
|
|
125
|
+
- **`maxSuggestions`** — maximum suggestions to return (default `5`).
|
|
126
|
+
|
|
127
|
+
After editing options, re-run `node train.js` to retrain with new settings.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## API Reference
|
|
132
|
+
|
|
133
|
+
### `predictor.completeWord(prefix)`
|
|
134
|
+
Returns word completions for a given prefix.
|
|
135
|
+
|
|
136
|
+
### `predictor.nextWord(context)`
|
|
137
|
+
Predicts the next word given preceding context.
|
|
138
|
+
|
|
139
|
+
### `predictor.complete(seed)`
|
|
140
|
+
Generates a complete sentence starting from a seed phrase.
|
|
141
|
+
|
|
142
|
+
### `predictor.completions(context, options)`
|
|
143
|
+
Returns multiple completion options with customizable count.
|
|
144
|
+
|
|
145
|
+
### `predictor.suggest(partial)`
|
|
146
|
+
Combined suggestion: completes partial word + predicts next words.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Temperature (Sampling)
|
|
151
|
+
|
|
152
|
+
The `completionTemp` option controls creativity of generated suggestions:
|
|
153
|
+
|
|
154
|
+
- **`0.0–0.3`** — very predictable, deterministic
|
|
155
|
+
- **`0.6`** — balanced (default), good for most uses
|
|
156
|
+
- **`0.8–1.0`** — more creative and varied, may diverge from dataset style
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Tips & Troubleshooting
|
|
161
|
+
|
|
162
|
+
- **Poor predictions?** Add more in-style text to `dataset.txt` and retrain.
|
|
163
|
+
- **Too many common words?** Increase dataset variety or raise `tfidfBlend` towards 1.0.
|
|
164
|
+
- **No matches found?** Ensure your seed text matches your dataset language/style.
|
|
165
|
+
- **Slow training?** Large datasets may take longer; consider using representative samples.
|
|
166
|
+
- **Memory usage?** Larger n-gram ranges and bigger datasets consume more memory.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
Author: Ismail Gidado
|
|
171
|
+
License: MIT
|