node-predict 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/package.json +24 -0
  2. package/predictjs.js +562 -0
  3. package/readme.md +171 -0
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "node-predict",
3
+ "version": "3.0.0",
4
+ "description": "Full lightweight and fast native offline, trainable text prediction engine for Node.js",
5
+ "keywords": [
6
+ "predict",
7
+ "autocomplete",
8
+ "text-prediction",
9
+ "ngram",
10
+ "trie",
11
+ "tfidf",
12
+ "predictjs"
13
+ ],
14
+ "license": "MIT",
15
+ "author": "Ismail Gidado",
16
+ "type": "commonjs",
17
+ "main": "predictjs.js",
18
+ "engines": {
19
+ "node": ">=14"
20
+ },
21
+ "scripts": {
22
+ "test": "node app.js"
23
+ }
24
+ }
package/predictjs.js ADDED
@@ -0,0 +1,562 @@
1
+ 'use strict'
2
+
3
+ /**
4
+ * ╔══════════════════════════════════════════════════════════════════╗
5
+ * ║ PredictJS v3.0 ║
6
+ * ║ Offline text prediction engine — learns purely from text ║
7
+ * ║ ────────────────────────────────────────────────────────── ║
8
+ * ║ • Trie → partial word completion ║
9
+ * ║ • Ensemble NGram → multi-size n-gram voting for accuracy ║
10
+ * ║ • TF-IDF → meaningful word scoring over filler ║
11
+ * ║ • Loop guard → prevents repetitive generation cycles ║
12
+ * ║ • Predict mode → suggest / complete / next word ║
13
+ * ║ • Persistent → save/load trained index from JSON file ║
14
+ * ╚══════════════════════════════════════════════════════════════════╝
15
+ */
16
+
17
+ const fs = require('fs')
18
+ const path = require('path')
19
+
20
+ // ─────────────────────────────────────────────
21
+ // DEFAULT CONFIG — tuned for best accuracy
22
+ // ─────────────────────────────────────────────
23
+ const DEFAULTS = {
24
+ // N-gram ensemble
25
+ nMin: 2, // bigram — broad coverage
26
+ nMax: 4, // 4-gram — specific context
27
+ smoothing: true,
28
+ smoothingAlpha: 0.05, // light smoothing — keeps predictions tight to actual data
29
+
30
+ // TF-IDF
31
+ useTFIDF: true,
32
+ tfidfBlend: 0.5, // 50/50 blend — balances frequency vs meaningfulness
33
+
34
+ // Ensemble weights — give more weight to larger n-grams (more specific)
35
+ // [weight for nMin, ..., weight for nMax] must match nMax - nMin + 1 entries
36
+ ensembleWeights: [0.15, 0.35, 0.50], // bigram=15%, trigram=35%, 4-gram=50%
37
+
38
+ // Trie
39
+ minWordLength: 2,
40
+ maxSuggestions: 5,
41
+
42
+ // Sentence completion
43
+ maxCompletionWords: 20,
44
+ completionTemp: 0.6, // lower = more faithful to dataset patterns
45
+ sentenceEndTokens: ['<END>'],
46
+
47
+ // Loop / repetition guard
48
+ maxRepeatBigram: 2, // stop if any 2-word sequence repeats this many times
49
+ penalizeRepeats: true, // reduce score of recently used words
50
+
51
+ // Tokenizer
52
+ caseSensitive: false,
53
+ keepPunctuation: false,
54
+ keepNumbers: false, // strip numbers — reduces noise
55
+
56
+ // Skip lines that look like metadata (comments, labels, etc.)
57
+ skipLinePatterns: [
58
+ /^#/, // comment lines
59
+ /^(input|response|output|user|bot|assistant|human)\s*:/i, // dialogue labels
60
+ /^\s*$/, // blank lines
61
+ ],
62
+
63
+ // Dataset
64
+ encoding: 'utf8',
65
+ datasetFormat: 'auto',
66
+ jsonTextField: 'text',
67
+
68
+ // Index
69
+ indexPath: './model-index.json',
70
+ autoSave: false,
71
+ }
72
+
73
+ // ─────────────────────────────────────────────
74
+ // TOKENIZER
75
+ // ─────────────────────────────────────────────
76
+ class Tokenizer {
77
+ constructor(cfg) { this.cfg = cfg }
78
+
79
+ clean(text) {
80
+ let t = text
81
+ if (!this.cfg.caseSensitive) t = t.toLowerCase()
82
+ if (!this.cfg.keepPunctuation) t = t.replace(/[^\w\s']/g, ' ')
83
+ if (!this.cfg.keepNumbers) t = t.replace(/\b\d+\b/g, ' ')
84
+ return t.replace(/\s+/g, ' ').trim()
85
+ }
86
+
87
+ tokenize(text) {
88
+ return this.clean(text)
89
+ .split(' ')
90
+ .map(w => w.replace(/^'+|'+$/g, ''))
91
+ .filter(w => w.length >= this.cfg.minWordLength)
92
+ }
93
+
94
+ // Split text into clean sentence token arrays
95
+ // Filters out any lines that look like metadata/labels
96
+ toSentences(text) {
97
+ const sentences = []
98
+
99
+ // First split by lines so we can filter whole lines
100
+ const lines = text.split('\n')
101
+
102
+ for (const line of lines) {
103
+ const trimmed = line.trim()
104
+
105
+ // Skip lines matching any skip pattern
106
+ const skip = this.cfg.skipLinePatterns.some(p => p.test(trimmed))
107
+ if (skip) continue
108
+
109
+ // Split line into sentences by punctuation, then tokenize each
110
+ const parts = trimmed.split(/[.!?]+/).map(s => s.trim()).filter(Boolean)
111
+ for (const part of parts) {
112
+ const tokens = this.tokenize(part)
113
+ if (tokens.length >= 3) sentences.push(tokens) // minimum 3 tokens per sentence
114
+ }
115
+ }
116
+
117
+ return sentences
118
+ }
119
+
120
+ detectFormat(filePath, content) {
121
+ if (this.cfg.datasetFormat !== 'auto') return this.cfg.datasetFormat
122
+ const ext = path.extname(filePath).toLowerCase()
123
+ if (ext === '.jsonl') return 'jsonl'
124
+ if (ext === '.json') return 'json'
125
+ const t = content.trimStart()
126
+ if (t.startsWith('[')) return 'json'
127
+ if (t.startsWith('{')) return 'jsonl'
128
+ return 'text'
129
+ }
130
+ }
131
+
132
+ // ─────────────────────────────────────────────
133
+ // TRIE
134
+ // ─────────────────────────────────────────────
135
+ class TrieNode {
136
+ constructor() { this.children = Object.create(null); this.isEnd = false; this.count = 0 }
137
+ }
138
+
139
+ class Trie {
140
+ constructor() { this.root = new TrieNode(); this.wordCount = 0 }
141
+
142
+ insert(word) {
143
+ let node = this.root
144
+ for (const ch of word) {
145
+ if (!node.children[ch]) node.children[ch] = new TrieNode()
146
+ node = node.children[ch]
147
+ }
148
+ if (!node.isEnd) this.wordCount++
149
+ node.isEnd = true
150
+ node.count++
151
+ }
152
+
153
+ // Only return words that are COMPLETE words in the trie
154
+ // "hi" should NOT match "higher" — user must explicitly ask for prefix
155
+ searchExact(word) {
156
+ let node = this.root
157
+ for (const ch of word) {
158
+ if (!node.children[ch]) return false
159
+ node = node.children[ch]
160
+ }
161
+ return node.isEnd ? node.count : false
162
+ }
163
+
164
+ search(prefix, limit = 5) {
165
+ let node = this.root
166
+ for (const ch of prefix) {
167
+ if (!node.children[ch]) return []
168
+ node = node.children[ch]
169
+ }
170
+ const results = []
171
+ this._dfs(node, prefix, results)
172
+ return results.sort((a, b) => b.count - a.count).slice(0, limit).map(r => r.word)
173
+ }
174
+
175
+ _dfs(node, cur, acc) {
176
+ if (node.isEnd) acc.push({ word: cur, count: node.count })
177
+ for (const [ch, child] of Object.entries(node.children)) this._dfs(child, cur + ch, acc)
178
+ }
179
+
180
+ serialize() { return { wordCount: this.wordCount, root: this._sn(this.root) } }
181
+ _sn(n) { return { e: n.isEnd ? 1 : 0, c: n.count, ch: Object.fromEntries(Object.entries(n.children).map(([k, v]) => [k, this._sn(v)])) } }
182
+
183
+ static deserialize(data) {
184
+ const t = new Trie(); t.wordCount = data.wordCount; t.root = Trie._dn(data.root); return t
185
+ }
186
+ static _dn(r) {
187
+ const n = new TrieNode(); n.isEnd = r.e === 1; n.count = r.c
188
+ for (const [k, v] of Object.entries(r.ch)) n.children[k] = Trie._dn(v)
189
+ return n
190
+ }
191
+ }
192
+
193
+ // ─────────────────────────────────────────────
194
+ // TF-IDF
195
+ // ─────────────────────────────────────────────
196
+ class TFIDF {
197
+ constructor() { this.df = Object.create(null); this.tf = Object.create(null); this.total = 0 }
198
+
199
+ train(sentences) {
200
+ this.total += sentences.length
201
+ for (const tokens of sentences) {
202
+ const seen = new Set()
203
+ for (const t of tokens) {
204
+ this.tf[t] = (this.tf[t] || 0) + 1
205
+ if (!seen.has(t)) { this.df[t] = (this.df[t] || 0) + 1; seen.add(t) }
206
+ }
207
+ }
208
+ }
209
+
210
+ idf(word) { const df = this.df[word] || 0; if (!df) return 0; return Math.log((this.total + 1) / (df + 1)) + 1 }
211
+ score(word) { return (this.tf[word] || 0) * this.idf(word) }
212
+
213
+ serialize() { return { df: this.df, tf: this.tf, total: this.total } }
214
+ static deserialize(d) { const t = new TFIDF(); t.df = d.df; t.tf = d.tf; t.total = d.total; return t }
215
+ }
216
+
217
+ // ─────────────────────────────────────────────
218
+ // ENSEMBLE N-GRAM
219
+ // ─────────────────────────────────────────────
220
+ class EnsembleNGram {
221
+ constructor(cfg) {
222
+ this.nMin = cfg.nMin
223
+ this.nMax = cfg.nMax
224
+ this.alpha = cfg.smoothing ? cfg.smoothingAlpha : 0
225
+ this.models = {}
226
+ for (let n = cfg.nMin; n <= cfg.nMax; n++) {
227
+ this.models[n] = { table: Object.create(null), vocab: new Set() }
228
+ }
229
+ }
230
+
231
+ train(sentences) {
232
+ for (const tokens of sentences) {
233
+ for (let n = this.nMin; n <= this.nMax; n++) {
234
+ const model = this.models[n]
235
+ for (const t of tokens) model.vocab.add(t)
236
+ for (let size = 1; size <= n; size++) {
237
+ for (let i = 0; i + size < tokens.length; i++) {
238
+ const ctx = tokens.slice(i, i + size).join(' ')
239
+ const next = tokens[i + size]
240
+ if (!model.table[ctx]) model.table[ctx] = Object.create(null)
241
+ model.table[ctx][next] = (model.table[ctx][next] || 0) + 1
242
+ }
243
+ }
244
+ }
245
+ }
246
+ }
247
+
248
+ predict(contextTokens, limit = 5, weights = null) {
249
+ const scores = Object.create(null)
250
+ const nSizes = Object.keys(this.models).map(Number)
251
+ const totalN = nSizes.reduce((a, b) => a + b, 0)
252
+
253
+ for (let idx = 0; idx < nSizes.length; idx++) {
254
+ const n = nSizes[idx]
255
+ const model = this.models[n]
256
+ const vocab = model.vocab.size || 1
257
+ const w = weights ? (weights[idx] || 1) : n / totalN
258
+
259
+ for (let size = Math.min(n, contextTokens.length); size >= 1; size--) {
260
+ const ctx = contextTokens.slice(-size).join(' ')
261
+ const dist = model.table[ctx]
262
+ if (!dist) continue
263
+ const total = Object.values(dist).reduce((a, b) => a + b, 0)
264
+ for (const [word, count] of Object.entries(dist)) {
265
+ scores[word] = (scores[word] || 0) + ((count + this.alpha) / (total + this.alpha * vocab)) * w
266
+ }
267
+ break
268
+ }
269
+ }
270
+
271
+ return Object.entries(scores)
272
+ .map(([word, score]) => ({ word, score: +score.toFixed(5) }))
273
+ .sort((a, b) => b.score - a.score)
274
+ .slice(0, limit)
275
+ }
276
+
277
+ /**
278
+ * Sample with temperature + repeat penalty
279
+ * recentWords: Set of recently generated words to penalize
280
+ */
281
+ sample(predictions, temp = 0.6, recentWords = null) {
282
+ if (!predictions.length) return null
283
+ if (temp === 0) return predictions[0].word
284
+
285
+ let pool = predictions
286
+ if (recentWords && recentWords.size) {
287
+ // reduce score of recently used words by 60%
288
+ pool = predictions.map(p => ({
289
+ word: p.word,
290
+ score: recentWords.has(p.word) ? p.score * 0.4 : p.score
291
+ })).sort((a, b) => b.score - a.score)
292
+ }
293
+
294
+ const scaled = pool.map(p => ({ word: p.word, w: Math.pow(Math.max(p.score, 1e-9), 1 / temp) }))
295
+ const total = scaled.reduce((s, p) => s + p.w, 0)
296
+ let rand = Math.random() * total
297
+ for (const p of scaled) { rand -= p.w; if (rand <= 0) return p.word }
298
+ return scaled[scaled.length - 1].word
299
+ }
300
+
301
+ serialize() {
302
+ const out = { nMin: this.nMin, nMax: this.nMax, alpha: this.alpha, models: {} }
303
+ for (const [n, m] of Object.entries(this.models)) out.models[n] = { table: m.table, vocab: [...m.vocab] }
304
+ return out
305
+ }
306
+
307
+ static deserialize(data) {
308
+ const e = new EnsembleNGram({ nMin: data.nMin, nMax: data.nMax, smoothing: false, smoothingAlpha: data.alpha })
309
+ e.alpha = data.alpha
310
+ for (const [n, m] of Object.entries(data.models)) e.models[n] = { table: m.table, vocab: new Set(m.vocab) }
311
+ return e
312
+ }
313
+ }
314
+
315
+ // ─────────────────────────────────────────────
316
+ // PIPELINE
317
+ // ─────────────────────────────────────────────
318
+ class Pipeline {
319
+ constructor(trie, ngram, tfidf, tokenizer, cfg) {
320
+ this.trie = trie
321
+ this.ngram = ngram
322
+ this.tfidf = tfidf
323
+ this.tokenizer = tokenizer
324
+ this.cfg = cfg
325
+ }
326
+
327
+ _blend(predictions) {
328
+ if (!this.cfg.useTFIDF || !this.tfidf) return predictions
329
+ if (!predictions.length) return predictions
330
+ const blend = this.cfg.tfidfBlend
331
+ const scores = predictions.map(p => this.tfidf.score(p.word))
332
+ const maxTF = Math.max(...scores, 1e-9)
333
+ return predictions.map((p, i) => ({
334
+ ...p,
335
+ score: +(p.score * (1 - blend) + (scores[i] / maxTF) * blend).toFixed(5)
336
+ })).sort((a, b) => b.score - a.score)
337
+ }
338
+
339
+ /**
340
+ * Suggest: if input ends with space → next words only
341
+ * if input doesn't end with space → word completions for partial + next words for context
342
+ */
343
+ suggest(input, options = {}) {
344
+ const limit = options.limit || this.cfg.maxSuggestions
345
+ const endsSpace = input.endsWith(' ')
346
+ const tokens = this.tokenizer.tokenize(input)
347
+ const last = tokens[tokens.length - 1] || ''
348
+ const ctx = endsSpace ? tokens : tokens.slice(0, -1)
349
+ const result = { input, partialWord: endsSpace ? null : last, wordCompletions: [], nextWords: [] }
350
+
351
+ // Word completion — only when there is a partial word
352
+ if (!endsSpace && last.length >= 1) {
353
+ result.wordCompletions = this.trie.search(last, limit)
354
+ }
355
+
356
+ // Next word — always from context if we have any
357
+ if (ctx.length > 0) {
358
+ const raw = this.ngram.predict(ctx, limit * 2, this.cfg.ensembleWeights)
359
+ result.nextWords = this._blend(raw).slice(0, limit)
360
+ }
361
+
362
+ return result
363
+ }
364
+
365
+ /**
366
+ * Complete a sentence from a seed.
367
+ * Fixes:
368
+ * - Resolves partial word at end via trie before generating
369
+ * - Loop detection: tracks bigrams in generated text, stops on repeat
370
+ * - Repeat penalty: reduces score of recently used words
371
+ */
372
+ complete(input, options = {}) {
373
+ const maxWords = options.maxWords || this.cfg.maxCompletionWords
374
+ const temp = options.temperature !== undefined ? options.temperature : this.cfg.completionTemp
375
+ const limit = options.limit || this.cfg.maxSuggestions
376
+
377
+ let tokens = this.tokenizer.tokenize(input)
378
+ if (!tokens.length) return { seed: input.trim(), completed: '', generated: '', wordCount: 0 }
379
+
380
+ // Resolve partial word at end if input doesn't end with space
381
+ if (!input.endsWith(' ') && tokens.length > 0) {
382
+ const completions = this.trie.search(tokens[tokens.length - 1], 1)
383
+ if (completions.length) tokens[tokens.length - 1] = completions[0]
384
+ }
385
+
386
+ const generated = []
387
+ const bigramCounts = Object.create(null) // bigram loop detection
388
+ const recentWindow = 5 // last N words for repeat penalty
389
+
390
+ for (let i = 0; i < maxWords; i++) {
391
+ const raw = this.ngram.predict(tokens, limit * 2, this.cfg.ensembleWeights)
392
+ const pred = this._blend(raw).slice(0, limit)
393
+ if (!pred.length) break
394
+
395
+ // Build recent word set for penalty
396
+ const recentWords = this.cfg.penalizeRepeats
397
+ ? new Set(generated.slice(-recentWindow))
398
+ : null
399
+
400
+ const next = this.ngram.sample(pred, temp, recentWords)
401
+ if (!next) break
402
+
403
+ // Bigram loop detection — "X Y X Y X Y" = stop
404
+ const bigram = tokens.length > 0 ? `${tokens[tokens.length - 1]} ${next}` : next
405
+ bigramCounts[bigram] = (bigramCounts[bigram] || 0) + 1
406
+ if (bigramCounts[bigram] >= this.cfg.maxRepeatBigram) break
407
+
408
+ tokens.push(next)
409
+ generated.push(next)
410
+ }
411
+
412
+ const seedTokens = this.tokenizer.tokenize(input.trim())
413
+ return {
414
+ seed: input.trim(),
415
+ completed: [...seedTokens, ...generated].join(' '),
416
+ generated: generated.join(' '),
417
+ wordCount: generated.length,
418
+ }
419
+ }
420
+
421
+ completions(input, options = {}) {
422
+ const count = options.count || 3
423
+ const seen = new Set()
424
+ const out = []
425
+ for (let i = 0; i < count * 5 && out.length < count; i++) {
426
+ const r = this.complete(input, options)
427
+ if (r.completed && !seen.has(r.completed)) { seen.add(r.completed); out.push(r) }
428
+ }
429
+ return out
430
+ }
431
+ }
432
+
433
+ // ─────────────────────────────────────────────
434
+ // MAIN CLASS — PredictJS
435
+ // ─────────────────────────────────────────────
436
+ class PredictJS {
437
+ constructor(options = {}) {
438
+ this.config = { ...DEFAULTS, ...options }
439
+ this.tokenizer = new Tokenizer(this.config)
440
+ this.trie = new Trie()
441
+ this.ngram = new EnsembleNGram(this.config)
442
+ this.tfidf = new TFIDF()
443
+ this._rebuild()
444
+ this.trained = false
445
+ this.stats = { tokens: 0, sentences: 0, uniqueWords: 0, ngramContexts: 0 }
446
+ }
447
+
448
+ _rebuild() {
449
+ this.pipeline = new Pipeline(this.trie, this.ngram, this.tfidf, this.tokenizer, this.config)
450
+ }
451
+
452
+ // ── Training ───────────────────────────────
453
+
454
+ trainText(text) {
455
+ this._trainSentences(this.tokenizer.toSentences(text))
456
+ return this
457
+ }
458
+
459
+ trainArray(arr) {
460
+ this._trainSentences(arr.flatMap(t => this.tokenizer.toSentences(t)))
461
+ return this
462
+ }
463
+
464
+ trainFile(filePath) {
465
+ const absPath = path.resolve(filePath)
466
+ if (!fs.existsSync(absPath)) throw new Error(`File not found: ${absPath}`)
467
+ const content = fs.readFileSync(absPath, this.config.encoding)
468
+ const format = this.tokenizer.detectFormat(absPath, content)
469
+ let texts = []
470
+
471
+ if (format === 'text') {
472
+ texts = [content]
473
+ } else if (format === 'json') {
474
+ const arr = Array.isArray(JSON.parse(content)) ? JSON.parse(content) : [JSON.parse(content)]
475
+ texts = arr.map(i => typeof i === 'string' ? i : (i[this.config.jsonTextField] || '')).filter(Boolean)
476
+ } else if (format === 'jsonl') {
477
+ texts = content.split('\n').filter(Boolean).map(line => {
478
+ try { const o = JSON.parse(line); return typeof o === 'string' ? o : (o[this.config.jsonTextField] || '') } catch { return '' }
479
+ }).filter(Boolean)
480
+ }
481
+
482
+ return this.trainArray(texts)
483
+ }
484
+
485
+ _trainSentences(sentences) {
486
+ if (!sentences.length) return this
487
+ const allTokens = sentences.flat()
488
+ for (const t of allTokens) this.trie.insert(t)
489
+ this.ngram.train(sentences)
490
+ this.tfidf.train(sentences)
491
+
492
+ this.stats.tokens += allTokens.length
493
+ this.stats.sentences += sentences.length
494
+ this.stats.uniqueWords = this.trie.wordCount
495
+ this.stats.ngramContexts = Object.keys(this.ngram.models[this.config.nMax]?.table || {}).length
496
+ this.trained = true
497
+
498
+ if (this.config.autoSave) this.saveIndex()
499
+ return this
500
+ }
501
+
502
+ // ── Prediction API ─────────────────────────
503
+
504
+ suggest(input, options = {}) { this._assert(); return this.pipeline.suggest(input, options) }
505
+ completeWord(partial, limit) { this._assert(); return this.trie.search(partial, limit || this.config.maxSuggestions) }
506
+ complete(input, options = {}) { this._assert(); return this.pipeline.complete(input, options) }
507
+ completions(input, options = {}) { this._assert(); return this.pipeline.completions(input, options) }
508
+
509
+ nextWord(context, limit) {
510
+ this._assert()
511
+ const tokens = this.tokenizer.tokenize(context)
512
+ const raw = this.ngram.predict(tokens, (limit || this.config.maxSuggestions) * 2, this.config.ensembleWeights)
513
+ return this.pipeline._blend(raw).slice(0, limit || this.config.maxSuggestions)
514
+ }
515
+
516
+ // ── Index ──────────────────────────────────
517
+
518
+ saveIndex(filePath) {
519
+ const dest = path.resolve(filePath || this.config.indexPath)
520
+ const index = {
521
+ version: '3.0.0',
522
+ savedAt: new Date().toISOString(),
523
+ config: this.config,
524
+ stats: this.stats,
525
+ trie: this.trie.serialize(),
526
+ ngram: this.ngram.serialize(),
527
+ tfidf: this.tfidf.serialize(),
528
+ }
529
+ fs.writeFileSync(dest, JSON.stringify(index, null, 2))
530
+ console.log(`[PredictJS] Index saved → ${dest}`)
531
+ return dest
532
+ }
533
+
534
+ loadIndex(filePath) {
535
+ const src = path.resolve(filePath || this.config.indexPath)
536
+ if (!fs.existsSync(src)) throw new Error(`Index not found: ${src}`)
537
+ const index = JSON.parse(fs.readFileSync(src, 'utf8'))
538
+ this.config = { ...index.config, ...this.config }
539
+ this.stats = index.stats
540
+ this.trie = Trie.deserialize(index.trie)
541
+ this.ngram = EnsembleNGram.deserialize(index.ngram)
542
+ this.tfidf = TFIDF.deserialize(index.tfidf)
543
+ this._rebuild()
544
+ this.trained = true
545
+ console.log(`[PredictJS] Index loaded ← ${src} (${this.stats.uniqueWords} words, ${this.stats.sentences} sentences)`)
546
+ return this
547
+ }
548
+
549
+ // ── Utils ──────────────────────────────────
550
+
551
+ getStats() { return { ...this.stats, trained: this.trained, config: this.config } }
552
+ configure(options) { this.config = { ...this.config, ...options }; this._rebuild(); return this }
553
+ reset() {
554
+ this.trie = new Trie(); this.ngram = new EnsembleNGram(this.config)
555
+ this.tfidf = new TFIDF(); this._rebuild(); this.trained = false
556
+ this.stats = { tokens: 0, sentences: 0, uniqueWords: 0, ngramContexts: 0 }
557
+ return this
558
+ }
559
+ _assert() { if (!this.trained) throw new Error('[PredictJS] Model not trained. Call trainText(), trainArray(), or trainFile() first.') }
560
+ }
561
+
562
+ module.exports = { PredictJS, DEFAULTS }
package/readme.md ADDED
@@ -0,0 +1,171 @@
1
+ # node-predict
2
+
3
+ A lightweight, fully offline, trainable native text prediction engine for Node.js. Provides word completions, next-word predictions, and sentence generation using n-gram language models with TF-IDF weighting.
4
+
5
+ ## Quick Start
6
+
7
+ Load a pre-trained model and make predictions:
8
+
9
+ ```js
10
+ const { PredictJS } = require('node-predict')
11
+
12
+ const predictor = new PredictJS()
13
+ predictor.loadIndex('./model-index.json')
14
+
15
+ // 1) Word completions
16
+ console.log(predictor.completeWord('app'))
17
+
18
+ // 2) Next-word predictions
19
+ console.log(predictor.nextWord('The best way to'))
20
+
21
+ // 3) Complete a sentence
22
+ console.log(predictor.complete('Learning new skills'))
23
+
24
+ // 4) Multiple completions
25
+ console.log(predictor.completions('In my opinion', { count: 3 }))
26
+
27
+ // 5) Combined suggestion (partial + next words)
28
+ console.log(predictor.suggest('I en'))
29
+ ```
30
+
31
+ ---
32
+
33
+ ## Training Your Own Model
34
+
35
+ To train the model on your custom dataset, edit `dataset.txt` with your text samples, then run `node train.js`. The training script reads `dataset.txt`, builds n-gram models, calculates TF-IDF weights, and saves the trained model to `model-index.json`.
36
+
37
+ ### Training Code Example
38
+
39
+ Here's how `train.js` trains the model:
40
+
41
+ ```js
42
+ const { PredictJS } = require('./predictjs')
43
+
44
+ const DATASET_PATH = './dataset.txt'
45
+ const INDEX_PATH = './model-index.json'
46
+
47
+ const predictor = new PredictJS({
48
+ // N-gram range
49
+ nMin: 2,
50
+ nMax: 4,
51
+
52
+ // Smoothing — keep low for large datasets, raise towards 0.3 for small ones
53
+ smoothing: true,
54
+ smoothingAlpha: 0.05,
55
+
56
+ // Ensemble weights [bigram, trigram, 4-gram]
57
+ // Higher weight = more influence on predictions
58
+ ensembleWeights: [0.15, 0.35, 0.50],
59
+
60
+ // TF-IDF blending
61
+ useTFIDF: true,
62
+ tfidfBlend: 0.5,
63
+
64
+ // Word settings
65
+ minWordLength: 2,
66
+ maxSuggestions: 5,
67
+
68
+ // Completion settings
69
+ maxCompletionWords: 20,
70
+ completionTemp: 0.6,
71
+
72
+ // Loop prevention
73
+ maxRepeatBigram: 2,
74
+ penalizeRepeats: true,
75
+
76
+ // Strip numbers, lowercase, no punctuation
77
+ caseSensitive: false,
78
+ keepPunctuation: false,
79
+ keepNumbers: false,
80
+ })
81
+
82
+ console.log('Training...')
83
+
84
+ try {
85
+ const start = Date.now()
86
+ predictor.trainFile(DATASET_PATH)
87
+ const elapsed = Date.now() - start
88
+ const stats = predictor.getStats()
89
+
90
+ console.log('✔ Training complete')
91
+ console.log('Sentences:', stats.sentences)
92
+ console.log('Total tokens:', stats.tokens)
93
+ console.log('Unique words:', stats.uniqueWords)
94
+ console.log('Time taken:', elapsed, 'ms')
95
+
96
+ predictor.saveIndex(INDEX_PATH)
97
+ console.log('✔ Model saved to', INDEX_PATH)
98
+
99
+ } catch (err) {
100
+ console.error('✘ Training failed:', err.message)
101
+ process.exit(1)
102
+ }
103
+ ```
104
+
105
+ To train with your own data:
106
+
107
+ 1. Edit `dataset.txt` with your text samples
108
+ 2. Run `node train.js`
109
+ 3. The trained model is saved to `model-index.json`
110
+ 4. Use it as shown in the Quick Start section
111
+
112
+ ---
113
+
114
+ ## Configuration Options
115
+
116
+ Edit `train.js` to customize training behavior. Common options:
117
+
118
+ - **`nMin`, `nMax`** — n-gram range (default `2–4`). Controls context window size for predictions.
119
+ - **`smoothingAlpha`** — smoothing strength for unseen word pairs (default `0.05`). Keeps low for large datasets, raise towards 0.3 for small ones.
120
+ - **`ensembleWeights`** — weights for [bigram, trigram, 4-gram] predictions (default `[0.15, 0.35, 0.50]`). Higher weights give more influence.
121
+ - **`tfidfBlend`** — blend between raw frequency and TF-IDF scoring (default `0.5`). Higher values make results less common-word-heavy.
122
+ - **`maxCompletionWords`** — maximum words to generate (default `20`).
123
+ - **`completionTemp`** — sampling temperature for completions (default `0.6`).
124
+ - **`minWordLength`** — minimum word length to consider (default `2`).
125
+ - **`maxSuggestions`** — maximum suggestions to return (default `5`).
126
+
127
+ After editing options, re-run `node train.js` to retrain with new settings.
128
+
129
+ ---
130
+
131
+ ## API Reference
132
+
133
+ ### `predictor.completeWord(prefix)`
134
+ Returns word completions for a given prefix.
135
+
136
+ ### `predictor.nextWord(context)`
137
+ Predicts the next word given preceding context.
138
+
139
+ ### `predictor.complete(seed)`
140
+ Generates a complete sentence starting from a seed phrase.
141
+
142
+ ### `predictor.completions(context, options)`
143
+ Returns multiple completion options with customizable count.
144
+
145
+ ### `predictor.suggest(partial)`
146
+ Combined suggestion: completes partial word + predicts next words.
147
+
148
+ ---
149
+
150
+ ## Temperature (Sampling)
151
+
152
+ The `completionTemp` option controls creativity of generated suggestions:
153
+
154
+ - **`0.0–0.3`** — very predictable, deterministic
155
+ - **`0.6`** — balanced (default), good for most uses
156
+ - **`0.8–1.0`** — more creative and varied, may diverge from dataset style
157
+
158
+ ---
159
+
160
+ ## Tips & Troubleshooting
161
+
162
+ - **Poor predictions?** Add more in-style text to `dataset.txt` and retrain.
163
+ - **Too many common words?** Increase dataset variety or raise `tfidfBlend` towards 1.0.
164
+ - **No matches found?** Ensure your seed text matches your dataset language/style.
165
+ - **Slow training?** Large datasets may take longer; consider using representative samples.
166
+ - **Memory usage?** Larger n-gram ranges and bigger datasets consume more memory.
167
+
168
+ ---
169
+
170
+ Author: Ismail Gidado
171
+ License: MIT