@onmars/lunar-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +13 -0
- package/package.json +32 -0
- package/src/__tests__/clear-command.test.ts +214 -0
- package/src/__tests__/command-handler.test.ts +169 -0
- package/src/__tests__/compact-command.test.ts +80 -0
- package/src/__tests__/config-command.test.ts +240 -0
- package/src/__tests__/config-loader.test.ts +1512 -0
- package/src/__tests__/config.test.ts +429 -0
- package/src/__tests__/cron-command.test.ts +418 -0
- package/src/__tests__/cron-parser.test.ts +259 -0
- package/src/__tests__/daemon.test.ts +346 -0
- package/src/__tests__/dedup.test.ts +404 -0
- package/src/__tests__/e2e-sanitization.ts +168 -0
- package/src/__tests__/e2e-skill-loader.test.ts +176 -0
- package/src/__tests__/fixtures/AGENTS.md +4 -0
- package/src/__tests__/fixtures/IDENTITY.md +2 -0
- package/src/__tests__/fixtures/SOUL.md +3 -0
- package/src/__tests__/fixtures/moons/athena/IDENTITY.md +2 -0
- package/src/__tests__/fixtures/moons/athena/SOUL.md +3 -0
- package/src/__tests__/fixtures/moons/hermes/SOUL.md +3 -0
- package/src/__tests__/fixtures/skills/brain/SKILL.md +6 -0
- package/src/__tests__/fixtures/skills/empty/SKILL.md +3 -0
- package/src/__tests__/fixtures/skills/multiline/SKILL.md +7 -0
- package/src/__tests__/fixtures/skills/no-desc/SKILL.md +5 -0
- package/src/__tests__/fixtures/skills/notion/SKILL.md +6 -0
- package/src/__tests__/fixtures/skills/quoted/SKILL.md +6 -0
- package/src/__tests__/hook-runner.test.ts +1689 -0
- package/src/__tests__/input-sanitization.test.ts +367 -0
- package/src/__tests__/logger.test.ts +163 -0
- package/src/__tests__/memory-orchestrator.test.ts +552 -0
- package/src/__tests__/model-catalog.test.ts +215 -0
- package/src/__tests__/model-command.test.ts +185 -0
- package/src/__tests__/moon-loader.test.ts +398 -0
- package/src/__tests__/ping-command.test.ts +85 -0
- package/src/__tests__/plugin.test.ts +258 -0
- package/src/__tests__/remind-command.test.ts +368 -0
- package/src/__tests__/reset-command.test.ts +92 -0
- package/src/__tests__/router.test.ts +1246 -0
- package/src/__tests__/scheduler.test.ts +469 -0
- package/src/__tests__/security.test.ts +214 -0
- package/src/__tests__/session-meta.test.ts +101 -0
- package/src/__tests__/session-tracker.test.ts +389 -0
- package/src/__tests__/session.test.ts +241 -0
- package/src/__tests__/skill-loader.test.ts +153 -0
- package/src/__tests__/status-command.test.ts +153 -0
- package/src/__tests__/stop-command.test.ts +60 -0
- package/src/__tests__/think-command.test.ts +146 -0
- package/src/__tests__/usage-api.test.ts +222 -0
- package/src/__tests__/usage-command-api-fail.test.ts +48 -0
- package/src/__tests__/usage-command-no-oauth.test.ts +48 -0
- package/src/__tests__/usage-command.test.ts +173 -0
- package/src/__tests__/whoami-command.test.ts +124 -0
- package/src/index.ts +122 -0
- package/src/lib/command-handler.ts +135 -0
- package/src/lib/commands/clear.ts +69 -0
- package/src/lib/commands/compact.ts +14 -0
- package/src/lib/commands/config-show.ts +49 -0
- package/src/lib/commands/cron.ts +118 -0
- package/src/lib/commands/help.ts +26 -0
- package/src/lib/commands/model.ts +71 -0
- package/src/lib/commands/ping.ts +24 -0
- package/src/lib/commands/remind.ts +75 -0
- package/src/lib/commands/status.ts +118 -0
- package/src/lib/commands/stop.ts +18 -0
- package/src/lib/commands/think.ts +42 -0
- package/src/lib/commands/usage.ts +56 -0
- package/src/lib/commands/whoami.ts +23 -0
- package/src/lib/config-loader.ts +1449 -0
- package/src/lib/config.ts +202 -0
- package/src/lib/cron-parser.ts +388 -0
- package/src/lib/daemon.ts +216 -0
- package/src/lib/dedup.ts +414 -0
- package/src/lib/hook-runner.ts +1270 -0
- package/src/lib/logger.ts +55 -0
- package/src/lib/memory-orchestrator.ts +415 -0
- package/src/lib/model-catalog.ts +240 -0
- package/src/lib/moon-loader.ts +291 -0
- package/src/lib/plugin.ts +148 -0
- package/src/lib/router.ts +1135 -0
- package/src/lib/scheduler.ts +422 -0
- package/src/lib/security.ts +259 -0
- package/src/lib/session-tracker.ts +222 -0
- package/src/lib/session.ts +158 -0
- package/src/lib/skill-loader.ts +166 -0
- package/src/lib/usage-api.ts +145 -0
- package/src/types/agent.ts +86 -0
- package/src/types/channel.ts +93 -0
- package/src/types/index.ts +32 -0
- package/src/types/memory.ts +92 -0
- package/src/types/moon.ts +56 -0
- package/src/types/voice.ts +74 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'
|
|
2
|
+
import { dirname, resolve } from 'node:path'
|
|
3
|
+
import type { MemoryProvider } from '../types/memory'
|
|
4
|
+
import type { ChannelPersona } from '../types/moon'
|
|
5
|
+
import type { LunarConfig } from './config'
|
|
6
|
+
import type { MemoryConfig, RecallConfig, SecurityConfig, VoiceConfig } from './config-loader'
|
|
7
|
+
import { log } from './logger'
|
|
8
|
+
import type { MoonLoader } from './moon-loader'
|
|
9
|
+
import type { PluginRegistry } from './plugin'
|
|
10
|
+
import { Router } from './router'
|
|
11
|
+
import type { Scheduler } from './scheduler'
|
|
12
|
+
import type { SessionStore } from './session'
|
|
13
|
+
|
|
14
|
+
export interface DaemonOptions {
|
|
15
|
+
config: LunarConfig
|
|
16
|
+
registry: PluginRegistry
|
|
17
|
+
sessions: SessionStore
|
|
18
|
+
moonLoader: MoonLoader
|
|
19
|
+
channels: ChannelPersona[]
|
|
20
|
+
security?: SecurityConfig
|
|
21
|
+
/** Memory provider for auto-recall */
|
|
22
|
+
memory?: MemoryProvider
|
|
23
|
+
/** Auto-recall configuration */
|
|
24
|
+
/** Recall configuration (enabled, limit, minScore, budget) */
|
|
25
|
+
recallConfig?: RecallConfig
|
|
26
|
+
/** Workspace path (for loading prompts/) */
|
|
27
|
+
workspacePath?: string
|
|
28
|
+
/** Full memory config (for provider name, instructionsFile) */
|
|
29
|
+
memoryConfig?: MemoryConfig
|
|
30
|
+
/** Scheduler instance (optional — created by CLI start if config exists) */
|
|
31
|
+
scheduler?: Scheduler
|
|
32
|
+
/** Voice configuration (TTS mode, STT mode) */
|
|
33
|
+
voiceConfig?: VoiceConfig
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Daemon — The main process lifecycle manager.
|
|
38
|
+
*
|
|
39
|
+
* Handles: PID lock, signal handlers, graceful shutdown, startup orchestration.
|
|
40
|
+
*/
|
|
41
|
+
export class Daemon {
|
|
42
|
+
private pidPath: string
|
|
43
|
+
private router: Router
|
|
44
|
+
private shutdownPromise?: Promise<void>
|
|
45
|
+
private startedAt = 0
|
|
46
|
+
private scheduler?: Scheduler
|
|
47
|
+
|
|
48
|
+
constructor(private options: DaemonOptions) {
|
|
49
|
+
this.pidPath = resolve(options.config.dataDir, 'lunar.pid')
|
|
50
|
+
this.scheduler = options.scheduler
|
|
51
|
+
this.router = new Router(
|
|
52
|
+
options.registry,
|
|
53
|
+
options.sessions,
|
|
54
|
+
options.config,
|
|
55
|
+
options.moonLoader,
|
|
56
|
+
options.channels,
|
|
57
|
+
options.security,
|
|
58
|
+
options.memory,
|
|
59
|
+
options.recallConfig,
|
|
60
|
+
options.workspacePath,
|
|
61
|
+
options.memoryConfig,
|
|
62
|
+
options.scheduler,
|
|
63
|
+
options.voiceConfig,
|
|
64
|
+
)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/** Start the daemon */
|
|
68
|
+
async start(): Promise<void> {
|
|
69
|
+
// Acquire PID lock
|
|
70
|
+
this.acquirePidLock()
|
|
71
|
+
|
|
72
|
+
// Register signal handlers
|
|
73
|
+
this.registerSignals()
|
|
74
|
+
|
|
75
|
+
this.startedAt = Date.now()
|
|
76
|
+
this.router.daemonStartedAt = this.startedAt
|
|
77
|
+
|
|
78
|
+
log.info({ pid: process.pid, dev: this.options.config.dev }, '🌑 Lunar starting')
|
|
79
|
+
|
|
80
|
+
// Initialize all plugins
|
|
81
|
+
await this.options.registry.initAll()
|
|
82
|
+
|
|
83
|
+
// Wire up message routing
|
|
84
|
+
this.wireRouting()
|
|
85
|
+
|
|
86
|
+
// Start scheduler if configured
|
|
87
|
+
if (this.scheduler) {
|
|
88
|
+
await this.scheduler.start()
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
log.info(
|
|
92
|
+
{
|
|
93
|
+
channels: this.options.registry.listChannels(),
|
|
94
|
+
agents: this.options.registry.listAgents(),
|
|
95
|
+
memory: this.options.registry.listMemory(),
|
|
96
|
+
moons: this.options.moonLoader.list(),
|
|
97
|
+
scheduler: !!this.scheduler,
|
|
98
|
+
},
|
|
99
|
+
'🌑 Lunar ready',
|
|
100
|
+
)
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** Graceful shutdown */
|
|
104
|
+
async stop(): Promise<void> {
|
|
105
|
+
if (this.shutdownPromise) return this.shutdownPromise
|
|
106
|
+
|
|
107
|
+
this.shutdownPromise = (async () => {
|
|
108
|
+
log.info('🌑 Lunar shutting down...')
|
|
109
|
+
|
|
110
|
+
// Stop scheduler first (no new dispatches during shutdown)
|
|
111
|
+
if (this.scheduler) {
|
|
112
|
+
await this.scheduler.stop()
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Graceful session close — triggers memory lifecycle hooks
|
|
116
|
+
// (summarize, promote, etc.) before providers are destroyed
|
|
117
|
+
try {
|
|
118
|
+
await this.router.closeAllSessions()
|
|
119
|
+
} catch (err) {
|
|
120
|
+
log.warn({ err }, 'Error during session close — continuing shutdown')
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Close sessions DB
|
|
124
|
+
this.options.sessions.close()
|
|
125
|
+
|
|
126
|
+
// Destroy all plugins
|
|
127
|
+
await this.options.registry.destroyAll()
|
|
128
|
+
|
|
129
|
+
// Release PID lock
|
|
130
|
+
this.releasePidLock()
|
|
131
|
+
|
|
132
|
+
log.info('🌑 Lunar stopped')
|
|
133
|
+
})()
|
|
134
|
+
|
|
135
|
+
return this.shutdownPromise
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/** Get the router instance */
|
|
139
|
+
getRouter(): Router {
|
|
140
|
+
return this.router
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// --- Private ---
|
|
144
|
+
|
|
145
|
+
private wireRouting(): void {
|
|
146
|
+
const { registry } = this.options
|
|
147
|
+
|
|
148
|
+
for (const channelId of registry.listChannels()) {
|
|
149
|
+
const channel = registry.getChannel(channelId)
|
|
150
|
+
if (!channel) continue
|
|
151
|
+
|
|
152
|
+
channel.onMessage(async (msg) => {
|
|
153
|
+
await this.router.route(channelId, msg)
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
log.debug({ channel: channelId }, 'Message routing wired')
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
private acquirePidLock(): void {
|
|
161
|
+
const dir = dirname(this.pidPath)
|
|
162
|
+
if (!existsSync(dir)) mkdirSync(dir, { recursive: true })
|
|
163
|
+
|
|
164
|
+
// Check for stale PID
|
|
165
|
+
if (existsSync(this.pidPath)) {
|
|
166
|
+
const existingPid = readFileSync(this.pidPath, 'utf-8').trim()
|
|
167
|
+
try {
|
|
168
|
+
// Check if process is still running
|
|
169
|
+
process.kill(Number.parseInt(existingPid, 10), 0)
|
|
170
|
+
throw new Error(`Lunar is already running (PID ${existingPid})`)
|
|
171
|
+
} catch (err: unknown) {
|
|
172
|
+
if (
|
|
173
|
+
err instanceof Error &&
|
|
174
|
+
'code' in err &&
|
|
175
|
+
(err as NodeJS.ErrnoException).code === 'ESRCH'
|
|
176
|
+
) {
|
|
177
|
+
log.warn({ stalePid: existingPid }, 'Removing stale PID file')
|
|
178
|
+
unlinkSync(this.pidPath)
|
|
179
|
+
} else {
|
|
180
|
+
throw err
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
writeFileSync(this.pidPath, String(process.pid))
|
|
186
|
+
log.debug({ pid: process.pid, path: this.pidPath }, 'PID lock acquired')
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
private releasePidLock(): void {
|
|
190
|
+
try {
|
|
191
|
+
if (existsSync(this.pidPath)) {
|
|
192
|
+
unlinkSync(this.pidPath)
|
|
193
|
+
}
|
|
194
|
+
} catch {
|
|
195
|
+
// Best effort
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
private registerSignals(): void {
|
|
200
|
+
const shutdown = async (signal: string) => {
|
|
201
|
+
log.info({ signal }, 'Received signal')
|
|
202
|
+
await this.stop()
|
|
203
|
+
process.exit(0)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
process.on('SIGINT', () => shutdown('SIGINT'))
|
|
207
|
+
process.on('SIGTERM', () => shutdown('SIGTERM'))
|
|
208
|
+
process.on('uncaughtException', (err) => {
|
|
209
|
+
log.fatal({ err }, 'Uncaught exception')
|
|
210
|
+
this.stop().finally(() => process.exit(1))
|
|
211
|
+
})
|
|
212
|
+
process.on('unhandledRejection', (err) => {
|
|
213
|
+
log.error({ err }, 'Unhandled rejection')
|
|
214
|
+
})
|
|
215
|
+
}
|
|
216
|
+
}
|
package/src/lib/dedup.ts
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @module dedup — Near-duplicate detection for memory results
|
|
3
|
+
*
|
|
4
|
+
* Zero external dependencies. Three strategies:
|
|
5
|
+
*
|
|
6
|
+
* 1. **Dice (bigram)** — Character bigram overlap (Sørensen–Dice coefficient).
|
|
7
|
+
* Best for short texts (<300 chars). O(n) time, O(n) space.
|
|
8
|
+
*
|
|
9
|
+
* 2. **MinHash** — Approximate Jaccard similarity via locality-sensitive hashing.
|
|
10
|
+
* Uses word-level trigram shingling + FNV-1a hash + random permutations.
|
|
11
|
+
* Best for medium-long texts. O(n·k) time where k = numPermutations.
|
|
12
|
+
* Based on: Broder (1997), BigCode/HuggingFace dedup pipeline, GPT-3 paper.
|
|
13
|
+
*
|
|
14
|
+
* 3. **Adaptive** — Auto-selects Dice or MinHash based on text length.
|
|
15
|
+
* Default strategy. Crossover at ~300 chars.
|
|
16
|
+
*
|
|
17
|
+
* Design priorities:
|
|
18
|
+
* - Conservative: high precision (don't wrongly remove unique content) over
|
|
19
|
+
* high recall (catch every duplicate). Missing a duplicate = wasted tokens.
|
|
20
|
+
* Removing unique content = lost context.
|
|
21
|
+
* - Zero dependencies: pure TypeScript, no npm packages.
|
|
22
|
+
* - Framework-ready: pluggable DedupStrategy interface for custom implementations.
|
|
23
|
+
*
|
|
24
|
+
* References:
|
|
25
|
+
* - Sørensen–Dice: https://en.wikipedia.org/wiki/Dice-S%C3%B8rensen_coefficient
|
|
26
|
+
* - MinHash: Broder (1997), "On the resemblance and containment of documents"
|
|
27
|
+
* - "In Defense of MinHash Over SimHash" (Shrivastava & Li, AISTATS 2014)
|
|
28
|
+
* - HuggingFace BigCode: word-level trigram shingling standard
|
|
29
|
+
* - FNV-1a: Fowler–Noll–Vo non-cryptographic hash
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
// ════════════════════════════════════════════════════════════
|
|
33
|
+
// Types
|
|
34
|
+
// ════════════════════════════════════════════════════════════
|
|
35
|
+
|
|
36
|
+
export interface DedupResult<T> {
|
|
37
|
+
/** Items kept after deduplication */
|
|
38
|
+
kept: T[]
|
|
39
|
+
/** Items removed as duplicates (with reference to what they duplicated) */
|
|
40
|
+
removed: Array<{ item: T; duplicateOf: T; similarity: number }>
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface DedupStrategy {
|
|
44
|
+
/** Compare two texts. Returns 0–1 similarity score. */
|
|
45
|
+
compare(a: string, b: string): number
|
|
46
|
+
/** Strategy identifier for logging/debugging */
|
|
47
|
+
readonly id: string
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface DedupOptions {
|
|
51
|
+
/** Similarity threshold above which items are considered duplicates. Default: 0.85 */
|
|
52
|
+
threshold?: number
|
|
53
|
+
/** Maximum number of items to keep (token budget proxy). Default: Infinity */
|
|
54
|
+
maxResults?: number
|
|
55
|
+
/** Extract text from item for comparison. Default: String(item) */
|
|
56
|
+
textExtractor?: (item: unknown) => string
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ════════════════════════════════════════════════════════════
|
|
60
|
+
// Text normalization
|
|
61
|
+
// ════════════════════════════════════════════════════════════
|
|
62
|
+
|
|
63
|
+
const WHITESPACE_RE = /\s+/g
|
|
64
|
+
const PUNCTUATION_RE = /[^\p{L}\p{N}\s]/gu
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Normalize text for comparison: lowercase, collapse whitespace,
|
|
68
|
+
* strip punctuation (keeping letters + numbers in any language).
|
|
69
|
+
*/
|
|
70
|
+
export function normalize(text: string): string {
|
|
71
|
+
return text.toLowerCase().replace(PUNCTUATION_RE, ' ').replace(WHITESPACE_RE, ' ').trim()
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ════════════════════════════════════════════════════════════
|
|
75
|
+
// FNV-1a hash (32-bit)
|
|
76
|
+
// ════════════════════════════════════════════════════════════
|
|
77
|
+
|
|
78
|
+
const FNV_OFFSET = 0x811c9dc5
|
|
79
|
+
const FNV_PRIME = 0x01000193
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* FNV-1a 32-bit hash. Fast, well-distributed, non-cryptographic.
|
|
83
|
+
* Used internally by MinHash for shingling.
|
|
84
|
+
*/
|
|
85
|
+
export function fnv1a32(str: string): number {
|
|
86
|
+
let hash = FNV_OFFSET
|
|
87
|
+
for (let i = 0; i < str.length; i++) {
|
|
88
|
+
hash ^= str.charCodeAt(i)
|
|
89
|
+
hash = Math.imul(hash, FNV_PRIME) >>> 0
|
|
90
|
+
}
|
|
91
|
+
return hash
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// ════════════════════════════════════════════════════════════
|
|
95
|
+
// Dice coefficient (bigram-based)
|
|
96
|
+
// ════════════════════════════════════════════════════════════
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Sørensen–Dice coefficient using character bigrams.
|
|
100
|
+
* Handles multiset (counts duplicates correctly: "GG" vs "GGGG" ≠ 1.0).
|
|
101
|
+
*
|
|
102
|
+
* Accuracy: ~85% precision on short texts, ~97%+ at threshold 0.90.
|
|
103
|
+
* Speed: O(n+m) time, O(n+m) space.
|
|
104
|
+
*
|
|
105
|
+
* @returns 0–1 similarity score
|
|
106
|
+
*/
|
|
107
|
+
export function diceSimilarity(a: string, b: string): number {
|
|
108
|
+
if (a === b) return 1
|
|
109
|
+
if (a.length < 2 || b.length < 2) return 0
|
|
110
|
+
|
|
111
|
+
const bigramsA = new Map<string, number>()
|
|
112
|
+
const bigramsB = new Map<string, number>()
|
|
113
|
+
|
|
114
|
+
for (let i = 0; i < a.length - 1; i++) {
|
|
115
|
+
const bg = a.charAt(i) + a.charAt(i + 1)
|
|
116
|
+
bigramsA.set(bg, (bigramsA.get(bg) || 0) + 1)
|
|
117
|
+
}
|
|
118
|
+
for (let i = 0; i < b.length - 1; i++) {
|
|
119
|
+
const bg = b.charAt(i) + b.charAt(i + 1)
|
|
120
|
+
bigramsB.set(bg, (bigramsB.get(bg) || 0) + 1)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
let intersection = 0
|
|
124
|
+
for (const [bg, count] of bigramsA) {
|
|
125
|
+
const countB = bigramsB.get(bg)
|
|
126
|
+
if (countB) intersection += Math.min(count, countB)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return (2 * intersection) / (a.length - 1 + b.length - 1)
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
export class DiceDedupStrategy implements DedupStrategy {
|
|
133
|
+
readonly id = 'dice'
|
|
134
|
+
|
|
135
|
+
compare(a: string, b: string): number {
|
|
136
|
+
return diceSimilarity(normalize(a), normalize(b))
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ════════════════════════════════════════════════════════════
|
|
141
|
+
// MinHash (approximate Jaccard via LSH)
|
|
142
|
+
// ════════════════════════════════════════════════════════════
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Generate word-level n-gram shingles from text.
|
|
146
|
+
* Default: trigrams (n=3), the standard for text dedup (BigCode, GPT-3).
|
|
147
|
+
*
|
|
148
|
+
* "the quick brown fox" with n=3 → ["the quick brown", "quick brown fox"]
|
|
149
|
+
*/
|
|
150
|
+
export function shingle(text: string, n = 3): string[] {
|
|
151
|
+
const words = normalize(text)
|
|
152
|
+
.split(' ')
|
|
153
|
+
.filter((w) => w.length > 0)
|
|
154
|
+
if (words.length < n) return [words.join(' ')]
|
|
155
|
+
|
|
156
|
+
const shingles: string[] = []
|
|
157
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
158
|
+
shingles.push(words.slice(i, i + n).join(' '))
|
|
159
|
+
}
|
|
160
|
+
return shingles
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Mersenne prime 2^31 - 1 (standard for MinHash permutation simulation)
|
|
164
|
+
const MERSENNE_PRIME = 0x7fffffff
|
|
165
|
+
|
|
166
|
+
export interface PermutationCoeffs {
|
|
167
|
+
a: Uint32Array
|
|
168
|
+
b: Uint32Array
|
|
169
|
+
numPerm: number
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Generate random permutation coefficients for MinHash.
|
|
174
|
+
* Deterministic given the same seed (for reproducibility).
|
|
175
|
+
*
|
|
176
|
+
* Uses xorshift32 PRNG seeded from the input.
|
|
177
|
+
* These coefficients are reusable across all comparisons.
|
|
178
|
+
*/
|
|
179
|
+
export function generatePermutations(numPerm = 128, seed = 42): PermutationCoeffs {
|
|
180
|
+
const a = new Uint32Array(numPerm)
|
|
181
|
+
const b = new Uint32Array(numPerm)
|
|
182
|
+
|
|
183
|
+
// Seeded PRNG (xorshift32)
|
|
184
|
+
let state = seed >>> 0 || 1
|
|
185
|
+
function next(): number {
|
|
186
|
+
state ^= state << 13
|
|
187
|
+
state ^= state >>> 17
|
|
188
|
+
state ^= state << 5
|
|
189
|
+
return state >>> 0
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
for (let i = 0; i < numPerm; i++) {
|
|
193
|
+
// a must be non-zero mod prime for good distribution
|
|
194
|
+
a[i] = (next() % (MERSENNE_PRIME - 1)) + 1
|
|
195
|
+
b[i] = next() % MERSENNE_PRIME
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return { a, b, numPerm }
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Compute a MinHash signature for a set of shingles.
|
|
203
|
+
*
|
|
204
|
+
* Uses the random permutation simulation technique:
|
|
205
|
+
* h_i(x) = (a_i * hash(x) + b_i) mod prime
|
|
206
|
+
* where a_i, b_i are random coefficients per permutation.
|
|
207
|
+
*
|
|
208
|
+
* The signature is a Uint32Array of `numPerm` minimum hash values.
|
|
209
|
+
*/
|
|
210
|
+
export function computeMinHash(shingles: string[], coeffs: PermutationCoeffs): Uint32Array {
|
|
211
|
+
const { a, b, numPerm } = coeffs
|
|
212
|
+
const signature = new Uint32Array(numPerm).fill(0xffffffff)
|
|
213
|
+
|
|
214
|
+
for (const s of shingles) {
|
|
215
|
+
const hash = fnv1a32(s)
|
|
216
|
+
for (let i = 0; i < numPerm; i++) {
|
|
217
|
+
// Simulate permutation: (a[i] * hash + b[i]) mod MERSENNE_PRIME
|
|
218
|
+
const permuted = ((Math.imul(a[i], hash) >>> 0) + b[i]) % MERSENNE_PRIME
|
|
219
|
+
if (permuted < signature[i]) {
|
|
220
|
+
signature[i] = permuted
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return signature
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Estimate Jaccard similarity from two MinHash signatures.
|
|
230
|
+
* Fraction of matching positions ≈ Jaccard(A, B).
|
|
231
|
+
*
|
|
232
|
+
* Standard error: 1/√numPerm
|
|
233
|
+
* - 128 perms → SE ~0.088 (±8.8%)
|
|
234
|
+
* - 256 perms → SE ~0.063 (±6.3%)
|
|
235
|
+
*/
|
|
236
|
+
export function minhashSimilarity(sigA: Uint32Array, sigB: Uint32Array): number {
|
|
237
|
+
if (sigA.length !== sigB.length) {
|
|
238
|
+
throw new Error(`Signature length mismatch: ${sigA.length} vs ${sigB.length}`)
|
|
239
|
+
}
|
|
240
|
+
let matches = 0
|
|
241
|
+
for (let i = 0; i < sigA.length; i++) {
|
|
242
|
+
if (sigA[i] === sigB[i]) matches++
|
|
243
|
+
}
|
|
244
|
+
return matches / sigA.length
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/** Exact Jaccard similarity for small sets (no approximation needed). */
|
|
248
|
+
function jaccardExact(a: Set<string>, b: Set<string>): number {
|
|
249
|
+
let intersection = 0
|
|
250
|
+
for (const item of a) {
|
|
251
|
+
if (b.has(item)) intersection++
|
|
252
|
+
}
|
|
253
|
+
const union = a.size + b.size - intersection
|
|
254
|
+
return union === 0 ? 1 : intersection / union
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
export class MinHashDedupStrategy implements DedupStrategy {
|
|
258
|
+
readonly id = 'minhash'
|
|
259
|
+
private readonly coeffs: PermutationCoeffs
|
|
260
|
+
private readonly shingleSize: number
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* @param numPerm - Number of permutations (more = more accurate, slower).
|
|
264
|
+
* 128 = standard error ~8.8%. 256 = ~6.25%.
|
|
265
|
+
* @param shingleSize - Word n-gram size. 3 = standard (BigCode/GPT-3).
|
|
266
|
+
* @param seed - PRNG seed for reproducible permutations.
|
|
267
|
+
*/
|
|
268
|
+
constructor(numPerm = 128, shingleSize = 3, seed = 42) {
|
|
269
|
+
this.coeffs = generatePermutations(numPerm, seed)
|
|
270
|
+
this.shingleSize = shingleSize
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
compare(a: string, b: string): number {
|
|
274
|
+
const shinglesA = shingle(a, this.shingleSize)
|
|
275
|
+
const shinglesB = shingle(b, this.shingleSize)
|
|
276
|
+
|
|
277
|
+
// Edge case: very short texts with few shingles — fall back to exact Jaccard
|
|
278
|
+
if (shinglesA.length <= 3 || shinglesB.length <= 3) {
|
|
279
|
+
return jaccardExact(new Set(shinglesA), new Set(shinglesB))
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const sigA = computeMinHash(shinglesA, this.coeffs)
|
|
283
|
+
const sigB = computeMinHash(shinglesB, this.coeffs)
|
|
284
|
+
return minhashSimilarity(sigA, sigB)
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// ════════════════════════════════════════════════════════════
|
|
289
|
+
// Adaptive strategy (auto-selects based on text length)
|
|
290
|
+
// ════════════════════════════════════════════════════════════
|
|
291
|
+
|
|
292
|
+
export class AdaptiveDedupStrategy implements DedupStrategy {
|
|
293
|
+
readonly id = 'adaptive'
|
|
294
|
+
private readonly dice = new DiceDedupStrategy()
|
|
295
|
+
private readonly minhash: MinHashDedupStrategy
|
|
296
|
+
private readonly crossoverLength: number
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* @param crossoverLength - Char count threshold to switch Dice → MinHash.
|
|
300
|
+
* Default: 300 (empirically, Dice accuracy degrades
|
|
301
|
+
* for longer texts where boilerplate dilutes bigrams).
|
|
302
|
+
* @param numPerm - MinHash permutations (passed through).
|
|
303
|
+
*/
|
|
304
|
+
constructor(crossoverLength = 300, numPerm = 128) {
|
|
305
|
+
this.crossoverLength = crossoverLength
|
|
306
|
+
this.minhash = new MinHashDedupStrategy(numPerm)
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
compare(a: string, b: string): number {
|
|
310
|
+
const maxLen = Math.max(a.length, b.length)
|
|
311
|
+
if (maxLen <= this.crossoverLength) {
|
|
312
|
+
return this.dice.compare(a, b)
|
|
313
|
+
}
|
|
314
|
+
return this.minhash.compare(a, b)
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// ════════════════════════════════════════════════════════════
|
|
319
|
+
// Deduplication engine
|
|
320
|
+
// ════════════════════════════════════════════════════════════
|
|
321
|
+
|
|
322
|
+
/**
|
|
323
|
+
* Deduplicate an array of items using the given strategy.
|
|
324
|
+
*
|
|
325
|
+
* Items are compared pairwise. When a duplicate is found, the item appearing
|
|
326
|
+
* later in the array is removed (earlier items have priority — sort by
|
|
327
|
+
* relevance score before calling this).
|
|
328
|
+
*
|
|
329
|
+
* Complexity: O(n² · C) where C = cost of one comparison.
|
|
330
|
+
* For n ≤ 20 (typical memory recall), this is negligible.
|
|
331
|
+
*
|
|
332
|
+
* @param items - Items to deduplicate (pre-sorted by relevance, best first)
|
|
333
|
+
* @param strategy - Comparison strategy (default: adaptive)
|
|
334
|
+
* @param options - Threshold, maxResults, text extractor
|
|
335
|
+
* @returns Kept items + removed items with metadata
|
|
336
|
+
*/
|
|
337
|
+
export function deduplicate<T>(
|
|
338
|
+
items: T[],
|
|
339
|
+
strategy: DedupStrategy = new AdaptiveDedupStrategy(),
|
|
340
|
+
options: DedupOptions = {},
|
|
341
|
+
): DedupResult<T> {
|
|
342
|
+
const {
|
|
343
|
+
threshold = 0.85,
|
|
344
|
+
maxResults = Infinity,
|
|
345
|
+
textExtractor = (item: unknown) => String(item),
|
|
346
|
+
} = options
|
|
347
|
+
|
|
348
|
+
const kept: T[] = []
|
|
349
|
+
const removed: DedupResult<T>['removed'] = []
|
|
350
|
+
|
|
351
|
+
for (const item of items) {
|
|
352
|
+
if (kept.length >= maxResults) break
|
|
353
|
+
|
|
354
|
+
const text = textExtractor(item)
|
|
355
|
+
let isDuplicate = false
|
|
356
|
+
|
|
357
|
+
for (const existing of kept) {
|
|
358
|
+
const existingText = textExtractor(existing)
|
|
359
|
+
const similarity = strategy.compare(text, existingText)
|
|
360
|
+
|
|
361
|
+
if (similarity > threshold) {
|
|
362
|
+
removed.push({ item, duplicateOf: existing, similarity })
|
|
363
|
+
isDuplicate = true
|
|
364
|
+
break
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
if (!isDuplicate) {
|
|
369
|
+
kept.push(item)
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
return { kept, removed }
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// ════════════════════════════════════════════════════════════
|
|
377
|
+
// Factory — create strategy from config
|
|
378
|
+
// ════════════════════════════════════════════════════════════
|
|
379
|
+
|
|
380
|
+
export interface DedupConfig {
|
|
381
|
+
/** Strategy name: 'dice' | 'minhash' | 'adaptive' (default) */
|
|
382
|
+
strategy?: 'dice' | 'minhash' | 'adaptive'
|
|
383
|
+
/** Similarity threshold for duplicate detection. Default: 0.85 */
|
|
384
|
+
threshold?: number
|
|
385
|
+
/** MinHash: number of permutations. Default: 128 */
|
|
386
|
+
numPermutations?: number
|
|
387
|
+
/** MinHash: word n-gram size. Default: 3 */
|
|
388
|
+
shingleSize?: number
|
|
389
|
+
/** Adaptive: crossover length in chars. Default: 300 */
|
|
390
|
+
crossoverLength?: number
|
|
391
|
+
/** Maximum results after dedup. Default: Infinity */
|
|
392
|
+
maxResults?: number
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/** Create a DedupStrategy from config. */
|
|
396
|
+
export function createStrategy(config: DedupConfig = {}): DedupStrategy {
|
|
397
|
+
const {
|
|
398
|
+
strategy = 'adaptive',
|
|
399
|
+
numPermutations = 128,
|
|
400
|
+
shingleSize = 3,
|
|
401
|
+
crossoverLength = 300,
|
|
402
|
+
} = config
|
|
403
|
+
|
|
404
|
+
switch (strategy) {
|
|
405
|
+
case 'dice':
|
|
406
|
+
return new DiceDedupStrategy()
|
|
407
|
+
case 'minhash':
|
|
408
|
+
return new MinHashDedupStrategy(numPermutations, shingleSize)
|
|
409
|
+
case 'adaptive':
|
|
410
|
+
return new AdaptiveDedupStrategy(crossoverLength, numPermutations)
|
|
411
|
+
default:
|
|
412
|
+
return new AdaptiveDedupStrategy(crossoverLength, numPermutations)
|
|
413
|
+
}
|
|
414
|
+
}
|