@onmars/lunar-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +13 -0
  3. package/package.json +32 -0
  4. package/src/__tests__/clear-command.test.ts +214 -0
  5. package/src/__tests__/command-handler.test.ts +169 -0
  6. package/src/__tests__/compact-command.test.ts +80 -0
  7. package/src/__tests__/config-command.test.ts +240 -0
  8. package/src/__tests__/config-loader.test.ts +1512 -0
  9. package/src/__tests__/config.test.ts +429 -0
  10. package/src/__tests__/cron-command.test.ts +418 -0
  11. package/src/__tests__/cron-parser.test.ts +259 -0
  12. package/src/__tests__/daemon.test.ts +346 -0
  13. package/src/__tests__/dedup.test.ts +404 -0
  14. package/src/__tests__/e2e-sanitization.ts +168 -0
  15. package/src/__tests__/e2e-skill-loader.test.ts +176 -0
  16. package/src/__tests__/fixtures/AGENTS.md +4 -0
  17. package/src/__tests__/fixtures/IDENTITY.md +2 -0
  18. package/src/__tests__/fixtures/SOUL.md +3 -0
  19. package/src/__tests__/fixtures/moons/athena/IDENTITY.md +2 -0
  20. package/src/__tests__/fixtures/moons/athena/SOUL.md +3 -0
  21. package/src/__tests__/fixtures/moons/hermes/SOUL.md +3 -0
  22. package/src/__tests__/fixtures/skills/brain/SKILL.md +6 -0
  23. package/src/__tests__/fixtures/skills/empty/SKILL.md +3 -0
  24. package/src/__tests__/fixtures/skills/multiline/SKILL.md +7 -0
  25. package/src/__tests__/fixtures/skills/no-desc/SKILL.md +5 -0
  26. package/src/__tests__/fixtures/skills/notion/SKILL.md +6 -0
  27. package/src/__tests__/fixtures/skills/quoted/SKILL.md +6 -0
  28. package/src/__tests__/hook-runner.test.ts +1689 -0
  29. package/src/__tests__/input-sanitization.test.ts +367 -0
  30. package/src/__tests__/logger.test.ts +163 -0
  31. package/src/__tests__/memory-orchestrator.test.ts +552 -0
  32. package/src/__tests__/model-catalog.test.ts +215 -0
  33. package/src/__tests__/model-command.test.ts +185 -0
  34. package/src/__tests__/moon-loader.test.ts +398 -0
  35. package/src/__tests__/ping-command.test.ts +85 -0
  36. package/src/__tests__/plugin.test.ts +258 -0
  37. package/src/__tests__/remind-command.test.ts +368 -0
  38. package/src/__tests__/reset-command.test.ts +92 -0
  39. package/src/__tests__/router.test.ts +1246 -0
  40. package/src/__tests__/scheduler.test.ts +469 -0
  41. package/src/__tests__/security.test.ts +214 -0
  42. package/src/__tests__/session-meta.test.ts +101 -0
  43. package/src/__tests__/session-tracker.test.ts +389 -0
  44. package/src/__tests__/session.test.ts +241 -0
  45. package/src/__tests__/skill-loader.test.ts +153 -0
  46. package/src/__tests__/status-command.test.ts +153 -0
  47. package/src/__tests__/stop-command.test.ts +60 -0
  48. package/src/__tests__/think-command.test.ts +146 -0
  49. package/src/__tests__/usage-api.test.ts +222 -0
  50. package/src/__tests__/usage-command-api-fail.test.ts +48 -0
  51. package/src/__tests__/usage-command-no-oauth.test.ts +48 -0
  52. package/src/__tests__/usage-command.test.ts +173 -0
  53. package/src/__tests__/whoami-command.test.ts +124 -0
  54. package/src/index.ts +122 -0
  55. package/src/lib/command-handler.ts +135 -0
  56. package/src/lib/commands/clear.ts +69 -0
  57. package/src/lib/commands/compact.ts +14 -0
  58. package/src/lib/commands/config-show.ts +49 -0
  59. package/src/lib/commands/cron.ts +118 -0
  60. package/src/lib/commands/help.ts +26 -0
  61. package/src/lib/commands/model.ts +71 -0
  62. package/src/lib/commands/ping.ts +24 -0
  63. package/src/lib/commands/remind.ts +75 -0
  64. package/src/lib/commands/status.ts +118 -0
  65. package/src/lib/commands/stop.ts +18 -0
  66. package/src/lib/commands/think.ts +42 -0
  67. package/src/lib/commands/usage.ts +56 -0
  68. package/src/lib/commands/whoami.ts +23 -0
  69. package/src/lib/config-loader.ts +1449 -0
  70. package/src/lib/config.ts +202 -0
  71. package/src/lib/cron-parser.ts +388 -0
  72. package/src/lib/daemon.ts +216 -0
  73. package/src/lib/dedup.ts +414 -0
  74. package/src/lib/hook-runner.ts +1270 -0
  75. package/src/lib/logger.ts +55 -0
  76. package/src/lib/memory-orchestrator.ts +415 -0
  77. package/src/lib/model-catalog.ts +240 -0
  78. package/src/lib/moon-loader.ts +291 -0
  79. package/src/lib/plugin.ts +148 -0
  80. package/src/lib/router.ts +1135 -0
  81. package/src/lib/scheduler.ts +422 -0
  82. package/src/lib/security.ts +259 -0
  83. package/src/lib/session-tracker.ts +222 -0
  84. package/src/lib/session.ts +158 -0
  85. package/src/lib/skill-loader.ts +166 -0
  86. package/src/lib/usage-api.ts +145 -0
  87. package/src/types/agent.ts +86 -0
  88. package/src/types/channel.ts +93 -0
  89. package/src/types/index.ts +32 -0
  90. package/src/types/memory.ts +92 -0
  91. package/src/types/moon.ts +56 -0
  92. package/src/types/voice.ts +74 -0
@@ -0,0 +1,216 @@
1
+ import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs'
2
+ import { dirname, resolve } from 'node:path'
3
+ import type { MemoryProvider } from '../types/memory'
4
+ import type { ChannelPersona } from '../types/moon'
5
+ import type { LunarConfig } from './config'
6
+ import type { MemoryConfig, RecallConfig, SecurityConfig, VoiceConfig } from './config-loader'
7
+ import { log } from './logger'
8
+ import type { MoonLoader } from './moon-loader'
9
+ import type { PluginRegistry } from './plugin'
10
+ import { Router } from './router'
11
+ import type { Scheduler } from './scheduler'
12
+ import type { SessionStore } from './session'
13
+
14
+ export interface DaemonOptions {
15
+ config: LunarConfig
16
+ registry: PluginRegistry
17
+ sessions: SessionStore
18
+ moonLoader: MoonLoader
19
+ channels: ChannelPersona[]
20
+ security?: SecurityConfig
21
+ /** Memory provider for auto-recall */
22
+ memory?: MemoryProvider
23
+ /** Auto-recall configuration */
24
+ /** Recall configuration (enabled, limit, minScore, budget) */
25
+ recallConfig?: RecallConfig
26
+ /** Workspace path (for loading prompts/) */
27
+ workspacePath?: string
28
+ /** Full memory config (for provider name, instructionsFile) */
29
+ memoryConfig?: MemoryConfig
30
+ /** Scheduler instance (optional — created by CLI start if config exists) */
31
+ scheduler?: Scheduler
32
+ /** Voice configuration (TTS mode, STT mode) */
33
+ voiceConfig?: VoiceConfig
34
+ }
35
+
36
+ /**
37
+ * Daemon — The main process lifecycle manager.
38
+ *
39
+ * Handles: PID lock, signal handlers, graceful shutdown, startup orchestration.
40
+ */
41
+ export class Daemon {
42
+ private pidPath: string
43
+ private router: Router
44
+ private shutdownPromise?: Promise<void>
45
+ private startedAt = 0
46
+ private scheduler?: Scheduler
47
+
48
+ constructor(private options: DaemonOptions) {
49
+ this.pidPath = resolve(options.config.dataDir, 'lunar.pid')
50
+ this.scheduler = options.scheduler
51
+ this.router = new Router(
52
+ options.registry,
53
+ options.sessions,
54
+ options.config,
55
+ options.moonLoader,
56
+ options.channels,
57
+ options.security,
58
+ options.memory,
59
+ options.recallConfig,
60
+ options.workspacePath,
61
+ options.memoryConfig,
62
+ options.scheduler,
63
+ options.voiceConfig,
64
+ )
65
+ }
66
+
67
+ /** Start the daemon */
68
+ async start(): Promise<void> {
69
+ // Acquire PID lock
70
+ this.acquirePidLock()
71
+
72
+ // Register signal handlers
73
+ this.registerSignals()
74
+
75
+ this.startedAt = Date.now()
76
+ this.router.daemonStartedAt = this.startedAt
77
+
78
+ log.info({ pid: process.pid, dev: this.options.config.dev }, '🌑 Lunar starting')
79
+
80
+ // Initialize all plugins
81
+ await this.options.registry.initAll()
82
+
83
+ // Wire up message routing
84
+ this.wireRouting()
85
+
86
+ // Start scheduler if configured
87
+ if (this.scheduler) {
88
+ await this.scheduler.start()
89
+ }
90
+
91
+ log.info(
92
+ {
93
+ channels: this.options.registry.listChannels(),
94
+ agents: this.options.registry.listAgents(),
95
+ memory: this.options.registry.listMemory(),
96
+ moons: this.options.moonLoader.list(),
97
+ scheduler: !!this.scheduler,
98
+ },
99
+ '🌑 Lunar ready',
100
+ )
101
+ }
102
+
103
+ /** Graceful shutdown */
104
+ async stop(): Promise<void> {
105
+ if (this.shutdownPromise) return this.shutdownPromise
106
+
107
+ this.shutdownPromise = (async () => {
108
+ log.info('🌑 Lunar shutting down...')
109
+
110
+ // Stop scheduler first (no new dispatches during shutdown)
111
+ if (this.scheduler) {
112
+ await this.scheduler.stop()
113
+ }
114
+
115
+ // Graceful session close — triggers memory lifecycle hooks
116
+ // (summarize, promote, etc.) before providers are destroyed
117
+ try {
118
+ await this.router.closeAllSessions()
119
+ } catch (err) {
120
+ log.warn({ err }, 'Error during session close — continuing shutdown')
121
+ }
122
+
123
+ // Close sessions DB
124
+ this.options.sessions.close()
125
+
126
+ // Destroy all plugins
127
+ await this.options.registry.destroyAll()
128
+
129
+ // Release PID lock
130
+ this.releasePidLock()
131
+
132
+ log.info('🌑 Lunar stopped')
133
+ })()
134
+
135
+ return this.shutdownPromise
136
+ }
137
+
138
+ /** Get the router instance */
139
+ getRouter(): Router {
140
+ return this.router
141
+ }
142
+
143
+ // --- Private ---
144
+
145
+ private wireRouting(): void {
146
+ const { registry } = this.options
147
+
148
+ for (const channelId of registry.listChannels()) {
149
+ const channel = registry.getChannel(channelId)
150
+ if (!channel) continue
151
+
152
+ channel.onMessage(async (msg) => {
153
+ await this.router.route(channelId, msg)
154
+ })
155
+
156
+ log.debug({ channel: channelId }, 'Message routing wired')
157
+ }
158
+ }
159
+
160
+ private acquirePidLock(): void {
161
+ const dir = dirname(this.pidPath)
162
+ if (!existsSync(dir)) mkdirSync(dir, { recursive: true })
163
+
164
+ // Check for stale PID
165
+ if (existsSync(this.pidPath)) {
166
+ const existingPid = readFileSync(this.pidPath, 'utf-8').trim()
167
+ try {
168
+ // Check if process is still running
169
+ process.kill(Number.parseInt(existingPid, 10), 0)
170
+ throw new Error(`Lunar is already running (PID ${existingPid})`)
171
+ } catch (err: unknown) {
172
+ if (
173
+ err instanceof Error &&
174
+ 'code' in err &&
175
+ (err as NodeJS.ErrnoException).code === 'ESRCH'
176
+ ) {
177
+ log.warn({ stalePid: existingPid }, 'Removing stale PID file')
178
+ unlinkSync(this.pidPath)
179
+ } else {
180
+ throw err
181
+ }
182
+ }
183
+ }
184
+
185
+ writeFileSync(this.pidPath, String(process.pid))
186
+ log.debug({ pid: process.pid, path: this.pidPath }, 'PID lock acquired')
187
+ }
188
+
189
+ private releasePidLock(): void {
190
+ try {
191
+ if (existsSync(this.pidPath)) {
192
+ unlinkSync(this.pidPath)
193
+ }
194
+ } catch {
195
+ // Best effort
196
+ }
197
+ }
198
+
199
+ private registerSignals(): void {
200
+ const shutdown = async (signal: string) => {
201
+ log.info({ signal }, 'Received signal')
202
+ await this.stop()
203
+ process.exit(0)
204
+ }
205
+
206
+ process.on('SIGINT', () => shutdown('SIGINT'))
207
+ process.on('SIGTERM', () => shutdown('SIGTERM'))
208
+ process.on('uncaughtException', (err) => {
209
+ log.fatal({ err }, 'Uncaught exception')
210
+ this.stop().finally(() => process.exit(1))
211
+ })
212
+ process.on('unhandledRejection', (err) => {
213
+ log.error({ err }, 'Unhandled rejection')
214
+ })
215
+ }
216
+ }
@@ -0,0 +1,414 @@
1
+ /**
2
+ * @module dedup — Near-duplicate detection for memory results
3
+ *
4
+ * Zero external dependencies. Three strategies:
5
+ *
6
+ * 1. **Dice (bigram)** — Character bigram overlap (Sørensen–Dice coefficient).
7
+ * Best for short texts (<300 chars). O(n) time, O(n) space.
8
+ *
9
+ * 2. **MinHash** — Approximate Jaccard similarity via locality-sensitive hashing.
10
+ * Uses word-level trigram shingling + FNV-1a hash + random permutations.
11
+ * Best for medium-long texts. O(n·k) time where k = numPermutations.
12
+ * Based on: Broder (1997), BigCode/HuggingFace dedup pipeline, GPT-3 paper.
13
+ *
14
+ * 3. **Adaptive** — Auto-selects Dice or MinHash based on text length.
15
+ * Default strategy. Crossover at ~300 chars.
16
+ *
17
+ * Design priorities:
18
+ * - Conservative: high precision (don't wrongly remove unique content) over
19
+ * high recall (catch every duplicate). Missing a duplicate = wasted tokens.
20
+ * Removing unique content = lost context.
21
+ * - Zero dependencies: pure TypeScript, no npm packages.
22
+ * - Framework-ready: pluggable DedupStrategy interface for custom implementations.
23
+ *
24
+ * References:
25
+ * - Sørensen–Dice: https://en.wikipedia.org/wiki/Dice-S%C3%B8rensen_coefficient
26
+ * - MinHash: Broder (1997), "On the resemblance and containment of documents"
27
+ * - "In Defense of MinHash Over SimHash" (Shrivastava & Li, AISTATS 2014)
28
+ * - HuggingFace BigCode: word-level trigram shingling standard
29
+ * - FNV-1a: Fowler–Noll–Vo non-cryptographic hash
30
+ */
31
+
32
+ // ════════════════════════════════════════════════════════════
33
+ // Types
34
+ // ════════════════════════════════════════════════════════════
35
+
36
+ export interface DedupResult<T> {
37
+ /** Items kept after deduplication */
38
+ kept: T[]
39
+ /** Items removed as duplicates (with reference to what they duplicated) */
40
+ removed: Array<{ item: T; duplicateOf: T; similarity: number }>
41
+ }
42
+
43
+ export interface DedupStrategy {
44
+ /** Compare two texts. Returns 0–1 similarity score. */
45
+ compare(a: string, b: string): number
46
+ /** Strategy identifier for logging/debugging */
47
+ readonly id: string
48
+ }
49
+
50
+ export interface DedupOptions {
51
+ /** Similarity threshold above which items are considered duplicates. Default: 0.85 */
52
+ threshold?: number
53
+ /** Maximum number of items to keep (token budget proxy). Default: Infinity */
54
+ maxResults?: number
55
+ /** Extract text from item for comparison. Default: String(item) */
56
+ textExtractor?: (item: unknown) => string
57
+ }
58
+
59
+ // ════════════════════════════════════════════════════════════
60
+ // Text normalization
61
+ // ════════════════════════════════════════════════════════════
62
+
63
+ const WHITESPACE_RE = /\s+/g
64
+ const PUNCTUATION_RE = /[^\p{L}\p{N}\s]/gu
65
+
66
+ /**
67
+ * Normalize text for comparison: lowercase, collapse whitespace,
68
+ * strip punctuation (keeping letters + numbers in any language).
69
+ */
70
+ export function normalize(text: string): string {
71
+ return text.toLowerCase().replace(PUNCTUATION_RE, ' ').replace(WHITESPACE_RE, ' ').trim()
72
+ }
73
+
74
+ // ════════════════════════════════════════════════════════════
75
+ // FNV-1a hash (32-bit)
76
+ // ════════════════════════════════════════════════════════════
77
+
78
+ const FNV_OFFSET = 0x811c9dc5
79
+ const FNV_PRIME = 0x01000193
80
+
81
+ /**
82
+ * FNV-1a 32-bit hash. Fast, well-distributed, non-cryptographic.
83
+ * Used internally by MinHash for shingling.
84
+ */
85
+ export function fnv1a32(str: string): number {
86
+ let hash = FNV_OFFSET
87
+ for (let i = 0; i < str.length; i++) {
88
+ hash ^= str.charCodeAt(i)
89
+ hash = Math.imul(hash, FNV_PRIME) >>> 0
90
+ }
91
+ return hash
92
+ }
93
+
94
+ // ════════════════════════════════════════════════════════════
95
+ // Dice coefficient (bigram-based)
96
+ // ════════════════════════════════════════════════════════════
97
+
98
+ /**
99
+ * Sørensen–Dice coefficient using character bigrams.
100
+ * Handles multiset (counts duplicates correctly: "GG" vs "GGGG" ≠ 1.0).
101
+ *
102
+ * Accuracy: ~85% precision on short texts, ~97%+ at threshold 0.90.
103
+ * Speed: O(n+m) time, O(n+m) space.
104
+ *
105
+ * @returns 0–1 similarity score
106
+ */
107
+ export function diceSimilarity(a: string, b: string): number {
108
+ if (a === b) return 1
109
+ if (a.length < 2 || b.length < 2) return 0
110
+
111
+ const bigramsA = new Map<string, number>()
112
+ const bigramsB = new Map<string, number>()
113
+
114
+ for (let i = 0; i < a.length - 1; i++) {
115
+ const bg = a.charAt(i) + a.charAt(i + 1)
116
+ bigramsA.set(bg, (bigramsA.get(bg) || 0) + 1)
117
+ }
118
+ for (let i = 0; i < b.length - 1; i++) {
119
+ const bg = b.charAt(i) + b.charAt(i + 1)
120
+ bigramsB.set(bg, (bigramsB.get(bg) || 0) + 1)
121
+ }
122
+
123
+ let intersection = 0
124
+ for (const [bg, count] of bigramsA) {
125
+ const countB = bigramsB.get(bg)
126
+ if (countB) intersection += Math.min(count, countB)
127
+ }
128
+
129
+ return (2 * intersection) / (a.length - 1 + b.length - 1)
130
+ }
131
+
132
+ export class DiceDedupStrategy implements DedupStrategy {
133
+ readonly id = 'dice'
134
+
135
+ compare(a: string, b: string): number {
136
+ return diceSimilarity(normalize(a), normalize(b))
137
+ }
138
+ }
139
+
140
+ // ════════════════════════════════════════════════════════════
141
+ // MinHash (approximate Jaccard via LSH)
142
+ // ════════════════════════════════════════════════════════════
143
+
144
+ /**
145
+ * Generate word-level n-gram shingles from text.
146
+ * Default: trigrams (n=3), the standard for text dedup (BigCode, GPT-3).
147
+ *
148
+ * "the quick brown fox" with n=3 → ["the quick brown", "quick brown fox"]
149
+ */
150
+ export function shingle(text: string, n = 3): string[] {
151
+ const words = normalize(text)
152
+ .split(' ')
153
+ .filter((w) => w.length > 0)
154
+ if (words.length < n) return [words.join(' ')]
155
+
156
+ const shingles: string[] = []
157
+ for (let i = 0; i <= words.length - n; i++) {
158
+ shingles.push(words.slice(i, i + n).join(' '))
159
+ }
160
+ return shingles
161
+ }
162
+
163
+ // Mersenne prime 2^31 - 1 (standard for MinHash permutation simulation)
164
+ const MERSENNE_PRIME = 0x7fffffff
165
+
166
+ export interface PermutationCoeffs {
167
+ a: Uint32Array
168
+ b: Uint32Array
169
+ numPerm: number
170
+ }
171
+
172
+ /**
173
+ * Generate random permutation coefficients for MinHash.
174
+ * Deterministic given the same seed (for reproducibility).
175
+ *
176
+ * Uses xorshift32 PRNG seeded from the input.
177
+ * These coefficients are reusable across all comparisons.
178
+ */
179
+ export function generatePermutations(numPerm = 128, seed = 42): PermutationCoeffs {
180
+ const a = new Uint32Array(numPerm)
181
+ const b = new Uint32Array(numPerm)
182
+
183
+ // Seeded PRNG (xorshift32)
184
+ let state = seed >>> 0 || 1
185
+ function next(): number {
186
+ state ^= state << 13
187
+ state ^= state >>> 17
188
+ state ^= state << 5
189
+ return state >>> 0
190
+ }
191
+
192
+ for (let i = 0; i < numPerm; i++) {
193
+ // a must be non-zero mod prime for good distribution
194
+ a[i] = (next() % (MERSENNE_PRIME - 1)) + 1
195
+ b[i] = next() % MERSENNE_PRIME
196
+ }
197
+
198
+ return { a, b, numPerm }
199
+ }
200
+
201
+ /**
202
+ * Compute a MinHash signature for a set of shingles.
203
+ *
204
+ * Uses the random permutation simulation technique:
205
+ * h_i(x) = (a_i * hash(x) + b_i) mod prime
206
+ * where a_i, b_i are random coefficients per permutation.
207
+ *
208
+ * The signature is a Uint32Array of `numPerm` minimum hash values.
209
+ */
210
+ export function computeMinHash(shingles: string[], coeffs: PermutationCoeffs): Uint32Array {
211
+ const { a, b, numPerm } = coeffs
212
+ const signature = new Uint32Array(numPerm).fill(0xffffffff)
213
+
214
+ for (const s of shingles) {
215
+ const hash = fnv1a32(s)
216
+ for (let i = 0; i < numPerm; i++) {
217
+ // Simulate permutation: (a[i] * hash + b[i]) mod MERSENNE_PRIME
218
+ const permuted = ((Math.imul(a[i], hash) >>> 0) + b[i]) % MERSENNE_PRIME
219
+ if (permuted < signature[i]) {
220
+ signature[i] = permuted
221
+ }
222
+ }
223
+ }
224
+
225
+ return signature
226
+ }
227
+
228
+ /**
229
+ * Estimate Jaccard similarity from two MinHash signatures.
230
+ * Fraction of matching positions ≈ Jaccard(A, B).
231
+ *
232
+ * Standard error: 1/√numPerm
233
+ * - 128 perms → SE ~0.088 (±8.8%)
234
+ * - 256 perms → SE ~0.063 (±6.3%)
235
+ */
236
+ export function minhashSimilarity(sigA: Uint32Array, sigB: Uint32Array): number {
237
+ if (sigA.length !== sigB.length) {
238
+ throw new Error(`Signature length mismatch: ${sigA.length} vs ${sigB.length}`)
239
+ }
240
+ let matches = 0
241
+ for (let i = 0; i < sigA.length; i++) {
242
+ if (sigA[i] === sigB[i]) matches++
243
+ }
244
+ return matches / sigA.length
245
+ }
246
+
247
+ /** Exact Jaccard similarity for small sets (no approximation needed). */
248
+ function jaccardExact(a: Set<string>, b: Set<string>): number {
249
+ let intersection = 0
250
+ for (const item of a) {
251
+ if (b.has(item)) intersection++
252
+ }
253
+ const union = a.size + b.size - intersection
254
+ return union === 0 ? 1 : intersection / union
255
+ }
256
+
257
+ export class MinHashDedupStrategy implements DedupStrategy {
258
+ readonly id = 'minhash'
259
+ private readonly coeffs: PermutationCoeffs
260
+ private readonly shingleSize: number
261
+
262
+ /**
263
+ * @param numPerm - Number of permutations (more = more accurate, slower).
264
+ * 128 = standard error ~8.8%. 256 = ~6.25%.
265
+ * @param shingleSize - Word n-gram size. 3 = standard (BigCode/GPT-3).
266
+ * @param seed - PRNG seed for reproducible permutations.
267
+ */
268
+ constructor(numPerm = 128, shingleSize = 3, seed = 42) {
269
+ this.coeffs = generatePermutations(numPerm, seed)
270
+ this.shingleSize = shingleSize
271
+ }
272
+
273
+ compare(a: string, b: string): number {
274
+ const shinglesA = shingle(a, this.shingleSize)
275
+ const shinglesB = shingle(b, this.shingleSize)
276
+
277
+ // Edge case: very short texts with few shingles — fall back to exact Jaccard
278
+ if (shinglesA.length <= 3 || shinglesB.length <= 3) {
279
+ return jaccardExact(new Set(shinglesA), new Set(shinglesB))
280
+ }
281
+
282
+ const sigA = computeMinHash(shinglesA, this.coeffs)
283
+ const sigB = computeMinHash(shinglesB, this.coeffs)
284
+ return minhashSimilarity(sigA, sigB)
285
+ }
286
+ }
287
+
288
+ // ════════════════════════════════════════════════════════════
289
+ // Adaptive strategy (auto-selects based on text length)
290
+ // ════════════════════════════════════════════════════════════
291
+
292
+ export class AdaptiveDedupStrategy implements DedupStrategy {
293
+ readonly id = 'adaptive'
294
+ private readonly dice = new DiceDedupStrategy()
295
+ private readonly minhash: MinHashDedupStrategy
296
+ private readonly crossoverLength: number
297
+
298
+ /**
299
+ * @param crossoverLength - Char count threshold to switch Dice → MinHash.
300
+ * Default: 300 (empirically, Dice accuracy degrades
301
+ * for longer texts where boilerplate dilutes bigrams).
302
+ * @param numPerm - MinHash permutations (passed through).
303
+ */
304
+ constructor(crossoverLength = 300, numPerm = 128) {
305
+ this.crossoverLength = crossoverLength
306
+ this.minhash = new MinHashDedupStrategy(numPerm)
307
+ }
308
+
309
+ compare(a: string, b: string): number {
310
+ const maxLen = Math.max(a.length, b.length)
311
+ if (maxLen <= this.crossoverLength) {
312
+ return this.dice.compare(a, b)
313
+ }
314
+ return this.minhash.compare(a, b)
315
+ }
316
+ }
317
+
318
+ // ════════════════════════════════════════════════════════════
319
+ // Deduplication engine
320
+ // ════════════════════════════════════════════════════════════
321
+
322
+ /**
323
+ * Deduplicate an array of items using the given strategy.
324
+ *
325
+ * Items are compared pairwise. When a duplicate is found, the item appearing
326
+ * later in the array is removed (earlier items have priority — sort by
327
+ * relevance score before calling this).
328
+ *
329
+ * Complexity: O(n² · C) where C = cost of one comparison.
330
+ * For n ≤ 20 (typical memory recall), this is negligible.
331
+ *
332
+ * @param items - Items to deduplicate (pre-sorted by relevance, best first)
333
+ * @param strategy - Comparison strategy (default: adaptive)
334
+ * @param options - Threshold, maxResults, text extractor
335
+ * @returns Kept items + removed items with metadata
336
+ */
337
+ export function deduplicate<T>(
338
+ items: T[],
339
+ strategy: DedupStrategy = new AdaptiveDedupStrategy(),
340
+ options: DedupOptions = {},
341
+ ): DedupResult<T> {
342
+ const {
343
+ threshold = 0.85,
344
+ maxResults = Infinity,
345
+ textExtractor = (item: unknown) => String(item),
346
+ } = options
347
+
348
+ const kept: T[] = []
349
+ const removed: DedupResult<T>['removed'] = []
350
+
351
+ for (const item of items) {
352
+ if (kept.length >= maxResults) break
353
+
354
+ const text = textExtractor(item)
355
+ let isDuplicate = false
356
+
357
+ for (const existing of kept) {
358
+ const existingText = textExtractor(existing)
359
+ const similarity = strategy.compare(text, existingText)
360
+
361
+ if (similarity > threshold) {
362
+ removed.push({ item, duplicateOf: existing, similarity })
363
+ isDuplicate = true
364
+ break
365
+ }
366
+ }
367
+
368
+ if (!isDuplicate) {
369
+ kept.push(item)
370
+ }
371
+ }
372
+
373
+ return { kept, removed }
374
+ }
375
+
376
+ // ════════════════════════════════════════════════════════════
377
+ // Factory — create strategy from config
378
+ // ════════════════════════════════════════════════════════════
379
+
380
+ export interface DedupConfig {
381
+ /** Strategy name: 'dice' | 'minhash' | 'adaptive' (default) */
382
+ strategy?: 'dice' | 'minhash' | 'adaptive'
383
+ /** Similarity threshold for duplicate detection. Default: 0.85 */
384
+ threshold?: number
385
+ /** MinHash: number of permutations. Default: 128 */
386
+ numPermutations?: number
387
+ /** MinHash: word n-gram size. Default: 3 */
388
+ shingleSize?: number
389
+ /** Adaptive: crossover length in chars. Default: 300 */
390
+ crossoverLength?: number
391
+ /** Maximum results after dedup. Default: Infinity */
392
+ maxResults?: number
393
+ }
394
+
395
+ /** Create a DedupStrategy from config. */
396
+ export function createStrategy(config: DedupConfig = {}): DedupStrategy {
397
+ const {
398
+ strategy = 'adaptive',
399
+ numPermutations = 128,
400
+ shingleSize = 3,
401
+ crossoverLength = 300,
402
+ } = config
403
+
404
+ switch (strategy) {
405
+ case 'dice':
406
+ return new DiceDedupStrategy()
407
+ case 'minhash':
408
+ return new MinHashDedupStrategy(numPermutations, shingleSize)
409
+ case 'adaptive':
410
+ return new AdaptiveDedupStrategy(crossoverLength, numPermutations)
411
+ default:
412
+ return new AdaptiveDedupStrategy(crossoverLength, numPermutations)
413
+ }
414
+ }