ahok-skill 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc +8 -0
- package/Dockerfile +59 -0
- package/RAW_SKILL.md +219 -0
- package/README.md +277 -0
- package/SKILL.md +58 -0
- package/bin/opm.js +268 -0
- package/data/openmemory.sqlite +0 -0
- package/data/openmemory.sqlite-shm +0 -0
- package/data/openmemory.sqlite-wal +0 -0
- package/dist/ai/graph.js +293 -0
- package/dist/ai/mcp.js +397 -0
- package/dist/cli.js +78 -0
- package/dist/core/cfg.js +87 -0
- package/dist/core/db.js +636 -0
- package/dist/core/memory.js +116 -0
- package/dist/core/migrate.js +227 -0
- package/dist/core/models.js +105 -0
- package/dist/core/telemetry.js +57 -0
- package/dist/core/types.js +2 -0
- package/dist/core/vector/postgres.js +52 -0
- package/dist/core/vector/valkey.js +246 -0
- package/dist/core/vector_store.js +2 -0
- package/dist/index.js +44 -0
- package/dist/memory/decay.js +301 -0
- package/dist/memory/embed.js +675 -0
- package/dist/memory/hsg.js +959 -0
- package/dist/memory/reflect.js +131 -0
- package/dist/memory/user_summary.js +99 -0
- package/dist/migrate.js +9 -0
- package/dist/ops/compress.js +255 -0
- package/dist/ops/dynamics.js +189 -0
- package/dist/ops/extract.js +333 -0
- package/dist/ops/ingest.js +214 -0
- package/dist/server/index.js +109 -0
- package/dist/server/middleware/auth.js +137 -0
- package/dist/server/routes/auth.js +186 -0
- package/dist/server/routes/compression.js +108 -0
- package/dist/server/routes/dashboard.js +399 -0
- package/dist/server/routes/docs.js +241 -0
- package/dist/server/routes/dynamics.js +312 -0
- package/dist/server/routes/ide.js +280 -0
- package/dist/server/routes/index.js +33 -0
- package/dist/server/routes/keys.js +132 -0
- package/dist/server/routes/langgraph.js +61 -0
- package/dist/server/routes/memory.js +213 -0
- package/dist/server/routes/sources.js +140 -0
- package/dist/server/routes/system.js +63 -0
- package/dist/server/routes/temporal.js +293 -0
- package/dist/server/routes/users.js +101 -0
- package/dist/server/routes/vercel.js +57 -0
- package/dist/server/server.js +211 -0
- package/dist/server.js +3 -0
- package/dist/sources/base.js +223 -0
- package/dist/sources/github.js +171 -0
- package/dist/sources/google_drive.js +166 -0
- package/dist/sources/google_sheets.js +112 -0
- package/dist/sources/google_slides.js +139 -0
- package/dist/sources/index.js +34 -0
- package/dist/sources/notion.js +165 -0
- package/dist/sources/onedrive.js +143 -0
- package/dist/sources/web_crawler.js +166 -0
- package/dist/temporal_graph/index.js +20 -0
- package/dist/temporal_graph/query.js +240 -0
- package/dist/temporal_graph/store.js +116 -0
- package/dist/temporal_graph/timeline.js +241 -0
- package/dist/temporal_graph/types.js +2 -0
- package/dist/utils/chunking.js +60 -0
- package/dist/utils/index.js +31 -0
- package/dist/utils/keyword.js +94 -0
- package/dist/utils/text.js +120 -0
- package/nodemon.json +7 -0
- package/package.json +50 -0
- package/references/api_reference.md +66 -0
- package/references/examples.md +45 -0
- package/src/ai/graph.ts +363 -0
- package/src/ai/mcp.ts +494 -0
- package/src/cli.ts +94 -0
- package/src/core/cfg.ts +110 -0
- package/src/core/db.ts +1052 -0
- package/src/core/memory.ts +99 -0
- package/src/core/migrate.ts +302 -0
- package/src/core/models.ts +107 -0
- package/src/core/telemetry.ts +47 -0
- package/src/core/types.ts +130 -0
- package/src/core/vector/postgres.ts +61 -0
- package/src/core/vector/valkey.ts +261 -0
- package/src/core/vector_store.ts +9 -0
- package/src/index.ts +5 -0
- package/src/memory/decay.ts +427 -0
- package/src/memory/embed.ts +707 -0
- package/src/memory/hsg.ts +1245 -0
- package/src/memory/reflect.ts +158 -0
- package/src/memory/user_summary.ts +110 -0
- package/src/migrate.ts +8 -0
- package/src/ops/compress.ts +296 -0
- package/src/ops/dynamics.ts +272 -0
- package/src/ops/extract.ts +360 -0
- package/src/ops/ingest.ts +286 -0
- package/src/server/index.ts +159 -0
- package/src/server/middleware/auth.ts +156 -0
- package/src/server/routes/auth.ts +223 -0
- package/src/server/routes/compression.ts +106 -0
- package/src/server/routes/dashboard.ts +420 -0
- package/src/server/routes/docs.ts +380 -0
- package/src/server/routes/dynamics.ts +516 -0
- package/src/server/routes/ide.ts +283 -0
- package/src/server/routes/index.ts +32 -0
- package/src/server/routes/keys.ts +131 -0
- package/src/server/routes/langgraph.ts +71 -0
- package/src/server/routes/memory.ts +440 -0
- package/src/server/routes/sources.ts +111 -0
- package/src/server/routes/system.ts +68 -0
- package/src/server/routes/temporal.ts +335 -0
- package/src/server/routes/users.ts +111 -0
- package/src/server/routes/vercel.ts +55 -0
- package/src/server/server.js +215 -0
- package/src/server.ts +1 -0
- package/src/sources/base.ts +257 -0
- package/src/sources/github.ts +156 -0
- package/src/sources/google_drive.ts +144 -0
- package/src/sources/google_sheets.ts +85 -0
- package/src/sources/google_slides.ts +115 -0
- package/src/sources/index.ts +19 -0
- package/src/sources/notion.ts +148 -0
- package/src/sources/onedrive.ts +131 -0
- package/src/sources/web_crawler.ts +161 -0
- package/src/temporal_graph/index.ts +4 -0
- package/src/temporal_graph/query.ts +299 -0
- package/src/temporal_graph/store.ts +156 -0
- package/src/temporal_graph/timeline.ts +319 -0
- package/src/temporal_graph/types.ts +41 -0
- package/src/utils/chunking.ts +66 -0
- package/src/utils/index.ts +25 -0
- package/src/utils/keyword.ts +137 -0
- package/src/utils/text.ts +115 -0
- package/tests/test_api_workspace_management.ts +413 -0
- package/tests/test_bulk_delete.ts +267 -0
- package/tests/test_omnibus.ts +166 -0
- package/tests/test_workspace_management.ts +278 -0
- package/tests/verify.ts +104 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
import { all_async } from '../core/db'
|
|
4
|
+
import { TemporalFact, TimelineEntry } from './types'
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
export const get_subject_timeline = async (
|
|
8
|
+
subject: string,
|
|
9
|
+
predicate?: string
|
|
10
|
+
): Promise<TimelineEntry[]> => {
|
|
11
|
+
const conditions = ['subject = ?']
|
|
12
|
+
const params: any[] = [subject]
|
|
13
|
+
|
|
14
|
+
if (predicate) {
|
|
15
|
+
conditions.push('predicate = ?')
|
|
16
|
+
params.push(predicate)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const sql = `
|
|
20
|
+
SELECT subject, predicate, object, confidence, valid_from, valid_to
|
|
21
|
+
FROM temporal_facts
|
|
22
|
+
WHERE ${conditions.join(' AND ')}
|
|
23
|
+
ORDER BY valid_from ASC
|
|
24
|
+
`
|
|
25
|
+
|
|
26
|
+
const rows = await all_async(sql, params)
|
|
27
|
+
const timeline: TimelineEntry[] = []
|
|
28
|
+
|
|
29
|
+
for (const row of rows) {
|
|
30
|
+
// Creation event
|
|
31
|
+
timeline.push({
|
|
32
|
+
timestamp: new Date(row.valid_from),
|
|
33
|
+
subject: row.subject,
|
|
34
|
+
predicate: row.predicate,
|
|
35
|
+
object: row.object,
|
|
36
|
+
confidence: row.confidence,
|
|
37
|
+
change_type: 'created'
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
// Invalidation event (if applicable)
|
|
41
|
+
if (row.valid_to) {
|
|
42
|
+
timeline.push({
|
|
43
|
+
timestamp: new Date(row.valid_to),
|
|
44
|
+
subject: row.subject,
|
|
45
|
+
predicate: row.predicate,
|
|
46
|
+
object: row.object,
|
|
47
|
+
confidence: row.confidence,
|
|
48
|
+
change_type: 'invalidated'
|
|
49
|
+
})
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return timeline.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime())
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
export const get_predicate_timeline = async (
|
|
58
|
+
predicate: string,
|
|
59
|
+
from?: Date,
|
|
60
|
+
to?: Date
|
|
61
|
+
): Promise<TimelineEntry[]> => {
|
|
62
|
+
const conditions = ['predicate = ?']
|
|
63
|
+
const params: any[] = [predicate]
|
|
64
|
+
|
|
65
|
+
if (from) {
|
|
66
|
+
conditions.push('valid_from >= ?')
|
|
67
|
+
params.push(from.getTime())
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (to) {
|
|
71
|
+
conditions.push('valid_from <= ?')
|
|
72
|
+
params.push(to.getTime())
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const sql = `
|
|
76
|
+
SELECT subject, predicate, object, confidence, valid_from, valid_to
|
|
77
|
+
FROM temporal_facts
|
|
78
|
+
WHERE ${conditions.join(' AND ')}
|
|
79
|
+
ORDER BY valid_from ASC
|
|
80
|
+
`
|
|
81
|
+
|
|
82
|
+
const rows = await all_async(sql, params)
|
|
83
|
+
const timeline: TimelineEntry[] = []
|
|
84
|
+
|
|
85
|
+
for (const row of rows) {
|
|
86
|
+
timeline.push({
|
|
87
|
+
timestamp: new Date(row.valid_from),
|
|
88
|
+
subject: row.subject,
|
|
89
|
+
predicate: row.predicate,
|
|
90
|
+
object: row.object,
|
|
91
|
+
confidence: row.confidence,
|
|
92
|
+
change_type: 'created'
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
if (row.valid_to) {
|
|
96
|
+
timeline.push({
|
|
97
|
+
timestamp: new Date(row.valid_to),
|
|
98
|
+
subject: row.subject,
|
|
99
|
+
predicate: row.predicate,
|
|
100
|
+
object: row.object,
|
|
101
|
+
confidence: row.confidence,
|
|
102
|
+
change_type: 'invalidated'
|
|
103
|
+
})
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return timeline.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime())
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
export const get_changes_in_window = async (
|
|
112
|
+
from: Date,
|
|
113
|
+
to: Date,
|
|
114
|
+
subject?: string
|
|
115
|
+
): Promise<TimelineEntry[]> => {
|
|
116
|
+
const from_ts = from.getTime()
|
|
117
|
+
const to_ts = to.getTime()
|
|
118
|
+
const conditions: string[] = []
|
|
119
|
+
const params: any[] = []
|
|
120
|
+
|
|
121
|
+
if (subject) {
|
|
122
|
+
conditions.push('subject = ?')
|
|
123
|
+
params.push(subject)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const where = conditions.length > 0 ? `AND ${conditions.join(' AND ')}` : ''
|
|
127
|
+
|
|
128
|
+
const sql = `
|
|
129
|
+
SELECT subject, predicate, object, confidence, valid_from, valid_to
|
|
130
|
+
FROM temporal_facts
|
|
131
|
+
WHERE ((valid_from >= ? AND valid_from <= ?) OR (valid_to >= ? AND valid_to <= ?))
|
|
132
|
+
${where}
|
|
133
|
+
ORDER BY valid_from ASC
|
|
134
|
+
`
|
|
135
|
+
|
|
136
|
+
const rows = await all_async(sql, [from_ts, to_ts, from_ts, to_ts, ...params])
|
|
137
|
+
const timeline: TimelineEntry[] = []
|
|
138
|
+
|
|
139
|
+
for (const row of rows) {
|
|
140
|
+
if (row.valid_from >= from_ts && row.valid_from <= to_ts) {
|
|
141
|
+
timeline.push({
|
|
142
|
+
timestamp: new Date(row.valid_from),
|
|
143
|
+
subject: row.subject,
|
|
144
|
+
predicate: row.predicate,
|
|
145
|
+
object: row.object,
|
|
146
|
+
confidence: row.confidence,
|
|
147
|
+
change_type: 'created'
|
|
148
|
+
})
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (row.valid_to && row.valid_to >= from_ts && row.valid_to <= to_ts) {
|
|
152
|
+
timeline.push({
|
|
153
|
+
timestamp: new Date(row.valid_to),
|
|
154
|
+
subject: row.subject,
|
|
155
|
+
predicate: row.predicate,
|
|
156
|
+
object: row.object,
|
|
157
|
+
confidence: row.confidence,
|
|
158
|
+
change_type: 'invalidated'
|
|
159
|
+
})
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return timeline.sort((a, b) => a.timestamp.getTime() - b.timestamp.getTime())
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
export const compare_time_points = async (
|
|
168
|
+
subject: string,
|
|
169
|
+
time1: Date,
|
|
170
|
+
time2: Date
|
|
171
|
+
): Promise<{
|
|
172
|
+
added: TemporalFact[]
|
|
173
|
+
removed: TemporalFact[]
|
|
174
|
+
changed: Array<{ before: TemporalFact; after: TemporalFact }>
|
|
175
|
+
unchanged: TemporalFact[]
|
|
176
|
+
}> => {
|
|
177
|
+
const t1_ts = time1.getTime()
|
|
178
|
+
const t2_ts = time2.getTime()
|
|
179
|
+
|
|
180
|
+
// Get all facts for subject at both times
|
|
181
|
+
const facts_t1 = await all_async(`
|
|
182
|
+
SELECT id, subject, predicate, object, valid_from, valid_to, confidence, last_updated, metadata
|
|
183
|
+
FROM temporal_facts
|
|
184
|
+
WHERE subject = ?
|
|
185
|
+
AND valid_from <= ? AND (valid_to IS NULL OR valid_to >= ?)
|
|
186
|
+
`, [subject, t1_ts, t1_ts])
|
|
187
|
+
|
|
188
|
+
const facts_t2 = await all_async(`
|
|
189
|
+
SELECT id, subject, predicate, object, valid_from, valid_to, confidence, last_updated, metadata
|
|
190
|
+
FROM temporal_facts
|
|
191
|
+
WHERE subject = ?
|
|
192
|
+
AND valid_from <= ? AND (valid_to IS NULL OR valid_to >= ?)
|
|
193
|
+
`, [subject, t2_ts, t2_ts])
|
|
194
|
+
|
|
195
|
+
const map_t1 = new Map<string, any>()
|
|
196
|
+
const map_t2 = new Map<string, any>()
|
|
197
|
+
|
|
198
|
+
for (const f of facts_t1) {
|
|
199
|
+
map_t1.set(f.predicate, f)
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
for (const f of facts_t2) {
|
|
203
|
+
map_t2.set(f.predicate, f)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const added: TemporalFact[] = []
|
|
207
|
+
const removed: TemporalFact[] = []
|
|
208
|
+
const changed: Array<{ before: TemporalFact; after: TemporalFact }> = []
|
|
209
|
+
const unchanged: TemporalFact[] = []
|
|
210
|
+
|
|
211
|
+
// Find added and changed
|
|
212
|
+
for (const [pred, fact2] of map_t2) {
|
|
213
|
+
const fact1 = map_t1.get(pred)
|
|
214
|
+
if (!fact1) {
|
|
215
|
+
added.push(row_to_fact(fact2))
|
|
216
|
+
} else if (fact1.object !== fact2.object || fact1.id !== fact2.id) {
|
|
217
|
+
changed.push({
|
|
218
|
+
before: row_to_fact(fact1),
|
|
219
|
+
after: row_to_fact(fact2)
|
|
220
|
+
})
|
|
221
|
+
} else {
|
|
222
|
+
unchanged.push(row_to_fact(fact2))
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Find removed
|
|
227
|
+
for (const [pred, fact1] of map_t1) {
|
|
228
|
+
if (!map_t2.has(pred)) {
|
|
229
|
+
removed.push(row_to_fact(fact1))
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return { added, removed, changed, unchanged }
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
export const get_change_frequency = async (
|
|
238
|
+
subject: string,
|
|
239
|
+
predicate: string,
|
|
240
|
+
window_days: number = 30
|
|
241
|
+
): Promise<{
|
|
242
|
+
predicate: string
|
|
243
|
+
total_changes: number
|
|
244
|
+
avg_duration_ms: number
|
|
245
|
+
change_rate_per_day: number
|
|
246
|
+
}> => {
|
|
247
|
+
const now = Date.now()
|
|
248
|
+
const window_start = now - (window_days * 86400000)
|
|
249
|
+
|
|
250
|
+
const rows = await all_async(`
|
|
251
|
+
SELECT valid_from, valid_to
|
|
252
|
+
FROM temporal_facts
|
|
253
|
+
WHERE subject = ? AND predicate = ?
|
|
254
|
+
AND valid_from >= ?
|
|
255
|
+
ORDER BY valid_from ASC
|
|
256
|
+
`, [subject, predicate, window_start])
|
|
257
|
+
|
|
258
|
+
const total_changes = rows.length
|
|
259
|
+
let total_duration = 0
|
|
260
|
+
let valid_durations = 0
|
|
261
|
+
|
|
262
|
+
for (const row of rows) {
|
|
263
|
+
if (row.valid_to) {
|
|
264
|
+
total_duration += row.valid_to - row.valid_from
|
|
265
|
+
valid_durations++
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const avg_duration_ms = valid_durations > 0 ? total_duration / valid_durations : 0
|
|
270
|
+
const change_rate_per_day = total_changes / window_days
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
predicate,
|
|
274
|
+
total_changes,
|
|
275
|
+
avg_duration_ms,
|
|
276
|
+
change_rate_per_day
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
export const get_volatile_facts = async (
|
|
282
|
+
subject?: string,
|
|
283
|
+
limit: number = 10
|
|
284
|
+
): Promise<Array<{
|
|
285
|
+
subject: string
|
|
286
|
+
predicate: string
|
|
287
|
+
change_count: number
|
|
288
|
+
avg_confidence: number
|
|
289
|
+
}>> => {
|
|
290
|
+
const where = subject ? 'WHERE subject = ?' : ''
|
|
291
|
+
const params = subject ? [subject] : []
|
|
292
|
+
|
|
293
|
+
const sql = `
|
|
294
|
+
SELECT subject, predicate, COUNT(*) as change_count, AVG(confidence) as avg_confidence
|
|
295
|
+
FROM temporal_facts
|
|
296
|
+
${where}
|
|
297
|
+
GROUP BY subject, predicate
|
|
298
|
+
HAVING change_count > 1
|
|
299
|
+
ORDER BY change_count DESC, avg_confidence ASC
|
|
300
|
+
LIMIT ?
|
|
301
|
+
`
|
|
302
|
+
|
|
303
|
+
return await all_async(sql, [...params, limit])
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Helper function
|
|
307
|
+
function row_to_fact(row: any): TemporalFact {
|
|
308
|
+
return {
|
|
309
|
+
id: row.id,
|
|
310
|
+
subject: row.subject,
|
|
311
|
+
predicate: row.predicate,
|
|
312
|
+
object: row.object,
|
|
313
|
+
valid_from: new Date(row.valid_from),
|
|
314
|
+
valid_to: row.valid_to ? new Date(row.valid_to) : null,
|
|
315
|
+
confidence: row.confidence,
|
|
316
|
+
last_updated: new Date(row.last_updated),
|
|
317
|
+
metadata: row.metadata ? JSON.parse(row.metadata) : undefined
|
|
318
|
+
}
|
|
319
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export interface TemporalFact {
|
|
2
|
+
id: string
|
|
3
|
+
subject: string
|
|
4
|
+
predicate: string
|
|
5
|
+
object: string
|
|
6
|
+
valid_from: Date
|
|
7
|
+
valid_to: Date | null
|
|
8
|
+
confidence: number
|
|
9
|
+
last_updated: Date
|
|
10
|
+
metadata?: Record<string, any>
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface TemporalEdge {
|
|
14
|
+
id: string
|
|
15
|
+
source_id: string
|
|
16
|
+
target_id: string
|
|
17
|
+
relation_type: string
|
|
18
|
+
valid_from: Date
|
|
19
|
+
valid_to: Date | null
|
|
20
|
+
weight: number
|
|
21
|
+
metadata?: Record<string, any>
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface TimelineEntry {
|
|
25
|
+
timestamp: Date
|
|
26
|
+
subject: string
|
|
27
|
+
predicate: string
|
|
28
|
+
object: string
|
|
29
|
+
confidence: number
|
|
30
|
+
change_type: 'created' | 'updated' | 'invalidated'
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface TemporalQuery {
|
|
34
|
+
subject?: string
|
|
35
|
+
predicate?: string
|
|
36
|
+
object?: string
|
|
37
|
+
at?: Date
|
|
38
|
+
from?: Date
|
|
39
|
+
to?: Date
|
|
40
|
+
min_confidence?: number
|
|
41
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
export type chunk = {
|
|
2
|
+
text: string;
|
|
3
|
+
start: number;
|
|
4
|
+
end: number;
|
|
5
|
+
tokens: number;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
const cpt = 4;
|
|
9
|
+
const est = (t: string) => Math.ceil(t.length / cpt);
|
|
10
|
+
|
|
11
|
+
export const chunk_text = (txt: string, tgt = 768, ovr = 0.1): chunk[] => {
|
|
12
|
+
const tot = est(txt);
|
|
13
|
+
if (tot <= tgt)
|
|
14
|
+
return [{ text: txt, start: 0, end: txt.length, tokens: tot }];
|
|
15
|
+
|
|
16
|
+
const tch = tgt * cpt,
|
|
17
|
+
och = Math.floor(tch * ovr);
|
|
18
|
+
const paras = txt.split(/\n\n+/);
|
|
19
|
+
|
|
20
|
+
const chks: chunk[] = [];
|
|
21
|
+
let cur = "",
|
|
22
|
+
cs = 0;
|
|
23
|
+
|
|
24
|
+
for (const p of paras) {
|
|
25
|
+
const sents = p.split(/(?<=[.!?])\s+/);
|
|
26
|
+
for (const s of sents) {
|
|
27
|
+
const pot = cur + (cur ? " " : "") + s;
|
|
28
|
+
if (pot.length > tch && cur.length > 0) {
|
|
29
|
+
chks.push({
|
|
30
|
+
text: cur,
|
|
31
|
+
start: cs,
|
|
32
|
+
end: cs + cur.length,
|
|
33
|
+
tokens: est(cur),
|
|
34
|
+
});
|
|
35
|
+
const ovt = cur.slice(-och);
|
|
36
|
+
cur = ovt + " " + s;
|
|
37
|
+
cs = cs + cur.length - ovt.length - 1;
|
|
38
|
+
} else cur = pot;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (cur.length > 0)
|
|
43
|
+
chks.push({
|
|
44
|
+
text: cur,
|
|
45
|
+
start: cs,
|
|
46
|
+
end: cs + cur.length,
|
|
47
|
+
tokens: est(cur),
|
|
48
|
+
});
|
|
49
|
+
return chks;
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
export const agg_vec = (vecs: number[][]): number[] => {
|
|
53
|
+
const n = vecs.length;
|
|
54
|
+
if (!n) throw new Error("no vecs");
|
|
55
|
+
if (n === 1) return vecs[0].slice();
|
|
56
|
+
|
|
57
|
+
const d = vecs[0].length,
|
|
58
|
+
r = new Array(d).fill(0);
|
|
59
|
+
for (const v of vecs) for (let i = 0; i < d; i++) r[i] += v[i];
|
|
60
|
+
const rc = 1 / n;
|
|
61
|
+
for (let i = 0; i < d; i++) r[i] *= rc;
|
|
62
|
+
return r;
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
export const join_chunks = (cks: chunk[]) =>
|
|
66
|
+
cks.length ? cks.map((c) => c.text).join(" ") : "";
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
export const now = (): number => Date.now();
|
|
2
|
+
export const rid = (): string => crypto.randomUUID();
|
|
3
|
+
export const cos_sim = (a: Float32Array, b: Float32Array): number => {
|
|
4
|
+
let dot = 0,
|
|
5
|
+
na = 0,
|
|
6
|
+
nb = 0;
|
|
7
|
+
for (let i = 0; i < a.length; i++) {
|
|
8
|
+
const x = a[i],
|
|
9
|
+
y = b[i];
|
|
10
|
+
dot += x * y;
|
|
11
|
+
na += x * x;
|
|
12
|
+
nb += y * y;
|
|
13
|
+
}
|
|
14
|
+
const d = Math.sqrt(na) * Math.sqrt(nb);
|
|
15
|
+
return d ? dot / d : 0;
|
|
16
|
+
};
|
|
17
|
+
export const j = JSON.stringify;
|
|
18
|
+
export const p = <t = any>(x: string): t => JSON.parse(x);
|
|
19
|
+
export const vec_to_buf = (v: number[]): Buffer => {
|
|
20
|
+
const f32 = new Float32Array(v);
|
|
21
|
+
return Buffer.from(f32.buffer);
|
|
22
|
+
};
|
|
23
|
+
export const buf_to_vec = (buf: Buffer): Float32Array => {
|
|
24
|
+
return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
|
|
25
|
+
};
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { canonical_tokens_from_text } from "./text";
|
|
2
|
+
import { env } from "../core/cfg";
|
|
3
|
+
|
|
4
|
+
export interface keyword_match {
|
|
5
|
+
id: string;
|
|
6
|
+
score: number;
|
|
7
|
+
matched_terms: string[];
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export function extract_keywords(
|
|
11
|
+
text: string,
|
|
12
|
+
min_length: number = 3,
|
|
13
|
+
): Set<string> {
|
|
14
|
+
const tokens = canonical_tokens_from_text(text);
|
|
15
|
+
const keywords = new Set<string>();
|
|
16
|
+
|
|
17
|
+
for (const token of tokens) {
|
|
18
|
+
if (token.length >= min_length) {
|
|
19
|
+
keywords.add(token);
|
|
20
|
+
|
|
21
|
+
if (token.length >= 3) {
|
|
22
|
+
for (let i = 0; i <= token.length - 3; i++) {
|
|
23
|
+
keywords.add(token.slice(i, i + 3));
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
for (let i = 0; i < tokens.length - 1; i++) {
|
|
30
|
+
const bigram = `${tokens[i]}_${tokens[i + 1]}`;
|
|
31
|
+
if (bigram.length >= min_length) {
|
|
32
|
+
keywords.add(bigram);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
for (let i = 0; i < tokens.length - 2; i++) {
|
|
37
|
+
const trigram = `${tokens[i]}_${tokens[i + 1]}_${tokens[i + 2]}`;
|
|
38
|
+
keywords.add(trigram);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return keywords;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function compute_keyword_overlap(
|
|
45
|
+
query_keywords: Set<string>,
|
|
46
|
+
content_keywords: Set<string>,
|
|
47
|
+
): number {
|
|
48
|
+
let matches = 0;
|
|
49
|
+
let total_weight = 0;
|
|
50
|
+
|
|
51
|
+
for (const qk of query_keywords) {
|
|
52
|
+
if (content_keywords.has(qk)) {
|
|
53
|
+
const weight = qk.includes("_") ? 2.0 : 1.0;
|
|
54
|
+
matches += weight;
|
|
55
|
+
}
|
|
56
|
+
total_weight += qk.includes("_") ? 2.0 : 1.0;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (total_weight === 0) return 0;
|
|
60
|
+
return matches / total_weight;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function exact_phrase_match(query: string, content: string): boolean {
|
|
64
|
+
const q_norm = query.toLowerCase().trim();
|
|
65
|
+
const c_norm = content.toLowerCase();
|
|
66
|
+
return c_norm.includes(q_norm);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export function compute_bm25_score(
|
|
70
|
+
query_terms: string[],
|
|
71
|
+
content_terms: string[],
|
|
72
|
+
corpus_size: number = 10000,
|
|
73
|
+
avg_doc_length: number = 100,
|
|
74
|
+
): number {
|
|
75
|
+
const k1 = 1.5;
|
|
76
|
+
const b = 0.75;
|
|
77
|
+
|
|
78
|
+
const term_freq = new Map<string, number>();
|
|
79
|
+
for (const term of content_terms) {
|
|
80
|
+
term_freq.set(term, (term_freq.get(term) || 0) + 1);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const doc_length = content_terms.length;
|
|
84
|
+
let score = 0;
|
|
85
|
+
|
|
86
|
+
for (const q_term of query_terms) {
|
|
87
|
+
const tf = term_freq.get(q_term) || 0;
|
|
88
|
+
if (tf === 0) continue;
|
|
89
|
+
|
|
90
|
+
const idf = Math.log((corpus_size + 1) / (tf + 0.5));
|
|
91
|
+
const numerator = tf * (k1 + 1);
|
|
92
|
+
const denominator =
|
|
93
|
+
tf + k1 * (1 - b + b * (doc_length / avg_doc_length));
|
|
94
|
+
|
|
95
|
+
score += idf * (numerator / denominator);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return score;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
export async function keyword_filter_memories(
|
|
102
|
+
query: string,
|
|
103
|
+
all_memories: Array<{ id: string; content: string }>,
|
|
104
|
+
threshold: number = 0.1,
|
|
105
|
+
): Promise<Map<string, number>> {
|
|
106
|
+
const query_keywords = extract_keywords(query, env.keyword_min_length);
|
|
107
|
+
const query_terms = canonical_tokens_from_text(query);
|
|
108
|
+
const scores = new Map<string, number>();
|
|
109
|
+
|
|
110
|
+
for (const mem of all_memories) {
|
|
111
|
+
let total_score = 0;
|
|
112
|
+
|
|
113
|
+
if (exact_phrase_match(query, mem.content)) {
|
|
114
|
+
total_score += 1.0;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const content_keywords = extract_keywords(
|
|
118
|
+
mem.content,
|
|
119
|
+
env.keyword_min_length,
|
|
120
|
+
);
|
|
121
|
+
const keyword_score = compute_keyword_overlap(
|
|
122
|
+
query_keywords,
|
|
123
|
+
content_keywords,
|
|
124
|
+
);
|
|
125
|
+
total_score += keyword_score * 0.8;
|
|
126
|
+
|
|
127
|
+
const content_terms = canonical_tokens_from_text(mem.content);
|
|
128
|
+
const bm25_score = compute_bm25_score(query_terms, content_terms);
|
|
129
|
+
total_score += Math.min(1.0, bm25_score / 10) * 0.5;
|
|
130
|
+
|
|
131
|
+
if (total_score > threshold) {
|
|
132
|
+
scores.set(mem.id, total_score);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return scores;
|
|
137
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
const syn_grps = [
|
|
2
|
+
["prefer", "like", "love", "enjoy", "favor"],
|
|
3
|
+
["theme", "mode", "style", "layout"],
|
|
4
|
+
["meeting", "meet", "session", "call", "sync"],
|
|
5
|
+
["dark", "night", "black"],
|
|
6
|
+
["light", "bright", "day"],
|
|
7
|
+
["user", "person", "people", "customer"],
|
|
8
|
+
["task", "todo", "job"],
|
|
9
|
+
["note", "memo", "reminder"],
|
|
10
|
+
["time", "schedule", "when", "date"],
|
|
11
|
+
["project", "initiative", "plan"],
|
|
12
|
+
["issue", "problem", "bug"],
|
|
13
|
+
["document", "doc", "file"],
|
|
14
|
+
["question", "query", "ask"],
|
|
15
|
+
];
|
|
16
|
+
const cmap = new Map<string, string>();
|
|
17
|
+
const slook = new Map<string, Set<string>>();
|
|
18
|
+
|
|
19
|
+
for (const grp of syn_grps) {
|
|
20
|
+
const can = grp[0];
|
|
21
|
+
const sset = new Set(grp);
|
|
22
|
+
for (const w of grp) {
|
|
23
|
+
cmap.set(w, can);
|
|
24
|
+
slook.set(can, sset);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const stem_rules: Array<[RegExp, string]> = [
|
|
29
|
+
[/ies$/, "y"],
|
|
30
|
+
[/ing$/, ""],
|
|
31
|
+
[/ers?$/, "er"],
|
|
32
|
+
[/ed$/, ""],
|
|
33
|
+
[/s$/, ""],
|
|
34
|
+
];
|
|
35
|
+
const tok_pat = /[a-z0-9]+/gi;
|
|
36
|
+
|
|
37
|
+
export const tokenize = (text: string): string[] => {
|
|
38
|
+
const toks: string[] = [];
|
|
39
|
+
let m: RegExpExecArray | null;
|
|
40
|
+
while ((m = tok_pat.exec(text))) {
|
|
41
|
+
toks.push(m[0].toLowerCase());
|
|
42
|
+
}
|
|
43
|
+
return toks;
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
const stem = (tok: string): string => {
|
|
47
|
+
if (tok.length <= 3) return tok;
|
|
48
|
+
for (const [pat, rep] of stem_rules) {
|
|
49
|
+
if (pat.test(tok)) {
|
|
50
|
+
const st = tok.replace(pat, rep);
|
|
51
|
+
if (st.length >= 3) return st;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return tok;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
export const canonicalize_token = (tok: string): string => {
|
|
58
|
+
if (!tok) return "";
|
|
59
|
+
const low = tok.toLowerCase();
|
|
60
|
+
if (cmap.has(low)) return cmap.get(low)!;
|
|
61
|
+
const st = stem(low);
|
|
62
|
+
return cmap.get(st) || st;
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
export const canonical_tokens_from_text = (text: string): string[] => {
|
|
66
|
+
const res: string[] = [];
|
|
67
|
+
for (const tok of tokenize(text)) {
|
|
68
|
+
const can = canonicalize_token(tok);
|
|
69
|
+
if (can && can.length > 1) {
|
|
70
|
+
res.push(can);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return res;
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
export const synonyms_for = (tok: string): Set<string> => {
|
|
77
|
+
const can = canonicalize_token(tok);
|
|
78
|
+
return slook.get(can) || new Set([can]);
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
export const build_search_doc = (text: string): string => {
|
|
82
|
+
const can = canonical_tokens_from_text(text);
|
|
83
|
+
const exp = new Set<string>();
|
|
84
|
+
for (const tok of can) {
|
|
85
|
+
exp.add(tok);
|
|
86
|
+
const syns = slook.get(tok);
|
|
87
|
+
if (syns) {
|
|
88
|
+
syns.forEach((s) => exp.add(s));
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return Array.from(exp).join(" ");
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
export const build_fts_query = (text: string): string => {
|
|
95
|
+
const can = canonical_tokens_from_text(text);
|
|
96
|
+
if (!can.length) return "";
|
|
97
|
+
const uniq = Array.from(new Set(can.filter((t) => t.length > 1)));
|
|
98
|
+
return uniq.map((t) => `"${t}"`).join(" OR ");
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
export const canonical_token_set = (text: string): Set<string> => {
|
|
102
|
+
return new Set(canonical_tokens_from_text(text));
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
export const add_synonym_tokens = (toks: Iterable<string>): Set<string> => {
|
|
106
|
+
const res = new Set<string>();
|
|
107
|
+
for (const tok of toks) {
|
|
108
|
+
res.add(tok);
|
|
109
|
+
const syns = slook.get(tok);
|
|
110
|
+
if (syns) {
|
|
111
|
+
syns.forEach((s) => res.add(canonicalize_token(s)));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return res;
|
|
115
|
+
};
|