typeclaw 0.36.7 → 0.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/README.md +2 -2
  2. package/package.json +3 -2
  3. package/src/agent/index.ts +31 -11
  4. package/src/agent/live-sessions.ts +12 -0
  5. package/src/agent/model-fallback.ts +17 -15
  6. package/src/agent/model-overrides.ts +2 -2
  7. package/src/agent/session-meta.ts +10 -0
  8. package/src/agent/subagents.ts +11 -2
  9. package/src/agent/system-prompt.ts +9 -3
  10. package/src/agent/todo/continuation-policy.ts +6 -3
  11. package/src/agent/todo/continuation-wiring.ts +4 -2
  12. package/src/agent/todo/continuation.ts +3 -3
  13. package/src/agent/tools/todo/index.ts +27 -4
  14. package/src/bundled-plugins/agent-browser/index.ts +33 -108
  15. package/src/bundled-plugins/agent-browser/shim.ts +3 -94
  16. package/src/bundled-plugins/agent-browser/skills/agent-browser/SKILL.md +8 -33
  17. package/src/bundled-plugins/doc-render/skills/typeclaw-render-pdf/SKILL.md +2 -2
  18. package/src/bundled-plugins/guard/policies/memory-retrieval-cache-write.ts +7 -1
  19. package/src/bundled-plugins/memory/README.md +80 -23
  20. package/src/bundled-plugins/memory/append-tool.ts +74 -53
  21. package/src/bundled-plugins/memory/citation-superset.ts +4 -0
  22. package/src/bundled-plugins/memory/citations.ts +54 -0
  23. package/src/bundled-plugins/memory/dreaming-metrics.ts +30 -0
  24. package/src/bundled-plugins/memory/dreaming.ts +444 -21
  25. package/src/bundled-plugins/memory/index.ts +544 -400
  26. package/src/bundled-plugins/memory/load-memory.ts +87 -10
  27. package/src/bundled-plugins/memory/load-shards.ts +48 -22
  28. package/src/bundled-plugins/memory/memory-logger.ts +95 -106
  29. package/src/bundled-plugins/memory/memory-retrieval.ts +3 -3
  30. package/src/bundled-plugins/memory/parent-link.ts +33 -0
  31. package/src/bundled-plugins/memory/paths.ts +12 -0
  32. package/src/bundled-plugins/memory/references/frontmatter.ts +197 -0
  33. package/src/bundled-plugins/memory/references/load-references.ts +212 -0
  34. package/src/bundled-plugins/memory/references/store-reference-tool.ts +59 -0
  35. package/src/bundled-plugins/memory/search-tool.ts +282 -45
  36. package/src/bundled-plugins/memory/stream-events.ts +1 -0
  37. package/src/bundled-plugins/memory/stream-io.ts +28 -3
  38. package/src/bundled-plugins/memory/turn-dedup.ts +40 -0
  39. package/src/bundled-plugins/memory/vector/cache-write.ts +19 -0
  40. package/src/bundled-plugins/memory/vector/config.ts +28 -0
  41. package/src/bundled-plugins/memory/vector/doctor.ts +124 -0
  42. package/src/bundled-plugins/memory/vector/embedder.ts +246 -0
  43. package/src/bundled-plugins/memory/vector/hybrid.ts +439 -0
  44. package/src/bundled-plugins/memory/vector/index-on-write.ts +34 -0
  45. package/src/bundled-plugins/memory/vector/inspect.ts +111 -0
  46. package/src/bundled-plugins/memory/vector/passages.ts +125 -0
  47. package/src/bundled-plugins/memory/vector/reference-index-on-write.ts +50 -0
  48. package/src/bundled-plugins/memory/vector/relevance-gate.ts +93 -0
  49. package/src/bundled-plugins/memory/vector/startup.ts +71 -0
  50. package/src/bundled-plugins/memory/vector/store.ts +203 -0
  51. package/src/bundled-plugins/memory/vector/truncation.ts +124 -0
  52. package/src/bundled-plugins/security/policies/outbound-secret-scan.ts +2 -0
  53. package/src/channels/router.ts +239 -40
  54. package/src/cli/incomplete-init.ts +57 -0
  55. package/src/cli/init.ts +143 -12
  56. package/src/cli/inspect.ts +11 -5
  57. package/src/cli/model.ts +112 -34
  58. package/src/cli/restart.ts +24 -0
  59. package/src/cli/start.ts +24 -0
  60. package/src/cli/tunnel.ts +53 -8
  61. package/src/config/config.ts +110 -19
  62. package/src/config/index.ts +5 -1
  63. package/src/config/models-mutation.ts +29 -11
  64. package/src/config/providers-mutation.ts +2 -2
  65. package/src/config/providers.ts +146 -12
  66. package/src/container/shared.ts +9 -0
  67. package/src/container/start.ts +87 -4
  68. package/src/cron/consumer.ts +13 -7
  69. package/src/hostd/models.ts +64 -0
  70. package/src/hostd/paths.ts +6 -0
  71. package/src/hostd/portbroker-manager.ts +2 -2
  72. package/src/init/checkpoint.ts +201 -0
  73. package/src/init/dockerfile.ts +164 -51
  74. package/src/init/gitignore.ts +7 -7
  75. package/src/init/index.ts +41 -9
  76. package/src/init/line-auth.ts +50 -21
  77. package/src/init/models-dev.ts +96 -21
  78. package/src/init/oauth-login.ts +3 -3
  79. package/src/init/progress.ts +29 -0
  80. package/src/init/validate-api-key.ts +4 -0
  81. package/src/inspect/index.ts +13 -6
  82. package/src/inspect/item-list.ts +11 -2
  83. package/src/inspect/live-list.ts +65 -0
  84. package/src/inspect/open-item.ts +22 -1
  85. package/src/inspect/session-list.ts +29 -0
  86. package/src/models/embedding-model.ts +114 -0
  87. package/src/models/transformers-version.ts +55 -0
  88. package/src/plugin/types.ts +3 -0
  89. package/src/portbroker/container-server.ts +23 -0
  90. package/src/portbroker/forward-request-bus.ts +35 -0
  91. package/src/portbroker/forward-result-bus.ts +2 -3
  92. package/src/portbroker/hostd-client.ts +182 -36
  93. package/src/portbroker/index.ts +6 -1
  94. package/src/portbroker/protocol.ts +9 -2
  95. package/src/run/channel-session-factory.ts +11 -1
  96. package/src/run/index.ts +41 -7
  97. package/src/server/command-runner.ts +24 -1
  98. package/src/server/index.ts +42 -8
  99. package/src/shared/index.ts +2 -0
  100. package/src/shared/protocol.ts +31 -0
  101. package/src/skills/typeclaw-channels/SKILL.md +4 -4
  102. package/src/skills/typeclaw-config/SKILL.md +2 -2
  103. package/src/skills/typeclaw-memory/SKILL.md +3 -1
  104. package/src/skills/typeclaw-permissions/SKILL.md +3 -3
  105. package/src/skills/typeclaw-skills/SKILL.md +1 -1
  106. package/src/skills/typeclaw-tunnels/SKILL.md +22 -1
  107. package/src/tunnels/providers/cloudflare-quick.ts +65 -7
  108. package/src/tunnels/upstream-probe.ts +25 -0
  109. package/typeclaw.schema.json +156 -67
  110. package/src/bundled-plugins/agent-browser/dashboard-discovery.ts +0 -170
  111. package/src/bundled-plugins/agent-browser/dashboard-proxy.ts +0 -421
  112. package/src/portbroker/bind-with-forward.ts +0 -102
@@ -0,0 +1,439 @@
1
+ import { createHash } from 'node:crypto'
2
+
3
+ import { loadAllShards, type TopicShard } from '../load-shards'
4
+ import { buildParentLinks } from '../parent-link'
5
+ import { loadAllReferences, type Reference } from '../references/load-references'
6
+ import {
7
+ buildMatcher,
8
+ distinctTokens,
9
+ hasNonAscii,
10
+ searchAll,
11
+ searchAllRanked,
12
+ type MemorySearchMatch,
13
+ type StreamMatch,
14
+ } from '../search-tool'
15
+ import type { StreamEvent } from '../stream-events'
16
+ import { readAllUndreamedStreamDays, type UndreamedStreamDay } from '../stream-io'
17
+ import { embed, EMBEDDING_MODEL_ID, type EmbedType } from './embedder'
18
+ import type { Passage } from './passages'
19
+ import { clearsBaseline, gateRelevance, streamAdmissionBaseline } from './relevance-gate'
20
+ import { VectorStore, type VectorRow } from './store'
21
+
22
+ export { collectPassages, findMissingPassages, type Passage } from './passages'
23
+
24
+ const RRF_K = 60
25
+
26
+ export type HybridSearchResult = {
27
+ source: 'topic' | 'stream' | 'reference'
28
+ key: string
29
+ heading: string
30
+ excerpt: string
31
+ rrfScore: number
32
+ }
33
+
34
+ export type EmbedFn = (texts: string[], type: EmbedType) => Promise<Float32Array[]>
35
+
36
+ export async function hybridSearch(
37
+ query: string,
38
+ store: VectorStore,
39
+ agentDir: string,
40
+ topK: number,
41
+ embedFn: EmbedFn = embed,
42
+ ): Promise<HybridSearchResult[]> {
43
+ if (topK <= 0) return []
44
+
45
+ const [shards, streamDays, references, queryEmbeddings] = await Promise.all([
46
+ loadAllShards(agentDir),
47
+ readAllUndreamedStreamDays(agentDir),
48
+ loadAllReferences(agentDir),
49
+ embedFn([query], 'query'),
50
+ ])
51
+
52
+ const { parentSlugsByFragmentId, supersededFragmentIds } = buildParentLinks(shards)
53
+ const index = buildContentIndex(shards, streamDays, references, supersededFragmentIds)
54
+ const vectorRows = queryEmbeddings[0] === undefined ? [] : gatedVectorLane(queryEmbeddings[0], store, topK)
55
+ const keywordMatches = keywordLane(query, shards, streamDays, references, topK * 2)
56
+
57
+ return fuseLanes(vectorRows, keywordMatches, index, parentSlugsByFragmentId).slice(0, topK)
58
+ }
59
+
60
+ // The vector lane, gated by per-query relevance. Both row kinds are judged
61
+ // against ONE query-local no-match band, derived from the TOPIC score
62
+ // distribution alone (topics are the only stable-enough corpus to estimate the
63
+ // ambient band; sparse streams consume the band but never define it, so a
64
+ // nearest-neighbour cluster of fragments can't move the bar).
65
+ //
66
+ // - Topic rows: the gate keeps the knee above the band, or empties the topic
67
+ // partition entirely when no topic clears it.
68
+ // - Stream rows: admitted one-by-one only when they clear the SAME band by
69
+ // the shared margin. A genuine fresh fragment (well above the band) survives
70
+ // the freshness window; an irrelevant in-band neighbour is dropped, so a
71
+ // no-match query can't leak closest-neighbours-regardless through streams.
72
+ //
73
+ // Topic suppression uses the floor-gated verdict (gateRelevance): below the
74
+ // floor topics pass ungated, since a tiny index can't form a reliable band and
75
+ // a false negative is cheaper than suppressing the only memory. Stream
76
+ // admission uses streamAdmissionBaseline, which tolerates a below-floor topic
77
+ // set — even a few topics give the contrast a vector-only fragment match needs,
78
+ // and it returns null (dropping uncorroborated streams) only when NO topics
79
+ // exist at all. A lexically-corroborated fragment still reaches RRF via the
80
+ // separate keyword lane. An empty merged lane composes with RRF exactly like a
81
+ // lane that found nothing, so a genuine keyword hit survives a full no-match.
82
+ function gatedVectorLane(queryEmbedding: Float32Array, store: VectorStore, topK: number): VectorRow[] {
83
+ const scored = store.queryScored(queryEmbedding, EMBEDDING_MODEL_ID)
84
+ const bandDefiningRows = scored.filter(({ row }) => row.source === 'topic' || row.source === 'reference')
85
+ const streamRows = scored.filter(({ row }) => row.source === 'stream')
86
+
87
+ const bandScores = bandDefiningRows.map(({ score }) => score)
88
+ const keptBandDefiningRows = bandDefiningRows.slice(0, gateRelevance(bandScores, topK * 2))
89
+ const streamBaseline = streamAdmissionBaseline(bandScores)
90
+ const keptStreams = streamRows.filter(({ score }) => clearsBaseline(score, streamBaseline)).slice(0, topK * 2)
91
+
92
+ return [...keptBandDefiningRows, ...keptStreams].sort((a, b) => b.score - a.score).map(({ row }) => row)
93
+ }
94
+
95
+ // Phrase-first, then token-OR fallback (mirrors `memory_search`). `hybridSearch`'s
96
+ // query is always the whole user prompt, which never appears verbatim in a shard,
97
+ // so a phrase-only lane returns nothing every real turn and RRF degenerates to the
98
+ // vector lane alone. The `searchAllRanked` fallback also gives RRF a
99
+ // matched-token-count rank (truncated after ranking) instead of alphabetical order.
100
+ function keywordLane(
101
+ query: string,
102
+ shards: TopicShard[],
103
+ streamDays: UndreamedStreamDay[],
104
+ references: Reference[],
105
+ maxResults: number,
106
+ ): MemorySearchMatch[] {
107
+ const matcher = buildMatcher(query, false)
108
+ if (typeof matcher === 'string') return []
109
+ const phrase = searchAll(shards, streamDays, matcher, { full: false, maxResults, references })
110
+ const phraseMatches = 'matches' in phrase ? phrase.matches : []
111
+ if (phraseMatches.length > 0) return phraseMatches
112
+
113
+ const tokens = distinctHybridContentTokens(query)
114
+ if (tokens.length === 0) return []
115
+ if (tokens.length === 1 && tokens[0] === query.trim().toLowerCase()) return []
116
+ const ranked = searchAllRanked(shards, streamDays, tokens, {
117
+ full: false,
118
+ maxResults,
119
+ references,
120
+ tokenMatchMode: 'ascii-boundary',
121
+ })
122
+ return 'matches' in ranked ? ranked.matches : []
123
+ }
124
+
125
+ // Stopwords are judged ONLY against a token's ASCII-alpha core (below), so this
126
+ // set is intentionally English-only — non-ASCII tokens never reach it.
127
+ const HYBRID_PROMPT_STOPWORDS = new Set([
128
+ 'a',
129
+ 'an',
130
+ 'the',
131
+ 'and',
132
+ 'or',
133
+ 'but',
134
+ 'i',
135
+ 'me',
136
+ 'my',
137
+ 'mine',
138
+ 'you',
139
+ 'your',
140
+ 'yours',
141
+ 'he',
142
+ 'him',
143
+ 'his',
144
+ 'she',
145
+ 'her',
146
+ 'hers',
147
+ 'it',
148
+ 'its',
149
+ 'we',
150
+ 'us',
151
+ 'our',
152
+ 'ours',
153
+ 'they',
154
+ 'them',
155
+ 'their',
156
+ 'theirs',
157
+ 'this',
158
+ 'that',
159
+ 'these',
160
+ 'those',
161
+ 'here',
162
+ 'there',
163
+ 'what',
164
+ 'when',
165
+ 'where',
166
+ 'who',
167
+ 'whom',
168
+ 'whose',
169
+ 'why',
170
+ 'how',
171
+ 'which',
172
+ 'is',
173
+ 'am',
174
+ 'are',
175
+ 'was',
176
+ 'were',
177
+ 'be',
178
+ 'been',
179
+ 'being',
180
+ 'do',
181
+ 'does',
182
+ 'did',
183
+ 'doing',
184
+ 'have',
185
+ 'has',
186
+ 'had',
187
+ 'having',
188
+ 'can',
189
+ 'could',
190
+ 'should',
191
+ 'would',
192
+ 'will',
193
+ 'may',
194
+ 'might',
195
+ 'must',
196
+ 'about',
197
+ 'as',
198
+ 'at',
199
+ 'by',
200
+ 'for',
201
+ 'from',
202
+ 'in',
203
+ 'into',
204
+ 'of',
205
+ 'on',
206
+ 'onto',
207
+ 'to',
208
+ 'with',
209
+ 'without',
210
+ 'over',
211
+ 'under',
212
+ 'after',
213
+ 'before',
214
+ 'between',
215
+ 'up',
216
+ 'down',
217
+ 'out',
218
+ 'off',
219
+ 'say',
220
+ 'said',
221
+ 'thing',
222
+ 'things',
223
+ 'stuff',
224
+ ])
225
+
226
+ // The hybrid lane's query is a whole user prompt, so its tokens include
227
+ // function words ('what', 'we', 'the', 'about'). Token-OR matching on those
228
+ // alone would make the keyword lane non-empty for a low-information prompt and,
229
+ // when the vector lane is gated out, inject arbitrary memory instead of no
230
+ // result. The tool path's `distinctTokens` stays untouched — there the query is
231
+ // a deliberate agent search, not a sentence. Stopwords are judged ONLY on a
232
+ // token's ASCII-alpha core, so non-ASCII tokens ('홍길동'), numerics ('#651'),
233
+ // and short content words ('pr', 'ci', 'go') all survive; no length filter,
234
+ // which would wrongly drop CJK and short content.
235
+ function distinctHybridContentTokens(query: string): string[] {
236
+ return distinctTokens(query).filter((token) => {
237
+ const core = token.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, '')
238
+ if (core.length === 0) return hasNonAscii(token)
239
+ if (!/^[a-z]+$/.test(core)) return true
240
+ return !HYBRID_PROMPT_STOPWORDS.has(core)
241
+ })
242
+ }
243
+
244
+ // Reciprocal Rank Fusion across two rankers (vector + keyword). Each lane is
245
+ // collapsed to per-parent scores INDEPENDENTLY (MAX over a parent's children
246
+ // within that lane), then the two lanes' per-parent scores are SUMMED. The two
247
+ // reductions are different on purpose:
248
+ //
249
+ // - WITHIN a lane, a topic's children (the fragments citing it) collapse by
250
+ // MAX — summing would over-rank often-revised topics purely for having more
251
+ // historical citations to match (PARADE: max beats sum for concentrated
252
+ // relevance).
253
+ // - ACROSS lanes, the per-parent contributions SUM — cross-ranker agreement
254
+ // is the entire signal RRF exists to capture. A topic found by BOTH the
255
+ // vector and the keyword lane must outrank one found by a single lane, so
256
+ // its score is 1/(k+rankVector) + 1/(k+rankKeyword). Collapsing both lanes
257
+ // into one map and taking MAX (the previous behavior) discarded that
258
+ // agreement, leaving every result carrying a single lane's reciprocal rank.
259
+ function fuseLanes(
260
+ vectorRows: VectorRow[],
261
+ keywordMatches: MemorySearchMatch[],
262
+ index: Map<string, Omit<HybridSearchResult, 'rrfScore'>>,
263
+ parentSlugsByFragmentId: Map<string, Set<string>>,
264
+ ): HybridSearchResult[] {
265
+ const vectorLane = collapseLane(
266
+ vectorRows.map((row, i) => ({ source: row.source, key: row.key, score: 1 / (RRF_K + i + 1) })),
267
+ index,
268
+ parentSlugsByFragmentId,
269
+ )
270
+ const keywordLane = collapseLane(
271
+ keywordMatches.map((match, i) => ({ source: match.source, key: matchKey(match), score: 1 / (RRF_K + i + 1) })),
272
+ index,
273
+ parentSlugsByFragmentId,
274
+ )
275
+
276
+ const fused = new Map<string, HybridSearchResult>()
277
+ for (const lane of [vectorLane, keywordLane]) {
278
+ for (const [fusedKey, { content, score }] of lane) {
279
+ const existing = fused.get(fusedKey)
280
+ if (existing !== undefined) existing.rrfScore += score
281
+ else fused.set(fusedKey, { ...content, rrfScore: score })
282
+ }
283
+ }
284
+
285
+ return [...fused.values()].sort((a, b) => b.rrfScore - a.rrfScore || a.key.localeCompare(b.key))
286
+ }
287
+
288
+ type LaneHit = { source: 'topic' | 'stream' | 'reference'; key: string; score: number }
289
+ type LaneEntry = { content: Omit<HybridSearchResult, 'rrfScore'>; score: number }
290
+
291
+ // Returns one lane's per-parent scores so the caller can SUM across lanes;
292
+ // folding straight into a shared map would force a cross-lane MAX instead.
293
+ function collapseLane(
294
+ hits: LaneHit[],
295
+ index: Map<string, Omit<HybridSearchResult, 'rrfScore'>>,
296
+ parentSlugsByFragmentId: Map<string, Set<string>>,
297
+ ): Map<string, LaneEntry> {
298
+ const lane = new Map<string, LaneEntry>()
299
+ for (const hit of hits) {
300
+ for (const { fusedKey, content } of resolveToParents(hit.source, hit.key, index, parentSlugsByFragmentId)) {
301
+ const existing = lane.get(fusedKey)
302
+ if (existing !== undefined) existing.score = Math.max(existing.score, hit.score)
303
+ else lane.set(fusedKey, { content, score: hit.score })
304
+ }
305
+ }
306
+ return lane
307
+ }
308
+
309
+ // A matched fragment collapses to EVERY topic that cites it (a fragment can back
310
+ // more than one belief), so it contributes its score to each parent. An
311
+ // undreamed fragment with no citing topic resolves to itself.
312
+ function resolveToParents(
313
+ source: 'topic' | 'stream' | 'reference',
314
+ nodeKey: string,
315
+ index: Map<string, Omit<HybridSearchResult, 'rrfScore'>>,
316
+ parentSlugsByFragmentId: Map<string, Set<string>>,
317
+ ): Array<{ fusedKey: string; content: Omit<HybridSearchResult, 'rrfScore'> }> {
318
+ if (source === 'reference') {
319
+ const slug = referenceSlugFromKey(nodeKey)
320
+ const content = index.get(laneKey('reference', slug))
321
+ return content === undefined ? [] : [{ fusedKey: laneKey('reference', slug), content }]
322
+ }
323
+ if (source === 'stream') {
324
+ const fragmentId = fragmentIdFromKey(nodeKey)
325
+ const parentSlugs = fragmentId === null ? undefined : parentSlugsByFragmentId.get(fragmentId)
326
+ if (parentSlugs !== undefined && parentSlugs.size > 0) {
327
+ const parents: Array<{ fusedKey: string; content: Omit<HybridSearchResult, 'rrfScore'> }> = []
328
+ for (const parentSlug of parentSlugs) {
329
+ const topic = index.get(laneKey('topic', parentSlug))
330
+ if (topic !== undefined) parents.push({ fusedKey: laneKey('topic', parentSlug), content: topic })
331
+ }
332
+ if (parents.length > 0) return parents
333
+ }
334
+ }
335
+ const content = index.get(laneKey(source, nodeKey))
336
+ return content === undefined ? [] : [{ fusedKey: laneKey(source, nodeKey), content }]
337
+ }
338
+
339
+ function referenceSlugFromKey(referenceKey: string): string {
340
+ const hashIndex = referenceKey.indexOf('#')
341
+ return hashIndex === -1 ? referenceKey : referenceKey.slice(0, hashIndex)
342
+ }
343
+
344
+ function fragmentIdFromKey(streamKey: string): string | null {
345
+ const hashIndex = streamKey.indexOf('#')
346
+ if (hashIndex === -1) return null
347
+ const id = streamKey.slice(hashIndex + 1)
348
+ return id.startsWith('legacy-') ? null : id
349
+ }
350
+
351
+ // Superseded fragments are kept out of the content index entirely, so both
352
+ // lanes drop them: the keyword lane can match a superseded body, but resolving
353
+ // it finds no active parent link and then no `stream` fallback here, so the
354
+ // stale fragment never surfaces as a standalone result (mirrors the passage-set
355
+ // exclusion that keeps superseded fragments out of the vector lane).
356
+ function buildContentIndex(
357
+ shards: TopicShard[],
358
+ streamDays: UndreamedStreamDay[],
359
+ references: Reference[],
360
+ supersededFragmentIds: Set<string>,
361
+ ): Map<string, Omit<HybridSearchResult, 'rrfScore'>> {
362
+ const index = new Map<string, Omit<HybridSearchResult, 'rrfScore'>>()
363
+
364
+ for (const shard of shards) {
365
+ index.set(laneKey('topic', shard.slug), {
366
+ source: 'topic',
367
+ key: shard.slug,
368
+ heading: shard.frontmatter.heading,
369
+ excerpt: excerpt(shard.body, shard.frontmatter.heading),
370
+ })
371
+ }
372
+
373
+ for (const day of streamDays) {
374
+ for (const event of day.events) {
375
+ const item = streamIndexItem(day, event, supersededFragmentIds)
376
+ if (item !== null) index.set(laneKey('stream', item.key), item)
377
+ }
378
+ }
379
+
380
+ for (const reference of references) {
381
+ if (reference.frontmatter.demoted) continue
382
+ index.set(laneKey('reference', reference.slug), {
383
+ source: 'reference',
384
+ key: reference.slug,
385
+ heading: reference.frontmatter.title,
386
+ excerpt: excerpt(reference.body, reference.frontmatter.title),
387
+ })
388
+ }
389
+
390
+ return index
391
+ }
392
+
393
+ function streamIndexItem(
394
+ day: UndreamedStreamDay,
395
+ event: StreamEvent,
396
+ supersededFragmentIds: Set<string>,
397
+ ): Omit<HybridSearchResult, 'rrfScore'> | null {
398
+ if (event.type === 'watermark') return null
399
+ if (event.type === 'fragment') {
400
+ if (supersededFragmentIds.has(event.id)) return null
401
+ return {
402
+ source: 'stream',
403
+ key: `${day.date}#${event.id}`,
404
+ heading: event.topic,
405
+ excerpt: excerpt(event.body, event.topic),
406
+ }
407
+ }
408
+ return {
409
+ source: 'stream',
410
+ key: `${day.date}#legacy-${hashContent(event.text).slice(0, 12)}`,
411
+ heading: '[legacy prose from pre-shard migration]',
412
+ excerpt: excerpt(event.text, '[legacy prose from pre-shard migration]'),
413
+ }
414
+ }
415
+
416
+ function matchKey(match: MemorySearchMatch): string {
417
+ if (match.source === 'topic') return match.slug
418
+ if (match.source === 'reference') return match.slug
419
+ return streamMatchKey(match)
420
+ }
421
+
422
+ function streamMatchKey(match: StreamMatch): string {
423
+ if (match.eventId !== undefined) return match.eventId.replace(/^streams\//, '')
424
+ return `${match.date}#legacy-${hashContent(match.excerpt).slice(0, 12)}`
425
+ }
426
+
427
+ function laneKey(source: 'topic' | 'stream' | 'reference', key: string): string {
428
+ return `${source}:${key}`
429
+ }
430
+
431
+ function excerpt(body: string, fallback: string): string {
432
+ const trimmed = body.trim()
433
+ if (trimmed.length === 0) return fallback
434
+ return trimmed.split('\n').slice(0, 7).join('\n')
435
+ }
436
+
437
+ function hashContent(content: string): string {
438
+ return createHash('sha256').update(content).digest('hex')
439
+ }
@@ -0,0 +1,34 @@
1
+ import { fragmentContentHash } from '../fragment-parser'
2
+ import type { FragmentEvent } from '../stream-events'
3
+ import type { FragmentsAppendedContext } from '../stream-io'
4
+ import { embed, EMBEDDING_MODEL_ID } from './embedder'
5
+ import type { EmbedFn } from './hybrid'
6
+ import type { VectorStore } from './store'
7
+
8
+ export function makeAppendHook(
9
+ store: VectorStore,
10
+ embedFn: EmbedFn = embed,
11
+ ): (fragments: FragmentEvent[], context: FragmentsAppendedContext) => Promise<void> {
12
+ return async (fragments, context) => {
13
+ for (const fragment of fragments) {
14
+ const key = `${context.date ?? fragment.ts.slice(0, 10)}#${fragment.id}`
15
+ const id = `stream:${key}`
16
+ const contentHash = fragmentContentHash(fragment)
17
+ const existing = store.getByIds([id])[0]
18
+ if (existing?.contentHash === contentHash && existing.model === EMBEDDING_MODEL_ID) continue
19
+
20
+ const text = `${fragment.topic}\n${fragment.body}`
21
+ const [embedding] = await embedFn([text], 'passage')
22
+ if (embedding === undefined) continue
23
+ store.upsert({
24
+ id,
25
+ source: 'stream',
26
+ key,
27
+ model: EMBEDDING_MODEL_ID,
28
+ dims: embedding.length,
29
+ embedding,
30
+ contentHash,
31
+ })
32
+ }
33
+ }
34
+ }
@@ -0,0 +1,111 @@
1
+ import { Database } from 'bun:sqlite'
2
+
3
+ import { DIMS, EMBEDDING_MODEL_ID } from './embedder'
4
+
5
+ // Read-only health probe for the vector index DB. Deliberately does NOT go
6
+ // through `VectorStore.open`: that path runs `CREATE TABLE IF NOT EXISTS`,
7
+ // which would silently "heal" a DB whose `vectors` table is missing — turning
8
+ // a corruption signal into a no-op. The doctor must observe state, not mutate
9
+ // it, so we open raw, validate the schema ourselves, and never write.
10
+
11
+ const EXPECTED_COLUMNS = ['id', 'source', 'key', 'model', 'dims', 'embedding', 'content_hash', 'updated_at'] as const
12
+
13
+ export type VectorIndexProblem =
14
+ | { kind: 'unreadable'; detail: string }
15
+ | { kind: 'corrupt'; detail: string[] }
16
+ | { kind: 'schema-missing'; detail: string }
17
+
18
+ export type VectorIndexFinding =
19
+ | VectorIndexProblem
20
+ | { kind: 'ok'; rowCount: number; rowIds: string[]; modelMismatch: string[]; malformed: string[] }
21
+
22
+ type IntegrityRow = { result: string }
23
+ type SchemaRow = { name: string }
24
+ type RowMeta = { id: string; model: string; dims: number; embeddingBytes: number }
25
+
26
+ export function inspectVectorIndex(dbPath: string): VectorIndexFinding {
27
+ let db: Database
28
+ try {
29
+ db = new Database(dbPath, { readonly: true })
30
+ } catch (err) {
31
+ return { kind: 'unreadable', detail: messageOf(err) }
32
+ }
33
+
34
+ try {
35
+ const corruption = runQuickCheck(db)
36
+ if (corruption !== null) return { kind: 'corrupt', detail: corruption }
37
+
38
+ if (!hasVectorsTable(db)) {
39
+ return { kind: 'schema-missing', detail: 'vectors table is absent' }
40
+ }
41
+
42
+ const missingColumns = missingVectorColumns(db)
43
+ if (missingColumns.length > 0) {
44
+ return { kind: 'schema-missing', detail: `vectors table missing columns: ${missingColumns.join(', ')}` }
45
+ }
46
+
47
+ return classifyRows(db)
48
+ } catch (err) {
49
+ // A read that throws after the DB opened (e.g. a malformed page surfaced
50
+ // mid-scan that quick_check's sampling missed) is corruption, not an
51
+ // unreadable file — the file opened fine.
52
+ return { kind: 'corrupt', detail: [messageOf(err)] }
53
+ } finally {
54
+ db.close()
55
+ }
56
+ }
57
+
58
+ function runQuickCheck(db: Database): string[] | null {
59
+ // quick_check over integrity_check: integrity_check is O(db size) and can
60
+ // blow the 5s doctor budget on a large index; quick_check skips the
61
+ // expensive UNIQUE/foreign-key scans while still catching page-level
62
+ // corruption. A healthy DB returns exactly one row: "ok". SQLite names the
63
+ // result column after the pragma, hence `quick_check`, aliased to `result`.
64
+ const rows = db.query<IntegrityRow, []>('SELECT quick_check AS result FROM pragma_quick_check').all()
65
+ if (rows.length === 1 && rows[0]?.result === 'ok') return null
66
+ return rows.map((row) => row.result)
67
+ }
68
+
69
+ function hasVectorsTable(db: Database): boolean {
70
+ const row = db
71
+ .query<SchemaRow, [string]>("SELECT name FROM sqlite_master WHERE type = 'table' AND name = ?")
72
+ .get('vectors')
73
+ return row !== null
74
+ }
75
+
76
+ function missingVectorColumns(db: Database): string[] {
77
+ const present = new Set(
78
+ db
79
+ .query<SchemaRow, []>('PRAGMA table_info(vectors)')
80
+ .all()
81
+ .map((row) => row.name),
82
+ )
83
+ return EXPECTED_COLUMNS.filter((column) => !present.has(column))
84
+ }
85
+
86
+ function classifyRows(db: Database): VectorIndexFinding {
87
+ const rows = db
88
+ .query<RowMeta, []>('SELECT id, model, dims, length(embedding) AS embeddingBytes FROM vectors ORDER BY id')
89
+ .all()
90
+
91
+ const rowIds: string[] = []
92
+ const modelMismatch: string[] = []
93
+ const malformed: string[] = []
94
+
95
+ for (const row of rows) {
96
+ rowIds.push(row.id)
97
+ if (row.model !== EMBEDDING_MODEL_ID || row.dims !== DIMS) {
98
+ modelMismatch.push(row.id)
99
+ continue
100
+ }
101
+ // Float32 → 4 bytes per dim. A stored BLOB that disagrees can't decode to
102
+ // a valid embedding, so cosine against it would be garbage.
103
+ if (row.embeddingBytes !== row.dims * 4) malformed.push(row.id)
104
+ }
105
+
106
+ return { kind: 'ok', rowCount: rows.length, rowIds, modelMismatch, malformed }
107
+ }
108
+
109
+ function messageOf(err: unknown): string {
110
+ return err instanceof Error ? err.message : String(err)
111
+ }