@100xprompt/chitta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +203 -0
- package/assets/rules/claude-md.md +9 -0
- package/assets/skill/SKILL.md +47 -0
- package/package.json +48 -0
- package/src/README.md +124 -0
- package/src/arango-client.ts +67 -0
- package/src/arango-graph-provider.ts +364 -0
- package/src/bin.ts +27 -0
- package/src/config-env.ts +53 -0
- package/src/embedded/authorizer.ts +89 -0
- package/src/embedded/cli.ts +86 -0
- package/src/embedded/code-extractor.ts +9 -0
- package/src/embedded/demo.ts +36 -0
- package/src/embedded/extract.ts +12 -0
- package/src/embedded/extractors/code.ts +308 -0
- package/src/embedded/extractors/deterministic.ts +63 -0
- package/src/embedded/extractors/llm.ts +151 -0
- package/src/embedded/extractors/text-hygiene.ts +54 -0
- package/src/embedded/extractors/types.ts +34 -0
- package/src/embedded/graph/acl-paths.ts +96 -0
- package/src/embedded/graph/adjacency.ts +61 -0
- package/src/embedded/graph/centrality.ts +23 -0
- package/src/embedded/graph/communities.ts +46 -0
- package/src/embedded/graph/cypher.ts +17 -0
- package/src/embedded/graph/impact.ts +24 -0
- package/src/embedded/graph/knowledge-graph.ts +108 -0
- package/src/embedded/graph/pagerank.ts +57 -0
- package/src/embedded/graph/sql-access.ts +13 -0
- package/src/embedded/graph/traversal.ts +73 -0
- package/src/embedded/graph/types.ts +35 -0
- package/src/embedded/graph-query.ts +126 -0
- package/src/embedded/index.ts +171 -0
- package/src/embedded/ingest.ts +262 -0
- package/src/embedded/kgqa/answer-paths.ts +197 -0
- package/src/embedded/kgqa/entity-link.ts +13 -0
- package/src/embedded/kgqa/intent.ts +14 -0
- package/src/embedded/kgqa/predicates.ts +9 -0
- package/src/embedded/kgqa/preference.ts +20 -0
- package/src/embedded/kgqa/select.ts +99 -0
- package/src/embedded/kgqa/text.ts +16 -0
- package/src/embedded/kgqa/types.ts +6 -0
- package/src/embedded/kgqa-service.ts +122 -0
- package/src/embedded/llm-extractor.ts +10 -0
- package/src/embedded/local-embeddings.ts +36 -0
- package/src/embedded/personal.ts +100 -0
- package/src/embedded/reranker.ts +62 -0
- package/src/embedded/retrieval/decay-stage.ts +59 -0
- package/src/embedded/retrieval/diversity.ts +37 -0
- package/src/embedded/retrieval/fuse.ts +52 -0
- package/src/embedded/retrieval/graph-stage.ts +45 -0
- package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
- package/src/embedded/retrieval/keyword-stage.ts +27 -0
- package/src/embedded/retrieval/passage.ts +44 -0
- package/src/embedded/retrieval/rerank-stage.ts +31 -0
- package/src/embedded/retrieval/trace.ts +31 -0
- package/src/embedded/retrieval/vector-stage.ts +15 -0
- package/src/embedded/sqlite-graph-provider.ts +119 -0
- package/src/embedded/sqlite-store.ts +95 -0
- package/src/embedded/sqlite-vec-service.ts +122 -0
- package/src/embedded/store/chunks.ts +61 -0
- package/src/embedded/store/fts.ts +50 -0
- package/src/embedded/store/nodes-edges.ts +112 -0
- package/src/embedded/store/salience.ts +37 -0
- package/src/embedded/store/schema.ts +109 -0
- package/src/embedded/transformers-embeddings.ts +100 -0
- package/src/embeddings.ts +51 -0
- package/src/eval/goldset.ts +46 -0
- package/src/eval/harness.ts +65 -0
- package/src/eval/metrics.ts +38 -0
- package/src/http/server.ts +93 -0
- package/src/index.ts +44 -0
- package/src/install/index.ts +139 -0
- package/src/install/platforms.ts +126 -0
- package/src/install/skill.ts +46 -0
- package/src/install/writers.ts +82 -0
- package/src/mcp/backend.ts +129 -0
- package/src/mcp/server.ts +83 -0
- package/src/mcp/tools/context-about.ts +69 -0
- package/src/mcp/tools/context-graph.ts +23 -0
- package/src/mcp/tools/context-ingest.ts +88 -0
- package/src/mcp/tools/context-rebuild.ts +22 -0
- package/src/mcp/tools/context-relate.ts +88 -0
- package/src/mcp/tools/get-context.ts +52 -0
- package/src/mcp/tools/index.ts +40 -0
- package/src/mcp/tools/types.ts +33 -0
- package/src/permission.ts +72 -0
- package/src/provider.ts +65 -0
- package/src/qdrant-vector.ts +76 -0
- package/src/retrieval.ts +218 -0
- package/src/service.ts +40 -0
- package/src/types.ts +91 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
// Embedded store schema (bun:sqlite). A generic property-graph (nodes + edges)
|
|
2
|
+
// plus a chunks table holding payloads and dense vectors. One file = the whole
|
|
3
|
+
// knowledge base - no servers. This is what makes the single-binary path work.
|
|
4
|
+
//
|
|
5
|
+
// Vector search adapts: if an extension-capable SQLite is available it loads
|
|
6
|
+
// sqlite-vec and maintains a `vec_chunks` ANN index (the zvec-style fast path,
|
|
7
|
+
// in-process, same file); otherwise it transparently falls back to brute-force
|
|
8
|
+
// cosine. Either way the public API and the VectorDBService interface are identical.
|
|
9
|
+
//
|
|
10
|
+
// This file is a thin FACADE: it owns the single bun:sqlite Database and delegates
|
|
11
|
+
// to focused modules under ./store/ (schema/migrations, graph nodes+edges, chunks+
|
|
12
|
+
// vec ANN, FTS5, salience). Pure structural refactor - identical SQL, identical
|
|
13
|
+
// behavior. The public surface of SqliteStore is preserved exactly.
|
|
14
|
+
|
|
15
|
+
import { Database } from "bun:sqlite"
|
|
16
|
+
import { migrate, tryEnableExtensions, tryLoadVec } from "./store/schema"
|
|
17
|
+
import * as graph from "./store/nodes-edges"
|
|
18
|
+
import * as fts from "./store/fts"
|
|
19
|
+
import { ChunkRepo } from "./store/chunks"
|
|
20
|
+
import * as salience from "./store/salience"
|
|
21
|
+
|
|
22
|
+
export type Json = Record<string, unknown>
|
|
23
|
+
|
|
24
|
+
export class SqliteStore {
|
|
25
|
+
readonly db: Database
|
|
26
|
+
readonly vecEnabled: boolean
|
|
27
|
+
readonly ftsEnabled: boolean
|
|
28
|
+
private readonly chunks: ChunkRepo
|
|
29
|
+
|
|
30
|
+
constructor(path = ":memory:") {
|
|
31
|
+
tryEnableExtensions()
|
|
32
|
+
this.db = new Database(path)
|
|
33
|
+
this.db.exec("PRAGMA journal_mode = WAL;")
|
|
34
|
+
migrate(this.db)
|
|
35
|
+
this.vecEnabled = tryLoadVec(this.db)
|
|
36
|
+
this.ftsEnabled = fts.tryEnableFts(this.db)
|
|
37
|
+
this.chunks = new ChunkRepo(this.db, this.vecEnabled, this.ftsEnabled)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Graph: nodes & edges ────────────────────────────────────────────────
|
|
41
|
+
addNode(id: string, coll: string, data: Json = {}): void {
|
|
42
|
+
graph.addNode(this.db, id, coll, data)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
addEdge(src: string, dst: string, label: string, opts: { weight?: number; validAt?: number; recordId?: string; confidence?: number } = {}): void {
|
|
46
|
+
graph.addEdge(this.db, src, dst, label, opts)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
clearRecordContributions(recordId: string): void {
|
|
50
|
+
graph.clearRecordContributions(this.db, recordId)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
supersedeEdge(src: string, label: string, keepDst: string, atTime = Date.now()): number {
|
|
54
|
+
return graph.supersedeEdge(this.db, src, label, keepDst, atTime)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
backfillEdgeProvenance(): number {
|
|
58
|
+
return graph.backfillEdgeProvenance(this.db)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ── Salience / decay ────────────────────────────────────────────────────
|
|
62
|
+
recordSalience(recordIds: string[]): Map<string, { lastAccessedAt: number; accessCount: number; importance: number }> {
|
|
63
|
+
return salience.recordSalience(this.db, recordIds)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
touchRecords(recordIds: string[]): void {
|
|
67
|
+
salience.touchRecords(this.db, recordIds)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// ── Chunks + vec0 ANN ───────────────────────────────────────────────────
|
|
71
|
+
addChunk(pointId: string, virtualRecordId: string, orgId: string, content: string, embedding: number[]): void {
|
|
72
|
+
this.chunks.addChunk(pointId, virtualRecordId, orgId, content, embedding)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
knnSearch(queryVec: number[], k: number): Array<{ rowid: number; distance: number }> {
|
|
76
|
+
return this.chunks.knnSearch(queryVec, k)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
resetVec(): void {
|
|
80
|
+
this.chunks.resetVec()
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ── FTS5 keyword index ──────────────────────────────────────────────────
|
|
84
|
+
ftsSearch(query: string, k: number): number[] {
|
|
85
|
+
return fts.ftsSearch(this.db, this.ftsEnabled, query, k)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
resetFts(): void {
|
|
89
|
+
fts.resetFts(this.db, this.ftsEnabled)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
close(): void {
|
|
93
|
+
this.db.close()
|
|
94
|
+
}
|
|
95
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
// VectorDBService over bun:sqlite. Uses the sqlite-vec ANN index when the store
|
|
2
|
+
// has it (fast path), else brute-force cosine - same results, same interface.
|
|
3
|
+
// Honors the must/should filter the retrieval spine builds: a point passes if it
|
|
4
|
+
// matches all `must` AND (no `should` OR matches a `should`). The `should` on
|
|
5
|
+
// virtualRecordId is the ACL restriction to accessible records, applied AFTER the
|
|
6
|
+
// ANN candidates come back (over-fetched) so recall holds under filtering.
|
|
7
|
+
|
|
8
|
+
import type { VectorDBService, VectorPoint, VectorQueryResult } from "../provider"
|
|
9
|
+
import type { SqliteStore } from "./sqlite-store"
|
|
10
|
+
|
|
11
|
+
interface EmbeddedFilter {
|
|
12
|
+
must?: Record<string, unknown>
|
|
13
|
+
should?: Record<string, unknown>
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function cosine(a: number[], b: number[]): number {
|
|
17
|
+
let dot = 0
|
|
18
|
+
let na = 0
|
|
19
|
+
let nb = 0
|
|
20
|
+
const n = Math.min(a.length, b.length)
|
|
21
|
+
for (let i = 0; i < n; i++) {
|
|
22
|
+
dot += a[i] * b[i]
|
|
23
|
+
na += a[i] * a[i]
|
|
24
|
+
nb += b[i] * b[i]
|
|
25
|
+
}
|
|
26
|
+
if (na === 0 || nb === 0) return 0
|
|
27
|
+
return dot / (Math.sqrt(na) * Math.sqrt(nb))
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class SqliteVecService implements VectorDBService {
|
|
31
|
+
constructor(private readonly store: SqliteStore) {}
|
|
32
|
+
|
|
33
|
+
async filterCollection(args: {
|
|
34
|
+
must?: Record<string, unknown>
|
|
35
|
+
should?: Record<string, unknown>
|
|
36
|
+
}): Promise<EmbeddedFilter> {
|
|
37
|
+
return { must: args.must, should: args.should }
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async queryNearestPoints(args: {
|
|
41
|
+
collectionName: string
|
|
42
|
+
requests: unknown[]
|
|
43
|
+
}): Promise<VectorQueryResult[]> {
|
|
44
|
+
return args.requests.map((reqUnknown) => {
|
|
45
|
+
const req = reqUnknown as {
|
|
46
|
+
prefetch?: Array<{ query: unknown; using?: string }>
|
|
47
|
+
filter?: EmbeddedFilter
|
|
48
|
+
limit?: number
|
|
49
|
+
}
|
|
50
|
+
const dense = (req.prefetch?.find((p) => p.using === "dense")?.query ?? req.prefetch?.[0]?.query) as number[]
|
|
51
|
+
const filter = req.filter ?? {}
|
|
52
|
+
const limit = req.limit ?? 20
|
|
53
|
+
const mustOrg = filter.must?.["orgId"] as string | undefined
|
|
54
|
+
const allowedVids = filter.should?.["virtualRecordId"] as string[] | undefined
|
|
55
|
+
const allowed = allowedVids ? new Set(allowedVids) : undefined
|
|
56
|
+
|
|
57
|
+
// Try ANN; fall back to brute-force when the index can't serve (missing /
|
|
58
|
+
// not yet built / written by a non-vec store). Guarantees we never miss rows
|
|
59
|
+
// that exist in `chunks` just because the ANN index isn't populated.
|
|
60
|
+
let points = this.store.vecEnabled && dense ? this.annQuery(dense, mustOrg, allowed, limit) : []
|
|
61
|
+
if (points.length === 0) points = this.bruteForce(dense, mustOrg, allowed, limit)
|
|
62
|
+
return { points }
|
|
63
|
+
})
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Fast path: ANN candidates from vec0, over-fetched then ACL-filtered.
|
|
67
|
+
private annQuery(
|
|
68
|
+
dense: number[],
|
|
69
|
+
mustOrg: string | undefined,
|
|
70
|
+
allowed: Set<string> | undefined,
|
|
71
|
+
limit: number,
|
|
72
|
+
): VectorPoint[] {
|
|
73
|
+
const knn = this.store.knnSearch(dense, Math.max(limit * 20, 50))
|
|
74
|
+
if (knn.length === 0) return []
|
|
75
|
+
const byRowid = new Map(knn.map((k) => [k.rowid, k.distance]))
|
|
76
|
+
const rows = this.store.db
|
|
77
|
+
.query(`SELECT rowid, point_id, virtual_record_id, org_id, content FROM chunks WHERE rowid IN (${knn.map(() => "?").join(",")})`)
|
|
78
|
+
.all(...knn.map((k) => k.rowid)) as Array<{
|
|
79
|
+
rowid: number
|
|
80
|
+
point_id: string
|
|
81
|
+
virtual_record_id: string
|
|
82
|
+
org_id: string
|
|
83
|
+
content: string
|
|
84
|
+
}>
|
|
85
|
+
const out: VectorPoint[] = []
|
|
86
|
+
for (const c of rows) {
|
|
87
|
+
if (mustOrg != null && c.org_id !== mustOrg) continue
|
|
88
|
+
if (allowed && !allowed.has(c.virtual_record_id)) continue
|
|
89
|
+
out.push({
|
|
90
|
+
id: c.point_id,
|
|
91
|
+
score: 1 - (byRowid.get(c.rowid) ?? 1), // cosine distance → similarity
|
|
92
|
+
payload: { page_content: c.content, metadata: { virtualRecordId: c.virtual_record_id, orgId: c.org_id } },
|
|
93
|
+
})
|
|
94
|
+
}
|
|
95
|
+
out.sort((a, b) => b.score - a.score)
|
|
96
|
+
return out.slice(0, limit)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Fallback: scan + cosine in TS (portable, no extension needed).
|
|
100
|
+
private bruteForce(
|
|
101
|
+
dense: number[] | undefined,
|
|
102
|
+
mustOrg: string | undefined,
|
|
103
|
+
allowed: Set<string> | undefined,
|
|
104
|
+
limit: number,
|
|
105
|
+
): VectorPoint[] {
|
|
106
|
+
const rows = this.store.db
|
|
107
|
+
.query("SELECT point_id, virtual_record_id, org_id, content, embedding FROM chunks")
|
|
108
|
+
.all() as Array<{ point_id: string; virtual_record_id: string; org_id: string; content: string; embedding: string }>
|
|
109
|
+
const scored: VectorPoint[] = []
|
|
110
|
+
for (const c of rows) {
|
|
111
|
+
if (mustOrg != null && c.org_id !== mustOrg) continue
|
|
112
|
+
if (allowed && !allowed.has(c.virtual_record_id)) continue
|
|
113
|
+
scored.push({
|
|
114
|
+
id: c.point_id,
|
|
115
|
+
score: dense ? cosine(dense, JSON.parse(c.embedding) as number[]) : 0,
|
|
116
|
+
payload: { page_content: c.content, metadata: { virtualRecordId: c.virtual_record_id, orgId: c.org_id } },
|
|
117
|
+
})
|
|
118
|
+
}
|
|
119
|
+
scored.sort((a, b) => b.score - a.score)
|
|
120
|
+
return scored.slice(0, limit)
|
|
121
|
+
}
|
|
122
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
// Chunk persistence + vec0 ANN index + knnSearch. Pure structural extraction from
|
|
2
|
+
// sqlite-store.ts - identical SQL, identical behavior.
|
|
3
|
+
//
|
|
4
|
+
// The vec0 ANN table is created lazily once we know the embedding dimension, so the
|
|
5
|
+
// mutable `vecDim` state lives in a small repo object owned by the facade.
|
|
6
|
+
|
|
7
|
+
import { Database } from "bun:sqlite"
|
|
8
|
+
import { indexChunkFts } from "./fts"
|
|
9
|
+
|
|
10
|
+
export class ChunkRepo {
|
|
11
|
+
private vecDim = 0
|
|
12
|
+
|
|
13
|
+
constructor(
|
|
14
|
+
private readonly db: Database,
|
|
15
|
+
private readonly vecEnabled: boolean,
|
|
16
|
+
private readonly ftsEnabled: boolean,
|
|
17
|
+
) {}
|
|
18
|
+
|
|
19
|
+
// The vec0 ANN table is created lazily once we know the embedding dimension.
|
|
20
|
+
private ensureVec(dim: number): void {
|
|
21
|
+
if (!this.vecEnabled || this.vecDim) return
|
|
22
|
+
this.db.exec(`CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(embedding float[${dim}] distance_metric=cosine)`)
|
|
23
|
+
this.vecDim = dim
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
addChunk(pointId: string, virtualRecordId: string, orgId: string, content: string, embedding: number[]): void {
|
|
27
|
+
const res = this.db
|
|
28
|
+
.query("INSERT OR REPLACE INTO chunks (point_id, virtual_record_id, org_id, content, embedding) VALUES (?, ?, ?, ?, ?)")
|
|
29
|
+
.run(pointId, virtualRecordId, orgId, content, JSON.stringify(embedding))
|
|
30
|
+
const rowid = Number(res.lastInsertRowid)
|
|
31
|
+
if (this.vecEnabled) {
|
|
32
|
+
this.ensureVec(embedding.length)
|
|
33
|
+
this.db.query("DELETE FROM vec_chunks WHERE rowid = ?").run(rowid)
|
|
34
|
+
this.db.query("INSERT INTO vec_chunks(rowid, embedding) VALUES (?, ?)").run(rowid, JSON.stringify(embedding))
|
|
35
|
+
}
|
|
36
|
+
if (this.ftsEnabled) {
|
|
37
|
+
indexChunkFts(this.db, rowid, content)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** ANN KNN over the vec0 index → [{rowid, distance}] (cosine distance). Returns
|
|
42
|
+
* [] if the index isn't present/usable (e.g. data written by a non-vec store, a
|
|
43
|
+
* query-only process, or a dim mismatch) - the caller then falls back to brute-force. */
|
|
44
|
+
knnSearch(queryVec: number[], k: number): Array<{ rowid: number; distance: number }> {
|
|
45
|
+
if (!this.vecEnabled) return []
|
|
46
|
+
try {
|
|
47
|
+
return this.db
|
|
48
|
+
.query("SELECT rowid, distance FROM vec_chunks WHERE embedding MATCH ? ORDER BY distance LIMIT ?")
|
|
49
|
+
.all(JSON.stringify(queryVec), k) as Array<{ rowid: number; distance: number }>
|
|
50
|
+
} catch {
|
|
51
|
+
return [] // vec_chunks missing or incompatible → brute-force handles it
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Drop the ANN index (e.g. before reindexing with a different embedder/dim). */
|
|
56
|
+
resetVec(): void {
|
|
57
|
+
if (!this.vecEnabled) return
|
|
58
|
+
this.db.exec("DROP TABLE IF EXISTS vec_chunks")
|
|
59
|
+
this.vecDim = 0
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
// FTS5 build/search/reset. Pure structural extraction from sqlite-store.ts -
|
|
2
|
+
// identical SQL, identical behavior.
|
|
3
|
+
|
|
4
|
+
import { Database } from "bun:sqlite"
|
|
5
|
+
|
|
6
|
+
// BM25 keyword index (SQLite FTS5, built-in - no extension needed). Complements the
|
|
7
|
+
// dense vector index: FTS5 nails exact tokens dense embeddings miss (acronyms "SAP",
|
|
8
|
+
// numbers "£230M", proper nouns). Backfills existing chunks on first open so hybrid
|
|
9
|
+
// search works on prior data without a reindex.
|
|
10
|
+
export function tryEnableFts(db: Database): boolean {
|
|
11
|
+
try {
|
|
12
|
+
db.exec("CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(content)")
|
|
13
|
+
const ftsCount = (db.query("SELECT count(*) c FROM chunks_fts").get() as { c: number }).c
|
|
14
|
+
const chunkCount = (db.query("SELECT count(*) c FROM chunks").get() as { c: number }).c
|
|
15
|
+
if (ftsCount === 0 && chunkCount > 0)
|
|
16
|
+
db.exec("INSERT INTO chunks_fts(rowid, content) SELECT rowid, content FROM chunks")
|
|
17
|
+
return true
|
|
18
|
+
} catch {
|
|
19
|
+
return false // FTS5 unavailable → dense-only, hybrid degrades gracefully
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Upsert a chunk row into the FTS index (called as part of addChunk).
|
|
24
|
+
export function indexChunkFts(db: Database, rowid: number, content: string): void {
|
|
25
|
+
db.query("DELETE FROM chunks_fts WHERE rowid = ?").run(rowid)
|
|
26
|
+
db.query("INSERT INTO chunks_fts(rowid, content) VALUES (?, ?)").run(rowid, content)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** BM25 keyword search → chunk rowids in relevance order (best first). Each query
|
|
30
|
+
* token is matched as a quoted literal OR-joined, so punctuation/special chars are
|
|
31
|
+
* safe and ANY matching term contributes (high recall). [] if FTS is unavailable. */
|
|
32
|
+
export function ftsSearch(db: Database, ftsEnabled: boolean, query: string, k: number): number[] {
|
|
33
|
+
if (!ftsEnabled) return []
|
|
34
|
+
const terms = (query.toLowerCase().match(/[\p{L}\p{N}£$€%.\-]+/gu) ?? []).filter((t) => t.replace(/[^a-z0-9]/g, "").length >= 2)
|
|
35
|
+
if (!terms.length) return []
|
|
36
|
+
const expr = terms.map((t) => `"${t.replace(/"/g, "")}"`).join(" OR ")
|
|
37
|
+
try {
|
|
38
|
+
return (
|
|
39
|
+
db.query("SELECT rowid FROM chunks_fts WHERE chunks_fts MATCH ? ORDER BY bm25(chunks_fts) LIMIT ?").all(expr, k) as Array<{ rowid: number }>
|
|
40
|
+
).map((r) => r.rowid)
|
|
41
|
+
} catch {
|
|
42
|
+
return []
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Drop + recreate the BM25 index (e.g. before a full reindex). */
|
|
47
|
+
export function resetFts(db: Database, ftsEnabled: boolean): void {
|
|
48
|
+
if (!ftsEnabled) return
|
|
49
|
+
db.exec("DROP TABLE IF EXISTS chunks_fts; CREATE VIRTUAL TABLE chunks_fts USING fts5(content);")
|
|
50
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
// Graph vertex/edge CRUD + provenance + supersession. Pure structural extraction
|
|
2
|
+
// from sqlite-store.ts - identical SQL, identical behavior.
|
|
3
|
+
|
|
4
|
+
import { Database } from "bun:sqlite"
|
|
5
|
+
import type { Json } from "../sqlite-store"
|
|
6
|
+
|
|
7
|
+
export function addNode(db: Database, id: string, coll: string, data: Json = {}): void {
|
|
8
|
+
db.query("INSERT OR REPLACE INTO nodes (id, coll, data) VALUES (?, ?, ?)").run(id, coll, JSON.stringify(data))
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
// Merge-on-upsert (LightRAG): one row per (src,dst,label). Re-asserting an edge
|
|
12
|
+
// ACCUMULATES weight, refreshes its validity (revives a previously superseded
|
|
13
|
+
// edge), and unions provenance - the graph gets denser per write, never duplicated.
|
|
14
|
+
export function addEdge(
|
|
15
|
+
db: Database,
|
|
16
|
+
src: string,
|
|
17
|
+
dst: string,
|
|
18
|
+
label: string,
|
|
19
|
+
opts: { weight?: number; validAt?: number; recordId?: string; confidence?: number } = {},
|
|
20
|
+
): void {
|
|
21
|
+
const now = Date.now()
|
|
22
|
+
db
|
|
23
|
+
.query(
|
|
24
|
+
`INSERT INTO edges (src, dst, label, weight, created_at, valid_at, invalid_at, expired_at, provenance, confidence)
|
|
25
|
+
VALUES (?, ?, ?, ?, ?, ?, NULL, NULL, '[]', ?)
|
|
26
|
+
ON CONFLICT(src, dst, label) DO UPDATE SET
|
|
27
|
+
weight = weight + excluded.weight,
|
|
28
|
+
expired_at = NULL,
|
|
29
|
+
invalid_at = NULL,
|
|
30
|
+
valid_at = COALESCE(edges.valid_at, excluded.valid_at),
|
|
31
|
+
confidence = MAX(edges.confidence, excluded.confidence)`,
|
|
32
|
+
)
|
|
33
|
+
.run(src, dst, label, opts.weight ?? 1, now, opts.validAt ?? null, opts.confidence ?? 1)
|
|
34
|
+
if (opts.recordId) addProvenance(db, src, dst, label, opts.recordId)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Union a source record id into an edge's provenance list (which records asserted it).
|
|
38
|
+
function addProvenance(db: Database, src: string, dst: string, label: string, recordId: string): void {
|
|
39
|
+
const row = db.query("SELECT provenance FROM edges WHERE src = ? AND dst = ? AND label = ?").get(src, dst, label) as
|
|
40
|
+
| { provenance: string }
|
|
41
|
+
| undefined
|
|
42
|
+
if (!row) return
|
|
43
|
+
const set = new Set<string>(JSON.parse(row.provenance) as string[])
|
|
44
|
+
if (set.has(recordId)) return
|
|
45
|
+
set.add(recordId)
|
|
46
|
+
db.query("UPDATE edges SET provenance = ? WHERE src = ? AND dst = ? AND label = ?").run(JSON.stringify([...set]), src, dst, label)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Source-keyed replace-on-reingest (Graphify build_merge): before re-ingesting a
|
|
50
|
+
// record, drop everything IT contributed so facts removed from the source get
|
|
51
|
+
// garbage-collected (an entity-id-keyed upsert alone would orphan them forever).
|
|
52
|
+
// Removes the record's `mentions` edges, and strips the record from every concept
|
|
53
|
+
// edge's provenance - deleting the edge if no other record still asserts it, else
|
|
54
|
+
// lowering its weight to the surviving source count.
|
|
55
|
+
export function clearRecordContributions(db: Database, recordId: string): void {
|
|
56
|
+
db.query("DELETE FROM edges WHERE src = ? AND label = 'mentions'").run(recordId)
|
|
57
|
+
const rows = db
|
|
58
|
+
.query("SELECT src, dst, label, provenance FROM edges WHERE provenance LIKE ?")
|
|
59
|
+
.all(`%${JSON.stringify(recordId).slice(1, -1)}%`) as Array<{ src: string; dst: string; label: string; provenance: string }>
|
|
60
|
+
for (const r of rows) {
|
|
61
|
+
const prov = (JSON.parse(r.provenance) as string[]).filter((p) => p !== recordId)
|
|
62
|
+
if (prov.length === (JSON.parse(r.provenance) as string[]).length) continue // not actually this record
|
|
63
|
+
if (prov.length === 0) {
|
|
64
|
+
db.query("DELETE FROM edges WHERE src = ? AND dst = ? AND label = ?").run(r.src, r.dst, r.label)
|
|
65
|
+
} else {
|
|
66
|
+
db
|
|
67
|
+
.query("UPDATE edges SET provenance = ?, weight = ? WHERE src = ? AND dst = ? AND label = ?")
|
|
68
|
+
.run(JSON.stringify(prov), prov.length, r.src, r.dst, r.label)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Non-destructive supersession (Graphiti): close the validity interval on every
|
|
74
|
+
// LIVE edge from `src` with this `label` whose target ISN'T `keepDst`. Used for a
|
|
75
|
+
// FUNCTIONAL relation (single-valued: lives_in, works_at) when a newer fact arrives.
|
|
76
|
+
// The old edge stays in the table with invalid_at/expired_at set - history intact.
|
|
77
|
+
export function supersedeEdge(db: Database, src: string, label: string, keepDst: string, atTime = Date.now()): number {
|
|
78
|
+
const res = db
|
|
79
|
+
.query(
|
|
80
|
+
`UPDATE edges SET invalid_at = ?, expired_at = ?
|
|
81
|
+
WHERE src = ? AND label = ? AND dst != ? AND expired_at IS NULL`,
|
|
82
|
+
)
|
|
83
|
+
.run(atTime, Date.now(), src, label, keepDst)
|
|
84
|
+
return Number(res.changes)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Backfill provenance for LEGACY concept edges that predate provenance tracking
|
|
88
|
+
// (migrated/older data has provenance '[]'). With per-edge ACL now fail-closed, an
|
|
89
|
+
// un-provenanced edge would be hidden from everyone - so attribute each to the
|
|
90
|
+
// records that mention BOTH of its endpoints (where the relationship was extracted),
|
|
91
|
+
// restoring it to the correct permission scope. Returns the number repaired.
|
|
92
|
+
export function backfillEdgeProvenance(db: Database): number {
|
|
93
|
+
const edges = db
|
|
94
|
+
.query(
|
|
95
|
+
`SELECT src, dst, label FROM edges
|
|
96
|
+
WHERE provenance = '[]' AND label NOT IN ('mentions','permissions','belongsTo','inheritPermissions')`,
|
|
97
|
+
)
|
|
98
|
+
.all() as Array<{ src: string; dst: string; label: string }>
|
|
99
|
+
const recordsMentioningBoth = db.query(
|
|
100
|
+
`SELECT m1.src AS r FROM edges m1 JOIN edges m2 ON m1.src = m2.src
|
|
101
|
+
WHERE m1.label = 'mentions' AND m2.label = 'mentions' AND m1.dst = ? AND m2.dst = ?`,
|
|
102
|
+
)
|
|
103
|
+
let repaired = 0
|
|
104
|
+
for (const e of edges) {
|
|
105
|
+
const recs = [...new Set((recordsMentioningBoth.all(e.src, e.dst) as Array<{ r: string }>).map((x) => x.r))]
|
|
106
|
+
if (recs.length) {
|
|
107
|
+
db.query("UPDATE edges SET provenance = ? WHERE src = ? AND dst = ? AND label = ?").run(JSON.stringify(recs), e.src, e.dst, e.label)
|
|
108
|
+
repaired++
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return repaired
|
|
112
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
// Memory salience / decay ops. Pure structural extraction from sqlite-store.ts -
|
|
2
|
+
// identical SQL, identical behavior.
|
|
3
|
+
|
|
4
|
+
import { Database } from "bun:sqlite"
|
|
5
|
+
import { ph } from "./schema"
|
|
6
|
+
|
|
7
|
+
// Memory salience (Generative-Agents / ACT-R): per-record access recency, frequency,
|
|
8
|
+
// and importance - used to gently re-weight retrieval so fresh/important/often-used
|
|
9
|
+
// memories surface over stale ones. NEVER deletes; only dampens retrieval strength.
|
|
10
|
+
export function recordSalience(
|
|
11
|
+
db: Database,
|
|
12
|
+
recordIds: string[],
|
|
13
|
+
): Map<string, { lastAccessedAt: number; accessCount: number; importance: number }> {
|
|
14
|
+
const out = new Map<string, { lastAccessedAt: number; accessCount: number; importance: number }>()
|
|
15
|
+
if (recordIds.length === 0) return out
|
|
16
|
+
const rows = db
|
|
17
|
+
.query(
|
|
18
|
+
`SELECT id,
|
|
19
|
+
COALESCE(json_extract(data,'$.lastAccessedAt'), json_extract(data,'$.createdAt'), 0) la,
|
|
20
|
+
COALESCE(json_extract(data,'$.accessCount'), 0) ac,
|
|
21
|
+
COALESCE(json_extract(data,'$.importance'), 1) imp
|
|
22
|
+
FROM nodes WHERE coll = 'records' AND id IN (${ph(recordIds.length)})`,
|
|
23
|
+
)
|
|
24
|
+
.all(...recordIds) as Array<{ id: string; la: number; ac: number; imp: number }>
|
|
25
|
+
for (const r of rows) out.set(r.id, { lastAccessedAt: r.la, accessCount: r.ac, importance: r.imp })
|
|
26
|
+
return out
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Mark records as just-accessed (bump recency + frequency) - called on retrieval. */
|
|
30
|
+
export function touchRecords(db: Database, recordIds: string[]): void {
|
|
31
|
+
const now = Date.now()
|
|
32
|
+
const stmt = db.query(
|
|
33
|
+
`UPDATE nodes SET data = json_set(json_set(data,'$.lastAccessedAt', ?), '$.accessCount',
|
|
34
|
+
COALESCE(json_extract(data,'$.accessCount'),0) + 1) WHERE id = ? AND coll = 'records'`,
|
|
35
|
+
)
|
|
36
|
+
for (const id of recordIds) stmt.run(now, id)
|
|
37
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
// Schema DDL + migrations + sqlite-vec load detection for the embedded store.
|
|
2
|
+
// Pure structural extraction from sqlite-store.ts - identical SQL, identical behavior.
|
|
3
|
+
|
|
4
|
+
import { Database } from "bun:sqlite"
|
|
5
|
+
import fs from "node:fs"
|
|
6
|
+
import * as sqliteVec from "sqlite-vec"
|
|
7
|
+
|
|
8
|
+
// setCustomSQLite must be called once, before any Database is opened. We point bun
|
|
9
|
+
// at an extension-capable SQLite (Homebrew / system) so sqlite-vec can load.
|
|
10
|
+
let triedCustomSqlite = false
|
|
11
|
+
export function tryEnableExtensions(): void {
|
|
12
|
+
if (triedCustomSqlite) return
|
|
13
|
+
triedCustomSqlite = true
|
|
14
|
+
const candidates = [
|
|
15
|
+
"/opt/homebrew/opt/sqlite/lib/libsqlite3.dylib",
|
|
16
|
+
"/usr/local/opt/sqlite/lib/libsqlite3.dylib",
|
|
17
|
+
"/usr/lib/x86_64-linux-gnu/libsqlite3.so",
|
|
18
|
+
"/usr/lib/aarch64-linux-gnu/libsqlite3.so",
|
|
19
|
+
]
|
|
20
|
+
const lib = candidates.find((p) => fs.existsSync(p))
|
|
21
|
+
if (!lib) return
|
|
22
|
+
try {
|
|
23
|
+
;(Database as unknown as { setCustomSQLite(p: string): void }).setCustomSQLite(lib)
|
|
24
|
+
} catch {
|
|
25
|
+
/* already opened a DB, or unsupported - fall back to brute-force */
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export function tryLoadVec(db: Database): boolean {
|
|
30
|
+
try {
|
|
31
|
+
sqliteVec.load(db)
|
|
32
|
+
return true
|
|
33
|
+
} catch {
|
|
34
|
+
return false // no extension support → brute-force path
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function migrate(db: Database): void {
|
|
39
|
+
db.exec(`
|
|
40
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
41
|
+
id TEXT PRIMARY KEY,
|
|
42
|
+
coll TEXT NOT NULL,
|
|
43
|
+
data TEXT NOT NULL DEFAULT '{}'
|
|
44
|
+
);
|
|
45
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_coll ON nodes(coll);
|
|
46
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
47
|
+
point_id TEXT PRIMARY KEY,
|
|
48
|
+
virtual_record_id TEXT,
|
|
49
|
+
org_id TEXT,
|
|
50
|
+
content TEXT,
|
|
51
|
+
embedding TEXT
|
|
52
|
+
);
|
|
53
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_org ON chunks(org_id);
|
|
54
|
+
`)
|
|
55
|
+
migrateEdges(db)
|
|
56
|
+
// Confidence tier (Graphify EXTRACTED=1.0 / INFERRED≈0.8 / AMBIGUOUS≈0.5) - how
|
|
57
|
+
// sure we are the relationship is real. Added idempotently so existing DBs upgrade.
|
|
58
|
+
const ecols = (db.query("PRAGMA table_info(edges)").all() as Array<{ name: string }>).map((c) => c.name)
|
|
59
|
+
if (!ecols.includes("confidence")) db.exec("ALTER TABLE edges ADD COLUMN confidence REAL NOT NULL DEFAULT 1")
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// The edges table is a property-graph relation store shared by ACL (permissions/
|
|
63
|
+
// belongsTo), structure (mentions), and the concept graph (relates_to / typed verbs).
|
|
64
|
+
// It is bi-temporal + merge-on-upsert: one row per (src,dst,label), with `weight`
|
|
65
|
+
// accumulating across mentions (frequency≈confidence, LightRAG), `created_at` the
|
|
66
|
+
// ingest time, and (valid_at, invalid_at, expired_at) the Graphiti validity axes -
|
|
67
|
+
// a "live" edge has expired_at IS NULL. Contradictions INVALIDATE (close the
|
|
68
|
+
// interval); they never delete, so history is never lost.
|
|
69
|
+
export function migrateEdges(db: Database): void {
|
|
70
|
+
const cols = db.query("PRAGMA table_info(edges)").all() as Array<{ name: string }>
|
|
71
|
+
const hasEdges = cols.length > 0
|
|
72
|
+
const hasWeight = cols.some((c) => c.name === "weight")
|
|
73
|
+
if (hasEdges && hasWeight) return // already current
|
|
74
|
+
|
|
75
|
+
db.exec(`
|
|
76
|
+
CREATE TABLE IF NOT EXISTS edges_new (
|
|
77
|
+
src TEXT NOT NULL,
|
|
78
|
+
dst TEXT NOT NULL,
|
|
79
|
+
label TEXT NOT NULL,
|
|
80
|
+
weight REAL NOT NULL DEFAULT 1,
|
|
81
|
+
created_at INTEGER NOT NULL DEFAULT 0,
|
|
82
|
+
valid_at INTEGER,
|
|
83
|
+
invalid_at INTEGER,
|
|
84
|
+
expired_at INTEGER,
|
|
85
|
+
provenance TEXT NOT NULL DEFAULT '[]',
|
|
86
|
+
PRIMARY KEY (src, dst, label)
|
|
87
|
+
);
|
|
88
|
+
`)
|
|
89
|
+
if (hasEdges) {
|
|
90
|
+
// Fold any duplicate (src,dst,label) rows from the old schema into one,
|
|
91
|
+
// carrying the duplicate COUNT forward as the starting weight.
|
|
92
|
+
db.exec(`
|
|
93
|
+
INSERT INTO edges_new (src, dst, label, weight, created_at)
|
|
94
|
+
SELECT src, dst, label, COUNT(*), 0 FROM edges GROUP BY src, dst, label;
|
|
95
|
+
DROP TABLE edges;
|
|
96
|
+
`)
|
|
97
|
+
}
|
|
98
|
+
db.exec(`
|
|
99
|
+
ALTER TABLE edges_new RENAME TO edges;
|
|
100
|
+
CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src, label);
|
|
101
|
+
CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst, label);
|
|
102
|
+
CREATE INDEX IF NOT EXISTS idx_edges_live ON edges(label, expired_at);
|
|
103
|
+
`)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Small shared placeholder helper for IN (...) clauses.
|
|
107
|
+
export function ph(n: number): string {
|
|
108
|
+
return Array.from({ length: n }, () => "?").join(",")
|
|
109
|
+
}
|