@100xprompt/chitta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +203 -0
  3. package/assets/rules/claude-md.md +9 -0
  4. package/assets/skill/SKILL.md +47 -0
  5. package/package.json +48 -0
  6. package/src/README.md +124 -0
  7. package/src/arango-client.ts +67 -0
  8. package/src/arango-graph-provider.ts +364 -0
  9. package/src/bin.ts +27 -0
  10. package/src/config-env.ts +53 -0
  11. package/src/embedded/authorizer.ts +89 -0
  12. package/src/embedded/cli.ts +86 -0
  13. package/src/embedded/code-extractor.ts +9 -0
  14. package/src/embedded/demo.ts +36 -0
  15. package/src/embedded/extract.ts +12 -0
  16. package/src/embedded/extractors/code.ts +308 -0
  17. package/src/embedded/extractors/deterministic.ts +63 -0
  18. package/src/embedded/extractors/llm.ts +151 -0
  19. package/src/embedded/extractors/text-hygiene.ts +54 -0
  20. package/src/embedded/extractors/types.ts +34 -0
  21. package/src/embedded/graph/acl-paths.ts +96 -0
  22. package/src/embedded/graph/adjacency.ts +61 -0
  23. package/src/embedded/graph/centrality.ts +23 -0
  24. package/src/embedded/graph/communities.ts +46 -0
  25. package/src/embedded/graph/cypher.ts +17 -0
  26. package/src/embedded/graph/impact.ts +24 -0
  27. package/src/embedded/graph/knowledge-graph.ts +108 -0
  28. package/src/embedded/graph/pagerank.ts +57 -0
  29. package/src/embedded/graph/sql-access.ts +13 -0
  30. package/src/embedded/graph/traversal.ts +73 -0
  31. package/src/embedded/graph/types.ts +35 -0
  32. package/src/embedded/graph-query.ts +126 -0
  33. package/src/embedded/index.ts +171 -0
  34. package/src/embedded/ingest.ts +262 -0
  35. package/src/embedded/kgqa/answer-paths.ts +197 -0
  36. package/src/embedded/kgqa/entity-link.ts +13 -0
  37. package/src/embedded/kgqa/intent.ts +14 -0
  38. package/src/embedded/kgqa/predicates.ts +9 -0
  39. package/src/embedded/kgqa/preference.ts +20 -0
  40. package/src/embedded/kgqa/select.ts +99 -0
  41. package/src/embedded/kgqa/text.ts +16 -0
  42. package/src/embedded/kgqa/types.ts +6 -0
  43. package/src/embedded/kgqa-service.ts +122 -0
  44. package/src/embedded/llm-extractor.ts +10 -0
  45. package/src/embedded/local-embeddings.ts +36 -0
  46. package/src/embedded/personal.ts +100 -0
  47. package/src/embedded/reranker.ts +62 -0
  48. package/src/embedded/retrieval/decay-stage.ts +59 -0
  49. package/src/embedded/retrieval/diversity.ts +37 -0
  50. package/src/embedded/retrieval/fuse.ts +52 -0
  51. package/src/embedded/retrieval/graph-stage.ts +45 -0
  52. package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
  53. package/src/embedded/retrieval/keyword-stage.ts +27 -0
  54. package/src/embedded/retrieval/passage.ts +44 -0
  55. package/src/embedded/retrieval/rerank-stage.ts +31 -0
  56. package/src/embedded/retrieval/trace.ts +31 -0
  57. package/src/embedded/retrieval/vector-stage.ts +15 -0
  58. package/src/embedded/sqlite-graph-provider.ts +119 -0
  59. package/src/embedded/sqlite-store.ts +95 -0
  60. package/src/embedded/sqlite-vec-service.ts +122 -0
  61. package/src/embedded/store/chunks.ts +61 -0
  62. package/src/embedded/store/fts.ts +50 -0
  63. package/src/embedded/store/nodes-edges.ts +112 -0
  64. package/src/embedded/store/salience.ts +37 -0
  65. package/src/embedded/store/schema.ts +109 -0
  66. package/src/embedded/transformers-embeddings.ts +100 -0
  67. package/src/embeddings.ts +51 -0
  68. package/src/eval/goldset.ts +46 -0
  69. package/src/eval/harness.ts +65 -0
  70. package/src/eval/metrics.ts +38 -0
  71. package/src/http/server.ts +93 -0
  72. package/src/index.ts +44 -0
  73. package/src/install/index.ts +139 -0
  74. package/src/install/platforms.ts +126 -0
  75. package/src/install/skill.ts +46 -0
  76. package/src/install/writers.ts +82 -0
  77. package/src/mcp/backend.ts +129 -0
  78. package/src/mcp/server.ts +83 -0
  79. package/src/mcp/tools/context-about.ts +69 -0
  80. package/src/mcp/tools/context-graph.ts +23 -0
  81. package/src/mcp/tools/context-ingest.ts +88 -0
  82. package/src/mcp/tools/context-rebuild.ts +22 -0
  83. package/src/mcp/tools/context-relate.ts +88 -0
  84. package/src/mcp/tools/get-context.ts +52 -0
  85. package/src/mcp/tools/index.ts +40 -0
  86. package/src/mcp/tools/types.ts +33 -0
  87. package/src/permission.ts +72 -0
  88. package/src/provider.ts +65 -0
  89. package/src/qdrant-vector.ts +76 -0
  90. package/src/retrieval.ts +218 -0
  91. package/src/service.ts +40 -0
  92. package/src/types.ts +91 -0
@@ -0,0 +1,62 @@
1
+ // Cross-encoder reranker - the final, highest-precision retrieval stage. After RRF
2
+ // fuses BM25+dense+graph into a high-RECALL candidate pool, a cross-encoder fixes the
3
+ // ORDERING: it jointly attends over (query, passage) and scores true relevance -
4
+ // exactly the dimension rank-only RRF is weakest at (+5-15 pts ranking precision in
5
+ // 2026 benchmarks). We use the distilled ms-marco-MiniLM-L-6-v2 (~22M params) via
6
+ // transformers.js/ONNX, int8 - same in-process, no-server footprint as our embedder.
7
+ //
8
+ // OPTIONAL (like @huggingface/transformers): if the model can't load, rank() returns
9
+ // null and the caller keeps the RRF order. Never throws, never blocks retrieval.
10
+
11
+ export interface Reranker {
12
+ /** Score each doc's relevance to the query (higher = better). null ⇒ unavailable. */
13
+ rank(query: string, docs: string[]): Promise<number[] | null>
14
+ }
15
+
16
+ const DEFAULT_MODEL = process.env.CONTEXT_RERANK_MODEL || "Xenova/ms-marco-MiniLM-L-6-v2"
17
+
18
+ export class CrossEncoderReranker implements Reranker {
19
+ private model: unknown | null = null
20
+ private tokenizer: unknown | null = null
21
+ private failed = false
22
+ private loading: Promise<void> | null = null
23
+ constructor(private readonly modelId: string = DEFAULT_MODEL) {}
24
+
25
+ private async ensure(): Promise<boolean> {
26
+ if (this.failed) return false
27
+ if (this.model && this.tokenizer) return true
28
+ if (!this.loading) {
29
+ this.loading = (async () => {
30
+ try {
31
+ const t: any = await import("@huggingface/transformers")
32
+ this.tokenizer = await t.AutoTokenizer.from_pretrained(this.modelId)
33
+ this.model = await t.AutoModelForSequenceClassification.from_pretrained(this.modelId, { quantized: true })
34
+ } catch {
35
+ this.failed = true // model unavailable (not downloaded / offline) → graceful no-op
36
+ }
37
+ })()
38
+ }
39
+ await this.loading
40
+ return !this.failed && !!this.model
41
+ }
42
+
43
+ async rank(query: string, docs: string[]): Promise<number[] | null> {
44
+ if (docs.length === 0) return []
45
+ if (!(await this.ensure())) return null
46
+ try {
47
+ const tok = this.tokenizer as any
48
+ const model = this.model as any
49
+ // cross-encoder: each example is (query, doc) via text_pair
50
+ const inputs = tok(new Array(docs.length).fill(query), { text_pair: docs, padding: true, truncation: true })
51
+ const { logits } = await model(inputs)
52
+ const data = Array.from(logits.data as Iterable<number>)
53
+ const cols = logits.dims[logits.dims.length - 1] as number
54
+ // relevance logit: single-logit head → that value; 2-logit → positive class.
55
+ const out: number[] = []
56
+ for (let i = 0; i < docs.length; i++) out.push(cols === 1 ? data[i] : data[i * cols + (cols - 1)])
57
+ return out
58
+ } catch {
59
+ return null
60
+ }
61
+ }
62
+ }
@@ -0,0 +1,59 @@
1
+ // Re-rank stage: PERSONAL BOOST + MEMORY DECAY / SALIENCE.
2
+ //
3
+ // Personal boost: a multiplier on the RRF score for records the user owns.
4
+ // NB: no magnet penalty by RECORD SIZE - that wrongly punishes a record for being
5
+ // thoroughly chunked (a 20-fact news page is not a "magnet"). Flooding is handled
6
+ // structurally by the DIVERSITY CAP (≤ maxPerRecord per record in results), and BM25
7
+ // already favors the term-matching chunk over generic ones.
8
+ //
9
+ // MEMORY DECAY / SALIENCE (Generative-Agents / ACT-R): gently re-weight by recency
10
+ // × access-frequency × importance so fresh/important/often-used memories surface
11
+ // over stale ones - NEVER deletes. Records with no timestamp (legacy) stay neutral.
12
+ import type { SqliteStore } from "../sqlite-store"
13
+ import type { FusedResult } from "./fuse"
14
+
15
+ export interface DecayConfig {
16
+ personalBoost: number
17
+ maxPerRecord: number
18
+ decayOn: boolean
19
+ lambda: number
20
+ decayFloor: number
21
+ accessW: number
22
+ }
23
+
24
+ export function decayConfig(): DecayConfig {
25
+ const decayOn = !/^(0|false|off)$/i.test(process.env.CONTEXT_DECAY ?? "1")
26
+ return {
27
+ personalBoost: Number(process.env.CONTEXT_PERSONAL_BOOST ?? 1.2),
28
+ maxPerRecord: Number(process.env.CONTEXT_MAX_PER_RECORD ?? 2),
29
+ decayOn,
30
+ lambda: Math.LN2 / Math.max(1, Number(process.env.CONTEXT_DECAY_HALFLIFE_DAYS ?? 60)),
31
+ decayFloor: Number(process.env.CONTEXT_DECAY_FLOOR ?? 0.5),
32
+ accessW: Number(process.env.CONTEXT_DECAY_ACCESS_W ?? 0.15),
33
+ }
34
+ }
35
+
36
+ // Mutates each merged item's `rrf` in place by personal boost + decay/salience, then
37
+ // sorts the list descending by rrf.
38
+ export function decayStage(store: SqliteStore, merged: FusedResult[], userId: string, cfg: DecayConfig): void {
39
+ const recIds = [...new Set(merged.map((r) => r.metadata.recordId).filter(Boolean) as string[])]
40
+ const ownerMap = new Map<string, string>()
41
+ if (recIds.length)
42
+ for (const row of store.db
43
+ .query(`SELECT id, json_extract(data,'$.ownerId') o FROM nodes WHERE id IN (${recIds.map(() => "?").join(",")})`)
44
+ .all(...recIds) as Array<{ id: string; o: string | null }>)
45
+ if (row.o) ownerMap.set(row.id, row.o)
46
+ const now = Date.now()
47
+ const salience = cfg.decayOn ? store.recordSalience(recIds) : null
48
+ for (const r of merged) {
49
+ if (ownerMap.get(r.metadata.recordId as string) === userId) r.rrf *= cfg.personalBoost
50
+ const s = salience?.get(r.metadata.recordId as string)
51
+ if (s && s.lastAccessedAt > 0) {
52
+ const ageDays = Math.max(0, (now - s.lastAccessedAt) / 86_400_000)
53
+ const recency = Math.exp(-cfg.lambda * ageDays)
54
+ const accessBoost = 1 + cfg.accessW * Math.log1p(s.accessCount)
55
+ r.rrf *= (s.importance || 1) * (cfg.decayFloor + (1 - cfg.decayFloor) * recency) * accessBoost
56
+ }
57
+ }
58
+ merged.sort((a, b) => b.rrf - a.rrf)
59
+ }
@@ -0,0 +1,37 @@
1
+ // Diversity cap + passage extraction - no more than maxPerRecord chunks from any
2
+ // single record, with each surviving chunk reduced to its best matching passage.
3
+ import type { SqliteStore } from "../sqlite-store"
4
+ import type { SearchResult } from "../../types"
5
+ import type { FusedResult } from "./fuse"
6
+ import { bestPassage, queryTokens } from "./passage"
7
+
8
+ export function diversityStage(
9
+ store: SqliteStore,
10
+ ordered: FusedResult[],
11
+ query: string,
12
+ cutoff: number,
13
+ maxPerRecord: number,
14
+ topk: number,
15
+ decayOn: boolean,
16
+ ): SearchResult[] {
17
+ const terms = queryTokens(query)
18
+ const keyOf = (r: SearchResult) => (r.metadata.point_id as string) ?? `${r.metadata.virtualRecordId ?? ""}|${r.content.slice(0, 80)}`
19
+ const perRecord = new Map<string, number>()
20
+ const relevant: SearchResult[] = []
21
+ for (const r of ordered) {
22
+ if (r.rrf < cutoff) continue
23
+ // passage extraction: the exact matching line, not the whole digest chunk.
24
+ // Empty ⇒ the chunk was all boilerplate (cookie/nav) ⇒ drop it entirely.
25
+ const passage = bestPassage(r.content, terms)
26
+ if (!passage) continue
27
+ const recKey = (r.metadata.recordId as string) ?? (r.metadata.virtualRecordId as string) ?? keyOf(r)
28
+ const n = perRecord.get(recKey) ?? 0
29
+ if (n >= maxPerRecord) continue
30
+ perRecord.set(recKey, n + 1)
31
+ relevant.push({ score: r.score, citationType: r.citationType, content: passage, metadata: r.metadata })
32
+ if (relevant.length >= topk) break
33
+ }
34
+ // record the access (recency + frequency) of what we actually returned.
35
+ if (decayOn) store.touchRecords([...new Set(relevant.map((r) => r.metadata.recordId as string).filter(Boolean))])
36
+ return relevant
37
+ }
@@ -0,0 +1,52 @@
1
+ // Reciprocal Rank Fusion (RRF) - fuse the dense/sparse/graph signal lists into one
2
+ // ranked list with no score-scale calibration. RRF score = Σ 1/(k + rank) across the
3
+ // lists a chunk appears in (k=60), so a chunk strong in ANY signal surfaces, and one
4
+ // strong in several rises to the top.
5
+ import type { SqliteStore } from "../sqlite-store"
6
+ import type { AccessibleMap, SearchResult } from "../../types"
7
+
8
+ export type FusedResult = SearchResult & { rrf: number; legs: Set<string> }
9
+
10
+ export function rrfFuse(
11
+ dense: SearchResult[],
12
+ bm25: SearchResult[],
13
+ graphList: SearchResult[],
14
+ K: number,
15
+ ): FusedResult[] {
16
+ const keyOf = (r: SearchResult) => (r.metadata.point_id as string) ?? `${r.metadata.virtualRecordId ?? ""}|${r.content.slice(0, 80)}`
17
+ const fused = new Map<string, FusedResult>()
18
+ // `leg` tags WHICH signal (vector / keyword / graph) found each item - captured for
19
+ // the retrieval trace so the UI can show how a result was retrieved.
20
+ const fuse = (list: SearchResult[], leg: string) =>
21
+ list.forEach((r, i) => {
22
+ const k = keyOf(r)
23
+ const ex = fused.get(k)
24
+ if (ex) {
25
+ ex.rrf += 1 / (K + i + 1)
26
+ ex.legs.add(leg)
27
+ if (!ex.metadata.recordName && r.metadata.recordName) ex.metadata = { ...ex.metadata, ...r.metadata }
28
+ } else fused.set(k, { ...r, rrf: 1 / (K + i + 1), legs: new Set([leg]) })
29
+ })
30
+ fuse(dense, "vector")
31
+ fuse(bm25, "keyword")
32
+ fuse(graphList, "graph")
33
+ return [...fused.values()]
34
+ }
35
+
36
+ // Backfill recordName/recordId for BM25-only items (so citations resolve).
37
+ export function backfillMeta(store: SqliteStore, merged: FusedResult[], accMap: AccessibleMap): void {
38
+ const needMeta = merged.filter((r) => !r.metadata.recordName && r.metadata.virtualRecordId)
39
+ if (needMeta.length) {
40
+ const want = [...new Set(needMeta.map((r) => accMap[r.metadata.virtualRecordId as string]).filter(Boolean) as string[])]
41
+ const nameById = new Map<string, string>()
42
+ if (want.length)
43
+ for (const row of store.db
44
+ .query(`SELECT id, json_extract(data,'$.recordName') n FROM nodes WHERE id IN (${want.map(() => "?").join(",")})`)
45
+ .all(...want) as Array<{ id: string; n: string | null }>)
46
+ nameById.set(row.id, row.n ?? "")
47
+ for (const r of needMeta) {
48
+ const rid = accMap[r.metadata.virtualRecordId as string]
49
+ if (rid) r.metadata = { ...r.metadata, recordId: rid, recordName: nameById.get(rid) || r.metadata.recordName }
50
+ }
51
+ }
52
+ }
@@ -0,0 +1,45 @@
1
+ // Signal 3: GRAPH expansion (GraphRAG) - chunks reachable through related concepts.
2
+ import type { SqliteGraphProvider } from "../sqlite-graph-provider"
3
+ import type { SqliteStore } from "../sqlite-store"
4
+ import { embedQueryWith, type EmbeddingProvider } from "../../provider"
5
+ import type { AccessibleMap, SearchResult } from "../../types"
6
+ import { cosine } from "./passage"
7
+
8
+ export async function graphStage(
9
+ graph: SqliteGraphProvider,
10
+ store: SqliteStore,
11
+ embeddings: EmbeddingProvider,
12
+ query: string,
13
+ orgId: string,
14
+ dense: SearchResult[],
15
+ accMap: AccessibleMap,
16
+ ): Promise<SearchResult[]> {
17
+ const graphList: SearchResult[] = []
18
+ const seeds = [...new Set(dense.map((r) => r.metadata.recordId).filter(Boolean) as string[])]
19
+ if (seeds.length) {
20
+ const related = graph.getRelatedRecordIds(seeds, [...new Set(Object.values(accMap))], 5)
21
+ if (related.length) {
22
+ const q = await embedQueryWith(embeddings, query)
23
+ const seen = new Set(dense.map((r) => r.metadata.virtualRecordId))
24
+ const records = await graph.getRecordsByRecordIds(related, orgId)
25
+ for (const rec of records) {
26
+ const vid = (rec.virtualRecordId as string) ?? rec._key
27
+ if (seen.has(vid)) continue
28
+ const rows = store.db.query("SELECT content, embedding FROM chunks WHERE virtual_record_id = ?").all(vid) as Array<{ content: string; embedding: string }>
29
+ let best: { content: string; score: number } | null = null
30
+ for (const row of rows) {
31
+ const s = cosine(q, JSON.parse(row.embedding) as number[])
32
+ if (!best || s > best.score) best = { content: row.content, score: s }
33
+ }
34
+ if (best)
35
+ graphList.push({
36
+ score: best.score,
37
+ citationType: "graph|related",
38
+ content: best.content,
39
+ metadata: { recordName: rec.recordName, recordId: rec._key, virtualRecordId: vid, orgId, origin: rec.origin, mimeType: rec.mimeType },
40
+ })
41
+ }
42
+ }
43
+ }
44
+ return graphList
45
+ }
@@ -0,0 +1,80 @@
1
+ // HYBRID retrieval orchestrator - three complementary signals fused with Reciprocal
2
+ // Rank Fusion (the 2026 production default), then re-ranked. Signals:
3
+ // • DENSE (vector + ACL) - semantic similarity (paraphrase, meaning).
4
+ // • SPARSE (BM25 / FTS5) - exact tokens dense misses (acronyms "SAP", "£230M").
5
+ // • GRAPH (GraphRAG) - chunks reachable through related concepts.
6
+ // RRF score = Σ 1/(k + rank) across the lists a chunk appears in (k=60), so a chunk
7
+ // strong in ANY signal surfaces, and one strong in several rises to the top - with no
8
+ // score-scale calibration between cosine and BM25. Then: personal boost (ownership),
9
+ // memory decay/salience, cross-encoder rerank, passage extraction, diversity cap (MMR).
10
+ // All tunable via CONTEXT_* env.
11
+ import type { RetrievalService } from "../../retrieval"
12
+ import type { SqliteStore } from "../sqlite-store"
13
+ import type { SqliteGraphProvider } from "../sqlite-graph-provider"
14
+ import type { Reranker } from "../reranker"
15
+ import type { EmbeddingProvider } from "../../provider"
16
+ import type { RetrievalResponse } from "../../types"
17
+ import { vectorStage } from "./vector-stage"
18
+ import { keywordStage } from "./keyword-stage"
19
+ import { graphStage } from "./graph-stage"
20
+ import { rrfFuse, backfillMeta } from "./fuse"
21
+ import { decayConfig, decayStage } from "./decay-stage"
22
+ import { rerankStage } from "./rerank-stage"
23
+ import { diversityStage } from "./diversity"
24
+ import { populateTrace, type SearchTrace } from "./trace"
25
+
26
+ export interface HybridDeps {
27
+ retrieval: RetrievalService
28
+ store: SqliteStore
29
+ graph: SqliteGraphProvider
30
+ embeddings: EmbeddingProvider
31
+ reranker?: Reranker
32
+ }
33
+
34
+ export async function hybridSearch(
35
+ deps: HybridDeps,
36
+ query: string,
37
+ userId: string,
38
+ orgId: string,
39
+ trace?: SearchTrace,
40
+ ): Promise<RetrievalResponse> {
41
+ const { retrieval, store, graph, embeddings, reranker } = deps
42
+ const retrieveLimit = Number(process.env.CONTEXT_RETRIEVE_LIMIT ?? 20)
43
+ const accMap = await graph.getAccessibleVirtualRecordIds({ userId, orgId })
44
+ const accessibleVids = new Set(Object.keys(accMap))
45
+
46
+ // ── signal 1: DENSE (vector + ACL) ──
47
+ const { dense, res } = await vectorStage(retrieval, query, userId, orgId, retrieveLimit)
48
+
49
+ // ── signal 2: SPARSE (BM25) ──
50
+ const bm25 = keywordStage(store, query, orgId, accessibleVids, retrieveLimit)
51
+
52
+ // ── signal 3: GRAPH expansion (concept-connected chunks) ──
53
+ const graphList = await graphStage(graph, store, embeddings, query, orgId, dense, accMap)
54
+
55
+ // ── Reciprocal Rank Fusion ──
56
+ const K = Number(process.env.CONTEXT_RRF_K ?? 60)
57
+ const merged = rrfFuse(dense, bm25, graphList, K)
58
+ backfillMeta(store, merged, accMap)
59
+
60
+ // ── re-rank: personal boost + memory decay/salience (sorts merged) ──
61
+ const cfg = decayConfig()
62
+ decayStage(store, merged, userId, cfg)
63
+
64
+ const topk = Number(process.env.CONTEXT_TOPK ?? 6)
65
+ const ratio = Number(process.env.CONTEXT_RRF_RATIO ?? 0.3) // relative cutoff on fused score
66
+ const initialCutoff = (merged[0]?.rrf ?? 0) * ratio
67
+
68
+ // ── final stage: CROSS-ENCODER RERANK (optional) ──
69
+ const { ordered, cutoff, rerankerUsed } = await rerankStage(reranker, query, merged, initialCutoff)
70
+
71
+ // ── passage extraction + diversity cap + topk (+ access touch) ──
72
+ const relevant = diversityStage(store, ordered, query, cutoff, cfg.maxPerRecord, topk, cfg.decayOn)
73
+
74
+ // retrieval trace for the UI: counts per signal + which legs found each top item.
75
+ if (trace) populateTrace(trace, dense, bm25, graphList, merged, ordered, rerankerUsed)
76
+
77
+ return { ...res, searchResults: relevant }
78
+ }
79
+
80
+ export type { SearchTrace } from "./trace"
@@ -0,0 +1,27 @@
1
+ // Signal 2: SPARSE (BM25 / FTS5) - exact tokens dense misses (acronyms "SAP", "£230M").
2
+ // ACL-filtered to accessible records, fts rank order kept.
3
+ import type { SqliteStore } from "../sqlite-store"
4
+ import type { SearchResult } from "../../types"
5
+
6
+ export function keywordStage(
7
+ store: SqliteStore,
8
+ query: string,
9
+ orgId: string,
10
+ accessibleVids: Set<string>,
11
+ retrieveLimit: number,
12
+ ): SearchResult[] {
13
+ const bm25: SearchResult[] = []
14
+ const ftsRowids = store.ftsSearch(query, retrieveLimit)
15
+ if (ftsRowids.length) {
16
+ const rows = store.db
17
+ .query(`SELECT rowid, point_id, virtual_record_id v, org_id o, content FROM chunks WHERE rowid IN (${ftsRowids.map(() => "?").join(",")})`)
18
+ .all(...ftsRowids) as Array<{ rowid: number; point_id: string; v: string; o: string; content: string }>
19
+ const byRow = new Map(rows.map((r) => [r.rowid, r]))
20
+ for (const rid of ftsRowids) {
21
+ const r = byRow.get(rid)
22
+ if (!r || r.o !== orgId || !accessibleVids.has(r.v)) continue // ACL
23
+ bm25.push({ score: 0, citationType: "bm25", content: r.content, metadata: { virtualRecordId: r.v, orgId, point_id: r.point_id } })
24
+ }
25
+ }
26
+ return bm25
27
+ }
@@ -0,0 +1,44 @@
1
+ // Passage-extraction helpers - shared across rerank, diversity and trace stages.
2
+ import { cleanLine, isBoilerplate } from "../extract"
3
+
4
+ export function cosine(a: number[], b: number[]): number {
5
+ let dot = 0
6
+ let na = 0
7
+ let nb = 0
8
+ const n = Math.min(a.length, b.length)
9
+ for (let i = 0; i < n; i++) {
10
+ dot += a[i] * b[i]
11
+ na += a[i] * a[i]
12
+ nb += b[i] * b[i]
13
+ }
14
+ return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0
15
+ }
16
+
17
+ // Query tokenizer for passage scoring - meaningful tokens only (acronyms/numbers kept).
18
+ export function queryTokens(query: string): string[] {
19
+ return [...new Set((query.toLowerCase().match(/[a-z0-9]+/g) ?? []).filter((t) => t.length >= 3))]
20
+ }
21
+
22
+ // PASSAGE EXTRACTION - a retrieved chunk can be a coarse multi-fact digest (a whole
23
+ // scraped page), so returning the whole chunk buries the answer. We split it into
24
+ // sentences/lines, drop markdown + boilerplate, and return the line that best matches
25
+ // the query terms (term-length-weighted, so specific/rare terms dominate). This turns
26
+ // "matched the right digest" into "returned the exact fact" - at read time, no
27
+ // re-ingest, and it transparently skips any leftover cookie/nav line in a chunk.
28
+ export function bestPassage(content: string, terms: string[]): string {
29
+ const lines = content
30
+ .trim()
31
+ .split(/\n|(?<=[.!?])\s+/)
32
+ .map((l) => cleanLine(l))
33
+ .filter((l) => l.length > 0 && !isBoilerplate(l))
34
+ if (lines.length === 0) return "" // all boilerplate / no substance → caller drops it
35
+ if (lines.length === 1) return lines[0]
36
+ let best: { line: string; score: number } | null = null
37
+ for (const line of lines) {
38
+ const low = line.toLowerCase()
39
+ let score = 0
40
+ for (const t of terms) if (low.includes(t)) score += t.length // rarer/longer terms weigh more
41
+ if (score > 0 && (!best || score > best.score)) best = { line, score }
42
+ }
43
+ return best ? best.line : lines[0]
44
+ }
@@ -0,0 +1,31 @@
1
+ // Final stage: CROSS-ENCODER RERANK (optional).
2
+ // RRF maximizes recall; the cross-encoder fixes ORDERING by jointly scoring
3
+ // (query, passage). Rerank only the top-K candidates (cost is linear), reorder, and
4
+ // drop the rrf-relative cutoff (rerank logits aren't on the rrf scale).
5
+ import type { Reranker } from "../reranker"
6
+ import type { FusedResult } from "./fuse"
7
+ import { bestPassage, queryTokens } from "./passage"
8
+
9
+ export async function rerankStage(
10
+ reranker: Reranker | undefined,
11
+ query: string,
12
+ merged: FusedResult[],
13
+ initialCutoff: number,
14
+ ): Promise<{ ordered: FusedResult[]; cutoff: number; rerankerUsed: boolean }> {
15
+ let ordered = merged
16
+ let cutoff = initialCutoff
17
+ let rerankerUsed = false
18
+ if (reranker && merged.length > 1) {
19
+ const rerankK = Number(process.env.CONTEXT_RERANK_K ?? 20)
20
+ const cand = merged.slice(0, rerankK)
21
+ const scores = await reranker.rank(query, cand.map((c) => bestPassage(c.content, queryTokens(query)) || c.content))
22
+ if (scores) {
23
+ cand.forEach((c, i) => (c.rrf = scores[i]))
24
+ cand.sort((a, b) => b.rrf - a.rrf)
25
+ ordered = [...cand, ...merged.slice(rerankK)]
26
+ cutoff = -Infinity // reranker decided relevance; keep its order, no rrf cutoff
27
+ rerankerUsed = true
28
+ }
29
+ }
30
+ return { ordered, cutoff, rerankerUsed }
31
+ }
@@ -0,0 +1,31 @@
1
+ // Retrieval trace - how a query flowed through the pipeline, for the UI's "how it
2
+ // retrieved" panel. counts = items each signal returned; items = the top fused/reranked
3
+ // results tagged with WHICH signals (vector / keyword / graph) found them.
4
+ import type { SearchResult } from "../../types"
5
+ import type { FusedResult } from "./fuse"
6
+
7
+ export interface SearchTrace {
8
+ counts: { vector: number; keyword: number; graph: number; fused: number }
9
+ reranked: boolean
10
+ items: Array<{ label: string; recordId?: string; legs: string[]; rrf: number; rank: number }>
11
+ }
12
+
13
+ export function populateTrace(
14
+ trace: SearchTrace,
15
+ dense: SearchResult[],
16
+ bm25: SearchResult[],
17
+ graphList: SearchResult[],
18
+ merged: FusedResult[],
19
+ ordered: FusedResult[],
20
+ rerankerUsed: boolean,
21
+ ): void {
22
+ trace.counts = { vector: dense.length, keyword: bm25.length, graph: graphList.length, fused: merged.length }
23
+ trace.reranked = rerankerUsed
24
+ trace.items = ordered.slice(0, 8).map((r, i) => ({
25
+ label: (r.metadata.recordName as string) ?? (r.metadata.recordId as string) ?? "?",
26
+ recordId: r.metadata.recordId as string,
27
+ legs: [...r.legs],
28
+ rrf: r.rrf,
29
+ rank: i,
30
+ }))
31
+ }
@@ -0,0 +1,15 @@
1
+ // Signal 1: DENSE (vector + ACL) - semantic similarity (paraphrase, meaning).
2
+ import type { RetrievalService } from "../../retrieval"
3
+ import type { SearchResult } from "../../types"
4
+
5
+ export async function vectorStage(
6
+ retrieval: RetrievalService,
7
+ query: string,
8
+ userId: string,
9
+ orgId: string,
10
+ retrieveLimit: number,
11
+ ): Promise<{ dense: SearchResult[]; res: Awaited<ReturnType<RetrievalService["searchWithFilters"]>> }> {
12
+ const res = await retrieval.searchWithFilters({ queries: [query], userId, orgId, limit: retrieveLimit })
13
+ const dense: SearchResult[] = [...res.searchResults]
14
+ return { dense, res }
15
+ }
@@ -0,0 +1,119 @@
1
+ // GraphProvider over bun:sqlite - the ACL traversal ported from AQL to SQL.
2
+ //
3
+ // The Arango version ran eight permission paths; here they collapse to the same
4
+ // access semantics expressed over the generic node/edge tables:
5
+ // • principals = the user + every group/role/org/team they belong to or are
6
+ // permissioned to (one hop).
7
+ // • directRecords = records permissioned to any principal.
8
+ // • recordGroups = record-groups permissioned to any principal, then both:
9
+ // - inheritedRecords (recursive descent over inheritPermissions), and
10
+ // - kbRecords (records that belong to those groups, origin=UPLOAD).
11
+ // • anyoneRecords = org-wide shared records.
12
+ // All unioned, then deduped first-writer-wins → { virtualRecordId: recordId }.
13
+ // Same invariant as the Arango port; only the query language differs.
14
+ //
15
+ // This file is the ORCHESTRATOR: it owns the bun:sqlite handle and the row/
16
+ // placeholder helpers, and delegates the permission-path SQL to ./graph/acl-paths
17
+ // and the entity/edge assembly to ./graph/knowledge-graph (where the per-edge
18
+ // provenance leak-guard lives).
19
+
20
+ import type { GraphProvider } from "../provider"
21
+ import type { AccessibleMap, RecordDoc, RetrievalFilters, UserDoc } from "../types"
22
+ import * as acl from "./graph/acl-paths"
23
+ import * as kg from "./graph/knowledge-graph"
24
+ import type { Pair, SqlAccess } from "./graph/sql-access"
25
+ import type { SqliteStore } from "./sqlite-store"
26
+
27
+ export class SqliteGraphProvider implements GraphProvider {
28
+ constructor(private readonly store: SqliteStore) {}
29
+ private get db() {
30
+ return this.store.db
31
+ }
32
+ private rows<T = any>(sql: string, params: unknown[]): T[] {
33
+ return this.db.query(sql).all(...(params as any[])) as T[]
34
+ }
35
+ private ph(n: number): string {
36
+ return Array.from({ length: n }, () => "?").join(",")
37
+ }
38
+ // The SQL-access seam handed to the decomposed query modules.
39
+ private get sql(): SqlAccess {
40
+ return { rows: this.rows.bind(this), ph: this.ph.bind(this) }
41
+ }
42
+
43
+ async getAccessibleVirtualRecordIds(args: {
44
+ userId: string
45
+ orgId: string
46
+ filters?: RetrievalFilters
47
+ }): Promise<AccessibleMap> {
48
+ const user = acl.userRow(this.sql, args.userId)
49
+ if (!user) return {} // deny by default
50
+ const f = args.filters ?? {}
51
+
52
+ const principals = acl.principalIds(this.sql, user.id)
53
+ const recordGroups = acl.recordGroupsPermissionedTo(this.sql, principals, f.kb)
54
+
55
+ const all: Pair[] = [
56
+ ...acl.recordsPermissionedTo(this.sql, principals, f.apps),
57
+ ...acl.recordsInheritingFrom(this.sql, recordGroups),
58
+ ...acl.kbRecords(this.sql, recordGroups),
59
+ ...acl.anyoneRecords(this.sql, args.orgId),
60
+ ]
61
+
62
+ const map: AccessibleMap = {}
63
+ for (const { vid, rid } of all) if (vid && rid && !(vid in map)) map[vid] = rid
64
+ return map
65
+ }
66
+
67
+ async getRecordsByRecordIds(recordIds: string[], orgId: string): Promise<RecordDoc[]> {
68
+ if (recordIds.length === 0) return []
69
+ const rows = this.rows<{ id: string; data: string }>(
70
+ `SELECT id, data FROM nodes WHERE coll = 'records' AND id IN (${this.ph(recordIds.length)})
71
+ AND json_extract(data,'$.orgId') = ?`,
72
+ [...recordIds, orgId],
73
+ )
74
+ return rows.map((r) => ({ ...(JSON.parse(r.data) as RecordDoc), _key: r.id }))
75
+ }
76
+
77
+ async getUserByUserId(userId: string): Promise<UserDoc | null> {
78
+ return acl.userRow(this.sql, userId)
79
+ }
80
+
81
+ async getUserApps(userKey: string): Promise<Array<{ _key?: string; id?: string }>> {
82
+ return this.rows<{ id: string }>(
83
+ `SELECT n.id AS id FROM edges e JOIN nodes n ON n.id = e.dst AND n.coll = 'apps'
84
+ WHERE e.src = ? AND e.label = 'permissions'`,
85
+ [userKey],
86
+ ).map((r) => ({ _key: r.id, id: r.id }))
87
+ }
88
+
89
+ /** Record names (within the accessible set) that mention any of the given entities. */
90
+ recordsMentioning(entityIds: string[], accessibleRecordIds: string[]): string[] {
91
+ return kg.recordsMentioning(this.sql, entityIds, accessibleRecordIds)
92
+ }
93
+
94
+ async getDocument(recordId: string, collection: string): Promise<RecordDoc | null> {
95
+ const r = this.rows<{ id: string; data: string }>("SELECT id, data FROM nodes WHERE id = ? AND coll = ? LIMIT 1", [
96
+ recordId,
97
+ collection,
98
+ ])[0]
99
+ return r ? { ...(JSON.parse(r.data) as RecordDoc), _key: r.id } : null
100
+ }
101
+
102
+ /** GraphRAG hop: records connected to the seeds through shared/related concepts.
103
+ * seed records → their entities → relates_to neighbors → other records that
104
+ * mention those neighbors. Constrained to `accessibleRecordIds` (ACL-safe) and
105
+ * excluding the seeds themselves. */
106
+ getRelatedRecordIds(seedRecordIds: string[], accessibleRecordIds: string[], limit = 5): string[] {
107
+ return kg.getRelatedRecordIds(this.sql, seedRecordIds, accessibleRecordIds, limit)
108
+ }
109
+
110
+ /** The knowledge graph the given (already ACL-filtered) records expose:
111
+ * entities those records mention + relationships among them. ACL-safe because
112
+ * the caller passes only recordIds the user may access. */
113
+ getKnowledgeGraph(recordIds: string[]): {
114
+ entities: Array<{ id: string; label: string; type: string }>
115
+ relations: Array<{ from: string; to: string; type: string; weight: number }>
116
+ } {
117
+ return kg.getKnowledgeGraph(this.sql, recordIds)
118
+ }
119
+ }