@100xprompt/chitta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +203 -0
- package/assets/rules/claude-md.md +9 -0
- package/assets/skill/SKILL.md +47 -0
- package/package.json +48 -0
- package/src/README.md +124 -0
- package/src/arango-client.ts +67 -0
- package/src/arango-graph-provider.ts +364 -0
- package/src/bin.ts +27 -0
- package/src/config-env.ts +53 -0
- package/src/embedded/authorizer.ts +89 -0
- package/src/embedded/cli.ts +86 -0
- package/src/embedded/code-extractor.ts +9 -0
- package/src/embedded/demo.ts +36 -0
- package/src/embedded/extract.ts +12 -0
- package/src/embedded/extractors/code.ts +308 -0
- package/src/embedded/extractors/deterministic.ts +63 -0
- package/src/embedded/extractors/llm.ts +151 -0
- package/src/embedded/extractors/text-hygiene.ts +54 -0
- package/src/embedded/extractors/types.ts +34 -0
- package/src/embedded/graph/acl-paths.ts +96 -0
- package/src/embedded/graph/adjacency.ts +61 -0
- package/src/embedded/graph/centrality.ts +23 -0
- package/src/embedded/graph/communities.ts +46 -0
- package/src/embedded/graph/cypher.ts +17 -0
- package/src/embedded/graph/impact.ts +24 -0
- package/src/embedded/graph/knowledge-graph.ts +108 -0
- package/src/embedded/graph/pagerank.ts +57 -0
- package/src/embedded/graph/sql-access.ts +13 -0
- package/src/embedded/graph/traversal.ts +73 -0
- package/src/embedded/graph/types.ts +35 -0
- package/src/embedded/graph-query.ts +126 -0
- package/src/embedded/index.ts +171 -0
- package/src/embedded/ingest.ts +262 -0
- package/src/embedded/kgqa/answer-paths.ts +197 -0
- package/src/embedded/kgqa/entity-link.ts +13 -0
- package/src/embedded/kgqa/intent.ts +14 -0
- package/src/embedded/kgqa/predicates.ts +9 -0
- package/src/embedded/kgqa/preference.ts +20 -0
- package/src/embedded/kgqa/select.ts +99 -0
- package/src/embedded/kgqa/text.ts +16 -0
- package/src/embedded/kgqa/types.ts +6 -0
- package/src/embedded/kgqa-service.ts +122 -0
- package/src/embedded/llm-extractor.ts +10 -0
- package/src/embedded/local-embeddings.ts +36 -0
- package/src/embedded/personal.ts +100 -0
- package/src/embedded/reranker.ts +62 -0
- package/src/embedded/retrieval/decay-stage.ts +59 -0
- package/src/embedded/retrieval/diversity.ts +37 -0
- package/src/embedded/retrieval/fuse.ts +52 -0
- package/src/embedded/retrieval/graph-stage.ts +45 -0
- package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
- package/src/embedded/retrieval/keyword-stage.ts +27 -0
- package/src/embedded/retrieval/passage.ts +44 -0
- package/src/embedded/retrieval/rerank-stage.ts +31 -0
- package/src/embedded/retrieval/trace.ts +31 -0
- package/src/embedded/retrieval/vector-stage.ts +15 -0
- package/src/embedded/sqlite-graph-provider.ts +119 -0
- package/src/embedded/sqlite-store.ts +95 -0
- package/src/embedded/sqlite-vec-service.ts +122 -0
- package/src/embedded/store/chunks.ts +61 -0
- package/src/embedded/store/fts.ts +50 -0
- package/src/embedded/store/nodes-edges.ts +112 -0
- package/src/embedded/store/salience.ts +37 -0
- package/src/embedded/store/schema.ts +109 -0
- package/src/embedded/transformers-embeddings.ts +100 -0
- package/src/embeddings.ts +51 -0
- package/src/eval/goldset.ts +46 -0
- package/src/eval/harness.ts +65 -0
- package/src/eval/metrics.ts +38 -0
- package/src/http/server.ts +93 -0
- package/src/index.ts +44 -0
- package/src/install/index.ts +139 -0
- package/src/install/platforms.ts +126 -0
- package/src/install/skill.ts +46 -0
- package/src/install/writers.ts +82 -0
- package/src/mcp/backend.ts +129 -0
- package/src/mcp/server.ts +83 -0
- package/src/mcp/tools/context-about.ts +69 -0
- package/src/mcp/tools/context-graph.ts +23 -0
- package/src/mcp/tools/context-ingest.ts +88 -0
- package/src/mcp/tools/context-rebuild.ts +22 -0
- package/src/mcp/tools/context-relate.ts +88 -0
- package/src/mcp/tools/get-context.ts +52 -0
- package/src/mcp/tools/index.ts +40 -0
- package/src/mcp/tools/types.ts +33 -0
- package/src/permission.ts +72 -0
- package/src/provider.ts +65 -0
- package/src/qdrant-vector.ts +76 -0
- package/src/retrieval.ts +218 -0
- package/src/service.ts +40 -0
- package/src/types.ts +91 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
// Graph-query layer - turns the entity graph into something you can QUERY as a
|
|
2
|
+
// graph, not just semantically search. Ported from Graphify's serve.py query
|
|
3
|
+
// ergonomics (neighbors / shortest_path / impact / central) but, unlike Graphify:
|
|
4
|
+
// • every traversal is ACL-FILTERED per caller (we only ever build the subgraph of
|
|
5
|
+
// entities mentioned by records the user may access), and
|
|
6
|
+
// • seeds are resolved against entity labels (and can be vector-seeded by the caller),
|
|
7
|
+
// not Graphify's lexical IDF scoring.
|
|
8
|
+
// Pure in-memory traversal over the already-ACL-scoped {entities, relations} the
|
|
9
|
+
// provider returns - instant at personal/enterprise note scale, and ACL-safe by
|
|
10
|
+
// construction because the subgraph never contains an inaccessible entity.
|
|
11
|
+
//
|
|
12
|
+
// This file is the ORCHESTRATOR: scope() builds the ACL-scoped subgraph and each
|
|
13
|
+
// public method delegates to a pure algorithm module under ./graph/.
|
|
14
|
+
|
|
15
|
+
import { buildAdjacency, resolveIds } from "./graph/adjacency"
|
|
16
|
+
import { centralEntities } from "./graph/centrality"
|
|
17
|
+
import { detectCommunities } from "./graph/communities"
|
|
18
|
+
import { toCypher as renderCypher } from "./graph/cypher"
|
|
19
|
+
import { connectedEntities } from "./graph/impact"
|
|
20
|
+
import { personalizedPageRank } from "./graph/pagerank"
|
|
21
|
+
import { neighborsOf, shortestPath } from "./graph/traversal"
|
|
22
|
+
import type { ScopedGraph } from "./graph/types"
|
|
23
|
+
import type { SqliteGraphProvider } from "./sqlite-graph-provider"
|
|
24
|
+
|
|
25
|
+
export interface NeighborResult {
|
|
26
|
+
entity: string
|
|
27
|
+
neighbors: Array<{ label: string; relation: string; direction: "out" | "in"; weight: number }>
|
|
28
|
+
}
|
|
29
|
+
export interface PathResult {
|
|
30
|
+
found: boolean
|
|
31
|
+
hops: number
|
|
32
|
+
steps: Array<{ from: string; relation: string; to: string }>
|
|
33
|
+
}
|
|
34
|
+
export interface ImpactResult {
|
|
35
|
+
entity: string
|
|
36
|
+
records: string[]
|
|
37
|
+
connectedEntities: Array<{ label: string; relation: string }>
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export class GraphQueryService {
|
|
41
|
+
constructor(private readonly graph: SqliteGraphProvider) {}
|
|
42
|
+
|
|
43
|
+
// The ACL-scoped subgraph: entities + LIVE relations the user may see. Everything
|
|
44
|
+
// below traverses ONLY this, so no query can reach across a permission boundary.
|
|
45
|
+
private async scope(userId: string, orgId: string): Promise<ScopedGraph> {
|
|
46
|
+
const accessible = await this.graph.getAccessibleVirtualRecordIds({ userId, orgId })
|
|
47
|
+
const recordIds = [...new Set(Object.values(accessible))]
|
|
48
|
+
const { entities, relations } = this.graph.getKnowledgeGraph(recordIds)
|
|
49
|
+
const { byId, adj } = buildAdjacency(entities, relations)
|
|
50
|
+
return { entities, relations, byId, adj, recordIds }
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Direct neighbors of an entity, optionally filtered by relation, heaviest first. */
|
|
54
|
+
async neighbors(name: string, userId: string, orgId: string, relation?: string): Promise<NeighborResult | null> {
|
|
55
|
+
const { entities, byId, adj } = await this.scope(userId, orgId)
|
|
56
|
+
const ids = resolveIds(name, entities)
|
|
57
|
+
if (ids.length === 0) return null
|
|
58
|
+
return neighborsOf(ids, byId, adj, relation)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** Shortest relation chain between two entities (undirected BFS, hub-avoiding).
|
|
62
|
+
* Answers "how are X and Y related?" - the single most useful graph query. */
|
|
63
|
+
async pathBetween(a: string, b: string, userId: string, orgId: string): Promise<PathResult> {
|
|
64
|
+
const { entities, byId, adj } = await this.scope(userId, orgId)
|
|
65
|
+
const startIds = resolveIds(a, entities)
|
|
66
|
+
const goalIds = new Set(resolveIds(b, entities))
|
|
67
|
+
return shortestPath(startIds, goalIds, byId, adj)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Impact / reverse-reference: which accessible records mention the entity, and
|
|
71
|
+
* which entities it connects to. "What references / depends on X." */
|
|
72
|
+
async impactOf(name: string, userId: string, orgId: string): Promise<ImpactResult | null> {
|
|
73
|
+
const { entities, byId, adj, recordIds } = await this.scope(userId, orgId)
|
|
74
|
+
const ids = resolveIds(name, entities)
|
|
75
|
+
if (ids.length === 0) return null
|
|
76
|
+
const records = new Set<string>()
|
|
77
|
+
for (const name2 of this.graph.recordsMentioning(ids, recordIds)) records.add(name2)
|
|
78
|
+
const connected = connectedEntities(ids, byId, adj)
|
|
79
|
+
return { entity: byId.get(ids[0])?.label ?? ids[0], records: [...records], connectedEntities: connected }
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Hub entities - highest total edge weight (most-connected concepts) in the
|
|
83
|
+
* accessible graph. "What are the central things I know about." */
|
|
84
|
+
async central(userId: string, orgId: string, limit = 10): Promise<Array<{ label: string; degree: number; strength: number }>> {
|
|
85
|
+
const { byId, adj } = await this.scope(userId, orgId)
|
|
86
|
+
return centralEntities(byId, adj, limit)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/** Personalized PageRank multi-hop walk (HippoRAG-style). Seeds activation mass on
|
|
90
|
+
* the query's entities and spreads it over the ACL-scoped, weighted typed graph;
|
|
91
|
+
* a node reachable via MANY paths scores higher than a near dead-end. This is true
|
|
92
|
+
* multi-hop relevance - strictly better than fixed-depth neighbor expansion - and
|
|
93
|
+
* it's pure TS power-iteration (sub-ms at our scale). Returns ranked related
|
|
94
|
+
* entities (seeds excluded). Edge weight (frequency≈confidence) steers the flow. */
|
|
95
|
+
async walk(
|
|
96
|
+
seedNames: string[],
|
|
97
|
+
userId: string,
|
|
98
|
+
orgId: string,
|
|
99
|
+
opts: { alpha?: number; iters?: number; limit?: number } = {},
|
|
100
|
+
): Promise<Array<{ label: string; score: number; type: string }>> {
|
|
101
|
+
const alpha = opts.alpha ?? 0.85
|
|
102
|
+
const iters = opts.iters ?? 30
|
|
103
|
+
const limit = opts.limit ?? 15
|
|
104
|
+
const { entities, byId, adj } = await this.scope(userId, orgId)
|
|
105
|
+
if (entities.length === 0) return []
|
|
106
|
+
const seedIds = new Set<string>()
|
|
107
|
+
for (const name of seedNames) for (const id of resolveIds(name, entities)) seedIds.add(id)
|
|
108
|
+
if (seedIds.size === 0) return []
|
|
109
|
+
return personalizedPageRank(entities, byId, adj, seedIds, { alpha, iters, limit })
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/** Communities - connected clusters of related entities (Graphify's god-node /
|
|
113
|
+
* community view), via union-find over live edges. Each cluster's `hub` is its
|
|
114
|
+
* most-connected member. ACL-scoped. */
|
|
115
|
+
async communities(userId: string, orgId: string, minSize = 2): Promise<Array<{ size: number; hub: string; members: string[] }>> {
|
|
116
|
+
const { entities, relations, byId, adj } = await this.scope(userId, orgId)
|
|
117
|
+
return detectCommunities(entities, relations, byId, adj, minSize)
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Export the accessible graph as Cypher (Neo4j interop) - Graphify's to_cypher,
|
|
121
|
+
* but ACL-filtered to exactly what this user may see. Uses MERGE so it's idempotent. */
|
|
122
|
+
async toCypher(userId: string, orgId: string): Promise<string> {
|
|
123
|
+
const { entities, relations, byId } = await this.scope(userId, orgId)
|
|
124
|
+
return renderCypher(entities, relations, byId)
|
|
125
|
+
}
|
|
126
|
+
}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
// Embedded context stack: one SQLite file, in-process embeddings, zero servers.
|
|
2
|
+
// Wires the same RetrievalService (the moat) over embedded adapters - this is the
|
|
3
|
+
// single-binary path. `bun build --compile` over a CLI that calls this yields one
|
|
4
|
+
// self-contained executable.
|
|
5
|
+
|
|
6
|
+
import { RetrievalService } from "../retrieval"
|
|
7
|
+
import { SqliteStore } from "./sqlite-store"
|
|
8
|
+
import { SqliteGraphProvider } from "./sqlite-graph-provider"
|
|
9
|
+
import { SqliteVecService } from "./sqlite-vec-service"
|
|
10
|
+
import { LocalHashEmbeddings } from "./local-embeddings"
|
|
11
|
+
import { Ingestor, type IngestDoc } from "./ingest"
|
|
12
|
+
import { DeterministicExtractor, type KnowledgeExtractor } from "./extract"
|
|
13
|
+
import { Authorizer } from "./authorizer"
|
|
14
|
+
import { KgqaService } from "./kgqa-service"
|
|
15
|
+
import { GraphQueryService } from "./graph-query"
|
|
16
|
+
import type { Reranker } from "./reranker"
|
|
17
|
+
import type { LlmExtractor } from "./llm-extractor"
|
|
18
|
+
import type { EmbeddingProvider } from "../provider"
|
|
19
|
+
import type { RetrievalResponse } from "../types"
|
|
20
|
+
import { hybridSearch } from "./retrieval/hybrid-retriever"
|
|
21
|
+
import type { SearchTrace } from "./retrieval/trace"
|
|
22
|
+
|
|
23
|
+
export { SqliteStore } from "./sqlite-store"
|
|
24
|
+
export { SqliteGraphProvider } from "./sqlite-graph-provider"
|
|
25
|
+
export { SqliteVecService } from "./sqlite-vec-service"
|
|
26
|
+
export { LocalHashEmbeddings } from "./local-embeddings"
|
|
27
|
+
export { TransformersEmbeddings } from "./transformers-embeddings"
|
|
28
|
+
export { Ingestor, chunkText, type IngestDoc } from "./ingest"
|
|
29
|
+
export { DeterministicExtractor, type KnowledgeExtractor } from "./extract"
|
|
30
|
+
export { LlmExtractor, HybridExtractor } from "./llm-extractor"
|
|
31
|
+
export { Authorizer, AuthorizationError, type Role } from "./authorizer"
|
|
32
|
+
|
|
33
|
+
export interface EmbeddedOptions {
|
|
34
|
+
path?: string
|
|
35
|
+
collectionName?: string
|
|
36
|
+
embeddings?: EmbeddingProvider
|
|
37
|
+
extractor?: KnowledgeExtractor
|
|
38
|
+
llm?: LlmExtractor // enables LLM-based KGQA intent parsing
|
|
39
|
+
reranker?: Reranker // optional cross-encoder final stage (highest-precision reorder)
|
|
40
|
+
log?: { info: (m: string) => void; debug: (m: string) => void; error: (m: string) => void }
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Retrieval trace - how a query flowed through the pipeline, for the UI's "how it
|
|
44
|
+
// retrieved" panel. Defined in retrieval/trace.ts; re-exported here as part of the
|
|
45
|
+
// public API.
|
|
46
|
+
export type { SearchTrace } from "./retrieval/trace"
|
|
47
|
+
|
|
48
|
+
export function buildEmbeddedContext(opts: EmbeddedOptions = {}) {
|
|
49
|
+
const store = new SqliteStore(opts.path ?? ":memory:")
|
|
50
|
+
const graph = new SqliteGraphProvider(store)
|
|
51
|
+
const vector = new SqliteVecService(store)
|
|
52
|
+
const embeddings = opts.embeddings ?? new LocalHashEmbeddings()
|
|
53
|
+
const extractor = opts.extractor ?? new DeterministicExtractor()
|
|
54
|
+
const retrieval = new RetrievalService({
|
|
55
|
+
graph,
|
|
56
|
+
vector,
|
|
57
|
+
embeddings,
|
|
58
|
+
collectionName: opts.collectionName ?? "records",
|
|
59
|
+
log: opts.log,
|
|
60
|
+
})
|
|
61
|
+
const ingestor = new Ingestor(store, embeddings, extractor)
|
|
62
|
+
const authorizer = new Authorizer(store)
|
|
63
|
+
const kgqa = new KgqaService(graph, store, embeddings, opts.llm)
|
|
64
|
+
const graphQuery = new GraphQueryService(graph)
|
|
65
|
+
const reranker = opts.reranker // optional cross-encoder final stage
|
|
66
|
+
|
|
67
|
+
// Exact-answer first: try to answer the question precisely from the typed graph;
|
|
68
|
+
// returns null when it can't (caller then falls back to ranked retrieval).
|
|
69
|
+
async function ask(question: string, userId: string, orgId: string) {
|
|
70
|
+
return kgqa.answer(question, userId, orgId)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Authorized write path: checks the acting user MAY create + may grant the
|
|
74
|
+
// requested sharing, stamps ownership, then ingests. Throws AuthorizationError.
|
|
75
|
+
async function authorizedIngest(actingUserId: string, doc: IngestDoc) {
|
|
76
|
+
authorizer.assertCanCreate(actingUserId, doc.orgId, doc.permittedPrincipals ?? [], doc.shareWithOrg)
|
|
77
|
+
const principals = [...new Set([...(doc.permittedPrincipals ?? []), actingUserId])] // owner can always read
|
|
78
|
+
return ingestor.ingest({ ...doc, ownerId: actingUserId, permittedPrincipals: principals })
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Authorized delete: only the owner or an admin may remove a record (+ its
|
|
82
|
+
// edges, chunks, and ANN rows).
|
|
83
|
+
function deleteRecord(actingUserId: string, recordId: string): void {
|
|
84
|
+
authorizer.assertCanModify(actingUserId, recordId)
|
|
85
|
+
const vids = store.db.query("SELECT json_extract(data,'$.virtualRecordId') v FROM nodes WHERE id = ?").all(recordId) as Array<{ v: string }>
|
|
86
|
+
store.db.query("DELETE FROM nodes WHERE id = ?").run(recordId)
|
|
87
|
+
store.db.query("DELETE FROM edges WHERE src = ? OR dst = ?").run(recordId, recordId)
|
|
88
|
+
store.db.query("DELETE FROM chunks WHERE point_id LIKE ?").run(`${recordId}#%`)
|
|
89
|
+
for (const { v } of vids) if (v) store.db.query("DELETE FROM chunks WHERE virtual_record_id = ?").run(v)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// HYBRID retrieval - three complementary signals fused with Reciprocal Rank Fusion
|
|
93
|
+
// (the 2026 production default), then re-ranked. Signals:
|
|
94
|
+
// • DENSE (vector + ACL) - semantic similarity (paraphrase, meaning).
|
|
95
|
+
// • SPARSE (BM25 / FTS5) - exact tokens dense misses (acronyms "SAP", "£230M").
|
|
96
|
+
// • GRAPH (GraphRAG) - chunks reachable through related concepts.
|
|
97
|
+
// RRF score = Σ 1/(k + rank) across the lists a chunk appears in (k=60), so a chunk
|
|
98
|
+
// strong in ANY signal surfaces, and one strong in several rises to the top - with no
|
|
99
|
+
// score-scale calibration between cosine and BM25. Then: personal boost (ownership),
|
|
100
|
+
// memory decay/salience, cross-encoder rerank, passage extraction, diversity cap (MMR).
|
|
101
|
+
// The pipeline lives in ./retrieval/* - this is a thin wrapper that threads the
|
|
102
|
+
// shared embedded state into the orchestrator.
|
|
103
|
+
async function searchWithGraph(query: string, userId: string, orgId: string, trace?: SearchTrace): Promise<RetrievalResponse> {
|
|
104
|
+
return hybridSearch({ retrieval, store, graph, embeddings, reranker }, query, userId, orgId, trace)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Same retrieval, but also returns the pipeline TRACE (for the UI's explainability).
|
|
108
|
+
async function searchTraced(query: string, userId: string, orgId: string) {
|
|
109
|
+
const trace: SearchTrace = { counts: { vector: 0, keyword: 0, graph: 0, fused: 0 }, reranked: false, items: [] }
|
|
110
|
+
const response = await searchWithGraph(query, userId, orgId, trace)
|
|
111
|
+
return { response, trace }
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Re-embed every stored chunk with the current embedder and rebuild the ANN
|
|
115
|
+
// index. Needed when switching embedders (e.g. hash → transformers, different dim).
|
|
116
|
+
async function reindex(): Promise<number> {
|
|
117
|
+
store.resetVec()
|
|
118
|
+
store.resetFts()
|
|
119
|
+
const rows = store.db.query("SELECT point_id, virtual_record_id, org_id, content FROM chunks").all() as Array<{
|
|
120
|
+
point_id: string
|
|
121
|
+
virtual_record_id: string
|
|
122
|
+
org_id: string
|
|
123
|
+
content: string
|
|
124
|
+
}>
|
|
125
|
+
for (const r of rows) {
|
|
126
|
+
const emb = await embeddings.embedDense(r.content)
|
|
127
|
+
store.addChunk(r.point_id, r.virtual_record_id, r.org_id, r.content, emb)
|
|
128
|
+
}
|
|
129
|
+
return rows.length
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Re-extract the knowledge graph for EVERY existing record (e.g. after switching
|
|
133
|
+
// to an LLM extractor, or for data ingested before extraction existed). Clears
|
|
134
|
+
// the concept layer (entities + mentions + relates_to), keeps records/ACL/vectors.
|
|
135
|
+
async function rebuildGraph(): Promise<{ records: number; entities: number }> {
|
|
136
|
+
store.db.exec("DELETE FROM nodes WHERE coll = 'entities'")
|
|
137
|
+
store.db.exec("DELETE FROM edges WHERE label IN ('mentions','relates_to')")
|
|
138
|
+
const records = store.db.query("SELECT id, data FROM nodes WHERE coll = 'records'").all() as Array<{ id: string; data: string }>
|
|
139
|
+
let entities = 0
|
|
140
|
+
for (const rec of records) {
|
|
141
|
+
const chunks = store.db
|
|
142
|
+
.query("SELECT content FROM chunks WHERE point_id LIKE ? ORDER BY rowid")
|
|
143
|
+
.all(`${rec.id}#%`) as Array<{ content: string }>
|
|
144
|
+
const text = chunks.map((c) => c.content).join("\n\n")
|
|
145
|
+
const name = (JSON.parse(rec.data) as { recordName?: string }).recordName
|
|
146
|
+
if (text) entities += await ingestor.writeGraphFor(rec.id, text, name)
|
|
147
|
+
}
|
|
148
|
+
return { records: records.length, entities }
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
store,
|
|
153
|
+
graph,
|
|
154
|
+
vector,
|
|
155
|
+
embeddings,
|
|
156
|
+
retrieval,
|
|
157
|
+
ingestor,
|
|
158
|
+
authorizer,
|
|
159
|
+
kgqa,
|
|
160
|
+
graphQuery,
|
|
161
|
+
ask,
|
|
162
|
+
authorizedIngest,
|
|
163
|
+
deleteRecord,
|
|
164
|
+
searchWithGraph,
|
|
165
|
+
searchTraced,
|
|
166
|
+
reindex,
|
|
167
|
+
rebuildGraph,
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
export type EmbeddedContext = ReturnType<typeof buildEmbeddedContext>
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
// Ingestion - the WRITE side that CREATES the graph + vectors from real input.
|
|
2
|
+
// Phase 1 of the lifecycle: parse → chunk → embed → write record node + permission
|
|
3
|
+
// edges + chunk vectors. After this runs, retrieval (Phase 2) can resolve the doc.
|
|
4
|
+
|
|
5
|
+
import type { EmbeddingProvider } from "../provider"
|
|
6
|
+
import type { SqliteStore, Json } from "./sqlite-store"
|
|
7
|
+
import { DeterministicExtractor, stripBoilerplate, slugify, entityId, type KnowledgeExtractor } from "./extract"
|
|
8
|
+
import { CodeExtractor } from "./code-extractor"
|
|
9
|
+
|
|
10
|
+
export interface IngestDoc {
|
|
11
|
+
recordId: string
|
|
12
|
+
text: string
|
|
13
|
+
orgId: string
|
|
14
|
+
recordName: string
|
|
15
|
+
virtualRecordId?: string
|
|
16
|
+
mimeType?: string
|
|
17
|
+
connectorId?: string
|
|
18
|
+
origin?: "CONNECTOR" | "UPLOAD"
|
|
19
|
+
/** Principal ids (users/groups) that may see this doc → permission edges. */
|
|
20
|
+
permittedPrincipals?: string[]
|
|
21
|
+
/** If set, the doc is visible to anyone in this org (the "anyone" path). */
|
|
22
|
+
shareWithOrg?: string
|
|
23
|
+
/** Extract a knowledge graph (entities + relations) at ingest. Default true. */
|
|
24
|
+
extractGraph?: boolean
|
|
25
|
+
/** The creator/owner of the record (set by the authorized write path). */
|
|
26
|
+
ownerId?: string
|
|
27
|
+
/** Pre-extracted TYPED graph supplied by the CALLING model (the frontier LLM that
|
|
28
|
+
* already understood the content). When present, it is stored directly INSTEAD of
|
|
29
|
+
* running the built-in extractor - so the graph is precise typed triples with NO
|
|
30
|
+
* separate LLM endpoint needed. */
|
|
31
|
+
entities?: Array<{ name: string; type?: string }>
|
|
32
|
+
relations?: Array<{ from: string; to: string; type: string; confidence?: number }>
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Structure-aware chunker. The old greedy-merge packed many DISTINCT facts into one
|
|
36
|
+
* chunk → "embedding dilution": a 20-headline chunk has a muddy average vector that
|
|
37
|
+
* matches no specific query (the Reuters-news failure). Research (2026): recursive
|
|
38
|
+
* ~512-token chunks are the best general default; FACT-LIST documents (news
|
|
39
|
+
* headlines, bullet lists) want per-FACT granularity, while flowing prose wants
|
|
40
|
+
* sentences grouped up to the target. So:
|
|
41
|
+
* • list-like block (≥4 newline lines, short median) → one chunk PER LINE (no
|
|
42
|
+
* cross-fact merging) - kills dilution for dense news pages.
|
|
43
|
+
* • prose block → split into sentences, pack up to `size`.
|
|
44
|
+
* • a short heading (no terminal punctuation) carries forward and attaches to the
|
|
45
|
+
* next chunk's content (so "PRICING TIERS:" isn't a useless standalone).
|
|
46
|
+
* Oversized units are hard-split. No overlap (2026 benchmarks: no measurable gain). */
|
|
47
|
+
export function chunkText(text: string, size = 512, minChunk = 80): string[] {
|
|
48
|
+
const blocks = text.split(/\n\s*\n/).map((b) => b.trim()).filter(Boolean)
|
|
49
|
+
const out: string[] = []
|
|
50
|
+
let carry = "" // a short heading held to prepend to the following content
|
|
51
|
+
const median = (ns: number[]): number => {
|
|
52
|
+
if (!ns.length) return 0
|
|
53
|
+
const s = [...ns].sort((a, b) => a - b)
|
|
54
|
+
return s[s.length >> 1]
|
|
55
|
+
}
|
|
56
|
+
const flush = (s: string) => {
|
|
57
|
+
let t = (carry ? `${carry} ` : "") + s.trim()
|
|
58
|
+
carry = ""
|
|
59
|
+
while (t.length > size) {
|
|
60
|
+
out.push(t.slice(0, size).trim())
|
|
61
|
+
t = t.slice(size)
|
|
62
|
+
}
|
|
63
|
+
if (t.trim()) out.push(t.trim())
|
|
64
|
+
}
|
|
65
|
+
for (const b of blocks) {
|
|
66
|
+
const lines = b.split("\n").map((l) => l.trim()).filter(Boolean)
|
|
67
|
+
const listLike = lines.length >= 4 && median(lines.map((l) => l.length)) < 120
|
|
68
|
+
if (listLike) {
|
|
69
|
+
for (const l of lines) {
|
|
70
|
+
if (l.length < 30 && !/[.!?]$/.test(l)) carry = (carry ? `${carry} ` : "") + l // tiny fragment → heading
|
|
71
|
+
else flush(l) // each list item / headline becomes its own focused chunk
|
|
72
|
+
}
|
|
73
|
+
} else if (b.length < minChunk && !/[.!?]$/.test(b)) {
|
|
74
|
+
carry = (carry ? `${carry} ` : "") + b // short heading → attach to next block
|
|
75
|
+
} else {
|
|
76
|
+
const sents = b.split(/(?<=[.!?])\s+/).map((s) => s.trim()).filter(Boolean)
|
|
77
|
+
let cur = ""
|
|
78
|
+
for (const s of sents) {
|
|
79
|
+
if (cur && cur.length + s.length + 1 > size) {
|
|
80
|
+
flush(cur)
|
|
81
|
+
cur = s
|
|
82
|
+
} else cur = cur ? `${cur} ${s}` : s
|
|
83
|
+
}
|
|
84
|
+
if (cur) flush(cur)
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (carry) out.push(carry)
|
|
88
|
+
return out.length ? out : [text.trim()].filter(Boolean)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// FUNCTIONAL (single-valued) relations: a subject has at most ONE current value, so
|
|
92
|
+
// a newer fact SUPERSEDES the old (user moved cities; company changed CEO). Anything
|
|
93
|
+
// not here is multi-valued (partners_with, mentions, knows…) and simply accumulates.
|
|
94
|
+
// Only the typed/LLM path emits these predicates; the deterministic path uses
|
|
95
|
+
// "relates_to" (symmetric, never functional), so supersession is a no-op there.
|
|
96
|
+
const FUNCTIONAL_PREDICATES = new Set([
|
|
97
|
+
"lives_in", "located_in", "based_in", "works_at", "employed_by", "ceo_of", "led_by",
|
|
98
|
+
"born_in", "current_role", "role_is", "status_is", "owns", "owned_by", "married_to",
|
|
99
|
+
"reports_to", "headquartered_in", "capital_of", "member_of",
|
|
100
|
+
])
|
|
101
|
+
|
|
102
|
+
// Entity ids are namespaced (`entity:`) so a slug of free text can never collide with a
|
|
103
|
+
// principal (user/org/group) or record id and corrupt the ACL graph via INSERT OR
|
|
104
|
+
// REPLACE. The scheme + `entityId()` helper are defined once in extractors/text-hygiene
|
|
105
|
+
// (imported above) so every writer here and every resolver (kgqa/entity-link,
|
|
106
|
+
// graph/adjacency) agree on it.
|
|
107
|
+
|
|
108
|
+
export class Ingestor {
|
|
109
|
+
// Code files (detected by extension) are parsed with tree-sitter into a code graph;
|
|
110
|
+
// everything else goes through the configured text/LLM extractor.
|
|
111
|
+
private readonly codeExtractor = new CodeExtractor()
|
|
112
|
+
constructor(
|
|
113
|
+
private readonly store: SqliteStore,
|
|
114
|
+
private readonly embeddings: EmbeddingProvider,
|
|
115
|
+
private readonly extractor: KnowledgeExtractor = new DeterministicExtractor(),
|
|
116
|
+
) {}
|
|
117
|
+
|
|
118
|
+
// --- identity surface (normally fed by an IdP/SCIM sync) ---
|
|
119
|
+
registerOrg(orgId: string, data: Json = {}): void {
|
|
120
|
+
this.store.addNode(orgId, "organizations", data)
|
|
121
|
+
}
|
|
122
|
+
registerUser(userId: string, orgId: string, email?: string, role: "admin" | "editor" | "viewer" = "editor"): void {
|
|
123
|
+
this.store.addNode(userId, "users", { userId, email, role })
|
|
124
|
+
this.store.addNode(orgId, "organizations", {}) // idempotent (INSERT OR REPLACE)
|
|
125
|
+
this.store.addEdge(userId, orgId, "belongsTo")
|
|
126
|
+
}
|
|
127
|
+
registerGroup(groupId: string): void {
|
|
128
|
+
this.store.addNode(groupId, "groups", {})
|
|
129
|
+
}
|
|
130
|
+
addMembership(userId: string, groupId: string): void {
|
|
131
|
+
this.store.addEdge(userId, groupId, "belongsTo")
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// --- the document ingestion pipeline ---
|
|
135
|
+
async ingest(doc: IngestDoc): Promise<{ recordId: string; chunks: number; entities: number }> {
|
|
136
|
+
const vid = doc.virtualRecordId ?? doc.recordId
|
|
137
|
+
|
|
138
|
+
// (1) GRAPH: the record node.
|
|
139
|
+
this.store.addNode(doc.recordId, "records", {
|
|
140
|
+
virtualRecordId: vid,
|
|
141
|
+
orgId: doc.orgId,
|
|
142
|
+
recordName: doc.recordName,
|
|
143
|
+
mimeType: doc.mimeType ?? "text/plain",
|
|
144
|
+
connectorId: doc.connectorId ?? "upload",
|
|
145
|
+
connectorName: doc.connectorId ?? "upload",
|
|
146
|
+
origin: doc.origin ?? "UPLOAD",
|
|
147
|
+
indexingStatus: "COMPLETED",
|
|
148
|
+
ownerId: doc.ownerId, // creator - used for write/delete authorization
|
|
149
|
+
createdAt: Date.now(), // memory recency baseline (decay/salience re-ranking)
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
// (2) GRAPH: permission edges (the ACL) - captured from the source at ingest.
|
|
153
|
+
for (const principal of doc.permittedPrincipals ?? []) {
|
|
154
|
+
this.store.addEdge(principal, doc.recordId, "permissions")
|
|
155
|
+
}
|
|
156
|
+
if (doc.shareWithOrg) {
|
|
157
|
+
this.store.addNode(`anyone:${doc.recordId}`, "anyone", {
|
|
158
|
+
organization: doc.shareWithOrg,
|
|
159
|
+
file_key: doc.recordId,
|
|
160
|
+
})
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Drop web boilerplate (cookie banners, nav, subscribe CTAs) from PROSE before
|
|
164
|
+
// chunking/extraction so it never becomes a noisy chunk or junk entity. Code is
|
|
165
|
+
// left untouched (a line like "accept" can be real source).
|
|
166
|
+
const isCode = !!CodeExtractor.detectLanguage(doc.recordName)
|
|
167
|
+
const cleanText = isCode ? doc.text : stripBoilerplate(doc.text)
|
|
168
|
+
|
|
169
|
+
// (3) VECTORS: chunk → embed → store.
|
|
170
|
+
const chunks = chunkText(cleanText)
|
|
171
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
172
|
+
const embedding = await this.embeddings.embedDense(chunks[i])
|
|
173
|
+
this.store.addChunk(`${doc.recordId}#${i}`, vid, doc.orgId, chunks[i], embedding)
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// (4) KNOWLEDGE GRAPH: extract concepts → entity nodes + relationship edges.
|
|
177
|
+
// record --mentions--> entity, entity --relates_to--> entity. Entity ids are
|
|
178
|
+
// shared across docs, so two records that mention "Pro" link through it.
|
|
179
|
+
let entities = 0
|
|
180
|
+
if (doc.extractGraph !== false) {
|
|
181
|
+
// If the calling (frontier) model supplied a typed graph, store THAT - precise
|
|
182
|
+
// triples, no built-in extractor, no separate LLM. Otherwise fall back to the
|
|
183
|
+
// text/code extractor.
|
|
184
|
+
entities =
|
|
185
|
+
doc.entities?.length || doc.relations?.length
|
|
186
|
+
? this.writeProvidedGraph(doc.recordId, doc.entities ?? [], doc.relations ?? [])
|
|
187
|
+
: await this.writeGraphFor(doc.recordId, cleanText, doc.recordName)
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return { recordId: doc.recordId, chunks: chunks.length, entities }
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/** Store a TYPED graph supplied by the calling model (the frontier LLM that read
|
|
194
|
+
* the content). This is the precise path - no built-in extractor, no separate LLM.
|
|
195
|
+
* Normalizes relation predicates, stamps confidence + provenance, and applies
|
|
196
|
+
* bi-temporal supersession for single-valued (functional) relations. Returns the
|
|
197
|
+
* number of entities written. */
|
|
198
|
+
writeProvidedGraph(
|
|
199
|
+
recordId: string,
|
|
200
|
+
ents: Array<{ name: string; type?: string }>,
|
|
201
|
+
rels: Array<{ from: string; to: string; type: string; confidence?: number }>,
|
|
202
|
+
): number {
|
|
203
|
+
this.store.clearRecordContributions(recordId)
|
|
204
|
+
const added = new Set<string>()
|
|
205
|
+
const addEntity = (name: string, type?: string) => {
|
|
206
|
+
const slug = slugify(name)
|
|
207
|
+
if (!slug || added.has(slug)) return slug && entityId(slug)
|
|
208
|
+
added.add(slug)
|
|
209
|
+
const id = entityId(slug)
|
|
210
|
+
this.store.addNode(id, "entities", { label: name.trim(), type: type ?? "ENTITY" })
|
|
211
|
+
this.store.addEdge(recordId, id, "mentions", { recordId })
|
|
212
|
+
return id
|
|
213
|
+
}
|
|
214
|
+
for (const e of ents) addEntity(e.name, e.type)
|
|
215
|
+
const now = Date.now()
|
|
216
|
+
for (const r of rels) {
|
|
217
|
+
const from = addEntity(r.from) // ensure endpoint nodes exist + are mentioned
|
|
218
|
+
const to = addEntity(r.to)
|
|
219
|
+
if (!from || !to || from === to) continue
|
|
220
|
+
const label = (r.type || "relates_to").trim().toLowerCase().replace(/\s+/g, "_")
|
|
221
|
+
this.store.addEdge(from, to, label, { recordId, validAt: now, confidence: r.confidence })
|
|
222
|
+
if (FUNCTIONAL_PREDICATES.has(label)) this.store.supersedeEdge(from, label, to, now)
|
|
223
|
+
}
|
|
224
|
+
return added.size
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/** Extract concepts from text (or CODE) and attach them to a record (shared by
|
|
228
|
+
* ingest and rebuildGraph). Code files (detected from `name`) are parsed with
|
|
229
|
+
* tree-sitter; everything else uses the configured text/LLM extractor. Returns
|
|
230
|
+
* the number of entities written. */
|
|
231
|
+
async writeGraphFor(recordId: string, text: string, name?: string): Promise<number> {
|
|
232
|
+
// Source-keyed replace (Graphify): if this record was ingested before, drop its
|
|
233
|
+
// prior graph contributions first so facts it no longer asserts are GC'd, weights
|
|
234
|
+
// stay accurate, and re-ingest is idempotent rather than weight-inflating.
|
|
235
|
+
this.store.clearRecordContributions(recordId)
|
|
236
|
+
|
|
237
|
+
// Route: code → tree-sitter AST graph; prose → text/LLM extractor.
|
|
238
|
+
const lang = CodeExtractor.detectLanguage(name)
|
|
239
|
+
const extractor = lang ? this.codeExtractor : this.extractor
|
|
240
|
+
const { entities, relations } = await extractor.extract(text, { name, language: lang ?? undefined })
|
|
241
|
+
for (const e of entities) {
|
|
242
|
+
const id = entityId(e.id)
|
|
243
|
+
this.store.addNode(id, "entities", { label: e.label, type: e.type })
|
|
244
|
+
this.store.addEdge(recordId, id, "mentions", { recordId })
|
|
245
|
+
}
|
|
246
|
+
// Store the TYPED predicate as the edge label (calls/defines/imports for code;
|
|
247
|
+
// loves/is_a/… for prose). weight ACCUMULATES across re-mentions (frequency≈
|
|
248
|
+
// confidence); per-edge `confidence` is the EXTRACTED/INFERRED tier; the source
|
|
249
|
+
// record is recorded as provenance so we can trace and supersede a fact later.
|
|
250
|
+
const now = Date.now()
|
|
251
|
+
for (const r of relations) {
|
|
252
|
+
const label = r.type || "relates_to"
|
|
253
|
+
const from = entityId(r.from)
|
|
254
|
+
const to = entityId(r.to)
|
|
255
|
+
this.store.addEdge(from, to, label, { recordId, validAt: now, confidence: r.confidence })
|
|
256
|
+
// Bi-temporal supersession (Graphiti): for a single-valued relation, a newer
|
|
257
|
+
// value closes the prior one - non-destructively (history kept, marked expired).
|
|
258
|
+
if (FUNCTIONAL_PREDICATES.has(label)) this.store.supersedeEdge(from, label, to, now)
|
|
259
|
+
}
|
|
260
|
+
return entities.length
|
|
261
|
+
}
|
|
262
|
+
}
|