@100xprompt/chitta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +203 -0
  3. package/assets/rules/claude-md.md +9 -0
  4. package/assets/skill/SKILL.md +47 -0
  5. package/package.json +48 -0
  6. package/src/README.md +124 -0
  7. package/src/arango-client.ts +67 -0
  8. package/src/arango-graph-provider.ts +364 -0
  9. package/src/bin.ts +27 -0
  10. package/src/config-env.ts +53 -0
  11. package/src/embedded/authorizer.ts +89 -0
  12. package/src/embedded/cli.ts +86 -0
  13. package/src/embedded/code-extractor.ts +9 -0
  14. package/src/embedded/demo.ts +36 -0
  15. package/src/embedded/extract.ts +12 -0
  16. package/src/embedded/extractors/code.ts +308 -0
  17. package/src/embedded/extractors/deterministic.ts +63 -0
  18. package/src/embedded/extractors/llm.ts +151 -0
  19. package/src/embedded/extractors/text-hygiene.ts +54 -0
  20. package/src/embedded/extractors/types.ts +34 -0
  21. package/src/embedded/graph/acl-paths.ts +96 -0
  22. package/src/embedded/graph/adjacency.ts +61 -0
  23. package/src/embedded/graph/centrality.ts +23 -0
  24. package/src/embedded/graph/communities.ts +46 -0
  25. package/src/embedded/graph/cypher.ts +17 -0
  26. package/src/embedded/graph/impact.ts +24 -0
  27. package/src/embedded/graph/knowledge-graph.ts +108 -0
  28. package/src/embedded/graph/pagerank.ts +57 -0
  29. package/src/embedded/graph/sql-access.ts +13 -0
  30. package/src/embedded/graph/traversal.ts +73 -0
  31. package/src/embedded/graph/types.ts +35 -0
  32. package/src/embedded/graph-query.ts +126 -0
  33. package/src/embedded/index.ts +171 -0
  34. package/src/embedded/ingest.ts +262 -0
  35. package/src/embedded/kgqa/answer-paths.ts +197 -0
  36. package/src/embedded/kgqa/entity-link.ts +13 -0
  37. package/src/embedded/kgqa/intent.ts +14 -0
  38. package/src/embedded/kgqa/predicates.ts +9 -0
  39. package/src/embedded/kgqa/preference.ts +20 -0
  40. package/src/embedded/kgqa/select.ts +99 -0
  41. package/src/embedded/kgqa/text.ts +16 -0
  42. package/src/embedded/kgqa/types.ts +6 -0
  43. package/src/embedded/kgqa-service.ts +122 -0
  44. package/src/embedded/llm-extractor.ts +10 -0
  45. package/src/embedded/local-embeddings.ts +36 -0
  46. package/src/embedded/personal.ts +100 -0
  47. package/src/embedded/reranker.ts +62 -0
  48. package/src/embedded/retrieval/decay-stage.ts +59 -0
  49. package/src/embedded/retrieval/diversity.ts +37 -0
  50. package/src/embedded/retrieval/fuse.ts +52 -0
  51. package/src/embedded/retrieval/graph-stage.ts +45 -0
  52. package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
  53. package/src/embedded/retrieval/keyword-stage.ts +27 -0
  54. package/src/embedded/retrieval/passage.ts +44 -0
  55. package/src/embedded/retrieval/rerank-stage.ts +31 -0
  56. package/src/embedded/retrieval/trace.ts +31 -0
  57. package/src/embedded/retrieval/vector-stage.ts +15 -0
  58. package/src/embedded/sqlite-graph-provider.ts +119 -0
  59. package/src/embedded/sqlite-store.ts +95 -0
  60. package/src/embedded/sqlite-vec-service.ts +122 -0
  61. package/src/embedded/store/chunks.ts +61 -0
  62. package/src/embedded/store/fts.ts +50 -0
  63. package/src/embedded/store/nodes-edges.ts +112 -0
  64. package/src/embedded/store/salience.ts +37 -0
  65. package/src/embedded/store/schema.ts +109 -0
  66. package/src/embedded/transformers-embeddings.ts +100 -0
  67. package/src/embeddings.ts +51 -0
  68. package/src/eval/goldset.ts +46 -0
  69. package/src/eval/harness.ts +65 -0
  70. package/src/eval/metrics.ts +38 -0
  71. package/src/http/server.ts +93 -0
  72. package/src/index.ts +44 -0
  73. package/src/install/index.ts +139 -0
  74. package/src/install/platforms.ts +126 -0
  75. package/src/install/skill.ts +46 -0
  76. package/src/install/writers.ts +82 -0
  77. package/src/mcp/backend.ts +129 -0
  78. package/src/mcp/server.ts +83 -0
  79. package/src/mcp/tools/context-about.ts +69 -0
  80. package/src/mcp/tools/context-graph.ts +23 -0
  81. package/src/mcp/tools/context-ingest.ts +88 -0
  82. package/src/mcp/tools/context-rebuild.ts +22 -0
  83. package/src/mcp/tools/context-relate.ts +88 -0
  84. package/src/mcp/tools/get-context.ts +52 -0
  85. package/src/mcp/tools/index.ts +40 -0
  86. package/src/mcp/tools/types.ts +33 -0
  87. package/src/permission.ts +72 -0
  88. package/src/provider.ts +65 -0
  89. package/src/qdrant-vector.ts +76 -0
  90. package/src/retrieval.ts +218 -0
  91. package/src/service.ts +40 -0
  92. package/src/types.ts +91 -0
@@ -0,0 +1,197 @@
1
+ // The per-path answer resolvers. Each takes the providers it needs explicitly so the
2
+ // KgqaService can orchestrate by composing them. Logic, confidences, and outputs are
3
+ // identical to the original monolithic service - this is a pure structural split.
4
+
5
+ import type { SqliteGraphProvider } from "../sqlite-graph-provider"
6
+ import type { SqliteStore } from "../sqlite-store"
7
+ import type { EmbeddingProvider } from "../../provider"
8
+ import type { KgqaResult } from "../kgqa-service"
9
+ import type { Graph } from "./types"
10
+ import { stem } from "./text"
11
+ import { PREFERENCE_PREDICATES } from "./preference"
12
+ import { predMatch } from "./predicates"
13
+ import { QUERY_STOP, narrow, linesMentioningAny } from "./select"
14
+
15
+ // Record names (ACL-scoped) that mention any of the given entities.
16
+ export function cite(
17
+ graph: SqliteGraphProvider,
18
+ entityIds: string[],
19
+ _userId: string,
20
+ _orgId: string,
21
+ accessibleRecordIds: string[],
22
+ ): string[] {
23
+ if (entityIds.length === 0 || accessibleRecordIds.length === 0) return []
24
+ return graph.recordsMentioning(entityIds, accessibleRecordIds).slice(0, 3)
25
+ }
26
+
27
+ export function compose(
28
+ graph: SqliteGraphProvider,
29
+ anchor: string,
30
+ predicate: string,
31
+ answerIds: string[],
32
+ labelOf: Map<string, string>,
33
+ userId: string,
34
+ orgId: string,
35
+ recordIds: string[],
36
+ reverse = false,
37
+ ): KgqaResult {
38
+ const labels = [...new Set(answerIds)].map((id) => labelOf.get(id) ?? id)
39
+ const anchorLabel = labelOf.get(anchor) ?? anchor
40
+ const triple = reverse
41
+ ? { subject: labels.join(", "), predicate, object: anchorLabel }
42
+ : { subject: anchorLabel, predicate, object: labels.join(", ") }
43
+ // one fact per object so multi-valued answers list cleanly (e.g. "what do I love"
44
+ // → "you love coding", "you love Lavanya"), not a comma-run.
45
+ const pred = predicate.replace(/_/g, " ")
46
+ const facts = labels.map((l) => (reverse ? `${l} ${pred} ${anchorLabel}` : `${anchorLabel} ${pred} ${l}`))
47
+ return {
48
+ answer: labels.join(", "),
49
+ facts,
50
+ triple,
51
+ citations: cite(graph, [anchor, ...answerIds], userId, orgId, recordIds),
52
+ confidence: 0.9,
53
+ }
54
+ }
55
+
56
+ // Binary: does (subject, predicate, object) hold?
57
+ export function binaryAnswer(
58
+ graph: SqliteGraphProvider,
59
+ g: Graph,
60
+ subj: string,
61
+ obj: string,
62
+ predStem: string,
63
+ predicate: string | undefined,
64
+ labelOf: Map<string, string>,
65
+ userId: string,
66
+ orgId: string,
67
+ recordIds: string[],
68
+ ): KgqaResult {
69
+ const yes = g.relations.some(
70
+ (r) => r.from === subj && r.to === obj && predMatch(r.type, predStem),
71
+ )
72
+ const bAnswer = yes ? "Yes." : "No (not found in your knowledge graph)."
73
+ return {
74
+ answer: bAnswer,
75
+ facts: [bAnswer],
76
+ triple: { subject: labelOf.get(subj) ?? subj, predicate: predicate ?? "", object: labelOf.get(obj) ?? obj },
77
+ citations: yes ? cite(graph, [subj, obj], userId, orgId, recordIds) : [],
78
+ confidence: yes ? 0.9 : 0.5,
79
+ }
80
+ }
81
+
82
+ // Self / preference answer: return the user's preference edges (loves/likes/…) from
83
+ // the graph. Resolves abstract self-queries ("what do I like that needs logic?")
84
+ // through the graph regardless of phrasing; the frontier LLM does the final filter.
85
+ export function preferenceAnswer(
86
+ graph: SqliteGraphProvider,
87
+ g: Graph,
88
+ userId: string,
89
+ orgId: string,
90
+ recordIds: string[],
91
+ ): KgqaResult | null {
92
+ const labelOf = new Map(g.entities.map((e) => [e.id, e.label]))
93
+ const isPref = (t: string) => PREFERENCE_PREDICATES.has(t) || PREFERENCE_PREDICATES.has(stem(t))
94
+ const edges = g.relations.filter((r) => isPref(r.type))
95
+ if (!edges.length) return null
96
+ const facts = edges.map((r) => `${labelOf.get(r.from) ?? r.from} ${r.type.replace(/_/g, " ")} ${labelOf.get(r.to) ?? r.to}`)
97
+ const objs = [...new Set(edges.map((r) => labelOf.get(r.to) ?? r.to))]
98
+ const ids = [...new Set(edges.flatMap((r) => [r.from, r.to]))]
99
+ return {
100
+ answer: facts.join("\n"),
101
+ facts,
102
+ triple: { subject: "you", predicate: "prefer", object: objs.join(", ") },
103
+ citations: cite(graph, ids, userId, orgId, recordIds),
104
+ confidence: 0.85,
105
+ }
106
+ }
107
+
108
+ // Predicate-anchored answer: a query naming a RELATION but no entity ("what
109
+ // partnerships exist") → all edges of the matching predicate(s). Last resort before
110
+ // vector fallback, so named-entity queries (handled by entityLookup) are unaffected.
111
+ export function predicateAnswer(
112
+ graph: SqliteGraphProvider,
113
+ question: string,
114
+ g: Graph,
115
+ userId: string,
116
+ orgId: string,
117
+ recordIds: string[],
118
+ ): KgqaResult | null {
119
+ const preds = [...new Set(g.relations.map((r) => r.type).filter((t) => t !== "relates_to"))]
120
+ if (!preds.length) return null
121
+ // Don't apply QUERY_STOP here - relational words (partnership/deal/…) are exactly
122
+ // the high-level signal we want. The predicate HEAD is its first segment
123
+ // ("partners_with" → "partners"), matched loosely against the query's stems.
124
+ const qStems = [...new Set(question.toLowerCase().split(/[^a-z]+/).filter((w) => w.length >= 4).map(stem))]
125
+ const hit = preds.filter((p) => {
126
+ const head = stem(p.split("_")[0] ?? p)
127
+ return head.length >= 4 && qStems.some((qs) => qs === head || qs.includes(head) || head.includes(qs))
128
+ })
129
+ if (!hit.length) return null
130
+ const set = new Set(hit)
131
+ const edges = g.relations.filter((r) => set.has(r.type))
132
+ if (!edges.length) return null
133
+ const labelOf = new Map(g.entities.map((e) => [e.id, e.label]))
134
+ const facts = edges.map((r) => `${labelOf.get(r.from) ?? r.from} ${r.type.replace(/_/g, " ")} ${labelOf.get(r.to) ?? r.to}`)
135
+ const ids = [...new Set(edges.flatMap((r) => [r.from, r.to]))]
136
+ return {
137
+ answer: facts.join("\n"),
138
+ facts,
139
+ triple: { subject: "", predicate: hit.join(" / "), object: "" },
140
+ citations: cite(graph, ids, userId, orgId, recordIds),
141
+ confidence: 0.8,
142
+ }
143
+ }
144
+
145
+ // Entity-anchored answer (no LLM needed): if the query names a known entity,
146
+ // return the line(s)/facts about it THAT MATCH THE QUERY - a specific question
147
+ // gets the specific fact; a bare entity name gets everything.
148
+ export async function entityLookup(
149
+ graph: SqliteGraphProvider,
150
+ store: SqliteStore,
151
+ embeddings: EmbeddingProvider,
152
+ question: string,
153
+ g: Graph,
154
+ accessibleVids: string[],
155
+ recordIds: string[],
156
+ _userId: string,
157
+ _orgId: string,
158
+ ): Promise<KgqaResult | null> {
159
+ // Anchor on the query's KNOWN terms (words that appear in some entity label),
160
+ // not one entity node - so "Google" gathers all Google lines, and the full
161
+ // query then decides which of them to keep.
162
+ const qwords = question.toLowerCase().split(/[^a-z0-9]+/).filter((w) => w.length >= 3 && !QUERY_STOP.has(w))
163
+ const entityWords = new Set<string>()
164
+ for (const e of g.entities) for (const w of e.label.toLowerCase().split(/[^a-z0-9]+/)) if (w.length >= 3) entityWords.add(w)
165
+ const anchors = qwords.filter((w) => w.length >= 3 && entityWords.has(w)) // incl. acronyms (SAP, IBM, UCP)
166
+ if (anchors.length === 0) return null
167
+ const anchorSet = new Set(anchors)
168
+
169
+ const matchedIds = g.entities
170
+ .filter((e) => e.label.toLowerCase().split(/[^a-z0-9]+/).some((w) => anchorSet.has(w)))
171
+ .map((e) => e.id)
172
+ const cites = graph.recordsMentioning(matchedIds, recordIds).slice(0, 3)
173
+ const subject = anchors.join(", ")
174
+
175
+ // 1) Typed facts about the matched entities (when the LLM produced predicates).
176
+ const labelOf = new Map(g.entities.map((e) => [e.id, e.label]))
177
+ const mset = new Set(matchedIds)
178
+ const factLines = g.relations
179
+ .filter((r) => (mset.has(r.from) || mset.has(r.to)) && r.type !== "relates_to")
180
+ .map((r) => `${labelOf.get(r.from) ?? r.from} ${r.type.replace(/_/g, " ")} ${labelOf.get(r.to) ?? r.to}`)
181
+ if (factLines.length) {
182
+ const chosen = await narrow(embeddings, question, anchors, anchorSet, factLines)
183
+ return { answer: chosen.join("\n"), facts: chosen, triple: { subject, predicate: "facts", object: `${chosen.length}` }, citations: cites, confidence: 0.85 }
184
+ }
185
+
186
+ // 2) Otherwise the exact line(s) mentioning an anchor - query-filtered.
187
+ const all = linesMentioningAny(store, anchors, accessibleVids)
188
+ if (all.length === 0) return null
189
+ const lines = await narrow(embeddings, question, anchors, anchorSet, all)
190
+ return {
191
+ answer: lines.join("\n"),
192
+ facts: lines,
193
+ triple: { subject, predicate: "info", object: lines.length > 1 ? `${lines.length} facts` : lines[0] },
194
+ citations: cites,
195
+ confidence: 0.78,
196
+ }
197
+ }
@@ -0,0 +1,13 @@
1
+ // Entity linking - resolve a mention from parsed intent to a graph entity id,
2
+ // by slug first then exact label match.
3
+
4
+ import { slugify, entityId } from "../extract"
5
+ import type { Graph } from "./types"
6
+
7
+ export function link(mention: string, g: Graph): string | null {
8
+ const id = entityId(slugify(mention))
9
+ if (g.entities.some((e) => e.id === id)) return id
10
+ const m = mention.toLowerCase()
11
+ const byLabel = g.entities.find((e) => e.label.toLowerCase() === m)
12
+ return byLabel?.id ?? null
13
+ }
@@ -0,0 +1,14 @@
1
+ // Intent parsing - the no-LLM heuristic for simple "who/what do I <verb>" questions.
2
+ // (The LLM path is preferred; this covers the offline case.)
3
+
4
+ import type { QuestionIntent } from "../extract"
5
+
6
+ export function heuristicIntent(q: string): QuestionIntent | null {
7
+ // "who/what do/does/did I/you/<x> <verb>" → forward relation query
8
+ const m = q.toLowerCase().match(/\b(who|what)\b\s+(?:do|does|did)\s+([a-z]+)\s+([a-z]+)/)
9
+ if (m) {
10
+ const subj = ["i", "me", "my", "we", "you"].includes(m[2]) ? "user" : m[2]
11
+ return { type: "relation_query", subject: subj, predicate: m[3] }
12
+ }
13
+ return null
14
+ }
@@ -0,0 +1,9 @@
1
+ // Predicate stem-matching: loosely match an edge type against a stemmed predicate
2
+ // from the parsed intent ("partnered_with" vs "partner").
3
+
4
+ import { stem } from "./text"
5
+
6
+ export function predMatch(edgeType: string, predStem: string): boolean {
7
+ const e = stem(edgeType)
8
+ return e === predStem || e.includes(predStem) || predStem.includes(e)
9
+ }
@@ -0,0 +1,20 @@
1
+ // HIGH-LEVEL (thematic) routing - LightRAG-style dual-level retrieval.
2
+ // A query about the USER'S OWN preferences should be answered from the graph's
3
+ // preference edges (loves/likes/…), NEVER the vector index - so abstract self-queries
4
+ // ("do I like anything logical?") route through the graph, and the frontier LLM then
5
+ // filters. A preference NOUN (preferences/interests/hobbies) is self-evidently about
6
+ // the user; a preference VERB (like/love) needs a self pronoun so we don't hijack a
7
+ // relational query like "does Google love AI".
8
+
9
+ const PREF_NOUN = /\b(prefer(?:ence)?s?|interests?|hobb(?:y|ies)|favou?rites?|passions?|tastes?)\b/i
10
+ const PREF_VERB = /\b(likes?|loves?|loving|enjoys?|enjoying|prefers?|fond|keen|into)\b/i
11
+ const SELF_REF = /\b(i|me|my|mine|myself|im|i'm)\b/i
12
+
13
+ export const PREFERENCE_PREDICATES = new Set([
14
+ "loves", "love", "likes", "like", "enjoys", "enjoy", "prefers", "prefer", "favors", "favours",
15
+ "interested_in", "fond_of", "passionate_about", "fan_of", "keen_on", "into",
16
+ ])
17
+
18
+ export function isSelfPreference(q: string): boolean {
19
+ return PREF_NOUN.test(q) || (PREF_VERB.test(q) && SELF_REF.test(q))
20
+ }
@@ -0,0 +1,99 @@
1
+ // Relevance filtering + line gathering. Decides whether a query is BROAD (return
2
+ // everything about an entity) or SPECIFIC (semantically filter the candidate lines),
3
+ // and pulls exact lines from accessible chunks.
4
+
5
+ import { embedQueryWith, type EmbeddingProvider } from "../../provider"
6
+ import type { SqliteStore } from "../sqlite-store"
7
+ import { cleanLine, isBoilerplate } from "../extract"
8
+ import { cosine } from "./text"
9
+
10
+ // Generic question words to strip when deciding if a query is SPECIFIC (asks about
11
+ // a particular aspect) vs BROAD (just names the entity → return everything).
12
+ export const QUERY_STOP = new Set([
13
+ "about", "info", "information", "news", "tell", "me", "what", "whats", "is", "are", "do", "does", "did",
14
+ "the", "a", "an", "of", "on", "for", "and", "company", "companies", "details", "detail", "give", "show", "all", "any",
15
+ "please", "know", "regarding", "related", "to", "with", "recent", "latest", "update", "updates", "who", "which",
16
+ // generic RELATIONAL words - non-discriminating, so "X partnerships" returns ALL of X's
17
+ // relationships (let the LLM deduce), rather than only lines that literally say "partnership".
18
+ "partner", "partners", "partnered", "partnering", "partnership", "partnerships",
19
+ "relationship", "relationships", "deal", "deals", "collaboration", "collaborations", "collaborate",
20
+ "connection", "connections", "work", "works", "working", "involved", "between",
21
+ // comparison / full-coverage signals - these mean "give me everything", not a filter.
22
+ "compare", "comparison", "versus", "vs", "both", "each", "every", "everything", "anything", "list", "summary",
23
+ ])
24
+
25
+ // Query words that signal the user wants COMPREHENSIVE coverage (union of all the
26
+ // named entities' facts), not the single connecting line.
27
+ export const WANTS_ALL = /\b(compare|comparison|versus|vs|both|each|every|everything|all|list|summary)\b/
28
+
29
+ // Narrow candidate lines to the query. First prefer lines that mention ALL named
30
+ // anchors ("SAP" + "Google" → only the SAP+Google line, not every Google line);
31
+ // then apply the broad/specific semantic filter on what remains.
32
+ export async function narrow(
33
+ embeddings: EmbeddingProvider,
34
+ question: string,
35
+ anchors: string[],
36
+ anchorSet: Set<string>,
37
+ lines: string[],
38
+ ): Promise<string[]> {
39
+ if (lines.length <= 1) return lines
40
+ let candidate = lines
41
+ // Intersection narrows "SAP + Google" to their shared line - UNLESS the query is
42
+ // a comparison/coverage request ("compare X and Y"), where we want all of both.
43
+ if (anchors.length > 1 && !WANTS_ALL.test(question.toLowerCase())) {
44
+ const inter = lines.filter((l) => {
45
+ const ll = l.toLowerCase()
46
+ return anchors.every((a) => ll.includes(a))
47
+ })
48
+ if (inter.length > 0) candidate = inter
49
+ }
50
+ return selectByQuery(embeddings, question, anchorSet, candidate)
51
+ }
52
+
53
+ // Broad query (only anchor terms) → return all; specific (extra content words) →
54
+ // embed the full query and keep only lines that semantically match it.
55
+ export async function selectByQuery(
56
+ embeddings: EmbeddingProvider,
57
+ question: string,
58
+ anchorSet: Set<string>,
59
+ lines: string[],
60
+ ): Promise<string[]> {
61
+ if (lines.length <= 1) return lines
62
+ const residual = question
63
+ .toLowerCase()
64
+ .split(/[^a-z0-9]+/)
65
+ .filter((w) => w.length > 1 && !QUERY_STOP.has(w) && !anchorSet.has(w))
66
+ if (residual.length === 0) return lines // broad → everything
67
+
68
+ const q = await embedQueryWith(embeddings, question)
69
+ const scored: Array<{ line: string; s: number }> = []
70
+ for (const line of lines) scored.push({ line, s: cosine(q, await embeddings.embedDense(line)) })
71
+ scored.sort((a, b) => b.s - a.s)
72
+ const top = scored[0].s
73
+ const margin = Number(process.env.CONTEXT_LINE_MARGIN ?? 0.08)
74
+ return scored.filter((x) => x.s >= top - margin).map((x) => x.line)
75
+ }
76
+
77
+ export function linesMentioningAny(store: SqliteStore, terms: string[], accessibleVids: string[]): string[] {
78
+ const out = new Set<string>()
79
+ for (const term of terms) for (const l of linesMentioning(store, term, accessibleVids)) out.add(l)
80
+ return [...out]
81
+ }
82
+
83
+ // Exact lines/sentences from accessible chunks that mention the entity label.
84
+ export function linesMentioning(store: SqliteStore, label: string, accessibleVids: string[]): string[] {
85
+ if (accessibleVids.length === 0) return []
86
+ const vp = accessibleVids.map(() => "?").join(",")
87
+ const rows = store.db
88
+ .query(`SELECT content FROM chunks WHERE virtual_record_id IN (${vp}) AND content LIKE ?`)
89
+ .all(...accessibleVids, `%${label}%`) as Array<{ content: string }>
90
+ const want = label.toLowerCase()
91
+ const out = new Set<string>()
92
+ for (const r of rows) {
93
+ for (const raw of r.content.split(/\n|(?<=[.!?])\s+/)) {
94
+ const line = cleanLine(raw) // strip markdown ** / # / bullets
95
+ if (line && !isBoilerplate(line) && line.toLowerCase().includes(want)) out.add(line)
96
+ }
97
+ }
98
+ return [...out]
99
+ }
@@ -0,0 +1,16 @@
1
+ // Low-level text/vector helpers shared across the KGQA paths.
2
+
3
+ export function cosine(a: number[], b: number[]): number {
4
+ let d = 0
5
+ let na = 0
6
+ let nb = 0
7
+ const n = Math.min(a.length, b.length)
8
+ for (let i = 0; i < n; i++) {
9
+ d += a[i] * b[i]
10
+ na += a[i] * a[i]
11
+ nb += b[i] * b[i]
12
+ }
13
+ return na && nb ? d / (Math.sqrt(na) * Math.sqrt(nb)) : 0
14
+ }
15
+
16
+ export const stem = (s: string) => s.toLowerCase().replace(/(ing|ed|es|s)$/, "")
@@ -0,0 +1,6 @@
1
+ // Shared structural types for the KGQA subsystem.
2
+
3
+ export type Graph = {
4
+ entities: Array<{ id: string; label: string; type: string }>
5
+ relations: Array<{ from: string; to: string; type: string }>
6
+ }
@@ -0,0 +1,122 @@
1
+ // KGQA - answer a question with the EXACT fact from the typed graph, not a ranked
2
+ // list. "who do I love" → resolve (user, loves, ?) → "Lavanya", cited. Falls back
3
+ // to null (→ vector retrieval) when it can't answer confidently.
4
+ //
5
+ // Works best when the graph has TYPED predicate edges (from the LLM triple
6
+ // extractor). Intent parsing prefers the LLM; a small heuristic covers the
7
+ // no-LLM case for simple "who/what do I <verb>" questions.
8
+ //
9
+ // This module is the ORCHESTRATOR: the actual resolvers live under ./kgqa/* and are
10
+ // composed here. Public API (KgqaResult, KgqaService) is unchanged.
11
+
12
+ import type { SqliteGraphProvider } from "./sqlite-graph-provider"
13
+ import type { SqliteStore } from "./sqlite-store"
14
+ import type { LlmExtractor } from "./llm-extractor"
15
+ import type { EmbeddingProvider } from "../provider"
16
+ import type { QuestionIntent } from "./extract"
17
+ import { stem } from "./kgqa/text"
18
+ import type { Graph } from "./kgqa/types"
19
+ import { heuristicIntent } from "./kgqa/intent"
20
+ import { isSelfPreference } from "./kgqa/preference"
21
+ import { predMatch } from "./kgqa/predicates"
22
+ import { link } from "./kgqa/entity-link"
23
+ import {
24
+ compose,
25
+ binaryAnswer,
26
+ preferenceAnswer,
27
+ predicateAnswer,
28
+ entityLookup,
29
+ } from "./kgqa/answer-paths"
30
+
31
+ export interface KgqaResult {
32
+ answer: string
33
+ /** The individual facts that make up the answer - a query can match SEVERAL typed
34
+ * facts (e.g. "Google limits Meta" AND "Meta uses Gemini"); each is its own item so
35
+ * callers/UI can list them instead of running them together. */
36
+ facts: string[]
37
+ triple: { subject: string; predicate: string; object: string }
38
+ citations: string[] // record names supporting the answer
39
+ confidence: number
40
+ }
41
+
42
+ export class KgqaService {
43
+ constructor(
44
+ private readonly graph: SqliteGraphProvider,
45
+ private readonly store: SqliteStore,
46
+ private readonly embeddings: EmbeddingProvider,
47
+ private readonly llm?: LlmExtractor,
48
+ ) {}
49
+
50
+ async answer(question: string, userId: string, orgId: string): Promise<KgqaResult | null> {
51
+ // ACL-scoped graph: only entities/relations from records this user may see.
52
+ const accessible = await this.graph.getAccessibleVirtualRecordIds({ userId, orgId })
53
+ const recordIds = [...new Set(Object.values(accessible))]
54
+ const g = this.graph.getKnowledgeGraph(recordIds) as Graph
55
+ if (g.entities.length === 0) return null
56
+ const labelOf = new Map(g.entities.map((e) => [e.id, e.label]))
57
+
58
+ const intent = (await this.llm?.parseQuestionIntent(question)) ?? heuristicIntent(question)
59
+ // No relational intent? Route through the intelligent graph fallback (self/
60
+ // preference → entity anchor → predicate anchor) before any vector search.
61
+ if (!intent) return this.graphFallback(question, g, Object.keys(accessible), recordIds, userId, orgId)
62
+
63
+ const subj = intent.subject ? link(intent.subject, g) : null
64
+ const obj = intent.object ? link(intent.object, g) : null
65
+ const predStem = intent.predicate ? stem(intent.predicate.replace(/\s+/g, "_")) : null
66
+
67
+ // Forward relation: (subject, predicate, ?)
68
+ if (intent.type === "relation_query" && subj && predStem && !obj) {
69
+ const objs = g.relations.filter((r) => r.from === subj && predMatch(r.type, predStem)).map((r) => r.to)
70
+ if (objs.length) return compose(this.graph, subj, intent.predicate!, objs, labelOf, userId, orgId, recordIds)
71
+ }
72
+ // Reverse relation: (?, predicate, object)
73
+ if (intent.type === "relation_query" && obj && predStem && !subj) {
74
+ const subs = g.relations.filter((r) => r.to === obj && predMatch(r.type, predStem)).map((r) => r.from)
75
+ if (subs.length) return compose(this.graph, obj, intent.predicate!, subs, labelOf, userId, orgId, recordIds, true)
76
+ }
77
+ // Binary: does (subject, predicate, object) hold?
78
+ if (intent.type === "binary_relation" && subj && obj && predStem) {
79
+ return binaryAnswer(this.graph, g, subj, obj, predStem, intent.predicate, labelOf, userId, orgId, recordIds)
80
+ }
81
+ // Relational paths didn't resolve → intelligent graph fallback, else vector.
82
+ return this.graphFallback(question, g, Object.keys(accessible), recordIds, userId, orgId)
83
+ }
84
+
85
+ // Intelligent graph routing (LightRAG dual-level): self/preference theme → entity
86
+ // anchor → predicate anchor → null (after which the MCP falls back to vector search).
87
+ private async graphFallback(
88
+ question: string,
89
+ g: Graph,
90
+ accessibleVids: string[],
91
+ recordIds: string[],
92
+ userId: string,
93
+ orgId: string,
94
+ ): Promise<KgqaResult | null> {
95
+ if (isSelfPreference(question)) {
96
+ const p = preferenceAnswer(this.graph, g, userId, orgId, recordIds)
97
+ if (p) return p
98
+ }
99
+ const e = await this.entityLookup(question, g, accessibleVids, recordIds, userId, orgId)
100
+ if (e) return e
101
+ return predicateAnswer(this.graph, question, g, userId, orgId, recordIds)
102
+ }
103
+
104
+ // Entity-anchored answer (no LLM needed): if the query names a known entity,
105
+ // return the line(s)/facts about it THAT MATCH THE QUERY - a specific question
106
+ // gets the specific fact; a bare entity name gets everything.
107
+ entityLookup(
108
+ question: string,
109
+ g: Graph,
110
+ accessibleVids: string[],
111
+ recordIds: string[],
112
+ userId: string,
113
+ orgId: string,
114
+ ): Promise<KgqaResult | null> {
115
+ return entityLookup(this.graph, this.store, this.embeddings, question, g, accessibleVids, recordIds, userId, orgId)
116
+ }
117
+
118
+ // Parse a question into a typed intent - LLM-preferred with a heuristic fallback.
119
+ async parseQuestionIntent(question: string): Promise<QuestionIntent | null> {
120
+ return (await this.llm?.parseQuestionIntent(question)) ?? heuristicIntent(question)
121
+ }
122
+ }
@@ -0,0 +1,10 @@
1
+ // LLM-backed knowledge extraction. Calls an OpenAI-compatible chat endpoint -
2
+ // point it at a LOCAL/sovereign model (vLLM/SGLang/Ollama) so nothing leaves the
3
+ // building. Unlike the deterministic extractor, it handles casual lowercase text
4
+ // ("i love lavanya" → Lavanya[PERSON], user -loves→ Lavanya).
5
+ //
6
+ // Thin facade: the implementation lives in ./extractors/llm and is re-exported here
7
+ // so existing imports keep resolving unchanged. Public API is preserved exactly.
8
+
9
+ export type { LlmExtractorConfig } from "./extractors/llm"
10
+ export { LlmExtractor, HybridExtractor } from "./extractors/llm"
@@ -0,0 +1,36 @@
1
+ // In-process embeddings. This deterministic hashing embedder is dependency-free
2
+ // so the embedded stack runs and tests with zero downloads. For real semantic
3
+ // quality in the single binary, swap in transformers.js / fastembed (ONNX bge-*)
4
+ // - it implements the same EmbeddingProvider interface, so nothing above changes.
5
+
6
+ import type { EmbeddingProvider } from "../provider"
7
+
8
+ const DIM = 64
9
+
10
+ function tokens(text: string): string[] {
11
+ return text.toLowerCase().match(/[a-z0-9]+/g) ?? []
12
+ }
13
+
14
+ function bucket(token: string): number {
15
+ let h = 2166136261
16
+ for (let i = 0; i < token.length; i++) {
17
+ h ^= token.charCodeAt(i)
18
+ h = Math.imul(h, 16777619)
19
+ }
20
+ return Math.abs(h) % DIM
21
+ }
22
+
23
+ export class LocalHashEmbeddings implements EmbeddingProvider {
24
+ async embedDense(query: string): Promise<number[]> {
25
+ const v = new Array(DIM).fill(0)
26
+ for (const t of tokens(query)) v[bucket(t)] += 1
27
+ const norm = Math.sqrt(v.reduce((s, x) => s + x * x, 0)) || 1
28
+ return v.map((x) => x / norm)
29
+ }
30
+
31
+ async embedSparse(query: string): Promise<{ indices: number[]; values: number[] }> {
32
+ const counts = new Map<number, number>()
33
+ for (const t of tokens(query)) counts.set(bucket(t), (counts.get(bucket(t)) ?? 0) + 1)
34
+ return { indices: [...counts.keys()], values: [...counts.values()] }
35
+ }
36
+ }
@@ -0,0 +1,100 @@
1
+ // Shared, persistent embedded context for the CLI agent - a single local
2
+ // knowledge graph + vector store the `context_ingest` and `get_context` tools both
3
+ // use. Single local user (no ACL friction for personal use); zero servers, zero
4
+ // config. The DB persists at CONTEXT_DB or the app data dir.
5
+
6
+ import path from "node:path"
7
+ import os from "node:os"
8
+ import fs from "node:fs"
9
+ import { buildEmbeddedContext, type EmbeddedContext } from "./index"
10
+ import { LocalHashEmbeddings } from "./local-embeddings"
11
+ import { TransformersEmbeddings, AutoEmbeddings } from "./transformers-embeddings"
12
+ import { DeterministicExtractor, HybridExtractor, type KnowledgeExtractor } from "./index"
13
+ import { LlmExtractor } from "./llm-extractor"
14
+ import { CrossEncoderReranker } from "./reranker"
15
+ import type { Reranker } from "./reranker"
16
+ import type { Role } from "./authorizer"
17
+ import type { EmbeddingProvider } from "../provider"
18
+
19
+ /** Cross-encoder reranker is ON by default (measured +40% MRR / +27% nDCG, recall
20
+ * unchanged). It downloads a small (~22M) model on first use and degrades gracefully
21
+ * to RRF order if unavailable. Disable with CONTEXT_RERANK=0. */
22
+ function pickReranker(): Reranker | undefined {
23
+ return /^(0|false|off)$/i.test(process.env.CONTEXT_RERANK ?? "") ? undefined : new CrossEncoderReranker()
24
+ }
25
+
26
+ // Distinct ids - the nodes table is keyed by id, so user and org must not collide.
27
+ export const LOCAL_USER = "local-user"
28
+ export const LOCAL_ORG = "local-org"
29
+
30
+ /** WHO is asking. One shared graph (the DB at CONTEXT_DB), but EACH process carries
31
+ * its own identity from env, so N users hit the same graph and each sees only their
32
+ * ACL slice. Single-user default (no env) → local-user/local-org/admin, unchanged. */
33
+ export interface Identity {
34
+ userId: string
35
+ orgId: string
36
+ role: Role
37
+ groups: string[]
38
+ }
39
+ export function identity(): Identity {
40
+ const userId = process.env.CONTEXT_USER_ID
41
+ return {
42
+ userId: userId || LOCAL_USER,
43
+ orgId: process.env.CONTEXT_ORG_ID || LOCAL_ORG,
44
+ // explicit identity ⇒ default to least-privilege 'editor'; personal default ⇒ 'admin'.
45
+ role: ((process.env.CONTEXT_USER_ROLE as Role) || (userId ? "editor" : "admin")) as Role,
46
+ groups: (process.env.CONTEXT_USER_GROUPS || "").split(",").map((g) => g.trim()).filter(Boolean),
47
+ }
48
+ }
49
+
50
+ let cached: (EmbeddedContext & { userId: string; orgId: string }) | null = null
51
+
52
+ export function personalContextPath(): string {
53
+ if (process.env.CONTEXT_DB) return process.env.CONTEXT_DB
54
+ const dir = path.join(os.homedir(), ".local", "share", "100xprompt")
55
+ return path.join(dir, "context.db")
56
+ }
57
+
58
+ /** Pick the embedder from env: real semantic (transformers) or the offline
59
+ * keyword-hash default. CONTEXT_EMBEDDINGS=transformers enables the real model. */
60
+ export function pickEmbeddings(): EmbeddingProvider {
61
+ const mode = (process.env.CONTEXT_EMBEDDINGS ?? "auto").toLowerCase()
62
+ if (mode === "hash") return new LocalHashEmbeddings()
63
+ if (mode === "transformers") return new TransformersEmbeddings(process.env.CONTEXT_EMBED_MODEL || undefined)
64
+ return new AutoEmbeddings(process.env.CONTEXT_EMBED_MODEL || undefined) // default: real, hash fallback
65
+ }
66
+
67
+ /** Build the sovereign/local LLM client if CONTEXT_LLM_URL is set (used for both
68
+ * typed-triple extraction and KGQA question parsing). */
69
+ export function pickLlm(): LlmExtractor | undefined {
70
+ const url = process.env.CONTEXT_LLM_URL
71
+ if (!url) return undefined
72
+ return new LlmExtractor({
73
+ endpoint: url,
74
+ model: process.env.CONTEXT_LLM_MODEL || "default",
75
+ apiKey: process.env.CONTEXT_LLM_KEY,
76
+ })
77
+ }
78
+
79
+ /** Deterministic by default; Hybrid (deterministic + LLM) when a local model is set. */
80
+ export function pickExtractor(llm?: LlmExtractor): KnowledgeExtractor {
81
+ return llm ? new HybridExtractor(new DeterministicExtractor(), llm) : new DeterministicExtractor()
82
+ }
83
+
84
+ export function personalContext() {
85
+ if (cached) return cached
86
+ const dbPath = personalContextPath()
87
+ fs.mkdirSync(path.dirname(dbPath), { recursive: true })
88
+ const llm = pickLlm()
89
+ const ctx = buildEmbeddedContext({ path: dbPath, embeddings: pickEmbeddings(), extractor: pickExtractor(llm), llm, reranker: pickReranker() })
90
+ const { userId, orgId, role, groups } = identity()
91
+ // Provision the asking user into the SHARED graph: their role + group memberships
92
+ // drive what they can create and access. Idempotent (INSERT OR REPLACE).
93
+ ctx.ingestor.registerUser(userId, orgId, undefined, role)
94
+ for (const g of groups) {
95
+ ctx.ingestor.registerGroup(g)
96
+ ctx.ingestor.addMembership(userId, g)
97
+ }
98
+ cached = Object.assign(ctx, { userId, orgId })
99
+ return cached
100
+ }