@100xprompt/chitta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +203 -0
  3. package/assets/rules/claude-md.md +9 -0
  4. package/assets/skill/SKILL.md +47 -0
  5. package/package.json +48 -0
  6. package/src/README.md +124 -0
  7. package/src/arango-client.ts +67 -0
  8. package/src/arango-graph-provider.ts +364 -0
  9. package/src/bin.ts +27 -0
  10. package/src/config-env.ts +53 -0
  11. package/src/embedded/authorizer.ts +89 -0
  12. package/src/embedded/cli.ts +86 -0
  13. package/src/embedded/code-extractor.ts +9 -0
  14. package/src/embedded/demo.ts +36 -0
  15. package/src/embedded/extract.ts +12 -0
  16. package/src/embedded/extractors/code.ts +308 -0
  17. package/src/embedded/extractors/deterministic.ts +63 -0
  18. package/src/embedded/extractors/llm.ts +151 -0
  19. package/src/embedded/extractors/text-hygiene.ts +54 -0
  20. package/src/embedded/extractors/types.ts +34 -0
  21. package/src/embedded/graph/acl-paths.ts +96 -0
  22. package/src/embedded/graph/adjacency.ts +61 -0
  23. package/src/embedded/graph/centrality.ts +23 -0
  24. package/src/embedded/graph/communities.ts +46 -0
  25. package/src/embedded/graph/cypher.ts +17 -0
  26. package/src/embedded/graph/impact.ts +24 -0
  27. package/src/embedded/graph/knowledge-graph.ts +108 -0
  28. package/src/embedded/graph/pagerank.ts +57 -0
  29. package/src/embedded/graph/sql-access.ts +13 -0
  30. package/src/embedded/graph/traversal.ts +73 -0
  31. package/src/embedded/graph/types.ts +35 -0
  32. package/src/embedded/graph-query.ts +126 -0
  33. package/src/embedded/index.ts +171 -0
  34. package/src/embedded/ingest.ts +262 -0
  35. package/src/embedded/kgqa/answer-paths.ts +197 -0
  36. package/src/embedded/kgqa/entity-link.ts +13 -0
  37. package/src/embedded/kgqa/intent.ts +14 -0
  38. package/src/embedded/kgqa/predicates.ts +9 -0
  39. package/src/embedded/kgqa/preference.ts +20 -0
  40. package/src/embedded/kgqa/select.ts +99 -0
  41. package/src/embedded/kgqa/text.ts +16 -0
  42. package/src/embedded/kgqa/types.ts +6 -0
  43. package/src/embedded/kgqa-service.ts +122 -0
  44. package/src/embedded/llm-extractor.ts +10 -0
  45. package/src/embedded/local-embeddings.ts +36 -0
  46. package/src/embedded/personal.ts +100 -0
  47. package/src/embedded/reranker.ts +62 -0
  48. package/src/embedded/retrieval/decay-stage.ts +59 -0
  49. package/src/embedded/retrieval/diversity.ts +37 -0
  50. package/src/embedded/retrieval/fuse.ts +52 -0
  51. package/src/embedded/retrieval/graph-stage.ts +45 -0
  52. package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
  53. package/src/embedded/retrieval/keyword-stage.ts +27 -0
  54. package/src/embedded/retrieval/passage.ts +44 -0
  55. package/src/embedded/retrieval/rerank-stage.ts +31 -0
  56. package/src/embedded/retrieval/trace.ts +31 -0
  57. package/src/embedded/retrieval/vector-stage.ts +15 -0
  58. package/src/embedded/sqlite-graph-provider.ts +119 -0
  59. package/src/embedded/sqlite-store.ts +95 -0
  60. package/src/embedded/sqlite-vec-service.ts +122 -0
  61. package/src/embedded/store/chunks.ts +61 -0
  62. package/src/embedded/store/fts.ts +50 -0
  63. package/src/embedded/store/nodes-edges.ts +112 -0
  64. package/src/embedded/store/salience.ts +37 -0
  65. package/src/embedded/store/schema.ts +109 -0
  66. package/src/embedded/transformers-embeddings.ts +100 -0
  67. package/src/embeddings.ts +51 -0
  68. package/src/eval/goldset.ts +46 -0
  69. package/src/eval/harness.ts +65 -0
  70. package/src/eval/metrics.ts +38 -0
  71. package/src/http/server.ts +93 -0
  72. package/src/index.ts +44 -0
  73. package/src/install/index.ts +139 -0
  74. package/src/install/platforms.ts +126 -0
  75. package/src/install/skill.ts +46 -0
  76. package/src/install/writers.ts +82 -0
  77. package/src/mcp/backend.ts +129 -0
  78. package/src/mcp/server.ts +83 -0
  79. package/src/mcp/tools/context-about.ts +69 -0
  80. package/src/mcp/tools/context-graph.ts +23 -0
  81. package/src/mcp/tools/context-ingest.ts +88 -0
  82. package/src/mcp/tools/context-rebuild.ts +22 -0
  83. package/src/mcp/tools/context-relate.ts +88 -0
  84. package/src/mcp/tools/get-context.ts +52 -0
  85. package/src/mcp/tools/index.ts +40 -0
  86. package/src/mcp/tools/types.ts +33 -0
  87. package/src/permission.ts +72 -0
  88. package/src/provider.ts +65 -0
  89. package/src/qdrant-vector.ts +76 -0
  90. package/src/retrieval.ts +218 -0
  91. package/src/service.ts +40 -0
  92. package/src/types.ts +91 -0
@@ -0,0 +1,12 @@
1
+ // Knowledge extraction - turns raw text into entity nodes + relationship edges so
2
+ // the store becomes a real knowledge graph, not one opaque record. Deterministic
3
+ // (no LLM, no deps) so it runs offline; swap in an LLM extractor later behind the
4
+ // same `Extraction` shape for higher recall.
5
+ //
6
+ // This file is now a thin facade: the implementations live in ./extractors/* and
7
+ // are re-exported here so existing imports (`import { slugify } from "./extract"`)
8
+ // keep resolving unchanged. Public API is preserved exactly.
9
+
10
+ export type { ExtractedEntity, ExtractedRelation, QuestionIntent, Extraction, KnowledgeExtractor } from "./extractors/types"
11
+ export { slugify, entityId, ENTITY_PREFIX, cleanLine, isBoilerplate, stripBoilerplate } from "./extractors/text-hygiene"
12
+ export { DeterministicExtractor, extractKnowledge } from "./extractors/deterministic"
@@ -0,0 +1,308 @@
1
+ // Code → graph extractor (the Graphify capability, ported TS-native). Parses source
2
+ // with tree-sitter (WASM grammars, no Python, no servers) into the SAME entity/edge
3
+ // shape every other extractor produces - so the moment code nodes land, all the rest
4
+ // (ACL, vector recall, bi-temporal edges, context_relate / path / impact / central)
5
+ // works on them for free. This is what makes us a STRICT SUPERSET of Graphify: code
6
+ // graph + NL graph + permissions + vectors + temporal, in one embedded store.
7
+ //
8
+ // ALL 36 tree-sitter grammars are supported. Rather than hand-list node types per
9
+ // grammar (brittle across 36), we CLASSIFY nodes generically: tree-sitter follows
10
+ // strong conventions (`*_declaration`/`*_definition`/`*_item` for defs,
11
+ // `call_expression`/`*_invocation`/`command` for calls, `import|use|include|...` for
12
+ // imports), with a small OUTLIERS table for the few grammars that don't (Ruby,
13
+ // Elixir, Lua, Elm, …). Static AST is the RIGHT tool here (unlike prose): code has a
14
+ // formal grammar, so extraction is exact. Grammars are an optionalDependency; if they
15
+ // fail to load we degrade to an empty extraction (never crash).
16
+ // Re-exported from code-extractor.ts to preserve original import paths.
17
+
18
+ import { fileURLToPath } from "node:url"
19
+ import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor } from "./types"
20
+ import { slugify } from "./text-hygiene"
21
+
22
+ // NB: this module lives one directory deeper than the original code-extractor.ts, so
23
+ // the relative path to node_modules gains one extra `../` to resolve identically.
24
+ const WASM_DIR = fileURLToPath(new URL("../../../node_modules/tree-sitter-wasms/out/", import.meta.url))
25
+
26
+ // All 36 grammars shipped by tree-sitter-wasms (grammar name → wasm file stem).
27
+ const GRAMMARS = [
28
+ "bash", "c", "c_sharp", "cpp", "css", "dart", "elisp", "elixir", "elm", "embedded_template",
29
+ "go", "html", "java", "javascript", "json", "kotlin", "lua", "objc", "ocaml", "php",
30
+ "python", "ql", "rescript", "ruby", "rust", "scala", "solidity", "swift", "systemrdl",
31
+ "tlaplus", "toml", "tsx", "typescript", "vue", "yaml", "zig",
32
+ ] as const
33
+ type Lang = (typeof GRAMMARS)[number]
34
+ const GRAMMAR_SET = new Set<string>(GRAMMARS)
35
+
36
+ // File extension → grammar. Covers every supported language.
37
+ const EXT_TO_LANG: Record<string, Lang> = {
38
+ sh: "bash", bash: "bash", zsh: "bash",
39
+ c: "c", h: "c",
40
+ cs: "c_sharp",
41
+ cpp: "cpp", cc: "cpp", cxx: "cpp", hpp: "cpp", hxx: "cpp", hh: "cpp",
42
+ css: "css", scss: "css",
43
+ dart: "dart",
44
+ el: "elisp", emacs: "elisp",
45
+ ex: "elixir", exs: "elixir",
46
+ elm: "elm",
47
+ erb: "embedded_template", ejs: "embedded_template",
48
+ go: "go",
49
+ html: "html", htm: "html",
50
+ java: "java",
51
+ js: "javascript", mjs: "javascript", cjs: "javascript", jsx: "javascript",
52
+ json: "json",
53
+ kt: "kotlin", kts: "kotlin",
54
+ lua: "lua",
55
+ m: "objc", mm: "objc",
56
+ ml: "ocaml", mli: "ocaml",
57
+ php: "php",
58
+ py: "python", pyi: "python",
59
+ ql: "ql",
60
+ res: "rescript",
61
+ rb: "ruby",
62
+ rs: "rust",
63
+ scala: "scala", sc: "scala",
64
+ sol: "solidity",
65
+ swift: "swift",
66
+ rdl: "systemrdl",
67
+ tla: "tlaplus",
68
+ toml: "toml",
69
+ tsx: "tsx",
70
+ ts: "typescript", mts: "typescript", cts: "typescript",
71
+ vue: "vue",
72
+ yaml: "yaml", yml: "yaml",
73
+ zig: "zig",
74
+ }
75
+
76
+ type Kind = "func" | "class" | "call" | "import"
77
+
78
+ // Generic, convention-based classification of a tree-sitter node type. Tree-sitter
79
+ // grammars vary (`function_declaration`, `function_item`, `function_definition_statement`,
80
+ // …) so we match a KEYWORD plus a DEF-SUFFIX rather than enumerate every combination.
81
+ const IMPORT_RE = /^(import|include|use|using|require|package|open|with|load)(_|$)/
82
+ const CALL_RE = /(^|_)(call|invocation|command)(_expression|_statement)?$/
83
+ const CLASS_KEY = /(^|_)(class|struct|interface|trait|enum|contract|object|protocol|module|impl|type|record|union|actor|mixin|component)(_|$)/
84
+ const FUNC_KEY = /(^|_)(function|method|func|fn|constructor|subroutine|procedure|def|getter|setter)(_|$)/
85
+ const DEF_SUFFIX = /(declaration|definition|specifier|item|signature|spec|statement|binding)/
86
+
87
+ // Outliers whose node names carry no DEF-SUFFIX keyword. NB: bare "module" is Ruby's
88
+ // `module M` namespace - but it's also Python's FILE ROOT node type, so we must never
89
+ // classify the root (handled by starting the walk at the root's children).
90
+ const EXTRA_CLASS = new Set(["class", "module", "singleton_class", "object_declaration", "protocol_declaration"])
91
+ const EXTRA_FUNC = new Set(["method", "singleton_method", "let_binding", "value_declaration", "function_declaration_left", "defun", "macro_definition"])
92
+ const EXTRA_CALL = new Set(["function_call", "member_call_expression", "scoped_call_expression", "command_call", "method_call"])
93
+
94
+ function classify(t: string): Kind | null {
95
+ if (t === "preproc_include") return "import"
96
+ if (IMPORT_RE.test(t)) return "import"
97
+ if (CALL_RE.test(t) || EXTRA_CALL.has(t)) return "call"
98
+ if ((CLASS_KEY.test(t) && DEF_SUFFIX.test(t)) || EXTRA_CLASS.has(t)) return "class"
99
+ if ((FUNC_KEY.test(t) && DEF_SUFFIX.test(t)) || EXTRA_FUNC.has(t)) return "func"
100
+ return null
101
+ }
102
+
103
+ // Elixir (and similar Lisp-y grammars) express definitions AS macro calls - `def`,
104
+ // `defmodule`, etc. are `call` nodes whose head identifier is the macro name.
105
+ const DEF_MACROS = new Set(["def", "defp", "defmacro", "defmacrop", "defmodule", "defprotocol", "defimpl", "defstruct", "defn", "defun", "defmethod", "defclass"])
106
+ const MODULE_MACROS = new Set(["defmodule", "defprotocol", "defimpl", "defclass"])
107
+
108
+ const C_EXTRACTED = 1.0 // call resolves to a symbol we defined here
109
+ const C_INFERRED = 0.7 // call to an unknown/external symbol
110
+
111
+ let parserInit: Promise<void> | null = null
112
+ const langCache = new Map<string, unknown>()
113
+
114
+ async function loadLang(lang: string): Promise<unknown | null> {
115
+ try {
116
+ const TS = (await import("web-tree-sitter")).default as any
117
+ if (!parserInit) parserInit = TS.init()
118
+ await parserInit
119
+ if (!langCache.has(lang)) {
120
+ if (!GRAMMAR_SET.has(lang)) return null
121
+ langCache.set(lang, await TS.Language.load(WASM_DIR + `tree-sitter-${lang}.wasm`))
122
+ }
123
+ return langCache.get(lang)
124
+ } catch {
125
+ return null // grammars unavailable (e.g. compiled binary) → graceful no-op
126
+ }
127
+ }
128
+
129
+ export class CodeExtractor implements KnowledgeExtractor {
130
+ /** All languages this extractor can parse. */
131
+ static languages(): readonly string[] {
132
+ return GRAMMARS
133
+ }
134
+
135
+ /** Map a filename to a supported language, or null. */
136
+ static detectLanguage(name?: string): string | null {
137
+ if (!name) return null
138
+ const ext = name.toLowerCase().split(".").pop() ?? ""
139
+ return EXT_TO_LANG[ext] ?? null
140
+ }
141
+
142
+ /** Implements the generic extractor interface. Uses `meta.language` if given, else
143
+ * detects from `meta.name` (the filename). Returns empty for non-code / unknown. */
144
+ async extract(text: string, meta?: { name?: string; language?: string }): Promise<Extraction> {
145
+ const lang = meta?.language ?? CodeExtractor.detectLanguage(meta?.name)
146
+ if (!lang) return { entities: [], relations: [] }
147
+ return this.extractCode(text, lang, meta?.name ?? "source")
148
+ }
149
+
150
+ /** Parse one source file into code entities + relations. Returns empty (never
151
+ * throws) if the grammar can't load or its ABI is incompatible with the runtime. */
152
+ async extractCode(code: string, lang: string, fileName = "source"): Promise<Extraction> {
153
+ const Language = await loadLang(lang)
154
+ if (!Language) return { entities: [], relations: [] }
155
+ let tree: any
156
+ try {
157
+ const TS = (await import("web-tree-sitter")).default as any
158
+ const parser = new TS()
159
+ parser.setLanguage(Language) // throws on ABI mismatch → caught → graceful empty
160
+ tree = parser.parse(code)
161
+ } catch {
162
+ return { entities: [], relations: [] }
163
+ }
164
+
165
+ const entities = new Map<string, ExtractedEntity>()
166
+ const relations = new Map<string, ExtractedRelation>()
167
+ const defined = new Set<string>() // symbol ids we actually define here
168
+
169
+ const fileId = `file:${slugify(fileName)}`
170
+ entities.set(fileId, { id: fileId, label: fileName, type: "FILE" })
171
+
172
+ const symId = (name: string) => `sym:${slugify(name)}`
173
+ const addEnt = (id: string, label: string, type: string) => {
174
+ if (!entities.has(id)) entities.set(id, { id, label, type })
175
+ }
176
+ const addRel = (from: string, to: string, type: string, confidence: number) => {
177
+ const key = `${from}|${to}|${type}`
178
+ if (!relations.has(key)) relations.set(key, { from, to, type, confidence })
179
+ }
180
+ // Find a definition's name: prefer the `name` field, else a shallow DFS for the
181
+ // first identifier-like token (skipping parameter lists), so it works across grammars.
182
+ const IDENT_RE = /(identifier|name|word|constant|type_identifier|field_identifier)/
183
+ const nameOf = (node: any): string | null => {
184
+ const n = node.childForFieldName?.("name")
185
+ if (n?.text) return n.text
186
+ const stack: any[] = [node]
187
+ let depth = 0
188
+ while (stack.length && depth < 400) {
189
+ const cur = stack.shift()
190
+ depth++
191
+ for (let i = 0; i < cur.namedChildCount; i++) {
192
+ const c = cur.namedChild(i)
193
+ if (/parameter|argument|body|block/.test(c.type)) continue
194
+ if (IDENT_RE.test(c.type)) return c.text
195
+ stack.push(c)
196
+ }
197
+ }
198
+ return null
199
+ }
200
+ const lastSegment = (text: string): string | null => {
201
+ const parts = text.split(/[.:>-]+/).filter(Boolean)
202
+ return parts[parts.length - 1] || null
203
+ }
204
+ // The first identifier-like token in preorder, skipping argument subtrees - for a
205
+ // call node that's the callee; for an Elixir def-macro that's the macro name.
206
+ const firstIdent = (node: any): string | null => {
207
+ for (let i = 0; i < node.namedChildCount; i++) {
208
+ const c = node.namedChild(i)
209
+ if (/argument/.test(c.type)) continue
210
+ if (IDENT_RE.test(c.type)) return c.text
211
+ const deep = firstIdent(c)
212
+ if (deep) return deep
213
+ }
214
+ return null
215
+ }
216
+ const calleeOf = (node: any): string | null => {
217
+ const f = node.childForFieldName?.("function") || node.childForFieldName?.("name") || node.childForFieldName?.("method")
218
+ if (f?.text) return lastSegment(f.text)
219
+ const id = firstIdent(node)
220
+ return id ? lastSegment(id) : null
221
+ }
222
+ // Elixir: name defined by a def-macro call - in its arguments, the first module
223
+ // alias or the head identifier of the inner call (e.g. `def greet(n)` → greet).
224
+ const elixirDefName = (node: any): string | null => {
225
+ let scope = node
226
+ for (let i = 0; i < node.namedChildCount; i++) if (node.namedChild(i).type === "arguments") scope = node.namedChild(i)
227
+ for (let i = 0; i < scope.namedChildCount; i++) {
228
+ const c = scope.namedChild(i)
229
+ if (c.type === "alias") return c.text
230
+ if (c.type === "call") return firstIdent(c)
231
+ if (IDENT_RE.test(c.type)) return c.text
232
+ }
233
+ return null
234
+ }
235
+
236
+ const walk = (node: any, enclosing: string, enclosingClass: string | null) => {
237
+ let nextEnclosing = enclosing
238
+ let nextClass = enclosingClass
239
+ const kind = classify(node.type)
240
+
241
+ if (kind === "class") {
242
+ const nm = nameOf(node)
243
+ if (nm) {
244
+ const id = symId(nm)
245
+ addEnt(id, nm, "CLASS")
246
+ defined.add(id)
247
+ addRel(fileId, id, "defines", C_EXTRACTED)
248
+ nextEnclosing = id
249
+ nextClass = id
250
+ }
251
+ } else if (kind === "func") {
252
+ const nm = nameOf(node)
253
+ if (nm) {
254
+ const id = symId(nm)
255
+ const type = enclosingClass ? "METHOD" : "FUNCTION"
256
+ addEnt(id, nm, type)
257
+ defined.add(id)
258
+ if (enclosingClass) addRel(enclosingClass, id, "has_method", C_EXTRACTED)
259
+ else addRel(fileId, id, "defines", C_EXTRACTED)
260
+ nextEnclosing = id
261
+ }
262
+ } else if (kind === "import") {
263
+ const mod = node.text.replace(/^[^A-Za-z0-9_./@]*(import|from|use|using|require|include|package|open|with|load)\s+/i, "").split(/[\s;(){}]/)[0]?.replace(/["'`<>]/g, "")
264
+ if (mod && mod.length > 1) {
265
+ const id = `module:${slugify(mod)}`
266
+ addEnt(id, mod, "MODULE")
267
+ addRel(fileId, id, "imports", C_EXTRACTED)
268
+ }
269
+ } else if (kind === "call") {
270
+ // Elixir-style: a call to a def-macro is actually a DEFINITION.
271
+ const head = firstIdent(node)
272
+ if (head && DEF_MACROS.has(head)) {
273
+ const nm = elixirDefName(node)
274
+ if (nm && nm.length > 1) {
275
+ const id = symId(nm)
276
+ const isMod = MODULE_MACROS.has(head)
277
+ addEnt(id, nm, isMod ? "CLASS" : enclosingClass ? "METHOD" : "FUNCTION")
278
+ defined.add(id)
279
+ if (isMod) {
280
+ addRel(fileId, id, "defines", C_EXTRACTED)
281
+ nextClass = id
282
+ } else if (enclosingClass) addRel(enclosingClass, id, "has_method", C_EXTRACTED)
283
+ else addRel(fileId, id, "defines", C_EXTRACTED)
284
+ nextEnclosing = id
285
+ }
286
+ } else {
287
+ const callee = calleeOf(node)
288
+ if (callee && callee.length > 1) {
289
+ const id = symId(callee)
290
+ addEnt(id, callee, "FUNCTION")
291
+ addRel(enclosing, id, "calls", defined.has(id) ? C_EXTRACTED : C_INFERRED)
292
+ }
293
+ }
294
+ }
295
+
296
+ for (let i = 0; i < node.namedChildCount; i++) walk(node.namedChild(i), nextEnclosing, nextClass)
297
+ }
298
+ // Start at the root's CHILDREN - never classify the root itself (e.g. Python's
299
+ // root node type is "module", which would otherwise look like a class).
300
+ for (let i = 0; i < tree.rootNode.namedChildCount; i++) walk(tree.rootNode.namedChild(i), fileId, null)
301
+
302
+ // Second pass: a call to a symbol that turned out to be locally defined (possibly
303
+ // seen before its definition) is promoted to EXTRACTED confidence.
304
+ for (const r of relations.values()) if (r.type === "calls" && defined.has(r.to)) r.confidence = C_EXTRACTED
305
+
306
+ return { entities: [...entities.values()], relations: [...relations.values()] }
307
+ }
308
+ }
@@ -0,0 +1,63 @@
1
+ // Deterministic (no-LLM, no-deps) knowledge extractor. Pulls capitalized phrases
2
+ // and acronyms as entities and relates entities that co-occur in a sentence-ish
3
+ // unit. Runs fully offline. Re-exported from extract.ts to preserve import paths.
4
+
5
+ import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor } from "./types"
6
+
7
+ export class DeterministicExtractor implements KnowledgeExtractor {
8
+ async extract(text: string): Promise<Extraction> {
9
+ return extractKnowledge(text)
10
+ }
11
+ }
12
+
13
+ const STOP = new Set([
14
+ "The", "A", "An", "This", "That", "These", "Those", "It", "Our", "Your", "Their", "We", "You",
15
+ "I", "He", "She", "They", "And", "Or", "But", "For", "With", "From", "To", "Of", "In", "On",
16
+ "At", "By", "As", "Is", "Are", "Was", "Were", "Be", "Will", "Each", "All", "Layer", "Step",
17
+ "Both", "Also", "Use", "Used", "Using", "Here", "There", "When", "Then", "Now", "Both",
18
+ ])
19
+
20
+ function slug(s: string): string {
21
+ return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "")
22
+ }
23
+
24
+ // Capitalized phrases (e.g. "100X Prompt Pro", "Fine-Tuned") and acronyms ("SOC-II", "FP8").
25
+ const PHRASE = /\b(?:[0-9]+[A-Za-z]+|[A-Z][a-z0-9]+|[A-Z]{2,}(?:-[A-Z0-9]+)?)(?:[ -](?:[A-Z][A-Za-z0-9]+|[A-Z]{2,}(?:-[A-Z0-9]+)?|[0-9]+))*\b/g
26
+
27
+ function candidates(line: string): ExtractedEntity[] {
28
+ const out: ExtractedEntity[] = []
29
+ const seen = new Set<string>()
30
+ for (const m of line.matchAll(PHRASE)) {
31
+ const label = m[0].trim()
32
+ if (label.length < 2) continue
33
+ if (STOP.has(label)) continue
34
+ // single short common capitalized word at sentence start → skip noise
35
+ if (!label.includes(" ") && !label.includes("-") && label.length < 3) continue
36
+ const id = slug(label)
37
+ if (!id || seen.has(id)) continue
38
+ seen.add(id)
39
+ out.push({ id, label, type: /^[A-Z0-9-]+$/.test(label) && label === label.toUpperCase() ? "ACRONYM" : "CONCEPT" })
40
+ }
41
+ return out
42
+ }
43
+
44
+ export function extractKnowledge(text: string): Extraction {
45
+ const entities = new Map<string, ExtractedEntity>()
46
+ const relations = new Map<string, ExtractedRelation>()
47
+
48
+ // Split into lines / sentence-ish units; entities co-occurring in a unit relate.
49
+ const units = text.split(/[\n.;]+/).map((u) => u.trim()).filter(Boolean)
50
+ for (const unit of units) {
51
+ const ents = candidates(unit)
52
+ for (const e of ents) if (!entities.has(e.id)) entities.set(e.id, e)
53
+ // pairwise co-occurrence within the unit (cap to avoid explosion)
54
+ for (let i = 0; i < ents.length; i++) {
55
+ for (let j = i + 1; j < Math.min(ents.length, i + 6); j++) {
56
+ const [a, b] = [ents[i].id, ents[j].id].sort()
57
+ if (a === b) continue
58
+ relations.set(`${a}|${b}`, { from: a, to: b, type: "relates_to" })
59
+ }
60
+ }
61
+ }
62
+ return { entities: [...entities.values()], relations: [...relations.values()] }
63
+ }
@@ -0,0 +1,151 @@
1
+ // LLM-backed knowledge extraction. Calls an OpenAI-compatible chat endpoint -
2
+ // point it at a LOCAL/sovereign model (vLLM/SGLang/Ollama) so nothing leaves the
3
+ // building. Unlike the deterministic extractor, it handles casual lowercase text
4
+ // ("i love lavanya" → Lavanya[PERSON], user -loves→ Lavanya).
5
+ // Re-exported from llm-extractor.ts to preserve original import paths.
6
+
7
+ import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor, QuestionIntent } from "./types"
8
+ import { slugify } from "./text-hygiene"
9
+
10
+ export interface LlmExtractorConfig {
11
+ endpoint: string // OpenAI-compatible base, e.g. http://localhost:8000
12
+ model: string
13
+ apiKey?: string
14
+ fetchImpl?: typeof fetch
15
+ }
16
+
17
+ // Typed-triple extraction: the "code-like" structure - subject/object carry types,
18
+ // the predicate is a real verb, plus a confidence. (Per research: enhancing the LLM
19
+ // extractor beats REBEL/GLiNER/etc. for our TS stack - typed, no deps, no big models.)
20
+ const SYSTEM = [
21
+ "You extract knowledge-graph TRIPLES from the user's text.",
22
+ "Return ONLY a JSON object, no prose:",
23
+ '{"triples":[{"subject":"<entity>","subjectType":"PERSON|ORG|PLACE|PRODUCT|CONCEPT|OTHER",',
24
+ '"predicate":"<short verb>","object":"<entity>","objectType":"PERSON|ORG|PLACE|PRODUCT|CONCEPT|OTHER","confidence":0.0-1.0}]}',
25
+ "Cover people, orgs, places, products, key concepts - INCLUDING casual lowercase text.",
26
+ 'Example: "i love lavanya" → {"subject":"user","subjectType":"PERSON","predicate":"loves","object":"Lavanya","objectType":"PERSON","confidence":0.95}.',
27
+ "Be concise; only meaningful triples.",
28
+ ].join(" ")
29
+
30
+ const INTENT_SYSTEM = [
31
+ "Parse the user's QUESTION into a structured intent. Return ONLY JSON:",
32
+ '{"type":"entity_lookup|relation_query|binary_relation","subject":"<noun or null>","predicate":"<verb or null>","object":"<noun or null>"}',
33
+ 'Examples: "who do I love?" → {"type":"relation_query","subject":"user","predicate":"loves","object":null};',
34
+ '"does Lavanya work at Acme?" → {"type":"binary_relation","subject":"Lavanya","predicate":"works_at","object":"Acme"};',
35
+ '"what is Pro?" → {"type":"entity_lookup","subject":null,"predicate":null,"object":"Pro"}.',
36
+ 'Map first-person ("I","me","my") to subject "user".',
37
+ ].join(" ")
38
+
39
+ function parseJsonBlock(s: string): any {
40
+ try {
41
+ return JSON.parse(s)
42
+ } catch {
43
+ const m = s.match(/\{[\s\S]*\}/) // tolerate code fences / stray text
44
+ if (m) return JSON.parse(m[0])
45
+ throw new Error("LLM did not return JSON")
46
+ }
47
+ }
48
+
49
+ export class LlmExtractor implements KnowledgeExtractor {
50
+ private readonly fetch: typeof fetch
51
+ constructor(private readonly cfg: LlmExtractorConfig) {
52
+ this.fetch = cfg.fetchImpl ?? fetch
53
+ }
54
+
55
+ private async chat(system: string, user: string): Promise<string> {
56
+ const res = await this.fetch(`${this.cfg.endpoint.replace(/\/$/, "")}/v1/chat/completions`, {
57
+ method: "POST",
58
+ headers: {
59
+ "content-type": "application/json",
60
+ ...(this.cfg.apiKey ? { authorization: `Bearer ${this.cfg.apiKey}` } : {}),
61
+ },
62
+ body: JSON.stringify({
63
+ model: this.cfg.model,
64
+ temperature: 0,
65
+ messages: [
66
+ { role: "system", content: system },
67
+ { role: "user", content: user },
68
+ ],
69
+ }),
70
+ })
71
+ const body = (await res.json()) as { choices?: Array<{ message?: { content?: string } }> }
72
+ return body.choices?.[0]?.message?.content ?? ""
73
+ }
74
+
75
+ async extract(text: string): Promise<Extraction> {
76
+ const content = await this.chat(SYSTEM, text)
77
+ if (!content) return { entities: [], relations: [] }
78
+
79
+ const parsed = parseJsonBlock(content) as {
80
+ triples?: Array<{
81
+ subject?: string
82
+ subjectType?: string
83
+ predicate?: string
84
+ object?: string
85
+ objectType?: string
86
+ confidence?: number
87
+ }>
88
+ }
89
+
90
+ const entities: ExtractedEntity[] = []
91
+ const seen = new Set<string>()
92
+ const addEntity = (label?: string, type?: string) => {
93
+ const l = (label ?? "").trim()
94
+ if (!l) return
95
+ const id = slugify(l)
96
+ if (!id || seen.has(id)) return
97
+ seen.add(id)
98
+ entities.push({ id, label: l, type: (type ?? "CONCEPT").toUpperCase() })
99
+ }
100
+ for (const t of parsed.triples ?? []) {
101
+ addEntity(t.subject, t.subjectType)
102
+ addEntity(t.object, t.objectType)
103
+ }
104
+ const ids = new Set(entities.map((e) => e.id))
105
+ const relations: ExtractedRelation[] = []
106
+ for (const t of parsed.triples ?? []) {
107
+ const from = slugify(t.subject ?? "")
108
+ const to = slugify(t.object ?? "")
109
+ if (from && to && from !== to && ids.has(from) && ids.has(to)) {
110
+ relations.push({ from, to, type: (t.predicate ?? "relates_to").toLowerCase().replace(/\s+/g, "_"), confidence: t.confidence })
111
+ }
112
+ }
113
+ return { entities, relations }
114
+ }
115
+
116
+ /** Parse a natural-language QUESTION into a graph-query intent (for KGQA). */
117
+ async parseQuestionIntent(question: string): Promise<QuestionIntent | null> {
118
+ const content = await this.chat(INTENT_SYSTEM, question)
119
+ if (!content) return null
120
+ try {
121
+ const p = parseJsonBlock(content) as QuestionIntent
122
+ if (!p?.type) return null
123
+ return p
124
+ } catch {
125
+ return null
126
+ }
127
+ }
128
+ }
129
+
130
+ /** Run two extractors and merge (dedupe by entity id / relation pair). Used to
131
+ * combine deterministic + LLM for best recall. */
132
+ export class HybridExtractor implements KnowledgeExtractor {
133
+ constructor(
134
+ private readonly a: KnowledgeExtractor,
135
+ private readonly b: KnowledgeExtractor,
136
+ ) {}
137
+ async extract(text: string): Promise<Extraction> {
138
+ const results = await Promise.allSettled([this.a.extract(text), this.b.extract(text)])
139
+ const entities = new Map<string, ExtractedEntity>()
140
+ const relations = new Map<string, ExtractedRelation>()
141
+ for (const r of results) {
142
+ if (r.status !== "fulfilled") continue
143
+ for (const e of r.value.entities) if (!entities.has(e.id)) entities.set(e.id, e)
144
+ for (const rel of r.value.relations) {
145
+ const key = [rel.from, rel.to].sort().join("|")
146
+ if (!relations.has(key)) relations.set(key, rel)
147
+ }
148
+ }
149
+ return { entities: [...entities.values()], relations: [...relations.values()] }
150
+ }
151
+ }
@@ -0,0 +1,54 @@
1
+ // Text-hygiene utilities: slug/clean-line normalization and web-boilerplate
2
+ // stripping. Used by the extractors and (via re-export from extract.ts) across
3
+ // the codebase. Re-exported from extract.ts to preserve original import paths.
4
+
5
+ export function slugify(s: string): string {
6
+ return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "")
7
+ }
8
+
9
+ // Knowledge-graph entities share the `nodes` table with principals (users/orgs/groups)
10
+ // and records. Their ids are slugs of free text, so without isolation an ingested
11
+ // document that merely MENTIONS a word matching a principal id would overwrite that
12
+ // principal's node (INSERT OR REPLACE) and silently corrupt the ACL graph. Entity ids
13
+ // live in their own namespace so the collision is impossible - and every writer AND
14
+ // resolver must agree on the scheme, so it is defined ONCE here.
15
+ export const ENTITY_PREFIX = "entity:"
16
+ export const entityId = (slug: string): string => ENTITY_PREFIX + slug
17
+
18
+ // Strip markdown emphasis / heading / blockquote / bullet noise from a line of
19
+ // (often scraped) text, so stored + returned snippets are clean prose - no `**`,
20
+ // `#`, `>` or leftover bullet markers leaking into answers.
21
+ export function cleanLine(s: string): string {
22
+ return s
23
+ .replace(/\*\*|__|`+/g, "") // bold / italic / code markers
24
+ .replace(/^\s*#{1,6}\s+/, "") // heading hashes
25
+ .replace(/^\s*>+\s*/, "") // blockquote
26
+ .replace(/^[-*•▪◦\d.)\s]+/, "") // leading bullets / numbering
27
+ .replace(/\s+/g, " ")
28
+ .trim()
29
+ }
30
+
31
+ // Web boilerplate (cookie banners, nav menus, subscribe CTAs) that pollutes scraped
32
+ // pages and would otherwise become junk entities / noisy chunks. Strong multi-word
33
+ // phrases always drop; short EXACT nav tokens drop; real sentences are preserved.
34
+ const STRONG_BOILER =
35
+ /(manage cookie consent|cookie consent|consent banner|cookie policy|privacy policy|gdpr.?compliant|opt.?out|skip to content|view preferences|subscribe now|consenting to these technologies|withdrawing consent|adversely affect certain|join our community|all our premium content|delivered straight to your inbox|post a press release|reach our audience|editorial opportunities)/i
36
+ const NAV_TOKENS =
37
+ /^(accept|deny|subscribe|search|search\.\.\.|view preferences|view all|view all latest|see all|click here|sign in|log in|menu|home|contact us|newsletter|categories|events|resources|more|explore|explore all|explore more|applications|industries|news|search\b)$/i
38
+
39
+ export function isBoilerplate(line: string): boolean {
40
+ const t = line.trim()
41
+ if (!t) return true
42
+ if (STRONG_BOILER.test(t)) return true
43
+ if (NAV_TOKENS.test(t)) return true
44
+ return false
45
+ }
46
+
47
+ /** Drop boilerplate lines from a block of (scraped) text before chunking/extraction. */
48
+ export function stripBoilerplate(text: string): string {
49
+ return text
50
+ .split(/\r?\n/)
51
+ .filter((l) => !isBoilerplate(l))
52
+ .join("\n")
53
+ .replace(/\n{3,}/g, "\n\n")
54
+ }
@@ -0,0 +1,34 @@
1
+ // Shared knowledge-extraction types. Re-exported from extract.ts to preserve the
2
+ // original import paths (`import { ExtractedEntity } from "../extract"`).
3
+
4
+ export interface ExtractedEntity {
5
+ id: string // normalized slug (stable across docs → same concept merges)
6
+ label: string // surface form as first seen
7
+ type: string // ACRONYM | CONCEPT | PERSON | ORG | PLACE | PRODUCT | …
8
+ }
9
+ export interface ExtractedRelation {
10
+ from: string // entity id
11
+ to: string // entity id
12
+ type: string // relates_to, or a verb from the LLM (loves, builds, …)
13
+ confidence?: number // 0..1 (LLM-provided); deterministic omits it
14
+ }
15
+
16
+ /** Parsed intent of a question, for graph QA. */
17
+ export interface QuestionIntent {
18
+ type: "entity_lookup" | "relation_query" | "binary_relation"
19
+ subject?: string
20
+ predicate?: string
21
+ object?: string
22
+ }
23
+ export interface Extraction {
24
+ entities: ExtractedEntity[]
25
+ relations: ExtractedRelation[]
26
+ }
27
+
28
+ /** Pluggable extractor - deterministic (offline), LLM-backed (casual prose), or
29
+ * tree-sitter (code). Same Extraction shape, so ingest/rebuild don't care which.
30
+ * `meta` is an optional hint (filename / language) used by the code extractor;
31
+ * text extractors ignore it. */
32
+ export interface KnowledgeExtractor {
33
+ extract(text: string, meta?: { name?: string; language?: string }): Promise<Extraction>
34
+ }