@100xprompt/chitta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +203 -0
- package/assets/rules/claude-md.md +9 -0
- package/assets/skill/SKILL.md +47 -0
- package/package.json +48 -0
- package/src/README.md +124 -0
- package/src/arango-client.ts +67 -0
- package/src/arango-graph-provider.ts +364 -0
- package/src/bin.ts +27 -0
- package/src/config-env.ts +53 -0
- package/src/embedded/authorizer.ts +89 -0
- package/src/embedded/cli.ts +86 -0
- package/src/embedded/code-extractor.ts +9 -0
- package/src/embedded/demo.ts +36 -0
- package/src/embedded/extract.ts +12 -0
- package/src/embedded/extractors/code.ts +308 -0
- package/src/embedded/extractors/deterministic.ts +63 -0
- package/src/embedded/extractors/llm.ts +151 -0
- package/src/embedded/extractors/text-hygiene.ts +54 -0
- package/src/embedded/extractors/types.ts +34 -0
- package/src/embedded/graph/acl-paths.ts +96 -0
- package/src/embedded/graph/adjacency.ts +61 -0
- package/src/embedded/graph/centrality.ts +23 -0
- package/src/embedded/graph/communities.ts +46 -0
- package/src/embedded/graph/cypher.ts +17 -0
- package/src/embedded/graph/impact.ts +24 -0
- package/src/embedded/graph/knowledge-graph.ts +108 -0
- package/src/embedded/graph/pagerank.ts +57 -0
- package/src/embedded/graph/sql-access.ts +13 -0
- package/src/embedded/graph/traversal.ts +73 -0
- package/src/embedded/graph/types.ts +35 -0
- package/src/embedded/graph-query.ts +126 -0
- package/src/embedded/index.ts +171 -0
- package/src/embedded/ingest.ts +262 -0
- package/src/embedded/kgqa/answer-paths.ts +197 -0
- package/src/embedded/kgqa/entity-link.ts +13 -0
- package/src/embedded/kgqa/intent.ts +14 -0
- package/src/embedded/kgqa/predicates.ts +9 -0
- package/src/embedded/kgqa/preference.ts +20 -0
- package/src/embedded/kgqa/select.ts +99 -0
- package/src/embedded/kgqa/text.ts +16 -0
- package/src/embedded/kgqa/types.ts +6 -0
- package/src/embedded/kgqa-service.ts +122 -0
- package/src/embedded/llm-extractor.ts +10 -0
- package/src/embedded/local-embeddings.ts +36 -0
- package/src/embedded/personal.ts +100 -0
- package/src/embedded/reranker.ts +62 -0
- package/src/embedded/retrieval/decay-stage.ts +59 -0
- package/src/embedded/retrieval/diversity.ts +37 -0
- package/src/embedded/retrieval/fuse.ts +52 -0
- package/src/embedded/retrieval/graph-stage.ts +45 -0
- package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
- package/src/embedded/retrieval/keyword-stage.ts +27 -0
- package/src/embedded/retrieval/passage.ts +44 -0
- package/src/embedded/retrieval/rerank-stage.ts +31 -0
- package/src/embedded/retrieval/trace.ts +31 -0
- package/src/embedded/retrieval/vector-stage.ts +15 -0
- package/src/embedded/sqlite-graph-provider.ts +119 -0
- package/src/embedded/sqlite-store.ts +95 -0
- package/src/embedded/sqlite-vec-service.ts +122 -0
- package/src/embedded/store/chunks.ts +61 -0
- package/src/embedded/store/fts.ts +50 -0
- package/src/embedded/store/nodes-edges.ts +112 -0
- package/src/embedded/store/salience.ts +37 -0
- package/src/embedded/store/schema.ts +109 -0
- package/src/embedded/transformers-embeddings.ts +100 -0
- package/src/embeddings.ts +51 -0
- package/src/eval/goldset.ts +46 -0
- package/src/eval/harness.ts +65 -0
- package/src/eval/metrics.ts +38 -0
- package/src/http/server.ts +93 -0
- package/src/index.ts +44 -0
- package/src/install/index.ts +139 -0
- package/src/install/platforms.ts +126 -0
- package/src/install/skill.ts +46 -0
- package/src/install/writers.ts +82 -0
- package/src/mcp/backend.ts +129 -0
- package/src/mcp/server.ts +83 -0
- package/src/mcp/tools/context-about.ts +69 -0
- package/src/mcp/tools/context-graph.ts +23 -0
- package/src/mcp/tools/context-ingest.ts +88 -0
- package/src/mcp/tools/context-rebuild.ts +22 -0
- package/src/mcp/tools/context-relate.ts +88 -0
- package/src/mcp/tools/get-context.ts +52 -0
- package/src/mcp/tools/index.ts +40 -0
- package/src/mcp/tools/types.ts +33 -0
- package/src/permission.ts +72 -0
- package/src/provider.ts +65 -0
- package/src/qdrant-vector.ts +76 -0
- package/src/retrieval.ts +218 -0
- package/src/service.ts +40 -0
- package/src/types.ts +91 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
// Knowledge extraction - turns raw text into entity nodes + relationship edges so
|
|
2
|
+
// the store becomes a real knowledge graph, not one opaque record. Deterministic
|
|
3
|
+
// (no LLM, no deps) so it runs offline; swap in an LLM extractor later behind the
|
|
4
|
+
// same `Extraction` shape for higher recall.
|
|
5
|
+
//
|
|
6
|
+
// This file is now a thin facade: the implementations live in ./extractors/* and
|
|
7
|
+
// are re-exported here so existing imports (`import { slugify } from "./extract"`)
|
|
8
|
+
// keep resolving unchanged. Public API is preserved exactly.
|
|
9
|
+
|
|
10
|
+
export type { ExtractedEntity, ExtractedRelation, QuestionIntent, Extraction, KnowledgeExtractor } from "./extractors/types"
|
|
11
|
+
export { slugify, entityId, ENTITY_PREFIX, cleanLine, isBoilerplate, stripBoilerplate } from "./extractors/text-hygiene"
|
|
12
|
+
export { DeterministicExtractor, extractKnowledge } from "./extractors/deterministic"
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
// Code → graph extractor (the Graphify capability, ported TS-native). Parses source
|
|
2
|
+
// with tree-sitter (WASM grammars, no Python, no servers) into the SAME entity/edge
|
|
3
|
+
// shape every other extractor produces - so the moment code nodes land, all the rest
|
|
4
|
+
// (ACL, vector recall, bi-temporal edges, context_relate / path / impact / central)
|
|
5
|
+
// works on them for free. This is what makes us a STRICT SUPERSET of Graphify: code
|
|
6
|
+
// graph + NL graph + permissions + vectors + temporal, in one embedded store.
|
|
7
|
+
//
|
|
8
|
+
// ALL 36 tree-sitter grammars are supported. Rather than hand-list node types per
|
|
9
|
+
// grammar (brittle across 36), we CLASSIFY nodes generically: tree-sitter follows
|
|
10
|
+
// strong conventions (`*_declaration`/`*_definition`/`*_item` for defs,
|
|
11
|
+
// `call_expression`/`*_invocation`/`command` for calls, `import|use|include|...` for
|
|
12
|
+
// imports), with a small OUTLIERS table for the few grammars that don't (Ruby,
|
|
13
|
+
// Elixir, Lua, Elm, …). Static AST is the RIGHT tool here (unlike prose): code has a
|
|
14
|
+
// formal grammar, so extraction is exact. Grammars are an optionalDependency; if they
|
|
15
|
+
// fail to load we degrade to an empty extraction (never crash).
|
|
16
|
+
// Re-exported from code-extractor.ts to preserve original import paths.
|
|
17
|
+
|
|
18
|
+
import { fileURLToPath } from "node:url"
|
|
19
|
+
import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor } from "./types"
|
|
20
|
+
import { slugify } from "./text-hygiene"
|
|
21
|
+
|
|
22
|
+
// NB: this module lives one directory deeper than the original code-extractor.ts, so
|
|
23
|
+
// the relative path to node_modules gains one extra `../` to resolve identically.
|
|
24
|
+
const WASM_DIR = fileURLToPath(new URL("../../../node_modules/tree-sitter-wasms/out/", import.meta.url))
|
|
25
|
+
|
|
26
|
+
// All 36 grammars shipped by tree-sitter-wasms (grammar name → wasm file stem).
|
|
27
|
+
const GRAMMARS = [
|
|
28
|
+
"bash", "c", "c_sharp", "cpp", "css", "dart", "elisp", "elixir", "elm", "embedded_template",
|
|
29
|
+
"go", "html", "java", "javascript", "json", "kotlin", "lua", "objc", "ocaml", "php",
|
|
30
|
+
"python", "ql", "rescript", "ruby", "rust", "scala", "solidity", "swift", "systemrdl",
|
|
31
|
+
"tlaplus", "toml", "tsx", "typescript", "vue", "yaml", "zig",
|
|
32
|
+
] as const
|
|
33
|
+
type Lang = (typeof GRAMMARS)[number]
|
|
34
|
+
const GRAMMAR_SET = new Set<string>(GRAMMARS)
|
|
35
|
+
|
|
36
|
+
// File extension → grammar. Covers every supported language.
|
|
37
|
+
const EXT_TO_LANG: Record<string, Lang> = {
|
|
38
|
+
sh: "bash", bash: "bash", zsh: "bash",
|
|
39
|
+
c: "c", h: "c",
|
|
40
|
+
cs: "c_sharp",
|
|
41
|
+
cpp: "cpp", cc: "cpp", cxx: "cpp", hpp: "cpp", hxx: "cpp", hh: "cpp",
|
|
42
|
+
css: "css", scss: "css",
|
|
43
|
+
dart: "dart",
|
|
44
|
+
el: "elisp", emacs: "elisp",
|
|
45
|
+
ex: "elixir", exs: "elixir",
|
|
46
|
+
elm: "elm",
|
|
47
|
+
erb: "embedded_template", ejs: "embedded_template",
|
|
48
|
+
go: "go",
|
|
49
|
+
html: "html", htm: "html",
|
|
50
|
+
java: "java",
|
|
51
|
+
js: "javascript", mjs: "javascript", cjs: "javascript", jsx: "javascript",
|
|
52
|
+
json: "json",
|
|
53
|
+
kt: "kotlin", kts: "kotlin",
|
|
54
|
+
lua: "lua",
|
|
55
|
+
m: "objc", mm: "objc",
|
|
56
|
+
ml: "ocaml", mli: "ocaml",
|
|
57
|
+
php: "php",
|
|
58
|
+
py: "python", pyi: "python",
|
|
59
|
+
ql: "ql",
|
|
60
|
+
res: "rescript",
|
|
61
|
+
rb: "ruby",
|
|
62
|
+
rs: "rust",
|
|
63
|
+
scala: "scala", sc: "scala",
|
|
64
|
+
sol: "solidity",
|
|
65
|
+
swift: "swift",
|
|
66
|
+
rdl: "systemrdl",
|
|
67
|
+
tla: "tlaplus",
|
|
68
|
+
toml: "toml",
|
|
69
|
+
tsx: "tsx",
|
|
70
|
+
ts: "typescript", mts: "typescript", cts: "typescript",
|
|
71
|
+
vue: "vue",
|
|
72
|
+
yaml: "yaml", yml: "yaml",
|
|
73
|
+
zig: "zig",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
type Kind = "func" | "class" | "call" | "import"
|
|
77
|
+
|
|
78
|
+
// Generic, convention-based classification of a tree-sitter node type. Tree-sitter
|
|
79
|
+
// grammars vary (`function_declaration`, `function_item`, `function_definition_statement`,
|
|
80
|
+
// …) so we match a KEYWORD plus a DEF-SUFFIX rather than enumerate every combination.
|
|
81
|
+
const IMPORT_RE = /^(import|include|use|using|require|package|open|with|load)(_|$)/
|
|
82
|
+
const CALL_RE = /(^|_)(call|invocation|command)(_expression|_statement)?$/
|
|
83
|
+
const CLASS_KEY = /(^|_)(class|struct|interface|trait|enum|contract|object|protocol|module|impl|type|record|union|actor|mixin|component)(_|$)/
|
|
84
|
+
const FUNC_KEY = /(^|_)(function|method|func|fn|constructor|subroutine|procedure|def|getter|setter)(_|$)/
|
|
85
|
+
const DEF_SUFFIX = /(declaration|definition|specifier|item|signature|spec|statement|binding)/
|
|
86
|
+
|
|
87
|
+
// Outliers whose node names carry no DEF-SUFFIX keyword. NB: bare "module" is Ruby's
|
|
88
|
+
// `module M` namespace - but it's also Python's FILE ROOT node type, so we must never
|
|
89
|
+
// classify the root (handled by starting the walk at the root's children).
|
|
90
|
+
const EXTRA_CLASS = new Set(["class", "module", "singleton_class", "object_declaration", "protocol_declaration"])
|
|
91
|
+
const EXTRA_FUNC = new Set(["method", "singleton_method", "let_binding", "value_declaration", "function_declaration_left", "defun", "macro_definition"])
|
|
92
|
+
const EXTRA_CALL = new Set(["function_call", "member_call_expression", "scoped_call_expression", "command_call", "method_call"])
|
|
93
|
+
|
|
94
|
+
function classify(t: string): Kind | null {
|
|
95
|
+
if (t === "preproc_include") return "import"
|
|
96
|
+
if (IMPORT_RE.test(t)) return "import"
|
|
97
|
+
if (CALL_RE.test(t) || EXTRA_CALL.has(t)) return "call"
|
|
98
|
+
if ((CLASS_KEY.test(t) && DEF_SUFFIX.test(t)) || EXTRA_CLASS.has(t)) return "class"
|
|
99
|
+
if ((FUNC_KEY.test(t) && DEF_SUFFIX.test(t)) || EXTRA_FUNC.has(t)) return "func"
|
|
100
|
+
return null
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Elixir (and similar Lisp-y grammars) express definitions AS macro calls - `def`,
|
|
104
|
+
// `defmodule`, etc. are `call` nodes whose head identifier is the macro name.
|
|
105
|
+
const DEF_MACROS = new Set(["def", "defp", "defmacro", "defmacrop", "defmodule", "defprotocol", "defimpl", "defstruct", "defn", "defun", "defmethod", "defclass"])
|
|
106
|
+
const MODULE_MACROS = new Set(["defmodule", "defprotocol", "defimpl", "defclass"])
|
|
107
|
+
|
|
108
|
+
const C_EXTRACTED = 1.0 // call resolves to a symbol we defined here
|
|
109
|
+
const C_INFERRED = 0.7 // call to an unknown/external symbol
|
|
110
|
+
|
|
111
|
+
let parserInit: Promise<void> | null = null
|
|
112
|
+
const langCache = new Map<string, unknown>()
|
|
113
|
+
|
|
114
|
+
async function loadLang(lang: string): Promise<unknown | null> {
|
|
115
|
+
try {
|
|
116
|
+
const TS = (await import("web-tree-sitter")).default as any
|
|
117
|
+
if (!parserInit) parserInit = TS.init()
|
|
118
|
+
await parserInit
|
|
119
|
+
if (!langCache.has(lang)) {
|
|
120
|
+
if (!GRAMMAR_SET.has(lang)) return null
|
|
121
|
+
langCache.set(lang, await TS.Language.load(WASM_DIR + `tree-sitter-${lang}.wasm`))
|
|
122
|
+
}
|
|
123
|
+
return langCache.get(lang)
|
|
124
|
+
} catch {
|
|
125
|
+
return null // grammars unavailable (e.g. compiled binary) → graceful no-op
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export class CodeExtractor implements KnowledgeExtractor {
|
|
130
|
+
/** All languages this extractor can parse. */
|
|
131
|
+
static languages(): readonly string[] {
|
|
132
|
+
return GRAMMARS
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/** Map a filename to a supported language, or null. */
|
|
136
|
+
static detectLanguage(name?: string): string | null {
|
|
137
|
+
if (!name) return null
|
|
138
|
+
const ext = name.toLowerCase().split(".").pop() ?? ""
|
|
139
|
+
return EXT_TO_LANG[ext] ?? null
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/** Implements the generic extractor interface. Uses `meta.language` if given, else
|
|
143
|
+
* detects from `meta.name` (the filename). Returns empty for non-code / unknown. */
|
|
144
|
+
async extract(text: string, meta?: { name?: string; language?: string }): Promise<Extraction> {
|
|
145
|
+
const lang = meta?.language ?? CodeExtractor.detectLanguage(meta?.name)
|
|
146
|
+
if (!lang) return { entities: [], relations: [] }
|
|
147
|
+
return this.extractCode(text, lang, meta?.name ?? "source")
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/** Parse one source file into code entities + relations. Returns empty (never
|
|
151
|
+
* throws) if the grammar can't load or its ABI is incompatible with the runtime. */
|
|
152
|
+
async extractCode(code: string, lang: string, fileName = "source"): Promise<Extraction> {
|
|
153
|
+
const Language = await loadLang(lang)
|
|
154
|
+
if (!Language) return { entities: [], relations: [] }
|
|
155
|
+
let tree: any
|
|
156
|
+
try {
|
|
157
|
+
const TS = (await import("web-tree-sitter")).default as any
|
|
158
|
+
const parser = new TS()
|
|
159
|
+
parser.setLanguage(Language) // throws on ABI mismatch → caught → graceful empty
|
|
160
|
+
tree = parser.parse(code)
|
|
161
|
+
} catch {
|
|
162
|
+
return { entities: [], relations: [] }
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const entities = new Map<string, ExtractedEntity>()
|
|
166
|
+
const relations = new Map<string, ExtractedRelation>()
|
|
167
|
+
const defined = new Set<string>() // symbol ids we actually define here
|
|
168
|
+
|
|
169
|
+
const fileId = `file:${slugify(fileName)}`
|
|
170
|
+
entities.set(fileId, { id: fileId, label: fileName, type: "FILE" })
|
|
171
|
+
|
|
172
|
+
const symId = (name: string) => `sym:${slugify(name)}`
|
|
173
|
+
const addEnt = (id: string, label: string, type: string) => {
|
|
174
|
+
if (!entities.has(id)) entities.set(id, { id, label, type })
|
|
175
|
+
}
|
|
176
|
+
const addRel = (from: string, to: string, type: string, confidence: number) => {
|
|
177
|
+
const key = `${from}|${to}|${type}`
|
|
178
|
+
if (!relations.has(key)) relations.set(key, { from, to, type, confidence })
|
|
179
|
+
}
|
|
180
|
+
// Find a definition's name: prefer the `name` field, else a shallow DFS for the
|
|
181
|
+
// first identifier-like token (skipping parameter lists), so it works across grammars.
|
|
182
|
+
const IDENT_RE = /(identifier|name|word|constant|type_identifier|field_identifier)/
|
|
183
|
+
const nameOf = (node: any): string | null => {
|
|
184
|
+
const n = node.childForFieldName?.("name")
|
|
185
|
+
if (n?.text) return n.text
|
|
186
|
+
const stack: any[] = [node]
|
|
187
|
+
let depth = 0
|
|
188
|
+
while (stack.length && depth < 400) {
|
|
189
|
+
const cur = stack.shift()
|
|
190
|
+
depth++
|
|
191
|
+
for (let i = 0; i < cur.namedChildCount; i++) {
|
|
192
|
+
const c = cur.namedChild(i)
|
|
193
|
+
if (/parameter|argument|body|block/.test(c.type)) continue
|
|
194
|
+
if (IDENT_RE.test(c.type)) return c.text
|
|
195
|
+
stack.push(c)
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return null
|
|
199
|
+
}
|
|
200
|
+
const lastSegment = (text: string): string | null => {
|
|
201
|
+
const parts = text.split(/[.:>-]+/).filter(Boolean)
|
|
202
|
+
return parts[parts.length - 1] || null
|
|
203
|
+
}
|
|
204
|
+
// The first identifier-like token in preorder, skipping argument subtrees - for a
|
|
205
|
+
// call node that's the callee; for an Elixir def-macro that's the macro name.
|
|
206
|
+
const firstIdent = (node: any): string | null => {
|
|
207
|
+
for (let i = 0; i < node.namedChildCount; i++) {
|
|
208
|
+
const c = node.namedChild(i)
|
|
209
|
+
if (/argument/.test(c.type)) continue
|
|
210
|
+
if (IDENT_RE.test(c.type)) return c.text
|
|
211
|
+
const deep = firstIdent(c)
|
|
212
|
+
if (deep) return deep
|
|
213
|
+
}
|
|
214
|
+
return null
|
|
215
|
+
}
|
|
216
|
+
const calleeOf = (node: any): string | null => {
|
|
217
|
+
const f = node.childForFieldName?.("function") || node.childForFieldName?.("name") || node.childForFieldName?.("method")
|
|
218
|
+
if (f?.text) return lastSegment(f.text)
|
|
219
|
+
const id = firstIdent(node)
|
|
220
|
+
return id ? lastSegment(id) : null
|
|
221
|
+
}
|
|
222
|
+
// Elixir: name defined by a def-macro call - in its arguments, the first module
|
|
223
|
+
// alias or the head identifier of the inner call (e.g. `def greet(n)` → greet).
|
|
224
|
+
const elixirDefName = (node: any): string | null => {
|
|
225
|
+
let scope = node
|
|
226
|
+
for (let i = 0; i < node.namedChildCount; i++) if (node.namedChild(i).type === "arguments") scope = node.namedChild(i)
|
|
227
|
+
for (let i = 0; i < scope.namedChildCount; i++) {
|
|
228
|
+
const c = scope.namedChild(i)
|
|
229
|
+
if (c.type === "alias") return c.text
|
|
230
|
+
if (c.type === "call") return firstIdent(c)
|
|
231
|
+
if (IDENT_RE.test(c.type)) return c.text
|
|
232
|
+
}
|
|
233
|
+
return null
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const walk = (node: any, enclosing: string, enclosingClass: string | null) => {
|
|
237
|
+
let nextEnclosing = enclosing
|
|
238
|
+
let nextClass = enclosingClass
|
|
239
|
+
const kind = classify(node.type)
|
|
240
|
+
|
|
241
|
+
if (kind === "class") {
|
|
242
|
+
const nm = nameOf(node)
|
|
243
|
+
if (nm) {
|
|
244
|
+
const id = symId(nm)
|
|
245
|
+
addEnt(id, nm, "CLASS")
|
|
246
|
+
defined.add(id)
|
|
247
|
+
addRel(fileId, id, "defines", C_EXTRACTED)
|
|
248
|
+
nextEnclosing = id
|
|
249
|
+
nextClass = id
|
|
250
|
+
}
|
|
251
|
+
} else if (kind === "func") {
|
|
252
|
+
const nm = nameOf(node)
|
|
253
|
+
if (nm) {
|
|
254
|
+
const id = symId(nm)
|
|
255
|
+
const type = enclosingClass ? "METHOD" : "FUNCTION"
|
|
256
|
+
addEnt(id, nm, type)
|
|
257
|
+
defined.add(id)
|
|
258
|
+
if (enclosingClass) addRel(enclosingClass, id, "has_method", C_EXTRACTED)
|
|
259
|
+
else addRel(fileId, id, "defines", C_EXTRACTED)
|
|
260
|
+
nextEnclosing = id
|
|
261
|
+
}
|
|
262
|
+
} else if (kind === "import") {
|
|
263
|
+
const mod = node.text.replace(/^[^A-Za-z0-9_./@]*(import|from|use|using|require|include|package|open|with|load)\s+/i, "").split(/[\s;(){}]/)[0]?.replace(/["'`<>]/g, "")
|
|
264
|
+
if (mod && mod.length > 1) {
|
|
265
|
+
const id = `module:${slugify(mod)}`
|
|
266
|
+
addEnt(id, mod, "MODULE")
|
|
267
|
+
addRel(fileId, id, "imports", C_EXTRACTED)
|
|
268
|
+
}
|
|
269
|
+
} else if (kind === "call") {
|
|
270
|
+
// Elixir-style: a call to a def-macro is actually a DEFINITION.
|
|
271
|
+
const head = firstIdent(node)
|
|
272
|
+
if (head && DEF_MACROS.has(head)) {
|
|
273
|
+
const nm = elixirDefName(node)
|
|
274
|
+
if (nm && nm.length > 1) {
|
|
275
|
+
const id = symId(nm)
|
|
276
|
+
const isMod = MODULE_MACROS.has(head)
|
|
277
|
+
addEnt(id, nm, isMod ? "CLASS" : enclosingClass ? "METHOD" : "FUNCTION")
|
|
278
|
+
defined.add(id)
|
|
279
|
+
if (isMod) {
|
|
280
|
+
addRel(fileId, id, "defines", C_EXTRACTED)
|
|
281
|
+
nextClass = id
|
|
282
|
+
} else if (enclosingClass) addRel(enclosingClass, id, "has_method", C_EXTRACTED)
|
|
283
|
+
else addRel(fileId, id, "defines", C_EXTRACTED)
|
|
284
|
+
nextEnclosing = id
|
|
285
|
+
}
|
|
286
|
+
} else {
|
|
287
|
+
const callee = calleeOf(node)
|
|
288
|
+
if (callee && callee.length > 1) {
|
|
289
|
+
const id = symId(callee)
|
|
290
|
+
addEnt(id, callee, "FUNCTION")
|
|
291
|
+
addRel(enclosing, id, "calls", defined.has(id) ? C_EXTRACTED : C_INFERRED)
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
for (let i = 0; i < node.namedChildCount; i++) walk(node.namedChild(i), nextEnclosing, nextClass)
|
|
297
|
+
}
|
|
298
|
+
// Start at the root's CHILDREN - never classify the root itself (e.g. Python's
|
|
299
|
+
// root node type is "module", which would otherwise look like a class).
|
|
300
|
+
for (let i = 0; i < tree.rootNode.namedChildCount; i++) walk(tree.rootNode.namedChild(i), fileId, null)
|
|
301
|
+
|
|
302
|
+
// Second pass: a call to a symbol that turned out to be locally defined (possibly
|
|
303
|
+
// seen before its definition) is promoted to EXTRACTED confidence.
|
|
304
|
+
for (const r of relations.values()) if (r.type === "calls" && defined.has(r.to)) r.confidence = C_EXTRACTED
|
|
305
|
+
|
|
306
|
+
return { entities: [...entities.values()], relations: [...relations.values()] }
|
|
307
|
+
}
|
|
308
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// Deterministic (no-LLM, no-deps) knowledge extractor. Pulls capitalized phrases
|
|
2
|
+
// and acronyms as entities and relates entities that co-occur in a sentence-ish
|
|
3
|
+
// unit. Runs fully offline. Re-exported from extract.ts to preserve import paths.
|
|
4
|
+
|
|
5
|
+
import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor } from "./types"
|
|
6
|
+
|
|
7
|
+
export class DeterministicExtractor implements KnowledgeExtractor {
|
|
8
|
+
async extract(text: string): Promise<Extraction> {
|
|
9
|
+
return extractKnowledge(text)
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const STOP = new Set([
|
|
14
|
+
"The", "A", "An", "This", "That", "These", "Those", "It", "Our", "Your", "Their", "We", "You",
|
|
15
|
+
"I", "He", "She", "They", "And", "Or", "But", "For", "With", "From", "To", "Of", "In", "On",
|
|
16
|
+
"At", "By", "As", "Is", "Are", "Was", "Were", "Be", "Will", "Each", "All", "Layer", "Step",
|
|
17
|
+
"Both", "Also", "Use", "Used", "Using", "Here", "There", "When", "Then", "Now", "Both",
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
function slug(s: string): string {
|
|
21
|
+
return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "")
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Capitalized phrases (e.g. "100X Prompt Pro", "Fine-Tuned") and acronyms ("SOC-II", "FP8").
|
|
25
|
+
const PHRASE = /\b(?:[0-9]+[A-Za-z]+|[A-Z][a-z0-9]+|[A-Z]{2,}(?:-[A-Z0-9]+)?)(?:[ -](?:[A-Z][A-Za-z0-9]+|[A-Z]{2,}(?:-[A-Z0-9]+)?|[0-9]+))*\b/g
|
|
26
|
+
|
|
27
|
+
function candidates(line: string): ExtractedEntity[] {
|
|
28
|
+
const out: ExtractedEntity[] = []
|
|
29
|
+
const seen = new Set<string>()
|
|
30
|
+
for (const m of line.matchAll(PHRASE)) {
|
|
31
|
+
const label = m[0].trim()
|
|
32
|
+
if (label.length < 2) continue
|
|
33
|
+
if (STOP.has(label)) continue
|
|
34
|
+
// single short common capitalized word at sentence start → skip noise
|
|
35
|
+
if (!label.includes(" ") && !label.includes("-") && label.length < 3) continue
|
|
36
|
+
const id = slug(label)
|
|
37
|
+
if (!id || seen.has(id)) continue
|
|
38
|
+
seen.add(id)
|
|
39
|
+
out.push({ id, label, type: /^[A-Z0-9-]+$/.test(label) && label === label.toUpperCase() ? "ACRONYM" : "CONCEPT" })
|
|
40
|
+
}
|
|
41
|
+
return out
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function extractKnowledge(text: string): Extraction {
|
|
45
|
+
const entities = new Map<string, ExtractedEntity>()
|
|
46
|
+
const relations = new Map<string, ExtractedRelation>()
|
|
47
|
+
|
|
48
|
+
// Split into lines / sentence-ish units; entities co-occurring in a unit relate.
|
|
49
|
+
const units = text.split(/[\n.;]+/).map((u) => u.trim()).filter(Boolean)
|
|
50
|
+
for (const unit of units) {
|
|
51
|
+
const ents = candidates(unit)
|
|
52
|
+
for (const e of ents) if (!entities.has(e.id)) entities.set(e.id, e)
|
|
53
|
+
// pairwise co-occurrence within the unit (cap to avoid explosion)
|
|
54
|
+
for (let i = 0; i < ents.length; i++) {
|
|
55
|
+
for (let j = i + 1; j < Math.min(ents.length, i + 6); j++) {
|
|
56
|
+
const [a, b] = [ents[i].id, ents[j].id].sort()
|
|
57
|
+
if (a === b) continue
|
|
58
|
+
relations.set(`${a}|${b}`, { from: a, to: b, type: "relates_to" })
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return { entities: [...entities.values()], relations: [...relations.values()] }
|
|
63
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
// LLM-backed knowledge extraction. Calls an OpenAI-compatible chat endpoint -
|
|
2
|
+
// point it at a LOCAL/sovereign model (vLLM/SGLang/Ollama) so nothing leaves the
|
|
3
|
+
// building. Unlike the deterministic extractor, it handles casual lowercase text
|
|
4
|
+
// ("i love lavanya" → Lavanya[PERSON], user -loves→ Lavanya).
|
|
5
|
+
// Re-exported from llm-extractor.ts to preserve original import paths.
|
|
6
|
+
|
|
7
|
+
import type { Extraction, ExtractedEntity, ExtractedRelation, KnowledgeExtractor, QuestionIntent } from "./types"
|
|
8
|
+
import { slugify } from "./text-hygiene"
|
|
9
|
+
|
|
10
|
+
export interface LlmExtractorConfig {
|
|
11
|
+
endpoint: string // OpenAI-compatible base, e.g. http://localhost:8000
|
|
12
|
+
model: string
|
|
13
|
+
apiKey?: string
|
|
14
|
+
fetchImpl?: typeof fetch
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Typed-triple extraction: the "code-like" structure - subject/object carry types,
|
|
18
|
+
// the predicate is a real verb, plus a confidence. (Per research: enhancing the LLM
|
|
19
|
+
// extractor beats REBEL/GLiNER/etc. for our TS stack - typed, no deps, no big models.)
|
|
20
|
+
const SYSTEM = [
|
|
21
|
+
"You extract knowledge-graph TRIPLES from the user's text.",
|
|
22
|
+
"Return ONLY a JSON object, no prose:",
|
|
23
|
+
'{"triples":[{"subject":"<entity>","subjectType":"PERSON|ORG|PLACE|PRODUCT|CONCEPT|OTHER",',
|
|
24
|
+
'"predicate":"<short verb>","object":"<entity>","objectType":"PERSON|ORG|PLACE|PRODUCT|CONCEPT|OTHER","confidence":0.0-1.0}]}',
|
|
25
|
+
"Cover people, orgs, places, products, key concepts - INCLUDING casual lowercase text.",
|
|
26
|
+
'Example: "i love lavanya" → {"subject":"user","subjectType":"PERSON","predicate":"loves","object":"Lavanya","objectType":"PERSON","confidence":0.95}.',
|
|
27
|
+
"Be concise; only meaningful triples.",
|
|
28
|
+
].join(" ")
|
|
29
|
+
|
|
30
|
+
const INTENT_SYSTEM = [
|
|
31
|
+
"Parse the user's QUESTION into a structured intent. Return ONLY JSON:",
|
|
32
|
+
'{"type":"entity_lookup|relation_query|binary_relation","subject":"<noun or null>","predicate":"<verb or null>","object":"<noun or null>"}',
|
|
33
|
+
'Examples: "who do I love?" → {"type":"relation_query","subject":"user","predicate":"loves","object":null};',
|
|
34
|
+
'"does Lavanya work at Acme?" → {"type":"binary_relation","subject":"Lavanya","predicate":"works_at","object":"Acme"};',
|
|
35
|
+
'"what is Pro?" → {"type":"entity_lookup","subject":null,"predicate":null,"object":"Pro"}.',
|
|
36
|
+
'Map first-person ("I","me","my") to subject "user".',
|
|
37
|
+
].join(" ")
|
|
38
|
+
|
|
39
|
+
function parseJsonBlock(s: string): any {
|
|
40
|
+
try {
|
|
41
|
+
return JSON.parse(s)
|
|
42
|
+
} catch {
|
|
43
|
+
const m = s.match(/\{[\s\S]*\}/) // tolerate code fences / stray text
|
|
44
|
+
if (m) return JSON.parse(m[0])
|
|
45
|
+
throw new Error("LLM did not return JSON")
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export class LlmExtractor implements KnowledgeExtractor {
|
|
50
|
+
private readonly fetch: typeof fetch
|
|
51
|
+
constructor(private readonly cfg: LlmExtractorConfig) {
|
|
52
|
+
this.fetch = cfg.fetchImpl ?? fetch
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
private async chat(system: string, user: string): Promise<string> {
|
|
56
|
+
const res = await this.fetch(`${this.cfg.endpoint.replace(/\/$/, "")}/v1/chat/completions`, {
|
|
57
|
+
method: "POST",
|
|
58
|
+
headers: {
|
|
59
|
+
"content-type": "application/json",
|
|
60
|
+
...(this.cfg.apiKey ? { authorization: `Bearer ${this.cfg.apiKey}` } : {}),
|
|
61
|
+
},
|
|
62
|
+
body: JSON.stringify({
|
|
63
|
+
model: this.cfg.model,
|
|
64
|
+
temperature: 0,
|
|
65
|
+
messages: [
|
|
66
|
+
{ role: "system", content: system },
|
|
67
|
+
{ role: "user", content: user },
|
|
68
|
+
],
|
|
69
|
+
}),
|
|
70
|
+
})
|
|
71
|
+
const body = (await res.json()) as { choices?: Array<{ message?: { content?: string } }> }
|
|
72
|
+
return body.choices?.[0]?.message?.content ?? ""
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async extract(text: string): Promise<Extraction> {
|
|
76
|
+
const content = await this.chat(SYSTEM, text)
|
|
77
|
+
if (!content) return { entities: [], relations: [] }
|
|
78
|
+
|
|
79
|
+
const parsed = parseJsonBlock(content) as {
|
|
80
|
+
triples?: Array<{
|
|
81
|
+
subject?: string
|
|
82
|
+
subjectType?: string
|
|
83
|
+
predicate?: string
|
|
84
|
+
object?: string
|
|
85
|
+
objectType?: string
|
|
86
|
+
confidence?: number
|
|
87
|
+
}>
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const entities: ExtractedEntity[] = []
|
|
91
|
+
const seen = new Set<string>()
|
|
92
|
+
const addEntity = (label?: string, type?: string) => {
|
|
93
|
+
const l = (label ?? "").trim()
|
|
94
|
+
if (!l) return
|
|
95
|
+
const id = slugify(l)
|
|
96
|
+
if (!id || seen.has(id)) return
|
|
97
|
+
seen.add(id)
|
|
98
|
+
entities.push({ id, label: l, type: (type ?? "CONCEPT").toUpperCase() })
|
|
99
|
+
}
|
|
100
|
+
for (const t of parsed.triples ?? []) {
|
|
101
|
+
addEntity(t.subject, t.subjectType)
|
|
102
|
+
addEntity(t.object, t.objectType)
|
|
103
|
+
}
|
|
104
|
+
const ids = new Set(entities.map((e) => e.id))
|
|
105
|
+
const relations: ExtractedRelation[] = []
|
|
106
|
+
for (const t of parsed.triples ?? []) {
|
|
107
|
+
const from = slugify(t.subject ?? "")
|
|
108
|
+
const to = slugify(t.object ?? "")
|
|
109
|
+
if (from && to && from !== to && ids.has(from) && ids.has(to)) {
|
|
110
|
+
relations.push({ from, to, type: (t.predicate ?? "relates_to").toLowerCase().replace(/\s+/g, "_"), confidence: t.confidence })
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
return { entities, relations }
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/** Parse a natural-language QUESTION into a graph-query intent (for KGQA). */
|
|
117
|
+
async parseQuestionIntent(question: string): Promise<QuestionIntent | null> {
|
|
118
|
+
const content = await this.chat(INTENT_SYSTEM, question)
|
|
119
|
+
if (!content) return null
|
|
120
|
+
try {
|
|
121
|
+
const p = parseJsonBlock(content) as QuestionIntent
|
|
122
|
+
if (!p?.type) return null
|
|
123
|
+
return p
|
|
124
|
+
} catch {
|
|
125
|
+
return null
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Run two extractors and merge (dedupe by entity id / relation pair). Used to
|
|
131
|
+
* combine deterministic + LLM for best recall. */
|
|
132
|
+
export class HybridExtractor implements KnowledgeExtractor {
|
|
133
|
+
constructor(
|
|
134
|
+
private readonly a: KnowledgeExtractor,
|
|
135
|
+
private readonly b: KnowledgeExtractor,
|
|
136
|
+
) {}
|
|
137
|
+
async extract(text: string): Promise<Extraction> {
|
|
138
|
+
const results = await Promise.allSettled([this.a.extract(text), this.b.extract(text)])
|
|
139
|
+
const entities = new Map<string, ExtractedEntity>()
|
|
140
|
+
const relations = new Map<string, ExtractedRelation>()
|
|
141
|
+
for (const r of results) {
|
|
142
|
+
if (r.status !== "fulfilled") continue
|
|
143
|
+
for (const e of r.value.entities) if (!entities.has(e.id)) entities.set(e.id, e)
|
|
144
|
+
for (const rel of r.value.relations) {
|
|
145
|
+
const key = [rel.from, rel.to].sort().join("|")
|
|
146
|
+
if (!relations.has(key)) relations.set(key, rel)
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return { entities: [...entities.values()], relations: [...relations.values()] }
|
|
150
|
+
}
|
|
151
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
// Text-hygiene utilities: slug/clean-line normalization and web-boilerplate
|
|
2
|
+
// stripping. Used by the extractors and (via re-export from extract.ts) across
|
|
3
|
+
// the codebase. Re-exported from extract.ts to preserve original import paths.
|
|
4
|
+
|
|
5
|
+
export function slugify(s: string): string {
|
|
6
|
+
return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "")
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
// Knowledge-graph entities share the `nodes` table with principals (users/orgs/groups)
|
|
10
|
+
// and records. Their ids are slugs of free text, so without isolation an ingested
|
|
11
|
+
// document that merely MENTIONS a word matching a principal id would overwrite that
|
|
12
|
+
// principal's node (INSERT OR REPLACE) and silently corrupt the ACL graph. Entity ids
|
|
13
|
+
// live in their own namespace so the collision is impossible - and every writer AND
|
|
14
|
+
// resolver must agree on the scheme, so it is defined ONCE here.
|
|
15
|
+
export const ENTITY_PREFIX = "entity:"
|
|
16
|
+
export const entityId = (slug: string): string => ENTITY_PREFIX + slug
|
|
17
|
+
|
|
18
|
+
// Strip markdown emphasis / heading / blockquote / bullet noise from a line of
|
|
19
|
+
// (often scraped) text, so stored + returned snippets are clean prose - no `**`,
|
|
20
|
+
// `#`, `>` or leftover bullet markers leaking into answers.
|
|
21
|
+
export function cleanLine(s: string): string {
|
|
22
|
+
return s
|
|
23
|
+
.replace(/\*\*|__|`+/g, "") // bold / italic / code markers
|
|
24
|
+
.replace(/^\s*#{1,6}\s+/, "") // heading hashes
|
|
25
|
+
.replace(/^\s*>+\s*/, "") // blockquote
|
|
26
|
+
.replace(/^[-*•▪◦\d.)\s]+/, "") // leading bullets / numbering
|
|
27
|
+
.replace(/\s+/g, " ")
|
|
28
|
+
.trim()
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Web boilerplate (cookie banners, nav menus, subscribe CTAs) that pollutes scraped
|
|
32
|
+
// pages and would otherwise become junk entities / noisy chunks. Strong multi-word
|
|
33
|
+
// phrases always drop; short EXACT nav tokens drop; real sentences are preserved.
|
|
34
|
+
const STRONG_BOILER =
|
|
35
|
+
/(manage cookie consent|cookie consent|consent banner|cookie policy|privacy policy|gdpr.?compliant|opt.?out|skip to content|view preferences|subscribe now|consenting to these technologies|withdrawing consent|adversely affect certain|join our community|all our premium content|delivered straight to your inbox|post a press release|reach our audience|editorial opportunities)/i
|
|
36
|
+
const NAV_TOKENS =
|
|
37
|
+
/^(accept|deny|subscribe|search|search\.\.\.|view preferences|view all|view all latest|see all|click here|sign in|log in|menu|home|contact us|newsletter|categories|events|resources|more|explore|explore all|explore more|applications|industries|news|search\b)$/i
|
|
38
|
+
|
|
39
|
+
export function isBoilerplate(line: string): boolean {
|
|
40
|
+
const t = line.trim()
|
|
41
|
+
if (!t) return true
|
|
42
|
+
if (STRONG_BOILER.test(t)) return true
|
|
43
|
+
if (NAV_TOKENS.test(t)) return true
|
|
44
|
+
return false
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/** Drop boilerplate lines from a block of (scraped) text before chunking/extraction. */
|
|
48
|
+
export function stripBoilerplate(text: string): string {
|
|
49
|
+
return text
|
|
50
|
+
.split(/\r?\n/)
|
|
51
|
+
.filter((l) => !isBoilerplate(l))
|
|
52
|
+
.join("\n")
|
|
53
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
54
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// Shared knowledge-extraction types. Re-exported from extract.ts to preserve the
|
|
2
|
+
// original import paths (`import { ExtractedEntity } from "../extract"`).
|
|
3
|
+
|
|
4
|
+
export interface ExtractedEntity {
|
|
5
|
+
id: string // normalized slug (stable across docs → same concept merges)
|
|
6
|
+
label: string // surface form as first seen
|
|
7
|
+
type: string // ACRONYM | CONCEPT | PERSON | ORG | PLACE | PRODUCT | …
|
|
8
|
+
}
|
|
9
|
+
export interface ExtractedRelation {
|
|
10
|
+
from: string // entity id
|
|
11
|
+
to: string // entity id
|
|
12
|
+
type: string // relates_to, or a verb from the LLM (loves, builds, …)
|
|
13
|
+
confidence?: number // 0..1 (LLM-provided); deterministic omits it
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** Parsed intent of a question, for graph QA. */
|
|
17
|
+
export interface QuestionIntent {
|
|
18
|
+
type: "entity_lookup" | "relation_query" | "binary_relation"
|
|
19
|
+
subject?: string
|
|
20
|
+
predicate?: string
|
|
21
|
+
object?: string
|
|
22
|
+
}
|
|
23
|
+
export interface Extraction {
|
|
24
|
+
entities: ExtractedEntity[]
|
|
25
|
+
relations: ExtractedRelation[]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Pluggable extractor - deterministic (offline), LLM-backed (casual prose), or
|
|
29
|
+
* tree-sitter (code). Same Extraction shape, so ingest/rebuild don't care which.
|
|
30
|
+
* `meta` is an optional hint (filename / language) used by the code extractor;
|
|
31
|
+
* text extractors ignore it. */
|
|
32
|
+
export interface KnowledgeExtractor {
|
|
33
|
+
extract(text: string, meta?: { name?: string; language?: string }): Promise<Extraction>
|
|
34
|
+
}
|