@100xprompt/chitta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +203 -0
- package/assets/rules/claude-md.md +9 -0
- package/assets/skill/SKILL.md +47 -0
- package/package.json +48 -0
- package/src/README.md +124 -0
- package/src/arango-client.ts +67 -0
- package/src/arango-graph-provider.ts +364 -0
- package/src/bin.ts +27 -0
- package/src/config-env.ts +53 -0
- package/src/embedded/authorizer.ts +89 -0
- package/src/embedded/cli.ts +86 -0
- package/src/embedded/code-extractor.ts +9 -0
- package/src/embedded/demo.ts +36 -0
- package/src/embedded/extract.ts +12 -0
- package/src/embedded/extractors/code.ts +308 -0
- package/src/embedded/extractors/deterministic.ts +63 -0
- package/src/embedded/extractors/llm.ts +151 -0
- package/src/embedded/extractors/text-hygiene.ts +54 -0
- package/src/embedded/extractors/types.ts +34 -0
- package/src/embedded/graph/acl-paths.ts +96 -0
- package/src/embedded/graph/adjacency.ts +61 -0
- package/src/embedded/graph/centrality.ts +23 -0
- package/src/embedded/graph/communities.ts +46 -0
- package/src/embedded/graph/cypher.ts +17 -0
- package/src/embedded/graph/impact.ts +24 -0
- package/src/embedded/graph/knowledge-graph.ts +108 -0
- package/src/embedded/graph/pagerank.ts +57 -0
- package/src/embedded/graph/sql-access.ts +13 -0
- package/src/embedded/graph/traversal.ts +73 -0
- package/src/embedded/graph/types.ts +35 -0
- package/src/embedded/graph-query.ts +126 -0
- package/src/embedded/index.ts +171 -0
- package/src/embedded/ingest.ts +262 -0
- package/src/embedded/kgqa/answer-paths.ts +197 -0
- package/src/embedded/kgqa/entity-link.ts +13 -0
- package/src/embedded/kgqa/intent.ts +14 -0
- package/src/embedded/kgqa/predicates.ts +9 -0
- package/src/embedded/kgqa/preference.ts +20 -0
- package/src/embedded/kgqa/select.ts +99 -0
- package/src/embedded/kgqa/text.ts +16 -0
- package/src/embedded/kgqa/types.ts +6 -0
- package/src/embedded/kgqa-service.ts +122 -0
- package/src/embedded/llm-extractor.ts +10 -0
- package/src/embedded/local-embeddings.ts +36 -0
- package/src/embedded/personal.ts +100 -0
- package/src/embedded/reranker.ts +62 -0
- package/src/embedded/retrieval/decay-stage.ts +59 -0
- package/src/embedded/retrieval/diversity.ts +37 -0
- package/src/embedded/retrieval/fuse.ts +52 -0
- package/src/embedded/retrieval/graph-stage.ts +45 -0
- package/src/embedded/retrieval/hybrid-retriever.ts +80 -0
- package/src/embedded/retrieval/keyword-stage.ts +27 -0
- package/src/embedded/retrieval/passage.ts +44 -0
- package/src/embedded/retrieval/rerank-stage.ts +31 -0
- package/src/embedded/retrieval/trace.ts +31 -0
- package/src/embedded/retrieval/vector-stage.ts +15 -0
- package/src/embedded/sqlite-graph-provider.ts +119 -0
- package/src/embedded/sqlite-store.ts +95 -0
- package/src/embedded/sqlite-vec-service.ts +122 -0
- package/src/embedded/store/chunks.ts +61 -0
- package/src/embedded/store/fts.ts +50 -0
- package/src/embedded/store/nodes-edges.ts +112 -0
- package/src/embedded/store/salience.ts +37 -0
- package/src/embedded/store/schema.ts +109 -0
- package/src/embedded/transformers-embeddings.ts +100 -0
- package/src/embeddings.ts +51 -0
- package/src/eval/goldset.ts +46 -0
- package/src/eval/harness.ts +65 -0
- package/src/eval/metrics.ts +38 -0
- package/src/http/server.ts +93 -0
- package/src/index.ts +44 -0
- package/src/install/index.ts +139 -0
- package/src/install/platforms.ts +126 -0
- package/src/install/skill.ts +46 -0
- package/src/install/writers.ts +82 -0
- package/src/mcp/backend.ts +129 -0
- package/src/mcp/server.ts +83 -0
- package/src/mcp/tools/context-about.ts +69 -0
- package/src/mcp/tools/context-graph.ts +23 -0
- package/src/mcp/tools/context-ingest.ts +88 -0
- package/src/mcp/tools/context-rebuild.ts +22 -0
- package/src/mcp/tools/context-relate.ts +88 -0
- package/src/mcp/tools/get-context.ts +52 -0
- package/src/mcp/tools/index.ts +40 -0
- package/src/mcp/tools/types.ts +33 -0
- package/src/permission.ts +72 -0
- package/src/provider.ts +65 -0
- package/src/qdrant-vector.ts +76 -0
- package/src/retrieval.ts +218 -0
- package/src/service.ts +40 -0
- package/src/types.ts +91 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ContextBackend } from "../backend"
|
|
2
|
+
import type { ToolModule, ToolResult } from "./types"
|
|
3
|
+
|
|
4
|
+
const schema = {
|
|
5
|
+
name: "context_rebuild",
|
|
6
|
+
description:
|
|
7
|
+
"Re-extract the knowledge graph over ALL stored records using the current extractor. Run this after " +
|
|
8
|
+
"connecting a local LLM (CONTEXT_LLM_URL) to upgrade older data from untyped to TYPED triples " +
|
|
9
|
+
"(e.g. 'Google partners_with HSBC'), so questions resolve to exact facts. May take a while on large stores.",
|
|
10
|
+
inputSchema: { type: "object" as const, properties: {} },
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
async function handler(_args: Record<string, unknown>, backend: ContextBackend): Promise<ToolResult> {
|
|
14
|
+
const r = await backend.rebuild!()
|
|
15
|
+
return {
|
|
16
|
+
content: [
|
|
17
|
+
{ type: "text", text: `Rebuilt the knowledge graph: ${r.records} record(s) → ${r.entities} concept-mention(s) re-extracted with the current model.` },
|
|
18
|
+
],
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export const contextRebuildTool: ToolModule = { schema, handler, available: (b) => !!b.rebuild }
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import type { ContextBackend } from "../backend"
|
|
2
|
+
import type { ToolModule, ToolResult } from "./types"
|
|
3
|
+
|
|
4
|
+
const schema = {
|
|
5
|
+
name: "context_relate",
|
|
6
|
+
description:
|
|
7
|
+
"Query the knowledge graph AS A GRAPH (not text search). Four modes:\n" +
|
|
8
|
+
"• neighbors - what's directly connected to an entity (optionally by relation).\n" +
|
|
9
|
+
"• path - HOW two entities are related (shortest relation chain between them).\n" +
|
|
10
|
+
"• impact - which records reference an entity + what it connects to ('what depends on X').\n" +
|
|
11
|
+
"• central - the hub entities (most-connected concepts the user knows about).\n" +
|
|
12
|
+
"• communities - clusters of related entities (the natural groupings in the graph).\n" +
|
|
13
|
+
"• walk - multi-hop relevance (Personalized PageRank) from one or more seed entities: what's\n" +
|
|
14
|
+
" most related across the WHOLE graph, not just direct neighbors. Seeds = `entity` (comma-separated).\n" +
|
|
15
|
+
"• cypher - export the (permission-filtered) graph as Neo4j Cypher for interop.\n" +
|
|
16
|
+
"Works for CODE graphs too (functions/classes/calls/imports) and prose. " +
|
|
17
|
+
"USE WHEN the user asks 'how are X and Y connected', 'what's related to X', 'what calls/references X', " +
|
|
18
|
+
"'what are the main things here', or 'what clusters exist'. Permission-filtered. DON'T USE to fetch document text (get_context).",
|
|
19
|
+
inputSchema: {
|
|
20
|
+
type: "object" as const,
|
|
21
|
+
properties: {
|
|
22
|
+
mode: { type: "string", enum: ["neighbors", "path", "impact", "central", "communities", "cypher", "walk"], description: "which graph query" },
|
|
23
|
+
entity: { type: "string", description: "the entity (for neighbors/impact), or first entity (for path)" },
|
|
24
|
+
to: { type: "string", description: "the second entity (path mode only)" },
|
|
25
|
+
relation: { type: "string", description: "optional relation filter (neighbors mode)" },
|
|
26
|
+
},
|
|
27
|
+
required: ["mode"],
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
async function handler(args: Record<string, unknown>, backend: ContextBackend): Promise<ToolResult> {
|
|
32
|
+
const a = args as { mode: string; entity?: string; to?: string; relation?: string }
|
|
33
|
+
const gq = backend.graphQuery!
|
|
34
|
+
if (a.mode === "central") {
|
|
35
|
+
const hubs = (await gq.central(15)) as Array<{ label: string; degree: number; strength: number }>
|
|
36
|
+
const text = hubs.length
|
|
37
|
+
? "Central concepts (most-connected):\n" + hubs.map((h) => ` ${h.label} - ${h.degree} link(s), strength ${h.strength}`).join("\n")
|
|
38
|
+
: "The knowledge graph has no relationships yet."
|
|
39
|
+
return { content: [{ type: "text", text }] }
|
|
40
|
+
}
|
|
41
|
+
if (a.mode === "communities") {
|
|
42
|
+
const cs = (await gq.communities()) as Array<{ size: number; hub: string; members: string[] }>
|
|
43
|
+
const text = cs.length
|
|
44
|
+
? "Communities (clusters of related entities):\n" +
|
|
45
|
+
cs.slice(0, 20).map((c, i) => ` [${i + 1}] ${c.hub} +${c.size - 1} more - ${c.members.slice(0, 8).join(", ")}${c.members.length > 8 ? "…" : ""}`).join("\n")
|
|
46
|
+
: "No clusters yet (graph has no relationships)."
|
|
47
|
+
return { content: [{ type: "text", text }] }
|
|
48
|
+
}
|
|
49
|
+
if (a.mode === "walk") {
|
|
50
|
+
if (!a.entity) return { content: [{ type: "text", text: "Provide `entity` (one or more comma-separated seeds)." }], isError: true }
|
|
51
|
+
const seeds = a.entity.split(/\s*,\s*|\s+and\s+/i).filter(Boolean)
|
|
52
|
+
const r = (await gq.walk(seeds)) as Array<{ label: string; score: number; type: string }>
|
|
53
|
+
const text = r.length
|
|
54
|
+
? `Most related to ${seeds.join(", ")} (multi-hop PageRank):\n` + r.map((x) => ` ${x.label}${x.type ? ` (${x.type})` : ""} - ${x.score.toFixed(4)}`).join("\n")
|
|
55
|
+
: `No entity matching "${a.entity}" in accessible knowledge.`
|
|
56
|
+
return { content: [{ type: "text", text }] }
|
|
57
|
+
}
|
|
58
|
+
if (a.mode === "cypher") {
|
|
59
|
+
const cy = await gq.cypher()
|
|
60
|
+
return { content: [{ type: "text", text: cy || "// empty graph" }] }
|
|
61
|
+
}
|
|
62
|
+
if (!a.entity) return { content: [{ type: "text", text: "Provide `entity`." }], isError: true }
|
|
63
|
+
if (a.mode === "neighbors") {
|
|
64
|
+
const r = (await gq.neighbors(a.entity, a.relation)) as { entity: string; neighbors: Array<{ label: string; relation: string; direction: string; weight: number }> } | null
|
|
65
|
+
if (!r) return { content: [{ type: "text", text: `No entity matching "${a.entity}" in accessible knowledge.` }] }
|
|
66
|
+
const text = r.neighbors.length
|
|
67
|
+
? `${r.entity} is connected to:\n` + r.neighbors.map((n) => ` ${n.direction === "out" ? "→" : "←"} ${n.label} (${n.relation}, ×${n.weight})`).join("\n")
|
|
68
|
+
: `${r.entity} has no recorded relationships.`
|
|
69
|
+
return { content: [{ type: "text", text }] }
|
|
70
|
+
}
|
|
71
|
+
if (a.mode === "impact") {
|
|
72
|
+
const r = (await gq.impact(a.entity)) as { entity: string; records: string[]; connectedEntities: Array<{ label: string; relation: string }> } | null
|
|
73
|
+
if (!r) return { content: [{ type: "text", text: `No entity matching "${a.entity}" in accessible knowledge.` }] }
|
|
74
|
+
const recs = r.records.length ? `Referenced by: ${r.records.join(", ")}.` : "Not referenced by any record."
|
|
75
|
+
const conn = r.connectedEntities.length ? "\nConnects to: " + r.connectedEntities.map((c) => `${c.label} (${c.relation})`).join(", ") : ""
|
|
76
|
+
return { content: [{ type: "text", text: `${r.entity}\n${recs}${conn}` }] }
|
|
77
|
+
}
|
|
78
|
+
if (a.mode === "path") {
|
|
79
|
+
if (!a.to) return { content: [{ type: "text", text: "Path mode needs `entity` and `to`." }], isError: true }
|
|
80
|
+
const r = (await gq.path(a.entity, a.to)) as { found: boolean; hops: number; steps: Array<{ from: string; relation: string; to: string }> }
|
|
81
|
+
if (!r.found) return { content: [{ type: "text", text: `No path found between "${a.entity}" and "${a.to}" (or one isn't in accessible knowledge).` }] }
|
|
82
|
+
const chain = r.steps.map((s) => `${s.from} -${s.relation}→ ${s.to}`).join("\n then ")
|
|
83
|
+
return { content: [{ type: "text", text: `Connected in ${r.hops} hop(s):\n ${chain}` }] }
|
|
84
|
+
}
|
|
85
|
+
return { content: [{ type: "text", text: `unknown mode: ${a.mode}` }], isError: true }
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export const contextRelateTool: ToolModule = { schema, handler, available: (b) => !!b.graphQuery }
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { RetrievalStatus, type SearchResult } from "../../types"
|
|
2
|
+
import type { ContextBackend } from "../backend"
|
|
3
|
+
import type { ToolModule, ToolResult } from "./types"
|
|
4
|
+
|
|
5
|
+
function render(results: SearchResult[]): string {
|
|
6
|
+
return results.map((r, i) => `[${i + 1}] ${r.metadata.recordName ?? "untitled"}\n${r.content}`).join("\n\n")
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
const schema = {
|
|
10
|
+
name: "get_context",
|
|
11
|
+
description:
|
|
12
|
+
"Recall stored knowledge. USE WHEN: answering anything that could touch the user's own notes, people, " +
|
|
13
|
+
"projects, org knowledge, or past statements ('who/what did I…', 'what do we know about…', 'remind me…'). " +
|
|
14
|
+
"Call this BEFORE answering from your own assumptions. Returns ranked, cited, permission-filtered snippets " +
|
|
15
|
+
"(graph ACL → semantic vector search → GraphRAG expansion). DON'T USE for general world knowledge.",
|
|
16
|
+
inputSchema: {
|
|
17
|
+
type: "object" as const,
|
|
18
|
+
properties: { query: { type: "string", description: "what to recall - phrase it as the information need" } },
|
|
19
|
+
required: ["query"],
|
|
20
|
+
},
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async function handler(args: Record<string, unknown>, backend: ContextBackend): Promise<ToolResult> {
|
|
24
|
+
const query = String((args as any).query ?? "")
|
|
25
|
+
// KGQA first: if the question maps to an exact fact in the typed graph,
|
|
26
|
+
// answer precisely (with citation) instead of returning a ranked list.
|
|
27
|
+
if (backend.ask) {
|
|
28
|
+
const exact = await backend.ask(query)
|
|
29
|
+
if (exact && exact.confidence >= 0.7) {
|
|
30
|
+
const cite = exact.citations.length ? ` (source: ${exact.citations.join(", ")})` : ""
|
|
31
|
+
const t = exact.triple
|
|
32
|
+
// Multiple facts → list them as bullets (a query can match several typed facts);
|
|
33
|
+
// a single fact stays inline.
|
|
34
|
+
const facts = exact.facts?.length ? exact.facts : [exact.answer]
|
|
35
|
+
const body = facts.length > 1 ? facts.map((f) => `• ${f}`).join("\n") : facts[0]
|
|
36
|
+
// Only show the triple bracket for a SINGLE genuine relational fact (a real verb).
|
|
37
|
+
const isRelational = facts.length === 1 && t.predicate && !["info", "facts", "mentioned_as", "prefer"].includes(t.predicate)
|
|
38
|
+
const tripleLine = isRelational ? `\n[${t.subject} -${t.predicate}→ ${t.object}]` : ""
|
|
39
|
+
return { content: [{ type: "text", text: `${body}${cite}${tripleLine}` }] }
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
const res = await backend.query(query)
|
|
43
|
+
const text =
|
|
44
|
+
res.status === RetrievalStatus.SUCCESS && res.searchResults.length
|
|
45
|
+
? render(res.searchResults)
|
|
46
|
+
: res.status === RetrievalStatus.ACCESSIBLE_RECORDS_NOT_FOUND
|
|
47
|
+
? "The knowledge graph is empty or you have no access yet."
|
|
48
|
+
: "No relevant context found."
|
|
49
|
+
return { content: [{ type: "text", text }] }
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export const getContextTool: ToolModule = { schema, handler }
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// Aggregates the per-tool modules into the ListTools schema array and a
|
|
2
|
+
// name→handler dispatch map, applying each tool's capability gate against the
|
|
3
|
+
// active backend. Order matches the original inline TOOLS array exactly so the
|
|
4
|
+
// ListTools response and the context_about "## Tools" section are unchanged.
|
|
5
|
+
|
|
6
|
+
import type { ContextBackend } from "../backend"
|
|
7
|
+
import { getContextTool } from "./get-context"
|
|
8
|
+
import { contextIngestTool } from "./context-ingest"
|
|
9
|
+
import { contextGraphTool } from "./context-graph"
|
|
10
|
+
import { contextRebuildTool } from "./context-rebuild"
|
|
11
|
+
import { contextRelateTool } from "./context-relate"
|
|
12
|
+
import { contextAboutTool, setAboutToolList } from "./context-about"
|
|
13
|
+
import type { ToolModule, ToolResult, ToolSchema } from "./types"
|
|
14
|
+
|
|
15
|
+
const ALL: ToolModule[] = [
|
|
16
|
+
getContextTool,
|
|
17
|
+
contextIngestTool,
|
|
18
|
+
contextGraphTool,
|
|
19
|
+
contextRebuildTool,
|
|
20
|
+
contextRelateTool,
|
|
21
|
+
contextAboutTool,
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
export interface ResolvedTools {
|
|
25
|
+
/** Capability-gated schemas, for ListTools (and the about report). */
|
|
26
|
+
schemas: ToolSchema[]
|
|
27
|
+
/** name → handler for CallTool dispatch (only available tools are present). */
|
|
28
|
+
dispatch: Map<string, ToolModule["handler"]>
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function resolveTools(backend: ContextBackend): ResolvedTools {
|
|
32
|
+
const active = ALL.filter((t) => (t.available ? t.available(backend) : true))
|
|
33
|
+
const schemas = active.map((t) => t.schema)
|
|
34
|
+
const dispatch = new Map(active.map((t) => [t.schema.name, t.handler]))
|
|
35
|
+
// Keep context_about's "## Tools" listing in sync with what's actually exposed.
|
|
36
|
+
setAboutToolList(schemas)
|
|
37
|
+
return { schemas, dispatch }
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export type { ToolModule, ToolResult, ToolSchema }
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
// Shared shapes for the modular MCP tools. Each tool module exports its name,
|
|
2
|
+
// its JSON input schema (the entry that used to live in server.ts's TOOLS array),
|
|
3
|
+
// and an async handler `(args, backend) => result`. A tool may be GATED behind a
|
|
4
|
+
// backend capability via `available(backend)` - when it returns false the tool is
|
|
5
|
+
// not listed and not dispatchable (identical to the old `...(backend.x ? [...] : [])`).
|
|
6
|
+
|
|
7
|
+
import type { ContextBackend } from "../backend"
|
|
8
|
+
|
|
9
|
+
export interface ToolResult {
|
|
10
|
+
content: Array<{ type: "text"; text: string }>
|
|
11
|
+
isError?: boolean
|
|
12
|
+
// The MCP SDK's ServerResult union accepts an open record; this index signature
|
|
13
|
+
// lets ToolResult satisfy that branch (so handlers don't need the task-result fields).
|
|
14
|
+
[key: string]: unknown
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface ToolSchema {
|
|
18
|
+
name: string
|
|
19
|
+
description: string
|
|
20
|
+
inputSchema: { type: "object"; properties: Record<string, unknown>; required?: string[] }
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface ToolModule {
|
|
24
|
+
schema: ToolSchema
|
|
25
|
+
handler: (args: Record<string, unknown>, backend: ContextBackend) => Promise<ToolResult>
|
|
26
|
+
/** Optional capability gate; defaults to always-available when omitted. */
|
|
27
|
+
available?: (backend: ContextBackend) => boolean
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Slugify a title into a record-id prefix. */
|
|
31
|
+
export function slug(s: string): string {
|
|
32
|
+
return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 40) || "note"
|
|
33
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// Ported from PipesHub `backend/python/app/models/permission.py`.
|
|
2
|
+
// The ACL primitive: a permission is an edge from a principal (user/group/role/
|
|
3
|
+
// domain/org/team/anyone) to a record, carrying a role. Mirrors the sharing
|
|
4
|
+
// semantics of the source systems (Drive/Slack), so enforcement at query time is
|
|
5
|
+
// just a graph traversal over these edges.
|
|
6
|
+
|
|
7
|
+
export enum PermissionType {
|
|
8
|
+
READ = "READER",
|
|
9
|
+
WRITE = "WRITER",
|
|
10
|
+
OWNER = "OWNER",
|
|
11
|
+
COMMENT = "COMMENTER",
|
|
12
|
+
OTHER = "OTHERS",
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export enum EntityType {
|
|
16
|
+
USER = "USER",
|
|
17
|
+
GROUP = "GROUP",
|
|
18
|
+
ROLE = "ROLE",
|
|
19
|
+
DOMAIN = "DOMAIN",
|
|
20
|
+
ORG = "ORG",
|
|
21
|
+
TEAM = "TEAM",
|
|
22
|
+
ANYONE = "ANYONE",
|
|
23
|
+
ANYONE_WITH_LINK = "ANYONE_WITH_LINK",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface Permission {
|
|
27
|
+
externalId?: string
|
|
28
|
+
email?: string
|
|
29
|
+
type: PermissionType
|
|
30
|
+
entityType: EntityType
|
|
31
|
+
createdAt: number
|
|
32
|
+
updatedAt: number
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Generic permission-edge shape consumed by the graph store. */
|
|
36
|
+
export interface PermissionEdge {
|
|
37
|
+
from_id: string
|
|
38
|
+
from_collection: string
|
|
39
|
+
to_id: string
|
|
40
|
+
to_collection: string
|
|
41
|
+
role: PermissionType
|
|
42
|
+
type: EntityType
|
|
43
|
+
createdAtTimestamp: number
|
|
44
|
+
updatedAtTimestamp: number
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function toGraphPermission(
|
|
48
|
+
perm: Permission,
|
|
49
|
+
fromId: string,
|
|
50
|
+
fromCollection: string,
|
|
51
|
+
toId: string,
|
|
52
|
+
toCollection: string,
|
|
53
|
+
): PermissionEdge {
|
|
54
|
+
return {
|
|
55
|
+
from_id: fromId,
|
|
56
|
+
from_collection: fromCollection,
|
|
57
|
+
to_id: toId,
|
|
58
|
+
to_collection: toCollection,
|
|
59
|
+
role: perm.type,
|
|
60
|
+
type: perm.entityType,
|
|
61
|
+
createdAtTimestamp: perm.createdAt,
|
|
62
|
+
updatedAtTimestamp: perm.updatedAt,
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface AccessControl {
|
|
67
|
+
owners: string[]
|
|
68
|
+
editors: string[]
|
|
69
|
+
viewers: string[]
|
|
70
|
+
domains: string[]
|
|
71
|
+
anyoneWithLink: boolean
|
|
72
|
+
}
|
package/src/provider.ts
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
// The seams. The ACL + retrieval *logic* is native TS (graph-provider.ts /
|
|
2
|
+
// retrieval.ts); the actual datastores plug in behind these interfaces so the
|
|
3
|
+
// moat stays runtime-agnostic (ArangoDB today, anything tomorrow).
|
|
4
|
+
|
|
5
|
+
import type { AccessibleMap, RecordDoc, RetrievalFilters, UserDoc } from "./types"
|
|
6
|
+
|
|
7
|
+
/** Thin transport to ArangoDB. Implement with arangojs or the org's HTTP client. */
|
|
8
|
+
export interface ArangoClient {
|
|
9
|
+
executeAql(query: string, bindVars: Record<string, unknown>): Promise<any[]>
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/** Graph store: identity, the ACL traversal, and record fetches. */
|
|
13
|
+
export interface GraphProvider {
|
|
14
|
+
/** THE moat: virtualRecordId -> recordId for everything the user may access. */
|
|
15
|
+
getAccessibleVirtualRecordIds(args: {
|
|
16
|
+
userId: string
|
|
17
|
+
orgId: string
|
|
18
|
+
filters?: RetrievalFilters
|
|
19
|
+
}): Promise<AccessibleMap>
|
|
20
|
+
|
|
21
|
+
getRecordsByRecordIds(recordIds: string[], orgId: string): Promise<RecordDoc[]>
|
|
22
|
+
getUserByUserId(userId: string): Promise<UserDoc | null>
|
|
23
|
+
getUserApps(userKey: string): Promise<Array<{ _key?: string; id?: string }>>
|
|
24
|
+
getDocument(recordId: string, collection: string): Promise<RecordDoc | null>
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Vector store contract - hybrid (dense + sparse, RRF) search over Qdrant-shaped
|
|
28
|
+
* payloads. `filterCollection` builds the must/should filter restricting the
|
|
29
|
+
* search to ACL-approved virtualRecordIds. */
|
|
30
|
+
export interface VectorPoint {
|
|
31
|
+
id: string | number
|
|
32
|
+
score: number
|
|
33
|
+
payload: { page_content?: string; metadata?: Record<string, unknown> }
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface VectorQueryResult {
|
|
37
|
+
points: VectorPoint[]
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface VectorDBService {
|
|
41
|
+
filterCollection(args: {
|
|
42
|
+
must?: Record<string, unknown>
|
|
43
|
+
should?: Record<string, unknown>
|
|
44
|
+
}): Promise<unknown>
|
|
45
|
+
|
|
46
|
+
queryNearestPoints(args: {
|
|
47
|
+
collectionName: string
|
|
48
|
+
requests: unknown[]
|
|
49
|
+
}): Promise<VectorQueryResult[]>
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** Embedding provider - dense + sparse (BM25) for hybrid retrieval. `embedDense`
|
|
53
|
+
* embeds DOCUMENTS (chunks); `embedQuery` embeds the QUERY. They differ for
|
|
54
|
+
* asymmetric models (e.g. EmbeddingGemma's task prefixes); symmetric models leave
|
|
55
|
+
* `embedQuery` undefined and callers fall back to `embedDense`. */
|
|
56
|
+
export interface EmbeddingProvider {
|
|
57
|
+
embedDense(query: string): Promise<number[]>
|
|
58
|
+
embedQuery?(query: string): Promise<number[]>
|
|
59
|
+
embedSparse(query: string): Promise<{ indices: number[]; values: number[] }>
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/** Embed a QUERY with the asymmetric path when the provider has one, else the doc path. */
|
|
63
|
+
export function embedQueryWith(emb: EmbeddingProvider, text: string): Promise<number[]> {
|
|
64
|
+
return emb.embedQuery ? emb.embedQuery(text) : emb.embedDense(text)
|
|
65
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
// VectorDBService adapter over Qdrant's HTTP API (no SDK dependency).
|
|
2
|
+
// `filterCollection` builds a Qdrant filter; `queryNearestPoints` runs the hybrid
|
|
3
|
+
// (dense + sparse, RRF) batch query the retrieval spine assembles.
|
|
4
|
+
|
|
5
|
+
import type { VectorDBService, VectorQueryResult } from "./provider"
|
|
6
|
+
|
|
7
|
+
export interface QdrantConfig {
|
|
8
|
+
/** e.g. http://localhost:6333 */
|
|
9
|
+
url: string
|
|
10
|
+
apiKey?: string
|
|
11
|
+
/** payload key prefix for metadata fields (Qdrant stores them under `metadata`). */
|
|
12
|
+
metadataPrefix?: string
|
|
13
|
+
fetchImpl?: typeof fetch
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** Qdrant filter condition: match any of the given values for a payload key. */
|
|
17
|
+
interface QdrantCondition {
|
|
18
|
+
key: string
|
|
19
|
+
match: { any: unknown[] } | { value: unknown }
|
|
20
|
+
}
|
|
21
|
+
interface QdrantFilter {
|
|
22
|
+
must?: QdrantCondition[]
|
|
23
|
+
should?: QdrantCondition[]
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export class QdrantVectorService implements VectorDBService {
|
|
27
|
+
private readonly fetch: typeof fetch
|
|
28
|
+
private readonly prefix: string
|
|
29
|
+
constructor(private readonly cfg: QdrantConfig) {
|
|
30
|
+
this.fetch = cfg.fetchImpl ?? fetch
|
|
31
|
+
this.prefix = cfg.metadataPrefix ?? "metadata."
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
private headers(): Record<string, string> {
|
|
35
|
+
const h: Record<string, string> = { "content-type": "application/json" }
|
|
36
|
+
if (this.cfg.apiKey) h["api-key"] = this.cfg.apiKey
|
|
37
|
+
return h
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
private conditions(spec?: Record<string, unknown>): QdrantCondition[] {
|
|
41
|
+
if (!spec) return []
|
|
42
|
+
return Object.entries(spec).map(([key, val]) => ({
|
|
43
|
+
key: `${this.prefix}${key}`,
|
|
44
|
+
match: Array.isArray(val) ? { any: val } : { value: val },
|
|
45
|
+
}))
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async filterCollection(args: {
|
|
49
|
+
must?: Record<string, unknown>
|
|
50
|
+
should?: Record<string, unknown>
|
|
51
|
+
}): Promise<QdrantFilter> {
|
|
52
|
+
const filter: QdrantFilter = {}
|
|
53
|
+
const must = this.conditions(args.must)
|
|
54
|
+
const should = this.conditions(args.should)
|
|
55
|
+
if (must.length) filter.must = must
|
|
56
|
+
if (should.length) filter.should = should
|
|
57
|
+
return filter
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async queryNearestPoints(args: {
|
|
61
|
+
collectionName: string
|
|
62
|
+
requests: unknown[]
|
|
63
|
+
}): Promise<VectorQueryResult[]> {
|
|
64
|
+
const url = `${this.cfg.url.replace(/\/$/, "")}/collections/${encodeURIComponent(
|
|
65
|
+
args.collectionName,
|
|
66
|
+
)}/points/query/batch`
|
|
67
|
+
const res = await this.fetch(url, {
|
|
68
|
+
method: "POST",
|
|
69
|
+
headers: this.headers(),
|
|
70
|
+
body: JSON.stringify({ searches: args.requests }),
|
|
71
|
+
})
|
|
72
|
+
const body = (await res.json()) as { result?: Array<{ points: VectorQueryResult["points"] }>; status?: unknown }
|
|
73
|
+
if (!res.ok) throw new Error(`qdrant query failed: ${res.status} ${JSON.stringify(body.status)}`)
|
|
74
|
+
return (body.result ?? []).map((r) => ({ points: r.points ?? [] }))
|
|
75
|
+
}
|
|
76
|
+
}
|
package/src/retrieval.ts
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
// Ported from PipesHub `modules/retrieval/retrieval_service.py::search_with_filters`.
|
|
2
|
+
//
|
|
3
|
+
// The security-critical invariant, preserved exactly:
|
|
4
|
+
// 1. Compute the user's accessible {virtualRecordId -> recordId} map from the
|
|
5
|
+
// GRAPH first (ACL). 2. Restrict the vector search to those virtual ids.
|
|
6
|
+
// 3. For every hit, resolve the recordId from the ACCESSIBLE MAP - never from
|
|
7
|
+
// the vector payload. That step is the cross-connector leak guard: a shared
|
|
8
|
+
// virtualRecordId only ever resolves to the record THIS user may see.
|
|
9
|
+
//
|
|
10
|
+
// The cosmetic webUrl/mime fallback enrichment for file/mail records
|
|
11
|
+
// (retrieval_service.py:462-532) is intentionally omitted here - it is
|
|
12
|
+
// presentation, not access control. Port it later if you need source links.
|
|
13
|
+
|
|
14
|
+
import { embedQueryWith, type EmbeddingProvider, type GraphProvider, type VectorDBService, type VectorPoint } from "./provider"
|
|
15
|
+
import {
|
|
16
|
+
ACCESSIBLE_RECORDS_NOT_FOUND_MESSAGE,
|
|
17
|
+
RetrievalStatus,
|
|
18
|
+
type RecordDoc,
|
|
19
|
+
type RetrievalFilters,
|
|
20
|
+
type RetrievalResponse,
|
|
21
|
+
type SearchResult,
|
|
22
|
+
type UserDoc,
|
|
23
|
+
} from "./types"
|
|
24
|
+
|
|
25
|
+
const REQUIRED_FIELDS = ["origin", "recordName", "recordId", "mimeType", "orgId"] as const
|
|
26
|
+
|
|
27
|
+
export interface RetrievalDeps {
|
|
28
|
+
graph: GraphProvider
|
|
29
|
+
vector: VectorDBService
|
|
30
|
+
embeddings: EmbeddingProvider
|
|
31
|
+
collectionName: string
|
|
32
|
+
log?: { info: (m: string) => void; debug: (m: string) => void; error: (m: string) => void }
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export class RetrievalService {
|
|
36
|
+
constructor(private readonly deps: RetrievalDeps) {}
|
|
37
|
+
private get log() {
|
|
38
|
+
return this.deps.log ?? { info() {}, debug() {}, error() {} }
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async searchWithFilters(args: {
|
|
42
|
+
queries: string[]
|
|
43
|
+
userId: string
|
|
44
|
+
orgId: string
|
|
45
|
+
filterGroups?: RetrievalFilters
|
|
46
|
+
limit?: number
|
|
47
|
+
virtualRecordIdsFromTool?: string[]
|
|
48
|
+
}): Promise<RetrievalResponse> {
|
|
49
|
+
const { queries, userId, orgId } = args
|
|
50
|
+
const limit = args.limit ?? 20
|
|
51
|
+
const filterGroups = args.filterGroups ?? {}
|
|
52
|
+
const kbIds = filterGroups.kb
|
|
53
|
+
|
|
54
|
+
try {
|
|
55
|
+
// (1) ACL + user, in parallel.
|
|
56
|
+
const [accessibleMap, user] = await Promise.all([
|
|
57
|
+
this.deps.graph.getAccessibleVirtualRecordIds({ userId, orgId, filters: filterGroups }),
|
|
58
|
+
this.deps.graph.getUserByUserId(userId),
|
|
59
|
+
])
|
|
60
|
+
|
|
61
|
+
if (Object.keys(accessibleMap).length === 0) {
|
|
62
|
+
this.log.error(`No accessible documents for user ${userId} org ${orgId}`)
|
|
63
|
+
return this.empty(ACCESSIBLE_RECORDS_NOT_FOUND_MESSAGE, RetrievalStatus.ACCESSIBLE_RECORDS_NOT_FOUND)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// (2) Vector filter restricted to ACL-approved virtual ids.
|
|
67
|
+
const filter = args.virtualRecordIdsFromTool
|
|
68
|
+
? await this.deps.vector.filterCollection({
|
|
69
|
+
must: { orgId, virtualRecordId: args.virtualRecordIdsFromTool },
|
|
70
|
+
})
|
|
71
|
+
: await this.deps.vector.filterCollection({
|
|
72
|
+
must: { orgId },
|
|
73
|
+
should: { virtualRecordId: Object.keys(accessibleMap) },
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
const searchResults = await this.executeParallelSearches(queries, filter, limit)
|
|
77
|
+
if (searchResults.length === 0)
|
|
78
|
+
return this.empty(
|
|
79
|
+
"No relevant documents found for your search query. Try different keywords.",
|
|
80
|
+
RetrievalStatus.EMPTY_RESPONSE,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
const returnedVirtualIds = [
|
|
84
|
+
...new Set(
|
|
85
|
+
searchResults
|
|
86
|
+
.map((r) => r.metadata?.virtualRecordId)
|
|
87
|
+
.filter((v): v is string => v != null),
|
|
88
|
+
),
|
|
89
|
+
]
|
|
90
|
+
if (returnedVirtualIds.length === 0)
|
|
91
|
+
return this.empty(ACCESSIBLE_RECORDS_NOT_FOUND_MESSAGE, RetrievalStatus.ACCESSIBLE_RECORDS_NOT_FOUND)
|
|
92
|
+
|
|
93
|
+
// (3) THE leak guard: resolve recordIds ONLY through the accessible map.
|
|
94
|
+
const recordIdsToFetch = [
|
|
95
|
+
...new Set(returnedVirtualIds.filter((v) => v in accessibleMap).map((v) => accessibleMap[v])),
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
const fetched = await this.deps.graph.getRecordsByRecordIds(recordIdsToFetch, orgId)
|
|
99
|
+
if (!fetched || fetched.length === 0)
|
|
100
|
+
return this.empty(ACCESSIBLE_RECORDS_NOT_FOUND_MESSAGE, RetrievalStatus.ACCESSIBLE_RECORDS_NOT_FOUND)
|
|
101
|
+
|
|
102
|
+
const recordById = new Map<string, RecordDoc>()
|
|
103
|
+
for (const r of fetched) if (r?._key) recordById.set(r._key, r)
|
|
104
|
+
|
|
105
|
+
// Enrich each result with its (permission-verified) record metadata.
|
|
106
|
+
const enriched: SearchResult[] = []
|
|
107
|
+
for (const result of searchResults) {
|
|
108
|
+
const vid = result.metadata?.virtualRecordId
|
|
109
|
+
if (vid == null || !(vid in accessibleMap)) continue // not permitted → drop
|
|
110
|
+
const recordId = accessibleMap[vid]
|
|
111
|
+
const record = recordById.get(recordId)
|
|
112
|
+
result.metadata.recordId = recordId
|
|
113
|
+
if (record) {
|
|
114
|
+
result.metadata.origin = record.origin
|
|
115
|
+
result.metadata.connector = record.connectorName ?? null
|
|
116
|
+
result.metadata.connectorId = record.connectorId ?? null
|
|
117
|
+
result.metadata.kbId = record.kbId ?? null
|
|
118
|
+
result.metadata.recordName = record.recordName
|
|
119
|
+
result.metadata.orgId = orgId
|
|
120
|
+
let weburl = record.webUrl
|
|
121
|
+
if (weburl?.startsWith("https://mail.google.com/mail?authuser=") && (user as UserDoc | null)?.email)
|
|
122
|
+
weburl = weburl.replace("{user.email}", (user as UserDoc).email!)
|
|
123
|
+
if (weburl) result.metadata.webUrl = weburl
|
|
124
|
+
if (record.mimeType) result.metadata.mimeType = record.mimeType
|
|
125
|
+
}
|
|
126
|
+
enriched.push(result)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
enriched.sort((a, b) => (b.score ?? 0) - (a.score ?? 0))
|
|
130
|
+
|
|
131
|
+
// Drop results missing fields citation validation requires.
|
|
132
|
+
const complete = enriched.filter((r) => {
|
|
133
|
+
if (!r.content) return false
|
|
134
|
+
return REQUIRED_FIELDS.every((f) => (r.metadata as Record<string, unknown>)[f] != null)
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
const records = recordIdsToFetch.map((id) => recordById.get(id)).filter((r): r is RecordDoc => Boolean(r))
|
|
138
|
+
|
|
139
|
+
if (complete.length === 0 && records.length === 0)
|
|
140
|
+
return this.empty(
|
|
141
|
+
"No relevant documents found for your search query.",
|
|
142
|
+
RetrievalStatus.EMPTY_RESPONSE,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
const resp: RetrievalResponse = {
|
|
146
|
+
searchResults: complete,
|
|
147
|
+
records,
|
|
148
|
+
status: RetrievalStatus.SUCCESS,
|
|
149
|
+
statusCode: 200,
|
|
150
|
+
message: "Query processed successfully. Relevant records retrieved.",
|
|
151
|
+
}
|
|
152
|
+
if (kbIds?.length) resp.appliedFilters = { kb: kbIds, kb_count: kbIds.length }
|
|
153
|
+
return resp
|
|
154
|
+
} catch (e) {
|
|
155
|
+
this.log.error(`Filtered search failed: ${String(e)}`)
|
|
156
|
+
return this.empty("Unexpected server error during search.", RetrievalStatus.ERROR)
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Hybrid dense+sparse retrieval with RRF fusion. The request shape is
|
|
161
|
+
// Qdrant-flavored; the VectorDBService adapter forwards it to the client.
|
|
162
|
+
private async executeParallelSearches(
|
|
163
|
+
queries: string[],
|
|
164
|
+
filter: unknown,
|
|
165
|
+
limit: number,
|
|
166
|
+
): Promise<SearchResult[]> {
|
|
167
|
+
const requests = await Promise.all(
|
|
168
|
+
queries.map(async (q) => {
|
|
169
|
+
const [dense, sparse] = await Promise.all([
|
|
170
|
+
embedQueryWith(this.deps.embeddings, q), // QUERY embedding (asymmetric-aware)
|
|
171
|
+
this.deps.embeddings.embedSparse(q),
|
|
172
|
+
])
|
|
173
|
+
return {
|
|
174
|
+
prefetch: [
|
|
175
|
+
{ query: dense, using: "dense", limit: limit * 2, filter },
|
|
176
|
+
{ query: sparse, using: "sparse", limit: limit * 2, filter },
|
|
177
|
+
],
|
|
178
|
+
query: { fusion: "RRF" },
|
|
179
|
+
with_payload: true,
|
|
180
|
+
limit,
|
|
181
|
+
filter,
|
|
182
|
+
}
|
|
183
|
+
}),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
const results = await this.deps.vector.queryNearestPoints({
|
|
187
|
+
collectionName: this.deps.collectionName,
|
|
188
|
+
requests,
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
const seen = new Set<string | number>()
|
|
192
|
+
const out: SearchResult[] = []
|
|
193
|
+
for (const r of results)
|
|
194
|
+
for (const p of r.points as VectorPoint[]) {
|
|
195
|
+
if (seen.has(p.id)) continue
|
|
196
|
+
seen.add(p.id)
|
|
197
|
+
const metadata = { ...(p.payload.metadata ?? {}), point_id: p.id }
|
|
198
|
+
out.push({
|
|
199
|
+
score: p.score,
|
|
200
|
+
citationType: "vectordb|document",
|
|
201
|
+
metadata,
|
|
202
|
+
content: p.payload.page_content ?? "",
|
|
203
|
+
})
|
|
204
|
+
}
|
|
205
|
+
return out
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
private empty(message: string, status: RetrievalStatus): RetrievalResponse {
|
|
209
|
+
const codes: Record<RetrievalStatus, number> = {
|
|
210
|
+
[RetrievalStatus.SUCCESS]: 200,
|
|
211
|
+
[RetrievalStatus.ERROR]: 500,
|
|
212
|
+
[RetrievalStatus.ACCESSIBLE_RECORDS_NOT_FOUND]: 404,
|
|
213
|
+
[RetrievalStatus.VECTOR_DB_EMPTY]: 503,
|
|
214
|
+
[RetrievalStatus.EMPTY_RESPONSE]: 200,
|
|
215
|
+
}
|
|
216
|
+
return { searchResults: [], records: [], status, statusCode: codes[status] ?? 500, message }
|
|
217
|
+
}
|
|
218
|
+
}
|