@199-bio/engram 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ColBERT bridge for Engram
4
+ Uses RAGatouille for state-of-the-art retrieval
5
+
6
+ Run as subprocess from Node.js, communicates via JSON over stdin/stdout.
7
+ """
8
+
9
+ import sys
10
+ import json
11
+ import os
12
+ from pathlib import Path
13
+
14
+ # Suppress warnings
15
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
+
17
+ def lazy_load_ragatouille():
18
+ """Lazy load RAGatouille to speed up startup"""
19
+ try:
20
+ from ragatouille import RAGPretrainedModel
21
+ return RAGPretrainedModel
22
+ except ImportError:
23
+ return None
24
+
25
+ class ColBERTBridge:
26
+ def __init__(self, index_path: str):
27
+ self.index_path = Path(index_path)
28
+ self.index_path.mkdir(parents=True, exist_ok=True)
29
+ self.model = None
30
+ self.index = None
31
+ self.index_name = "engram_index"
32
+
33
+ def _ensure_model(self):
34
+ """Load model if not already loaded"""
35
+ if self.model is None:
36
+ RAGPretrainedModel = lazy_load_ragatouille()
37
+ if RAGPretrainedModel is None:
38
+ raise RuntimeError("RAGatouille not installed. Run: pip install ragatouille")
39
+ self.model = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
40
+
41
+ def _ensure_index(self):
42
+ """Load existing index if available"""
43
+ if self.index is None:
44
+ index_dir = self.index_path / ".ragatouille" / "colbert" / "indexes" / self.index_name
45
+ if index_dir.exists():
46
+ RAGPretrainedModel = lazy_load_ragatouille()
47
+ if RAGPretrainedModel:
48
+ try:
49
+ self.index = RAGPretrainedModel.from_index(str(index_dir))
50
+ except Exception:
51
+ pass # Will recreate index
52
+
53
+ def index_documents(self, documents: list[dict]) -> dict:
54
+ """
55
+ Index documents for search
56
+ documents: [{"id": "...", "content": "..."}]
57
+ """
58
+ self._ensure_model()
59
+
60
+ if not documents:
61
+ return {"success": True, "count": 0}
62
+
63
+ doc_ids = [d["id"] for d in documents]
64
+ doc_contents = [d["content"] for d in documents]
65
+
66
+ # Index with RAGatouille
67
+ self.index = self.model.index(
68
+ collection=doc_contents,
69
+ document_ids=doc_ids,
70
+ index_name=self.index_name,
71
+ max_document_length=512,
72
+ split_documents=True,
73
+ )
74
+
75
+ return {"success": True, "count": len(documents)}
76
+
77
+ def add_documents(self, documents: list[dict]) -> dict:
78
+ """
79
+ Add documents to existing index
80
+ """
81
+ self._ensure_index()
82
+
83
+ if self.index is None:
84
+ # No existing index, create new
85
+ return self.index_documents(documents)
86
+
87
+ doc_ids = [d["id"] for d in documents]
88
+ doc_contents = [d["content"] for d in documents]
89
+
90
+ try:
91
+ self.index.add_to_index(
92
+ new_collection=doc_contents,
93
+ new_document_ids=doc_ids,
94
+ )
95
+ return {"success": True, "count": len(documents)}
96
+ except Exception as e:
97
+ # Fallback: reindex everything
98
+ return {"success": False, "error": str(e)}
99
+
100
+ def search(self, query: str, k: int = 10) -> dict:
101
+ """
102
+ Search for documents
103
+ Returns: {"results": [{"id": "...", "score": 0.9, "content": "..."}]}
104
+ """
105
+ self._ensure_index()
106
+
107
+ if self.index is None:
108
+ return {"results": []}
109
+
110
+ try:
111
+ results = self.index.search(query=query, k=k)
112
+
113
+ formatted = []
114
+ for r in results:
115
+ formatted.append({
116
+ "id": r.get("document_id", r.get("doc_id", "")),
117
+ "score": float(r.get("score", 0)),
118
+ "content": r.get("content", ""),
119
+ })
120
+
121
+ return {"results": formatted}
122
+ except Exception as e:
123
+ return {"results": [], "error": str(e)}
124
+
125
+ def rerank(self, query: str, documents: list[dict], k: int = 10) -> dict:
126
+ """
127
+ Rerank documents using ColBERT
128
+ documents: [{"id": "...", "content": "..."}]
129
+ """
130
+ self._ensure_model()
131
+
132
+ if not documents:
133
+ return {"results": []}
134
+
135
+ doc_contents = [d["content"] for d in documents]
136
+
137
+ try:
138
+ # Use ColBERT as reranker
139
+ results = self.model.rerank(
140
+ query=query,
141
+ documents=doc_contents,
142
+ k=min(k, len(documents)),
143
+ )
144
+
145
+ formatted = []
146
+ for r in results:
147
+ idx = r.get("result_index", 0)
148
+ if idx < len(documents):
149
+ formatted.append({
150
+ "id": documents[idx]["id"],
151
+ "score": float(r.get("score", 0)),
152
+ "content": documents[idx]["content"],
153
+ })
154
+
155
+ return {"results": formatted}
156
+ except Exception as e:
157
+ return {"results": [], "error": str(e)}
158
+
159
+ def delete_documents(self, doc_ids: list[str]) -> dict:
160
+ """
161
+ Delete documents from index
162
+ """
163
+ self._ensure_index()
164
+
165
+ if self.index is None:
166
+ return {"success": True, "count": 0}
167
+
168
+ try:
169
+ self.index.delete_from_index(document_ids=doc_ids)
170
+ return {"success": True, "count": len(doc_ids)}
171
+ except Exception as e:
172
+ return {"success": False, "error": str(e)}
173
+
174
+
175
+ def main():
176
+ """Main loop - read JSON commands from stdin, write responses to stdout"""
177
+ index_path = os.environ.get("ENGRAM_INDEX_PATH", os.path.expanduser("~/.engram"))
178
+ bridge = ColBERTBridge(index_path)
179
+
180
+ # Signal ready
181
+ print(json.dumps({"status": "ready"}), flush=True)
182
+
183
+ for line in sys.stdin:
184
+ line = line.strip()
185
+ if not line:
186
+ continue
187
+
188
+ try:
189
+ cmd = json.loads(line)
190
+ action = cmd.get("action")
191
+
192
+ if action == "index":
193
+ result = bridge.index_documents(cmd.get("documents", []))
194
+ elif action == "add":
195
+ result = bridge.add_documents(cmd.get("documents", []))
196
+ elif action == "search":
197
+ result = bridge.search(cmd.get("query", ""), cmd.get("k", 10))
198
+ elif action == "rerank":
199
+ result = bridge.rerank(
200
+ cmd.get("query", ""),
201
+ cmd.get("documents", []),
202
+ cmd.get("k", 10)
203
+ )
204
+ elif action == "delete":
205
+ result = bridge.delete_documents(cmd.get("ids", []))
206
+ elif action == "ping":
207
+ result = {"status": "ok"}
208
+ elif action == "quit":
209
+ break
210
+ else:
211
+ result = {"error": f"Unknown action: {action}"}
212
+
213
+ print(json.dumps(result), flush=True)
214
+
215
+ except json.JSONDecodeError as e:
216
+ print(json.dumps({"error": f"Invalid JSON: {e}"}), flush=True)
217
+ except Exception as e:
218
+ print(json.dumps({"error": str(e)}), flush=True)
219
+
220
+
221
+ if __name__ == "__main__":
222
+ main()
@@ -0,0 +1,317 @@
1
+ /**
2
+ * ColBERT retriever - TypeScript wrapper for Python bridge
3
+ */
4
+
5
+ import { spawn, ChildProcess } from "child_process";
6
+ import { createInterface, Interface } from "readline";
7
+ import path from "path";
8
+ import { fileURLToPath } from "url";
9
+
10
+ const __filename = fileURLToPath(import.meta.url);
11
+ const __dirname = path.dirname(__filename);
12
+
13
+ // Python bridge is in src/, not dist/ - go up from dist/retrieval to project root, then into src/
14
+ const BRIDGE_PATH = path.join(__dirname, "..", "..", "src", "retrieval", "colbert-bridge.py");
15
+
16
+ export interface Document {
17
+ id: string;
18
+ content: string;
19
+ }
20
+
21
+ export interface SearchResult {
22
+ id: string;
23
+ score: number;
24
+ content: string;
25
+ }
26
+
27
+ interface BridgeResponse {
28
+ status?: string;
29
+ success?: boolean;
30
+ count?: number;
31
+ results?: SearchResult[];
32
+ error?: string;
33
+ }
34
+
35
+ export class ColBERTRetriever {
36
+ private process: ChildProcess | null = null;
37
+ private readline: Interface | null = null;
38
+ private pendingRequests: Map<number, {
39
+ resolve: (value: BridgeResponse) => void;
40
+ reject: (error: Error) => void;
41
+ }> = new Map();
42
+ private requestId = 0;
43
+ private ready = false;
44
+ private readyPromise: Promise<void>;
45
+ private readyResolve: (() => void) | null = null;
46
+ private buffer = "";
47
+
48
+ constructor(private indexPath: string) {
49
+ this.readyPromise = new Promise((resolve) => {
50
+ this.readyResolve = resolve;
51
+ });
52
+ }
53
+
54
+ /**
55
+ * Start the Python bridge process
56
+ */
57
+ async start(): Promise<void> {
58
+ if (this.process) return;
59
+
60
+ this.process = spawn("python3", [BRIDGE_PATH], {
61
+ env: {
62
+ ...process.env,
63
+ ENGRAM_INDEX_PATH: this.indexPath,
64
+ },
65
+ stdio: ["pipe", "pipe", "pipe"],
66
+ });
67
+
68
+ this.readline = createInterface({
69
+ input: this.process.stdout!,
70
+ crlfDelay: Infinity,
71
+ });
72
+
73
+ this.readline.on("line", (line) => {
74
+ this.handleLine(line);
75
+ });
76
+
77
+ this.process.stderr?.on("data", (data) => {
78
+ // Log Python errors for debugging
79
+ console.error(`[ColBERT] ${data.toString()}`);
80
+ });
81
+
82
+ this.process.on("exit", (code) => {
83
+ console.error(`[ColBERT] Process exited with code ${code}`);
84
+ this.ready = false;
85
+ this.process = null;
86
+ this.readline = null;
87
+ });
88
+
89
+ // Wait for ready signal
90
+ await this.readyPromise;
91
+ }
92
+
93
+ private handleLine(line: string): void {
94
+ try {
95
+ const response = JSON.parse(line) as BridgeResponse;
96
+
97
+ // Check for ready signal
98
+ if (response.status === "ready") {
99
+ this.ready = true;
100
+ this.readyResolve?.();
101
+ return;
102
+ }
103
+
104
+ // Handle response (simple protocol - responses come in order)
105
+ const oldest = Array.from(this.pendingRequests.entries())[0];
106
+ if (oldest) {
107
+ const [id, { resolve }] = oldest;
108
+ this.pendingRequests.delete(id);
109
+ resolve(response);
110
+ }
111
+ } catch (error) {
112
+ console.error(`[ColBERT] Failed to parse: ${line}`);
113
+ }
114
+ }
115
+
116
+ private async send(command: Record<string, unknown>): Promise<BridgeResponse> {
117
+ if (!this.process || !this.ready) {
118
+ await this.start();
119
+ }
120
+
121
+ return new Promise((resolve, reject) => {
122
+ const id = this.requestId++;
123
+ this.pendingRequests.set(id, { resolve, reject });
124
+
125
+ const json = JSON.stringify(command) + "\n";
126
+ this.process!.stdin!.write(json);
127
+ });
128
+ }
129
+
130
+ /**
131
+ * Index documents for search
132
+ */
133
+ async index(documents: Document[]): Promise<{ success: boolean; count: number }> {
134
+ const response = await this.send({
135
+ action: "index",
136
+ documents,
137
+ });
138
+
139
+ return {
140
+ success: response.success ?? false,
141
+ count: response.count ?? 0,
142
+ };
143
+ }
144
+
145
+ /**
146
+ * Add documents to existing index
147
+ */
148
+ async add(documents: Document[]): Promise<{ success: boolean; count: number }> {
149
+ const response = await this.send({
150
+ action: "add",
151
+ documents,
152
+ });
153
+
154
+ return {
155
+ success: response.success ?? false,
156
+ count: response.count ?? 0,
157
+ };
158
+ }
159
+
160
+ /**
161
+ * Search for documents
162
+ */
163
+ async search(query: string, k: number = 10): Promise<SearchResult[]> {
164
+ const response = await this.send({
165
+ action: "search",
166
+ query,
167
+ k,
168
+ });
169
+
170
+ return response.results ?? [];
171
+ }
172
+
173
+ /**
174
+ * Rerank documents using ColBERT
175
+ */
176
+ async rerank(query: string, documents: Document[], k: number = 10): Promise<SearchResult[]> {
177
+ const response = await this.send({
178
+ action: "rerank",
179
+ query,
180
+ documents,
181
+ k,
182
+ });
183
+
184
+ return response.results ?? [];
185
+ }
186
+
187
+ /**
188
+ * Delete documents from index
189
+ */
190
+ async delete(ids: string[]): Promise<{ success: boolean; count: number }> {
191
+ const response = await this.send({
192
+ action: "delete",
193
+ ids,
194
+ });
195
+
196
+ return {
197
+ success: response.success ?? false,
198
+ count: response.count ?? 0,
199
+ };
200
+ }
201
+
202
+ /**
203
+ * Check if bridge is ready
204
+ */
205
+ async ping(): Promise<boolean> {
206
+ try {
207
+ const response = await this.send({ action: "ping" });
208
+ return response.status === "ok";
209
+ } catch {
210
+ return false;
211
+ }
212
+ }
213
+
214
+ /**
215
+ * Stop the Python bridge
216
+ */
217
+ async stop(): Promise<void> {
218
+ if (this.process) {
219
+ try {
220
+ await this.send({ action: "quit" });
221
+ } catch {
222
+ // Ignore errors during shutdown
223
+ }
224
+ this.process.kill();
225
+ this.process = null;
226
+ this.readline = null;
227
+ this.ready = false;
228
+ }
229
+ }
230
+ }
231
+
232
+ /**
233
+ * Fallback retriever when ColBERT is not available
234
+ * Uses simple TF-IDF-like scoring
235
+ */
236
+ export class SimpleRetriever {
237
+ private documents: Map<string, Document> = new Map();
238
+
239
+ async index(documents: Document[]): Promise<{ success: boolean; count: number }> {
240
+ for (const doc of documents) {
241
+ this.documents.set(doc.id, doc);
242
+ }
243
+ return { success: true, count: documents.length };
244
+ }
245
+
246
+ async add(documents: Document[]): Promise<{ success: boolean; count: number }> {
247
+ return this.index(documents);
248
+ }
249
+
250
+ async search(query: string, k: number = 10): Promise<SearchResult[]> {
251
+ const queryTerms = query.toLowerCase().split(/\s+/);
252
+ const results: SearchResult[] = [];
253
+
254
+ for (const [id, doc] of this.documents) {
255
+ const contentLower = doc.content.toLowerCase();
256
+ let score = 0;
257
+
258
+ for (const term of queryTerms) {
259
+ if (contentLower.includes(term)) {
260
+ score += 1;
261
+ }
262
+ }
263
+
264
+ if (score > 0) {
265
+ results.push({ id, score: score / queryTerms.length, content: doc.content });
266
+ }
267
+ }
268
+
269
+ return results
270
+ .sort((a, b) => b.score - a.score)
271
+ .slice(0, k);
272
+ }
273
+
274
+ async rerank(query: string, documents: Document[], k: number = 10): Promise<SearchResult[]> {
275
+ const temp = new Map(this.documents);
276
+ this.documents.clear();
277
+
278
+ for (const doc of documents) {
279
+ this.documents.set(doc.id, doc);
280
+ }
281
+
282
+ const results = await this.search(query, k);
283
+ this.documents = temp;
284
+
285
+ return results;
286
+ }
287
+
288
+ async delete(ids: string[]): Promise<{ success: boolean; count: number }> {
289
+ let count = 0;
290
+ for (const id of ids) {
291
+ if (this.documents.delete(id)) {
292
+ count++;
293
+ }
294
+ }
295
+ return { success: true, count };
296
+ }
297
+ }
298
+
299
+ /**
300
+ * Create the best available retriever
301
+ */
302
+ export async function createRetriever(indexPath: string): Promise<ColBERTRetriever | SimpleRetriever> {
303
+ const colbert = new ColBERTRetriever(indexPath);
304
+
305
+ try {
306
+ await colbert.start();
307
+ if (await colbert.ping()) {
308
+ console.error("[Engram] Using ColBERT retriever");
309
+ return colbert;
310
+ }
311
+ } catch (error) {
312
+ console.error("[Engram] ColBERT not available, using simple retriever:", error);
313
+ }
314
+
315
+ console.error("[Engram] Using simple fallback retriever");
316
+ return new SimpleRetriever();
317
+ }