gitdb-vectors 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gitdb/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ """GitDB — GPU-accelerated version-controlled vector database."""
2
+
3
+ from gitdb.core import GitDB, Transaction
4
+ from gitdb.hooks import HookManager
5
+ from gitdb.schema import Schema, SchemaError
6
+ from gitdb.types import (
7
+ Results, CommitInfo, DiffResult, MergeResult,
8
+ BlameEntry, BisectResult, StashEntry,
9
+ )
10
+
11
+ __version__ = "0.6.0"
12
+ __all__ = [
13
+ "GitDB", "Transaction", "HookManager", "Schema", "SchemaError",
14
+ "Results", "CommitInfo", "DiffResult",
15
+ "MergeResult", "BlameEntry", "BisectResult", "StashEntry",
16
+ ]
gitdb/ambient.py ADDED
@@ -0,0 +1,459 @@
1
+ """Emirati AC — Spreading Activation for GitDB.
2
+
3
+ Named after how Emiratis leave their car AC running so it's pre-chilled
4
+ when they walk out. The GPU continuously pattern-matches recent operations
5
+ against the entire vector store. Vectors activate neighboring vectors
6
+ through multi-hop chains. By the time you query, the answer is already hot.
7
+
8
+ Architecture (from SoulKeeper, adapted for version-controlled vectors):
9
+ 1. Context feed: every gitdb operation (log, diff, add, query) feeds context
10
+ 2. Direct activation: GPU matmul finds top matches (microseconds)
11
+ 3. Spreading activation: active vectors activate their neighbors
12
+ 4. Decay: activations fade over time (vectors cool down)
13
+ 5. Hot cache: top N activated vectors, sorted by activation level
14
+ 6. Drift detection: monitors semantic centroid drift on current branch
15
+
16
+ Multi-hop chains emerge naturally:
17
+ "auth tokens" → session_management (0.65) → security_audit (0.58)
18
+ → vulnerability_scan (0.52) → penetration_test (0.48)
19
+ """
20
+
21
+ import hashlib
22
+ import threading
23
+ import time
24
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple
25
+
26
+ import torch
27
+ import torch.nn.functional as F
28
+
29
+ from gitdb.types import Results, VectorMeta
30
+
31
+
32
+ class EmiratiAC:
33
+ """Background spreading activation engine for GitDB.
34
+
35
+ Usage:
36
+ db = GitDB("store", dim=1024, device="mps")
37
+ db.ac.start() # Engine on, AC running
38
+ db.add(texts=["..."]) # Context auto-fed
39
+ hot = db.ac.primed(10) # Already-ranked vectors, zero compute
40
+ db.ac.stop() # Engine off
41
+ """
42
+
43
+ # ── Tuning knobs ──────────────────────────────────────────
44
+ POLL_INTERVAL = 3.0 # Seconds between ambient cycles
45
+ ACTIVATION_DECAY = 0.90 # 10% decay per cycle
46
+ SPREAD_FACTOR = 0.25 # 25% of activation spreads to neighbors
47
+ SPREAD_MIN_SIM = 0.40 # Only spread to vectors with sim > 0.4
48
+ SPREAD_MIN_LEVEL = 0.20 # Only spread from vectors activated > 0.2
49
+ MAX_SPREAD_SOURCES = 15 # Max vectors to spread from per cycle
50
+ NEIGHBOR_K = 5 # Neighbors per vector for spreading
51
+ HOT_CACHE_SIZE = 50 # Keep top 50 activated vectors
52
+ MIN_ACTIVATION = 0.05 # Below this, vector is deactivated
53
+ DIRECT_WEIGHT = 0.5 # Weight for direct matmul matches
54
+ REINFORCE_BONUS = 0.1 # Bonus when same vector re-activates
55
+ DRIFT_WINDOW = 20 # Track last N additions for drift detection
56
+
57
+ def __init__(self, gitdb: Any):
58
+ self.db = gitdb
59
+
60
+ # Activation state
61
+ self.activations: Dict[int, float] = {} # row_idx → activation level
62
+ self.hot_cache: List[Tuple[int, float]] = [] # sorted (idx, level) pairs
63
+ self.context_buffer: List[str] = [] # recent operation descriptions
64
+ self._last_context_hash = ""
65
+
66
+ # Drift tracking
67
+ self._branch_centroid: Optional[torch.Tensor] = None
68
+ self._recent_additions: List[torch.Tensor] = []
69
+ self._drift_alerts: List[Dict] = []
70
+
71
+ # Thread control
72
+ self._running = False
73
+ self._thread: Optional[threading.Thread] = None
74
+ self._lock = threading.RLock()
75
+ self._new_context = threading.Event()
76
+
77
+ # Callbacks
78
+ self._on_drift: Optional[Callable] = None
79
+
80
+ # Stats
81
+ self.cycles = 0
82
+ self.last_cycle_ms = 0.0
83
+ self.total_spread_hops = 0
84
+ self.peak_active = 0
85
+
86
+ # ── Lifecycle ──────────────────────────────────────────────
87
+
88
+ def start(self):
89
+ """Start the ambient activation loop."""
90
+ if self._running:
91
+ return
92
+ self._running = True
93
+ self._compute_centroid()
94
+ self._thread = threading.Thread(
95
+ target=self._run_loop, daemon=True, name="EmiratiAC")
96
+ self._thread.start()
97
+
98
+ def stop(self):
99
+ """Stop the ambient loop."""
100
+ self._running = False
101
+ self._new_context.set()
102
+ if self._thread:
103
+ self._thread.join(timeout=5)
104
+ self._thread = None
105
+
106
+ @property
107
+ def running(self) -> bool:
108
+ return self._running
109
+
110
+ # ── Context Feeding ────────────────────────────────────────
111
+
112
+ def feed(self, operation: str, detail: str = ""):
113
+ """Feed context from a gitdb operation.
114
+
115
+ Called automatically by GitDB methods when AC is running.
116
+ Examples:
117
+ feed("query", "search for authentication vectors")
118
+ feed("commit", "Add 50 finance embeddings")
119
+ feed("diff", "main vs feature-branch")
120
+ feed("log", "viewing last 20 commits")
121
+ """
122
+ with self._lock:
123
+ ctx = f"{operation}: {detail}" if detail else operation
124
+ self.context_buffer.append(ctx)
125
+ self.context_buffer = self.context_buffer[-20:] # sliding window
126
+ self._new_context.set()
127
+
128
+ def feed_text(self, text: str):
129
+ """Feed raw text context (e.g., query text, document text)."""
130
+ self.feed("text", text)
131
+
132
+ def feed_vectors(self, vectors: torch.Tensor):
133
+ """Feed vectors directly (e.g., from add or query operations).
134
+
135
+ Runs direct activation without needing an embedding model.
136
+ """
137
+ if self.db.tree.embeddings is None or self.db.tree.embeddings.shape[0] == 0:
138
+ return
139
+
140
+ with self._lock:
141
+ try:
142
+ emb = self.db.tree.embeddings
143
+ if vectors.dim() == 1:
144
+ vectors = vectors.unsqueeze(0)
145
+ vectors = vectors.to(emb.device)
146
+
147
+ # Direct matmul for activation
148
+ norms_a = F.normalize(vectors, p=2, dim=1)
149
+ norms_b = F.normalize(emb, p=2, dim=1)
150
+ sims = torch.mm(norms_a, norms_b.t()) # (Q, N)
151
+
152
+ # Activate top matches from each query vector
153
+ for q in range(sims.shape[0]):
154
+ top_scores, top_idx = torch.topk(
155
+ sims[q], k=min(20, sims.shape[1]))
156
+ for idx_t, score_t in zip(top_idx, top_scores):
157
+ idx = int(idx_t.item())
158
+ score = float(score_t.item())
159
+ if score > 0.20:
160
+ current = self.activations.get(idx, 0)
161
+ if current > 0:
162
+ new_level = current + score * self.DIRECT_WEIGHT + self.REINFORCE_BONUS
163
+ else:
164
+ new_level = score * self.DIRECT_WEIGHT
165
+ self.activations[idx] = min(1.0, new_level)
166
+ except Exception:
167
+ pass
168
+
169
+ self._new_context.set()
170
+
171
+ def track_addition(self, vectors: torch.Tensor):
172
+ """Track new additions for drift detection."""
173
+ with self._lock:
174
+ if vectors.dim() == 1:
175
+ vectors = vectors.unsqueeze(0)
176
+ for i in range(vectors.shape[0]):
177
+ self._recent_additions.append(vectors[i].cpu().clone())
178
+ self._recent_additions = self._recent_additions[-self.DRIFT_WINDOW:]
179
+
180
+ # ── Query Integration ──────────────────────────────────────
181
+
182
+ def primed(self, top_k: int = 10) -> Results:
183
+ """Return pre-activated vectors — instant, zero computation.
184
+
185
+ This is the payoff: results are already ranked before you ask.
186
+ """
187
+ with self._lock:
188
+ ids, scores, docs, metas = [], [], [], []
189
+ for idx, level in self.hot_cache[:top_k]:
190
+ if idx < len(self.db.tree.metadata):
191
+ meta = self.db.tree.metadata[idx]
192
+ ids.append(meta.id)
193
+ scores.append(round(level, 4))
194
+ docs.append(meta.document)
195
+ metas.append(meta.metadata)
196
+ return Results(ids=ids, scores=scores, documents=docs, metadata=metas)
197
+
198
+ def boost_results(self, results: Results, boost_weight: float = 0.3) -> Results:
199
+ """Boost query results using activation levels.
200
+
201
+ Re-ranks results by combining cosine similarity with activation level.
202
+ Activated vectors get a score boost — the AC "priming" effect.
203
+ """
204
+ if not results.ids:
205
+ return results
206
+
207
+ activation_map = self.get_activation_map()
208
+ boosted = []
209
+ for i in range(len(results.ids)):
210
+ vid = results.ids[i]
211
+ base_score = results.scores[i]
212
+ # Find activation for this vector
213
+ act_level = 0.0
214
+ for idx, level in self.hot_cache:
215
+ if idx < len(self.db.tree.metadata) and self.db.tree.metadata[idx].id == vid:
216
+ act_level = level
217
+ break
218
+ boosted_score = base_score * (1 - boost_weight) + act_level * boost_weight
219
+ boosted.append((i, boosted_score))
220
+
221
+ # Re-sort by boosted score
222
+ boosted.sort(key=lambda x: x[1], reverse=True)
223
+ return Results(
224
+ ids=[results.ids[i] for i, _ in boosted],
225
+ scores=[s for _, s in boosted],
226
+ documents=[results.documents[i] for i, _ in boosted],
227
+ metadata=[results.metadata[i] for i, _ in boosted],
228
+ )
229
+
230
+ def get_activation_map(self) -> Dict[int, float]:
231
+ """Return current activation levels."""
232
+ with self._lock:
233
+ return dict(self.activations)
234
+
235
+ # ── Drift Detection ────────────────────────────────────────
236
+
237
+ def drift(self) -> Optional[Dict]:
238
+ """Check semantic drift of recent additions vs branch centroid.
239
+
240
+ Returns None if no drift, or dict with drift magnitude and details.
241
+ """
242
+ with self._lock:
243
+ if self._branch_centroid is None or not self._recent_additions:
244
+ return None
245
+
246
+ recent = torch.stack(self._recent_additions)
247
+ recent_centroid = F.normalize(recent.mean(dim=0, keepdim=True), p=2, dim=1)
248
+ branch_norm = F.normalize(self._branch_centroid.unsqueeze(0), p=2, dim=1)
249
+
250
+ drift_sim = F.cosine_similarity(recent_centroid, branch_norm).item()
251
+ drift_magnitude = 1.0 - drift_sim
252
+
253
+ if drift_magnitude > 0.3: # Significant drift
254
+ return {
255
+ "magnitude": round(drift_magnitude, 4),
256
+ "similarity_to_centroid": round(drift_sim, 4),
257
+ "recent_count": len(self._recent_additions),
258
+ "severity": "high" if drift_magnitude > 0.5 else "medium",
259
+ }
260
+ return None
261
+
262
+ def on_drift(self, callback: Callable):
263
+ """Register a callback for drift alerts."""
264
+ self._on_drift = callback
265
+
266
+ # ── Stats ──────────────────────────────────────────────────
267
+
268
+ def stats(self) -> Dict:
269
+ """Current AC stats."""
270
+ with self._lock:
271
+ return {
272
+ "running": self._running,
273
+ "active_vectors": len(self.activations),
274
+ "hot_cache_size": len(self.hot_cache),
275
+ "cycles": self.cycles,
276
+ "last_cycle_ms": round(self.last_cycle_ms, 1),
277
+ "peak_active": self.peak_active,
278
+ "total_spread_hops": self.total_spread_hops,
279
+ "top_score": round(self.hot_cache[0][1], 3) if self.hot_cache else 0,
280
+ "context_depth": len(self.context_buffer),
281
+ "drift": self.drift(),
282
+ }
283
+
284
+ # ── Internal ───────────────────────────────────────────────
285
+
286
+ def _compute_centroid(self):
287
+ """Compute semantic centroid of current branch state."""
288
+ if self.db.tree.embeddings is not None and self.db.tree.embeddings.shape[0] > 0:
289
+ self._branch_centroid = self.db.tree.embeddings.mean(dim=0).cpu()
290
+
291
+ def _run_loop(self):
292
+ """Main loop — runs until stopped."""
293
+ while self._running:
294
+ self._new_context.wait(timeout=self.POLL_INTERVAL)
295
+ self._new_context.clear()
296
+ if not self._running:
297
+ break
298
+ try:
299
+ self._cycle()
300
+ except Exception:
301
+ pass
302
+
303
+ def _cycle(self):
304
+ """One activation cycle: context → matmul → spread → cache → drift check."""
305
+ if self.db.tree.embeddings is None or self.db.tree.embeddings.shape[0] == 0:
306
+ return
307
+
308
+ t0 = time.time()
309
+
310
+ # Check if context changed
311
+ with self._lock:
312
+ if not self.context_buffer:
313
+ self._decay_only()
314
+ return
315
+ context_text = " | ".join(self.context_buffer[-10:])
316
+
317
+ context_hash = hashlib.md5(context_text.encode()).hexdigest()
318
+ if context_hash == self._last_context_hash:
319
+ self._decay_only()
320
+ return
321
+ self._last_context_hash = context_hash
322
+
323
+ # Step 1: Try to embed context text (requires embed module)
324
+ context_vec = self._embed_context(context_text)
325
+
326
+ if context_vec is not None:
327
+ # Step 2: GPU matmul — direct activation
328
+ emb = self.db.tree.embeddings
329
+ device = emb.device
330
+ q = F.normalize(context_vec.unsqueeze(0).to(device), p=2, dim=1)
331
+ norms = F.normalize(emb, p=2, dim=1)
332
+ sims = torch.mm(q, norms.t()).squeeze(0)
333
+
334
+ top_scores, top_idx = torch.topk(sims, k=min(20, len(sims)))
335
+
336
+ # Step 3: Decay existing activations
337
+ with self._lock:
338
+ for idx in list(self.activations.keys()):
339
+ self.activations[idx] *= self.ACTIVATION_DECAY
340
+ if self.activations[idx] < self.MIN_ACTIVATION:
341
+ del self.activations[idx]
342
+
343
+ # Step 4: Activate direct matches
344
+ for idx_t, score_t in zip(top_idx, top_scores):
345
+ idx = int(idx_t.item())
346
+ score = float(score_t.item())
347
+ if score > 0.20:
348
+ with self._lock:
349
+ current = self.activations.get(idx, 0)
350
+ if current > 0:
351
+ new_level = current + score * self.DIRECT_WEIGHT + self.REINFORCE_BONUS
352
+ else:
353
+ new_level = score * self.DIRECT_WEIGHT
354
+ self.activations[idx] = min(1.0, new_level)
355
+ else:
356
+ # No embedder — just decay
357
+ with self._lock:
358
+ for idx in list(self.activations.keys()):
359
+ self.activations[idx] *= self.ACTIVATION_DECAY
360
+ if self.activations[idx] < self.MIN_ACTIVATION:
361
+ del self.activations[idx]
362
+
363
+ # Step 5: Spreading activation (the magic)
364
+ hops = self._spread()
365
+
366
+ # Step 6: Update hot cache
367
+ with self._lock:
368
+ sorted_acts = sorted(
369
+ self.activations.items(), key=lambda x: x[1], reverse=True)
370
+ self.hot_cache = sorted_acts[:self.HOT_CACHE_SIZE]
371
+ active_count = len(self.activations)
372
+ if active_count > self.peak_active:
373
+ self.peak_active = active_count
374
+
375
+ # Step 7: Drift detection
376
+ drift = self.drift()
377
+ if drift and self._on_drift:
378
+ self._on_drift(drift)
379
+ self._drift_alerts.append({**drift, "timestamp": time.time()})
380
+
381
+ self.cycles += 1
382
+ self.total_spread_hops += hops
383
+ self.last_cycle_ms = (time.time() - t0) * 1000
384
+
385
+ def _decay_only(self):
386
+ """Decay activations when context hasn't changed."""
387
+ with self._lock:
388
+ for idx in list(self.activations.keys()):
389
+ self.activations[idx] *= self.ACTIVATION_DECAY
390
+ if self.activations[idx] < self.MIN_ACTIVATION:
391
+ del self.activations[idx]
392
+ sorted_acts = sorted(
393
+ self.activations.items(), key=lambda x: x[1], reverse=True)
394
+ self.hot_cache = sorted_acts[:self.HOT_CACHE_SIZE]
395
+
396
+ def _spread(self) -> int:
397
+ """Spreading activation — active vectors activate their neighbors.
398
+
399
+ This is how human memory works: searching for "auth tokens" doesn't just
400
+ find auth vectors — it activates session management, security audit,
401
+ vulnerability scanning through multi-hop cosine similarity chains.
402
+ """
403
+ emb = self.db.tree.embeddings
404
+ if emb is None:
405
+ return 0
406
+
407
+ with self._lock:
408
+ to_spread = [
409
+ (idx, level) for idx, level in self.activations.items()
410
+ if level > self.SPREAD_MIN_LEVEL
411
+ ][:self.MAX_SPREAD_SOURCES]
412
+
413
+ if not to_spread:
414
+ return 0
415
+
416
+ hops = 0
417
+ n = emb.shape[0]
418
+ device = emb.device
419
+
420
+ for idx, level in to_spread:
421
+ if idx >= n:
422
+ continue
423
+ try:
424
+ # Cosine similarity of this vector vs all others
425
+ vec = F.normalize(emb[idx].unsqueeze(0), p=2, dim=1)
426
+ all_norm = F.normalize(emb, p=2, dim=1)
427
+ sims = torch.mm(vec, all_norm.t()).squeeze(0)
428
+
429
+ # Top-k neighbors (excluding self)
430
+ top_scores, top_idx = torch.topk(
431
+ sims, k=min(self.NEIGHBOR_K + 1, n))
432
+
433
+ for n_idx_t, n_score_t in zip(top_idx, top_scores):
434
+ n_idx = int(n_idx_t.item())
435
+ n_score = float(n_score_t.item())
436
+ if n_idx == idx:
437
+ continue
438
+ if n_score < self.SPREAD_MIN_SIM:
439
+ continue
440
+
441
+ spread = level * self.SPREAD_FACTOR * n_score
442
+ with self._lock:
443
+ current = self.activations.get(n_idx, 0)
444
+ if current < spread:
445
+ self.activations[n_idx] = min(1.0, current + spread)
446
+ if current == 0:
447
+ hops += 1
448
+ except Exception:
449
+ continue
450
+
451
+ return hops
452
+
453
+ def _embed_context(self, text: str) -> Optional[torch.Tensor]:
454
+ """Try to embed context text. Returns None if no embedder available."""
455
+ try:
456
+ from gitdb.embed import embed_query
457
+ return embed_query(text, dim=self.db.dim)
458
+ except (ImportError, Exception):
459
+ return None