brainlayer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. brainlayer/__init__.py +3 -0
  2. brainlayer/cli/__init__.py +1545 -0
  3. brainlayer/cli/wizard.py +132 -0
  4. brainlayer/cli_new.py +151 -0
  5. brainlayer/client.py +164 -0
  6. brainlayer/clustering.py +736 -0
  7. brainlayer/daemon.py +1105 -0
  8. brainlayer/dashboard/README.md +129 -0
  9. brainlayer/dashboard/__init__.py +5 -0
  10. brainlayer/dashboard/app.py +151 -0
  11. brainlayer/dashboard/search.py +229 -0
  12. brainlayer/dashboard/views.py +230 -0
  13. brainlayer/embeddings.py +131 -0
  14. brainlayer/engine.py +550 -0
  15. brainlayer/index_new.py +87 -0
  16. brainlayer/mcp/__init__.py +1558 -0
  17. brainlayer/migrate.py +205 -0
  18. brainlayer/paths.py +43 -0
  19. brainlayer/pipeline/__init__.py +47 -0
  20. brainlayer/pipeline/analyze_communication.py +508 -0
  21. brainlayer/pipeline/brain_graph.py +567 -0
  22. brainlayer/pipeline/chat_tags.py +63 -0
  23. brainlayer/pipeline/chunk.py +422 -0
  24. brainlayer/pipeline/classify.py +472 -0
  25. brainlayer/pipeline/cluster_sampling.py +73 -0
  26. brainlayer/pipeline/enrichment.py +810 -0
  27. brainlayer/pipeline/extract.py +66 -0
  28. brainlayer/pipeline/extract_claude_desktop.py +149 -0
  29. brainlayer/pipeline/extract_corrections.py +231 -0
  30. brainlayer/pipeline/extract_markdown.py +195 -0
  31. brainlayer/pipeline/extract_whatsapp.py +227 -0
  32. brainlayer/pipeline/git_overlay.py +301 -0
  33. brainlayer/pipeline/longitudinal_analyzer.py +568 -0
  34. brainlayer/pipeline/obsidian_export.py +455 -0
  35. brainlayer/pipeline/operation_grouping.py +486 -0
  36. brainlayer/pipeline/plan_linking.py +313 -0
  37. brainlayer/pipeline/sanitize.py +549 -0
  38. brainlayer/pipeline/semantic_style.py +574 -0
  39. brainlayer/pipeline/session_enrichment.py +472 -0
  40. brainlayer/pipeline/style_embed.py +67 -0
  41. brainlayer/pipeline/style_index.py +139 -0
  42. brainlayer/pipeline/temporal_chains.py +203 -0
  43. brainlayer/pipeline/time_batcher.py +248 -0
  44. brainlayer/pipeline/unified_timeline.py +569 -0
  45. brainlayer/storage.py +66 -0
  46. brainlayer/store.py +155 -0
  47. brainlayer/taxonomy.json +80 -0
  48. brainlayer/vector_store.py +1891 -0
  49. brainlayer-1.0.0.dist-info/METADATA +313 -0
  50. brainlayer-1.0.0.dist-info/RECORD +53 -0
  51. brainlayer-1.0.0.dist-info/WHEEL +4 -0
  52. brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
  53. brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,736 @@
1
+ """Hierarchical clustering pipeline for BrainLayer chunks.
2
+
3
+ Builds a 3-level cluster hierarchy using Recursive Leiden on a Faiss KNN graph.
4
+ Stores results in SQLite tables (clusters, chunk_clusters, vec_cluster_centroids).
5
+
6
+ Architecture:
7
+ 245K chunk embeddings (sqlite-vec, 1024 dims)
8
+ → L2-normalize
9
+ → Faiss IndexFlatIP k=30 KNN graph (~2 GB, 5-15 min)
10
+ → igraph conversion
11
+ → Recursive Leiden at 3 resolutions
12
+ Level 0: ~40 clusters (resolution ~0.005)
13
+ Level 1: ~400 clusters (~10 per L0, resolution ~0.05)
14
+ Level 2: ~4000 clusters (~10 per L1, resolution ~0.5)
15
+ → Centroids + materialized paths → SQLite
16
+
17
+ Usage:
18
+ python3 -m brainlayer.clustering [--db-path PATH] [--k 30] [--dry-run]
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import logging
24
+ import struct
25
+ import time
26
+ import uuid
27
+ from collections import defaultdict
28
+ from datetime import datetime, timezone
29
+ from pathlib import Path
30
+ from typing import Optional
31
+
32
+ import apsw
33
+ import faiss
34
+ import igraph as ig
35
+ import leidenalg
36
+ import numpy as np
37
+ import sqlite_vec
38
+
39
+ logging.basicConfig(
40
+ level=logging.INFO,
41
+ format="%(asctime)s %(levelname)s %(message)s",
42
+ datefmt="%H:%M:%S",
43
+ )
44
+ logger = logging.getLogger(__name__)
45
+
46
+ DEFAULT_DB = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db"
47
+ EMBEDDING_DIM = 1024
48
+
49
+ # Target cluster counts at each level
50
+ LEVEL_TARGETS = [40, 10, 10] # L0: ~40 top, L1: ~10 per L0, L2: ~10 per L1
51
+
52
+
53
+ def serialize_f32(vector) -> bytes:
54
+ """Serialize float32 vector to bytes for sqlite-vec.
55
+
56
+ Note: Duplicated from vector_store.serialize_f32 because clustering.py
57
+ runs as a standalone script (python -m brainlayer.clustering) and
58
+ importing vector_store triggers heavy dependencies. Kept intentionally.
59
+ """
60
+ return struct.pack(f"{len(vector)}f", *vector)
61
+
62
+
63
+ # ─── Step 1: Extract Embeddings ─────────────────────────────────
64
+
65
+
66
+ def extract_embeddings(db_path: str):
67
+ """Batch-read all embeddings from sqlite-vec.
68
+
69
+ Returns:
70
+ chunk_ids: list of chunk IDs (same order as embeddings)
71
+ embeddings: numpy array of shape (N, 1024)
72
+ """
73
+ conn = apsw.Connection(db_path, flags=apsw.SQLITE_OPEN_READONLY)
74
+ conn.enableloadextension(True)
75
+ conn.loadextension(sqlite_vec.loadable_path())
76
+ conn.enableloadextension(False)
77
+ cursor = conn.cursor()
78
+
79
+ total = list(cursor.execute("SELECT COUNT(*) FROM chunk_vectors"))[0][0]
80
+ logger.info(f"Extracting {total} embeddings...")
81
+
82
+ # Read all chunk IDs first (ordered by rowid for deterministic ordering)
83
+ logger.info(" Reading chunk IDs...")
84
+ all_chunk_ids = [row[0] for row in cursor.execute("SELECT id FROM chunks ORDER BY rowid")]
85
+ logger.info(f" Got {len(all_chunk_ids)} chunk IDs")
86
+
87
+ # Build a set for fast lookup + ordered list
88
+ chunk_id_set = set(all_chunk_ids)
89
+
90
+ # Read all vectors — vec0 doesn't support ORDER BY or OFFSET,
91
+ # so we read everything and match by chunk_id
92
+ logger.info(" Reading all vectors from chunk_vectors...")
93
+ chunk_ids = []
94
+ embeddings = []
95
+ # AIDEV-NOTE: vec0 virtual tables return rows in insertion order (not guaranteed sorted).
96
+ # We read all and re-order to match chunks.rowid order.
97
+ vec_map = {}
98
+ for chunk_id, emb_bytes in cursor.execute("SELECT chunk_id, embedding FROM chunk_vectors"):
99
+ if emb_bytes and len(emb_bytes) == EMBEDDING_DIM * 4 and chunk_id in chunk_id_set:
100
+ vec = np.frombuffer(emb_bytes, dtype=np.float32).copy()
101
+ vec_map[chunk_id] = vec
102
+
103
+ # Re-order to match chunks.rowid order
104
+ for cid in all_chunk_ids:
105
+ if cid in vec_map:
106
+ chunk_ids.append(cid)
107
+ embeddings.append(vec_map[cid])
108
+
109
+ del vec_map # Free memory
110
+ logger.info(f" Matched {len(chunk_ids)}/{len(all_chunk_ids)} chunks with vectors")
111
+
112
+ conn.close()
113
+ embeddings_array = np.vstack(embeddings) if embeddings else np.zeros((0, EMBEDDING_DIM), dtype=np.float32)
114
+ logger.info(f"Extracted {len(chunk_ids)} embeddings, shape: {embeddings_array.shape}")
115
+ return chunk_ids, embeddings_array
116
+
117
+
118
+ # ─── Step 2: Build KNN Graph ────────────────────────────────────
119
+
120
+
121
+ def build_knn_graph(embeddings: np.ndarray, k: int = 30):
122
+ """Build KNN graph using Faiss IndexFlatIP after L2 normalization.
123
+
124
+ L2 normalization converts dot product (IP) → cosine similarity.
125
+
126
+ Returns:
127
+ distances: (N, k) array of cosine similarities
128
+ indices: (N, k) array of neighbor indices
129
+ """
130
+ n, d = embeddings.shape
131
+ logger.info(f"L2-normalizing {n} embeddings...")
132
+ faiss.normalize_L2(embeddings) # in-place
133
+
134
+ logger.info(f"Building Faiss IndexFlatIP for KNN (k={k})...")
135
+ index = faiss.IndexFlatIP(d)
136
+ index.add(embeddings)
137
+
138
+ logger.info(f"Searching {k} nearest neighbors for {n} vectors...")
139
+ t0 = time.time()
140
+ # k+1 because the first result is always the point itself
141
+ distances, indices = index.search(embeddings, k + 1)
142
+ elapsed = time.time() - t0
143
+ logger.info(f"KNN search completed in {elapsed:.1f}s")
144
+
145
+ # Remove self-matches (first column)
146
+ return distances[:, 1:], indices[:, 1:]
147
+
148
+
149
+ # ─── Step 3: Convert to igraph ──────────────────────────────────
150
+
151
+
152
+ def knn_to_igraph(indices: np.ndarray, distances: np.ndarray, n: int):
153
+ """Convert KNN results to a weighted undirected igraph graph.
154
+
155
+ Collapses directed KNN edges into undirected with max weight.
156
+ """
157
+ logger.info(f"Building igraph from KNN ({n} nodes)...")
158
+
159
+ edge_set = set()
160
+ weight_map = {}
161
+
162
+ for i in range(n):
163
+ for j_pos in range(indices.shape[1]):
164
+ j = int(indices[i, j_pos])
165
+ if j < 0 or j >= n or j == i:
166
+ continue
167
+ w = float(distances[i, j_pos])
168
+ edge = (min(i, j), max(i, j))
169
+ if edge not in edge_set:
170
+ edge_set.add(edge)
171
+ weight_map[edge] = w
172
+ else:
173
+ weight_map[edge] = max(weight_map[edge], w)
174
+
175
+ edges = list(edge_set)
176
+ weights = [max(0.001, weight_map[e]) for e in edges]
177
+
178
+ g = ig.Graph(n=n, edges=edges, directed=False)
179
+ g.es["weight"] = weights
180
+
181
+ logger.info(f"igraph: {g.vcount()} nodes, {g.ecount()} edges")
182
+ return g
183
+
184
+
185
+ # ─── Step 4: Resolution Binary Search ───────────────────────────
186
+
187
+
188
+ def find_resolution_for_target(
189
+ graph: ig.Graph,
190
+ weights: list,
191
+ target: int,
192
+ lo: float = 0.0001,
193
+ hi: float = 5.0,
194
+ tolerance: float = 0.2,
195
+ max_iters: int = 20,
196
+ ) -> float:
197
+ """Binary search for Leiden resolution that gives ~target clusters.
198
+
199
+ tolerance: fraction of target we accept (e.g., 0.2 = within 20%)
200
+ """
201
+ best_res = (lo + hi) / 2
202
+ best_diff = float("inf")
203
+
204
+ for iteration in range(max_iters):
205
+ mid = (lo + hi) / 2
206
+ partition = leidenalg.find_partition(
207
+ graph,
208
+ leidenalg.RBConfigurationVertexPartition,
209
+ weights=weights,
210
+ resolution_parameter=mid,
211
+ n_iterations=3, # Fewer iterations for search
212
+ seed=42,
213
+ )
214
+ n_clusters = len(set(partition.membership))
215
+
216
+ diff = abs(n_clusters - target) / target
217
+ if diff < best_diff:
218
+ best_diff = diff
219
+ best_res = mid
220
+
221
+ if diff < tolerance:
222
+ logger.info(f" Resolution {mid:.6f} → {n_clusters} clusters (target: {target}, diff: {diff:.1%})")
223
+ return mid
224
+
225
+ if n_clusters > target:
226
+ hi = mid # Too many clusters → lower resolution
227
+ else:
228
+ lo = mid # Too few → higher resolution
229
+
230
+ logger.info(f" Best resolution {best_res:.6f} after {max_iters} iters (diff: {best_diff:.1%})")
231
+ return best_res
232
+
233
+
234
+ # ─── Step 5: Recursive Leiden ────────────────────────────────────
235
+
236
+
237
+ def recursive_leiden(
238
+ graph: ig.Graph,
239
+ node_indices: np.ndarray,
240
+ level_targets: list,
241
+ embeddings: np.ndarray,
242
+ level: int = 0,
243
+ parent_id: Optional[str] = None,
244
+ parent_path: str = "",
245
+ ):
246
+ """Run Leiden recursively to build a guaranteed nested hierarchy.
247
+
248
+ At each level, runs Leiden on the subgraph to split into ~target clusters.
249
+ Then recurses into each cluster for the next level.
250
+
251
+ Returns:
252
+ list of cluster dicts: {id, level, parent_id, path, node_indices, centroid}
253
+ """
254
+ if level >= len(level_targets):
255
+ return []
256
+
257
+ target = level_targets[level]
258
+ n = graph.vcount()
259
+
260
+ if n < 3:
261
+ # Too small to cluster further
262
+ cluster_id = str(uuid.uuid4())[:12]
263
+ path = f"{parent_path}/{cluster_id}" if parent_path else cluster_id
264
+ centroid = embeddings[node_indices].mean(axis=0) if len(node_indices) > 0 else np.zeros(EMBEDDING_DIM)
265
+ return [
266
+ {
267
+ "id": cluster_id,
268
+ "level": level,
269
+ "parent_id": parent_id,
270
+ "path": path,
271
+ "node_indices": node_indices,
272
+ "centroid": centroid,
273
+ "chunk_count": len(node_indices),
274
+ }
275
+ ]
276
+
277
+ # Get weights for this subgraph
278
+ weights = graph.es["weight"] if graph.es else None
279
+
280
+ # Find resolution for target cluster count
281
+ actual_target = min(target, max(2, n // 3)) # Don't try to make more clusters than we have points / 3
282
+ resolution = find_resolution_for_target(graph, weights, actual_target)
283
+
284
+ # Run final Leiden with more iterations
285
+ partition = leidenalg.find_partition(
286
+ graph,
287
+ leidenalg.RBConfigurationVertexPartition,
288
+ weights=weights,
289
+ resolution_parameter=resolution,
290
+ n_iterations=-1, # Until convergence
291
+ seed=42,
292
+ )
293
+
294
+ membership = partition.membership
295
+ communities = defaultdict(list)
296
+ for local_idx, comm_id in enumerate(membership):
297
+ communities[comm_id].append(local_idx)
298
+
299
+ n_clusters = len(communities)
300
+ logger.info(f"Level {level}: {n_clusters} clusters from {n} nodes (res={resolution:.6f})")
301
+
302
+ all_clusters = []
303
+
304
+ for comm_id in sorted(communities.keys()):
305
+ local_indices = communities[comm_id]
306
+ global_indices = np.array([node_indices[i] for i in local_indices])
307
+
308
+ cluster_id = str(uuid.uuid4())[:12]
309
+ path = f"{parent_path}/{cluster_id}" if parent_path else cluster_id
310
+ centroid = embeddings[global_indices].mean(axis=0)
311
+
312
+ cluster = {
313
+ "id": cluster_id,
314
+ "level": level,
315
+ "parent_id": parent_id,
316
+ "path": path,
317
+ "node_indices": global_indices,
318
+ "centroid": centroid,
319
+ "chunk_count": len(global_indices),
320
+ }
321
+ all_clusters.append(cluster)
322
+
323
+ # Recurse into sub-clusters if we have enough points and more levels
324
+ if level + 1 < len(level_targets) and len(local_indices) >= 6:
325
+ subgraph = graph.subgraph(local_indices)
326
+ sub_clusters = recursive_leiden(
327
+ subgraph,
328
+ global_indices,
329
+ level_targets,
330
+ embeddings,
331
+ level=level + 1,
332
+ parent_id=cluster_id,
333
+ parent_path=path,
334
+ )
335
+ all_clusters.extend(sub_clusters)
336
+
337
+ return all_clusters
338
+
339
+
340
+ # ─── Step 6: Write to SQLite ────────────────────────────────────
341
+
342
+
343
+ def create_cluster_schema(conn: apsw.Connection):
344
+ """Create clustering tables in the BrainLayer DB."""
345
+ cursor = conn.cursor()
346
+
347
+ cursor.execute("""
348
+ CREATE TABLE IF NOT EXISTS clusters (
349
+ id TEXT PRIMARY KEY,
350
+ level INTEGER NOT NULL,
351
+ parent_id TEXT,
352
+ path TEXT NOT NULL,
353
+ label TEXT,
354
+ ctfidf_label TEXT,
355
+ chunk_count INTEGER DEFAULT 0,
356
+ silhouette_score REAL,
357
+ avg_intra_dist REAL,
358
+ created_at TEXT,
359
+ updated_at TEXT
360
+ )
361
+ """)
362
+
363
+ cursor.execute("""
364
+ CREATE TABLE IF NOT EXISTS chunk_clusters (
365
+ chunk_id TEXT NOT NULL,
366
+ cluster_id TEXT NOT NULL,
367
+ level INTEGER NOT NULL,
368
+ dist_to_centroid REAL,
369
+ assignment_method TEXT DEFAULT 'initial',
370
+ assigned_at TEXT,
371
+ PRIMARY KEY (chunk_id, level)
372
+ )
373
+ """)
374
+
375
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_clusters_cluster ON chunk_clusters(cluster_id)")
376
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_clusters_level ON chunk_clusters(level)")
377
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_level ON clusters(level)")
378
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_parent ON clusters(parent_id)")
379
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_path ON clusters(path)")
380
+
381
+ cursor.execute("""
382
+ CREATE TABLE IF NOT EXISTS clustering_runs (
383
+ id TEXT PRIMARY KEY,
384
+ started_at TEXT,
385
+ completed_at TEXT,
386
+ status TEXT DEFAULT 'running',
387
+ total_chunks INTEGER,
388
+ level_counts TEXT,
389
+ params TEXT,
390
+ silhouette_scores TEXT
391
+ )
392
+ """)
393
+
394
+ # vec_cluster_centroids — virtual table for centroid KNN search
395
+ cursor.execute("""
396
+ CREATE VIRTUAL TABLE IF NOT EXISTS vec_cluster_centroids USING vec0(
397
+ cluster_id TEXT PRIMARY KEY,
398
+ centroid FLOAT[1024]
399
+ )
400
+ """)
401
+
402
+ logger.info("Cluster schema created/verified")
403
+
404
+
405
+ def write_clusters(
406
+ conn: apsw.Connection,
407
+ clusters: list,
408
+ chunk_ids: list,
409
+ embeddings: np.ndarray,
410
+ run_id: str,
411
+ ):
412
+ """Write cluster hierarchy and chunk assignments to SQLite.
413
+
414
+ Wrapped in a single transaction — either all clusters are written or none
415
+ (prevents partial state if an insert fails after clearing old data).
416
+ """
417
+ cursor = conn.cursor()
418
+ cursor.execute("PRAGMA busy_timeout = 5000")
419
+ now = datetime.now(timezone.utc).isoformat()
420
+
421
+ cursor.execute("BEGIN")
422
+ try:
423
+ # Clear previous data
424
+ cursor.execute("DELETE FROM clusters")
425
+ cursor.execute("DELETE FROM chunk_clusters")
426
+ cursor.execute("DELETE FROM vec_cluster_centroids")
427
+
428
+ logger.info(f"Writing {len(clusters)} clusters...")
429
+
430
+ for cluster in clusters:
431
+ # Write cluster
432
+ cursor.execute(
433
+ "INSERT INTO clusters (id, level, parent_id, path, chunk_count, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
434
+ (
435
+ cluster["id"],
436
+ cluster["level"],
437
+ cluster["parent_id"],
438
+ cluster["path"],
439
+ cluster["chunk_count"],
440
+ now,
441
+ now,
442
+ ),
443
+ )
444
+
445
+ # Write centroid (L2-normalize for cosine similarity via sqlite-vec match)
446
+ centroid = cluster["centroid"].copy()
447
+ norm = np.linalg.norm(centroid)
448
+ if norm > 0:
449
+ centroid /= norm
450
+ cursor.execute(
451
+ "INSERT INTO vec_cluster_centroids (cluster_id, centroid) VALUES (?, ?)",
452
+ (cluster["id"], serialize_f32(centroid.tolist())),
453
+ )
454
+
455
+ # Write chunk assignments
456
+ for global_idx in cluster["node_indices"]:
457
+ cid = chunk_ids[global_idx]
458
+ dist = float(np.linalg.norm(embeddings[global_idx] - cluster["centroid"]))
459
+ cursor.execute(
460
+ "INSERT OR REPLACE INTO chunk_clusters (chunk_id, cluster_id, level, dist_to_centroid, assignment_method, assigned_at) VALUES (?, ?, ?, ?, 'initial', ?)",
461
+ (cid, cluster["id"], cluster["level"], dist, now),
462
+ )
463
+
464
+ cursor.execute("COMMIT")
465
+ logger.info(f"Written {len(clusters)} clusters to DB")
466
+ except Exception:
467
+ cursor.execute("ROLLBACK")
468
+ logger.error("Failed to write clusters — rolled back")
469
+ raise
470
+
471
+
472
+ # ─── Step 7: c-TF-IDF Labeling ──────────────────────────────────
473
+
474
+
475
+ def generate_ctfidf_labels(
476
+ conn: apsw.Connection,
477
+ clusters: list,
478
+ chunk_ids: list,
479
+ ):
480
+ """Generate c-TF-IDF labels for all clusters.
481
+
482
+ For each cluster, gets the top-3 discriminative terms from its member chunks.
483
+ """
484
+ from sklearn.feature_extraction.text import TfidfVectorizer
485
+
486
+ cursor = conn.cursor()
487
+ cursor.execute("PRAGMA busy_timeout = 5000")
488
+
489
+ # Build a mapping from global index to chunk content (sample for efficiency)
490
+ logger.info("Loading chunk content for labeling...")
491
+ content_map = {}
492
+ for row in cursor.execute("SELECT id, content FROM chunks"):
493
+ content_map[row[0]] = row[1] or ""
494
+
495
+ # Group clusters by level
496
+ by_level = defaultdict(list)
497
+ for c in clusters:
498
+ by_level[c["level"]].append(c)
499
+
500
+ for level in sorted(by_level.keys()):
501
+ level_clusters = by_level[level]
502
+ logger.info(f"Labeling {len(level_clusters)} clusters at level {level}...")
503
+
504
+ # Build per-cluster documents
505
+ docs = []
506
+ cluster_ids_for_docs = []
507
+ for c in level_clusters:
508
+ # Sample up to 50 chunks per cluster for labeling
509
+ indices = c["node_indices"][:50] if len(c["node_indices"]) > 50 else c["node_indices"]
510
+ texts = []
511
+ for idx in indices:
512
+ cid = chunk_ids[idx]
513
+ text = content_map.get(cid, "")
514
+ if text:
515
+ texts.append(text[:2000])
516
+ doc = " ".join(texts)
517
+ if doc.strip():
518
+ docs.append(doc)
519
+ cluster_ids_for_docs.append(c["id"])
520
+
521
+ if not docs:
522
+ continue
523
+
524
+ vectorizer = TfidfVectorizer(
525
+ max_features=2000,
526
+ stop_words="english",
527
+ ngram_range=(1, 2),
528
+ min_df=1,
529
+ max_df=0.8,
530
+ )
531
+
532
+ try:
533
+ tfidf = vectorizer.fit_transform(docs)
534
+ except ValueError:
535
+ continue
536
+
537
+ feature_names = vectorizer.get_feature_names_out()
538
+
539
+ for i, cluster_id in enumerate(cluster_ids_for_docs):
540
+ top_indices = tfidf[i].toarray()[0].argsort()[-5:][::-1]
541
+ top_terms = [feature_names[idx] for idx in top_indices if tfidf[i, idx] > 0]
542
+ label = " / ".join(top_terms[:3]) if top_terms else f"cluster-{cluster_id}"
543
+
544
+ cursor.execute(
545
+ "UPDATE clusters SET ctfidf_label = ?, label = ? WHERE id = ?",
546
+ (label, label, cluster_id),
547
+ )
548
+
549
+ logger.info("c-TF-IDF labeling complete")
550
+
551
+
552
+ # ─── Step 8: Silhouette Score ────────────────────────────────────
553
+
554
+
555
+ def compute_silhouette_sample(
556
+ embeddings: np.ndarray,
557
+ clusters: list,
558
+ chunk_ids: list,
559
+ sample_size: int = 5000,
560
+ ):
561
+ """Compute silhouette score on a sample for each level.
562
+
563
+ Returns dict of {level: silhouette_score}.
564
+ """
565
+ from sklearn.metrics import silhouette_score
566
+
567
+ # Build level-specific label arrays
568
+ by_level = defaultdict(list)
569
+ for c in clusters:
570
+ by_level[c["level"]].append(c)
571
+
572
+ scores = {}
573
+ for level in sorted(by_level.keys()):
574
+ level_clusters = by_level[level]
575
+
576
+ # Build label array (global_idx → cluster_id)
577
+ label_map = {}
578
+ for c in level_clusters:
579
+ for idx in c["node_indices"]:
580
+ label_map[idx] = c["id"]
581
+
582
+ indices = sorted(label_map.keys())
583
+ if len(indices) < 10:
584
+ continue
585
+
586
+ # Sample for efficiency
587
+ if len(indices) > sample_size:
588
+ rng = np.random.default_rng(42)
589
+ indices = sorted(rng.choice(indices, sample_size, replace=False))
590
+
591
+ X = embeddings[indices]
592
+ labels = [label_map[i] for i in indices]
593
+
594
+ # Need at least 2 unique labels
595
+ if len(set(labels)) < 2:
596
+ continue
597
+
598
+ try:
599
+ score = silhouette_score(X, labels, metric="cosine", sample_size=min(len(X), sample_size))
600
+ scores[level] = float(score)
601
+ logger.info(f"Silhouette score L{level}: {score:.4f}")
602
+ except Exception as e:
603
+ logger.warning(f"Silhouette failed for L{level}: {e}")
604
+
605
+ return scores
606
+
607
+
608
+ # ─── Main Pipeline ───────────────────────────────────────────────
609
+
610
+
611
+ def run_clustering(
612
+ db_path: str = str(DEFAULT_DB),
613
+ k: int = 30,
614
+ level_targets: list = None,
615
+ dry_run: bool = False,
616
+ ):
617
+ """Run the full clustering pipeline."""
618
+ if level_targets is None:
619
+ level_targets = LEVEL_TARGETS
620
+
621
+ run_id = str(uuid.uuid4())[:12]
622
+ t0 = time.time()
623
+
624
+ # Step 1: Extract
625
+ logger.info("=" * 60)
626
+ logger.info("STEP 1: Extracting embeddings")
627
+ logger.info("=" * 60)
628
+ chunk_ids, embeddings = extract_embeddings(db_path)
629
+ n = len(chunk_ids)
630
+
631
+ if dry_run:
632
+ logger.info(f"DRY RUN: {n} chunks extracted. Would cluster with k={k}, targets={level_targets}")
633
+ return
634
+
635
+ # Step 2: KNN
636
+ logger.info("=" * 60)
637
+ logger.info("STEP 2: Building KNN graph")
638
+ logger.info("=" * 60)
639
+ distances, indices = build_knn_graph(embeddings, k=k)
640
+
641
+ # Step 3: igraph
642
+ logger.info("=" * 60)
643
+ logger.info("STEP 3: Converting to igraph")
644
+ logger.info("=" * 60)
645
+ graph = knn_to_igraph(indices, distances, n)
646
+
647
+ # Step 4+5: Recursive Leiden
648
+ logger.info("=" * 60)
649
+ logger.info("STEP 4: Running recursive Leiden clustering")
650
+ logger.info("=" * 60)
651
+ all_indices = np.arange(n)
652
+ clusters = recursive_leiden(graph, all_indices, level_targets, embeddings)
653
+
654
+ # Report cluster counts
655
+ by_level = defaultdict(int)
656
+ for c in clusters:
657
+ by_level[c["level"]] += 1
658
+ level_counts = {f"L{lv}": cnt for lv, cnt in sorted(by_level.items())}
659
+ logger.info(f"Cluster counts: {level_counts}")
660
+
661
+ # Step 6: Write to DB
662
+ logger.info("=" * 60)
663
+ logger.info("STEP 5: Writing to SQLite")
664
+ logger.info("=" * 60)
665
+ conn = apsw.Connection(db_path)
666
+ conn.enableloadextension(True)
667
+ conn.loadextension(sqlite_vec.loadable_path())
668
+ conn.enableloadextension(False)
669
+
670
+ create_cluster_schema(conn)
671
+ write_clusters(conn, clusters, chunk_ids, embeddings, run_id)
672
+
673
+ # Step 7: c-TF-IDF labels
674
+ logger.info("=" * 60)
675
+ logger.info("STEP 6: Generating c-TF-IDF labels")
676
+ logger.info("=" * 60)
677
+ generate_ctfidf_labels(conn, clusters, chunk_ids)
678
+
679
+ # Step 8: Silhouette
680
+ logger.info("=" * 60)
681
+ logger.info("STEP 7: Computing silhouette scores")
682
+ logger.info("=" * 60)
683
+ silhouette_scores = compute_silhouette_sample(embeddings, clusters, chunk_ids)
684
+
685
+ # Record run
686
+ now = datetime.now(timezone.utc).isoformat()
687
+ cursor = conn.cursor()
688
+ cursor.execute(
689
+ "INSERT INTO clustering_runs (id, started_at, completed_at, status, total_chunks, level_counts, params, silhouette_scores) VALUES (?, ?, ?, 'completed', ?, ?, ?, ?)",
690
+ (
691
+ run_id,
692
+ datetime.fromtimestamp(t0, tz=timezone.utc).isoformat(),
693
+ now,
694
+ n,
695
+ json.dumps(level_counts),
696
+ json.dumps({"k": k, "level_targets": level_targets}),
697
+ json.dumps(silhouette_scores),
698
+ ),
699
+ )
700
+
701
+ conn.close()
702
+
703
+ elapsed = time.time() - t0
704
+ logger.info("=" * 60)
705
+ logger.info(f"DONE in {elapsed / 60:.1f} minutes")
706
+ logger.info(f" Chunks: {n}")
707
+ logger.info(f" Clusters: {level_counts}")
708
+ logger.info(f" Silhouette: {silhouette_scores}")
709
+ logger.info(f" Run ID: {run_id}")
710
+ logger.info("=" * 60)
711
+
712
+ return {
713
+ "run_id": run_id,
714
+ "total_chunks": n,
715
+ "level_counts": level_counts,
716
+ "silhouette_scores": silhouette_scores,
717
+ "elapsed_minutes": elapsed / 60,
718
+ }
719
+
720
+
721
+ if __name__ == "__main__":
722
+ parser = argparse.ArgumentParser(description="Run hierarchical clustering on BrainLayer chunks")
723
+ parser.add_argument("--db-path", type=str, default=str(DEFAULT_DB))
724
+ parser.add_argument("--k", type=int, default=30, help="KNN neighbors")
725
+ parser.add_argument("--dry-run", action="store_true")
726
+ parser.add_argument("--l0", type=int, default=40, help="Target L0 clusters")
727
+ parser.add_argument("--l1", type=int, default=10, help="Target L1 per L0")
728
+ parser.add_argument("--l2", type=int, default=10, help="Target L2 per L1")
729
+ args = parser.parse_args()
730
+
731
+ run_clustering(
732
+ db_path=args.db_path,
733
+ k=args.k,
734
+ level_targets=[args.l0, args.l1, args.l2],
735
+ dry_run=args.dry_run,
736
+ )