brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
brainlayer/clustering.py
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""Hierarchical clustering pipeline for BrainLayer chunks.
|
|
2
|
+
|
|
3
|
+
Builds a 3-level cluster hierarchy using Recursive Leiden on a Faiss KNN graph.
|
|
4
|
+
Stores results in SQLite tables (clusters, chunk_clusters, vec_cluster_centroids).
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
245K chunk embeddings (sqlite-vec, 1024 dims)
|
|
8
|
+
→ L2-normalize
|
|
9
|
+
→ Faiss IndexFlatIP k=30 KNN graph (~2 GB, 5-15 min)
|
|
10
|
+
→ igraph conversion
|
|
11
|
+
→ Recursive Leiden at 3 resolutions
|
|
12
|
+
Level 0: ~40 clusters (resolution ~0.005)
|
|
13
|
+
Level 1: ~400 clusters (~10 per L0, resolution ~0.05)
|
|
14
|
+
Level 2: ~4000 clusters (~10 per L1, resolution ~0.5)
|
|
15
|
+
→ Centroids + materialized paths → SQLite
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
python3 -m brainlayer.clustering [--db-path PATH] [--k 30] [--dry-run]
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import struct
|
|
25
|
+
import time
|
|
26
|
+
import uuid
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Optional
|
|
31
|
+
|
|
32
|
+
import apsw
|
|
33
|
+
import faiss
|
|
34
|
+
import igraph as ig
|
|
35
|
+
import leidenalg
|
|
36
|
+
import numpy as np
|
|
37
|
+
import sqlite_vec
|
|
38
|
+
|
|
39
|
+
logging.basicConfig(
|
|
40
|
+
level=logging.INFO,
|
|
41
|
+
format="%(asctime)s %(levelname)s %(message)s",
|
|
42
|
+
datefmt="%H:%M:%S",
|
|
43
|
+
)
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
DEFAULT_DB = Path.home() / ".local" / "share" / "brainlayer" / "brainlayer.db"
|
|
47
|
+
EMBEDDING_DIM = 1024
|
|
48
|
+
|
|
49
|
+
# Target cluster counts at each level
|
|
50
|
+
LEVEL_TARGETS = [40, 10, 10] # L0: ~40 top, L1: ~10 per L0, L2: ~10 per L1
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def serialize_f32(vector) -> bytes:
|
|
54
|
+
"""Serialize float32 vector to bytes for sqlite-vec.
|
|
55
|
+
|
|
56
|
+
Note: Duplicated from vector_store.serialize_f32 because clustering.py
|
|
57
|
+
runs as a standalone script (python -m brainlayer.clustering) and
|
|
58
|
+
importing vector_store triggers heavy dependencies. Kept intentionally.
|
|
59
|
+
"""
|
|
60
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ─── Step 1: Extract Embeddings ─────────────────────────────────
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_embeddings(db_path: str):
|
|
67
|
+
"""Batch-read all embeddings from sqlite-vec.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
chunk_ids: list of chunk IDs (same order as embeddings)
|
|
71
|
+
embeddings: numpy array of shape (N, 1024)
|
|
72
|
+
"""
|
|
73
|
+
conn = apsw.Connection(db_path, flags=apsw.SQLITE_OPEN_READONLY)
|
|
74
|
+
conn.enableloadextension(True)
|
|
75
|
+
conn.loadextension(sqlite_vec.loadable_path())
|
|
76
|
+
conn.enableloadextension(False)
|
|
77
|
+
cursor = conn.cursor()
|
|
78
|
+
|
|
79
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM chunk_vectors"))[0][0]
|
|
80
|
+
logger.info(f"Extracting {total} embeddings...")
|
|
81
|
+
|
|
82
|
+
# Read all chunk IDs first (ordered by rowid for deterministic ordering)
|
|
83
|
+
logger.info(" Reading chunk IDs...")
|
|
84
|
+
all_chunk_ids = [row[0] for row in cursor.execute("SELECT id FROM chunks ORDER BY rowid")]
|
|
85
|
+
logger.info(f" Got {len(all_chunk_ids)} chunk IDs")
|
|
86
|
+
|
|
87
|
+
# Build a set for fast lookup + ordered list
|
|
88
|
+
chunk_id_set = set(all_chunk_ids)
|
|
89
|
+
|
|
90
|
+
# Read all vectors — vec0 doesn't support ORDER BY or OFFSET,
|
|
91
|
+
# so we read everything and match by chunk_id
|
|
92
|
+
logger.info(" Reading all vectors from chunk_vectors...")
|
|
93
|
+
chunk_ids = []
|
|
94
|
+
embeddings = []
|
|
95
|
+
# AIDEV-NOTE: vec0 virtual tables return rows in insertion order (not guaranteed sorted).
|
|
96
|
+
# We read all and re-order to match chunks.rowid order.
|
|
97
|
+
vec_map = {}
|
|
98
|
+
for chunk_id, emb_bytes in cursor.execute("SELECT chunk_id, embedding FROM chunk_vectors"):
|
|
99
|
+
if emb_bytes and len(emb_bytes) == EMBEDDING_DIM * 4 and chunk_id in chunk_id_set:
|
|
100
|
+
vec = np.frombuffer(emb_bytes, dtype=np.float32).copy()
|
|
101
|
+
vec_map[chunk_id] = vec
|
|
102
|
+
|
|
103
|
+
# Re-order to match chunks.rowid order
|
|
104
|
+
for cid in all_chunk_ids:
|
|
105
|
+
if cid in vec_map:
|
|
106
|
+
chunk_ids.append(cid)
|
|
107
|
+
embeddings.append(vec_map[cid])
|
|
108
|
+
|
|
109
|
+
del vec_map # Free memory
|
|
110
|
+
logger.info(f" Matched {len(chunk_ids)}/{len(all_chunk_ids)} chunks with vectors")
|
|
111
|
+
|
|
112
|
+
conn.close()
|
|
113
|
+
embeddings_array = np.vstack(embeddings) if embeddings else np.zeros((0, EMBEDDING_DIM), dtype=np.float32)
|
|
114
|
+
logger.info(f"Extracted {len(chunk_ids)} embeddings, shape: {embeddings_array.shape}")
|
|
115
|
+
return chunk_ids, embeddings_array
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ─── Step 2: Build KNN Graph ────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def build_knn_graph(embeddings: np.ndarray, k: int = 30):
|
|
122
|
+
"""Build KNN graph using Faiss IndexFlatIP after L2 normalization.
|
|
123
|
+
|
|
124
|
+
L2 normalization converts dot product (IP) → cosine similarity.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
distances: (N, k) array of cosine similarities
|
|
128
|
+
indices: (N, k) array of neighbor indices
|
|
129
|
+
"""
|
|
130
|
+
n, d = embeddings.shape
|
|
131
|
+
logger.info(f"L2-normalizing {n} embeddings...")
|
|
132
|
+
faiss.normalize_L2(embeddings) # in-place
|
|
133
|
+
|
|
134
|
+
logger.info(f"Building Faiss IndexFlatIP for KNN (k={k})...")
|
|
135
|
+
index = faiss.IndexFlatIP(d)
|
|
136
|
+
index.add(embeddings)
|
|
137
|
+
|
|
138
|
+
logger.info(f"Searching {k} nearest neighbors for {n} vectors...")
|
|
139
|
+
t0 = time.time()
|
|
140
|
+
# k+1 because the first result is always the point itself
|
|
141
|
+
distances, indices = index.search(embeddings, k + 1)
|
|
142
|
+
elapsed = time.time() - t0
|
|
143
|
+
logger.info(f"KNN search completed in {elapsed:.1f}s")
|
|
144
|
+
|
|
145
|
+
# Remove self-matches (first column)
|
|
146
|
+
return distances[:, 1:], indices[:, 1:]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ─── Step 3: Convert to igraph ──────────────────────────────────
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def knn_to_igraph(indices: np.ndarray, distances: np.ndarray, n: int):
|
|
153
|
+
"""Convert KNN results to a weighted undirected igraph graph.
|
|
154
|
+
|
|
155
|
+
Collapses directed KNN edges into undirected with max weight.
|
|
156
|
+
"""
|
|
157
|
+
logger.info(f"Building igraph from KNN ({n} nodes)...")
|
|
158
|
+
|
|
159
|
+
edge_set = set()
|
|
160
|
+
weight_map = {}
|
|
161
|
+
|
|
162
|
+
for i in range(n):
|
|
163
|
+
for j_pos in range(indices.shape[1]):
|
|
164
|
+
j = int(indices[i, j_pos])
|
|
165
|
+
if j < 0 or j >= n or j == i:
|
|
166
|
+
continue
|
|
167
|
+
w = float(distances[i, j_pos])
|
|
168
|
+
edge = (min(i, j), max(i, j))
|
|
169
|
+
if edge not in edge_set:
|
|
170
|
+
edge_set.add(edge)
|
|
171
|
+
weight_map[edge] = w
|
|
172
|
+
else:
|
|
173
|
+
weight_map[edge] = max(weight_map[edge], w)
|
|
174
|
+
|
|
175
|
+
edges = list(edge_set)
|
|
176
|
+
weights = [max(0.001, weight_map[e]) for e in edges]
|
|
177
|
+
|
|
178
|
+
g = ig.Graph(n=n, edges=edges, directed=False)
|
|
179
|
+
g.es["weight"] = weights
|
|
180
|
+
|
|
181
|
+
logger.info(f"igraph: {g.vcount()} nodes, {g.ecount()} edges")
|
|
182
|
+
return g
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ─── Step 4: Resolution Binary Search ───────────────────────────
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def find_resolution_for_target(
|
|
189
|
+
graph: ig.Graph,
|
|
190
|
+
weights: list,
|
|
191
|
+
target: int,
|
|
192
|
+
lo: float = 0.0001,
|
|
193
|
+
hi: float = 5.0,
|
|
194
|
+
tolerance: float = 0.2,
|
|
195
|
+
max_iters: int = 20,
|
|
196
|
+
) -> float:
|
|
197
|
+
"""Binary search for Leiden resolution that gives ~target clusters.
|
|
198
|
+
|
|
199
|
+
tolerance: fraction of target we accept (e.g., 0.2 = within 20%)
|
|
200
|
+
"""
|
|
201
|
+
best_res = (lo + hi) / 2
|
|
202
|
+
best_diff = float("inf")
|
|
203
|
+
|
|
204
|
+
for iteration in range(max_iters):
|
|
205
|
+
mid = (lo + hi) / 2
|
|
206
|
+
partition = leidenalg.find_partition(
|
|
207
|
+
graph,
|
|
208
|
+
leidenalg.RBConfigurationVertexPartition,
|
|
209
|
+
weights=weights,
|
|
210
|
+
resolution_parameter=mid,
|
|
211
|
+
n_iterations=3, # Fewer iterations for search
|
|
212
|
+
seed=42,
|
|
213
|
+
)
|
|
214
|
+
n_clusters = len(set(partition.membership))
|
|
215
|
+
|
|
216
|
+
diff = abs(n_clusters - target) / target
|
|
217
|
+
if diff < best_diff:
|
|
218
|
+
best_diff = diff
|
|
219
|
+
best_res = mid
|
|
220
|
+
|
|
221
|
+
if diff < tolerance:
|
|
222
|
+
logger.info(f" Resolution {mid:.6f} → {n_clusters} clusters (target: {target}, diff: {diff:.1%})")
|
|
223
|
+
return mid
|
|
224
|
+
|
|
225
|
+
if n_clusters > target:
|
|
226
|
+
hi = mid # Too many clusters → lower resolution
|
|
227
|
+
else:
|
|
228
|
+
lo = mid # Too few → higher resolution
|
|
229
|
+
|
|
230
|
+
logger.info(f" Best resolution {best_res:.6f} after {max_iters} iters (diff: {best_diff:.1%})")
|
|
231
|
+
return best_res
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# ─── Step 5: Recursive Leiden ────────────────────────────────────
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def recursive_leiden(
|
|
238
|
+
graph: ig.Graph,
|
|
239
|
+
node_indices: np.ndarray,
|
|
240
|
+
level_targets: list,
|
|
241
|
+
embeddings: np.ndarray,
|
|
242
|
+
level: int = 0,
|
|
243
|
+
parent_id: Optional[str] = None,
|
|
244
|
+
parent_path: str = "",
|
|
245
|
+
):
|
|
246
|
+
"""Run Leiden recursively to build a guaranteed nested hierarchy.
|
|
247
|
+
|
|
248
|
+
At each level, runs Leiden on the subgraph to split into ~target clusters.
|
|
249
|
+
Then recurses into each cluster for the next level.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
list of cluster dicts: {id, level, parent_id, path, node_indices, centroid}
|
|
253
|
+
"""
|
|
254
|
+
if level >= len(level_targets):
|
|
255
|
+
return []
|
|
256
|
+
|
|
257
|
+
target = level_targets[level]
|
|
258
|
+
n = graph.vcount()
|
|
259
|
+
|
|
260
|
+
if n < 3:
|
|
261
|
+
# Too small to cluster further
|
|
262
|
+
cluster_id = str(uuid.uuid4())[:12]
|
|
263
|
+
path = f"{parent_path}/{cluster_id}" if parent_path else cluster_id
|
|
264
|
+
centroid = embeddings[node_indices].mean(axis=0) if len(node_indices) > 0 else np.zeros(EMBEDDING_DIM)
|
|
265
|
+
return [
|
|
266
|
+
{
|
|
267
|
+
"id": cluster_id,
|
|
268
|
+
"level": level,
|
|
269
|
+
"parent_id": parent_id,
|
|
270
|
+
"path": path,
|
|
271
|
+
"node_indices": node_indices,
|
|
272
|
+
"centroid": centroid,
|
|
273
|
+
"chunk_count": len(node_indices),
|
|
274
|
+
}
|
|
275
|
+
]
|
|
276
|
+
|
|
277
|
+
# Get weights for this subgraph
|
|
278
|
+
weights = graph.es["weight"] if graph.es else None
|
|
279
|
+
|
|
280
|
+
# Find resolution for target cluster count
|
|
281
|
+
actual_target = min(target, max(2, n // 3)) # Don't try to make more clusters than we have points / 3
|
|
282
|
+
resolution = find_resolution_for_target(graph, weights, actual_target)
|
|
283
|
+
|
|
284
|
+
# Run final Leiden with more iterations
|
|
285
|
+
partition = leidenalg.find_partition(
|
|
286
|
+
graph,
|
|
287
|
+
leidenalg.RBConfigurationVertexPartition,
|
|
288
|
+
weights=weights,
|
|
289
|
+
resolution_parameter=resolution,
|
|
290
|
+
n_iterations=-1, # Until convergence
|
|
291
|
+
seed=42,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
membership = partition.membership
|
|
295
|
+
communities = defaultdict(list)
|
|
296
|
+
for local_idx, comm_id in enumerate(membership):
|
|
297
|
+
communities[comm_id].append(local_idx)
|
|
298
|
+
|
|
299
|
+
n_clusters = len(communities)
|
|
300
|
+
logger.info(f"Level {level}: {n_clusters} clusters from {n} nodes (res={resolution:.6f})")
|
|
301
|
+
|
|
302
|
+
all_clusters = []
|
|
303
|
+
|
|
304
|
+
for comm_id in sorted(communities.keys()):
|
|
305
|
+
local_indices = communities[comm_id]
|
|
306
|
+
global_indices = np.array([node_indices[i] for i in local_indices])
|
|
307
|
+
|
|
308
|
+
cluster_id = str(uuid.uuid4())[:12]
|
|
309
|
+
path = f"{parent_path}/{cluster_id}" if parent_path else cluster_id
|
|
310
|
+
centroid = embeddings[global_indices].mean(axis=0)
|
|
311
|
+
|
|
312
|
+
cluster = {
|
|
313
|
+
"id": cluster_id,
|
|
314
|
+
"level": level,
|
|
315
|
+
"parent_id": parent_id,
|
|
316
|
+
"path": path,
|
|
317
|
+
"node_indices": global_indices,
|
|
318
|
+
"centroid": centroid,
|
|
319
|
+
"chunk_count": len(global_indices),
|
|
320
|
+
}
|
|
321
|
+
all_clusters.append(cluster)
|
|
322
|
+
|
|
323
|
+
# Recurse into sub-clusters if we have enough points and more levels
|
|
324
|
+
if level + 1 < len(level_targets) and len(local_indices) >= 6:
|
|
325
|
+
subgraph = graph.subgraph(local_indices)
|
|
326
|
+
sub_clusters = recursive_leiden(
|
|
327
|
+
subgraph,
|
|
328
|
+
global_indices,
|
|
329
|
+
level_targets,
|
|
330
|
+
embeddings,
|
|
331
|
+
level=level + 1,
|
|
332
|
+
parent_id=cluster_id,
|
|
333
|
+
parent_path=path,
|
|
334
|
+
)
|
|
335
|
+
all_clusters.extend(sub_clusters)
|
|
336
|
+
|
|
337
|
+
return all_clusters
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# ─── Step 6: Write to SQLite ────────────────────────────────────
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def create_cluster_schema(conn: apsw.Connection):
|
|
344
|
+
"""Create clustering tables in the BrainLayer DB."""
|
|
345
|
+
cursor = conn.cursor()
|
|
346
|
+
|
|
347
|
+
cursor.execute("""
|
|
348
|
+
CREATE TABLE IF NOT EXISTS clusters (
|
|
349
|
+
id TEXT PRIMARY KEY,
|
|
350
|
+
level INTEGER NOT NULL,
|
|
351
|
+
parent_id TEXT,
|
|
352
|
+
path TEXT NOT NULL,
|
|
353
|
+
label TEXT,
|
|
354
|
+
ctfidf_label TEXT,
|
|
355
|
+
chunk_count INTEGER DEFAULT 0,
|
|
356
|
+
silhouette_score REAL,
|
|
357
|
+
avg_intra_dist REAL,
|
|
358
|
+
created_at TEXT,
|
|
359
|
+
updated_at TEXT
|
|
360
|
+
)
|
|
361
|
+
""")
|
|
362
|
+
|
|
363
|
+
cursor.execute("""
|
|
364
|
+
CREATE TABLE IF NOT EXISTS chunk_clusters (
|
|
365
|
+
chunk_id TEXT NOT NULL,
|
|
366
|
+
cluster_id TEXT NOT NULL,
|
|
367
|
+
level INTEGER NOT NULL,
|
|
368
|
+
dist_to_centroid REAL,
|
|
369
|
+
assignment_method TEXT DEFAULT 'initial',
|
|
370
|
+
assigned_at TEXT,
|
|
371
|
+
PRIMARY KEY (chunk_id, level)
|
|
372
|
+
)
|
|
373
|
+
""")
|
|
374
|
+
|
|
375
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_clusters_cluster ON chunk_clusters(cluster_id)")
|
|
376
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_clusters_level ON chunk_clusters(level)")
|
|
377
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_level ON clusters(level)")
|
|
378
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_parent ON clusters(parent_id)")
|
|
379
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_clusters_path ON clusters(path)")
|
|
380
|
+
|
|
381
|
+
cursor.execute("""
|
|
382
|
+
CREATE TABLE IF NOT EXISTS clustering_runs (
|
|
383
|
+
id TEXT PRIMARY KEY,
|
|
384
|
+
started_at TEXT,
|
|
385
|
+
completed_at TEXT,
|
|
386
|
+
status TEXT DEFAULT 'running',
|
|
387
|
+
total_chunks INTEGER,
|
|
388
|
+
level_counts TEXT,
|
|
389
|
+
params TEXT,
|
|
390
|
+
silhouette_scores TEXT
|
|
391
|
+
)
|
|
392
|
+
""")
|
|
393
|
+
|
|
394
|
+
# vec_cluster_centroids — virtual table for centroid KNN search
|
|
395
|
+
cursor.execute("""
|
|
396
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vec_cluster_centroids USING vec0(
|
|
397
|
+
cluster_id TEXT PRIMARY KEY,
|
|
398
|
+
centroid FLOAT[1024]
|
|
399
|
+
)
|
|
400
|
+
""")
|
|
401
|
+
|
|
402
|
+
logger.info("Cluster schema created/verified")
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def write_clusters(
|
|
406
|
+
conn: apsw.Connection,
|
|
407
|
+
clusters: list,
|
|
408
|
+
chunk_ids: list,
|
|
409
|
+
embeddings: np.ndarray,
|
|
410
|
+
run_id: str,
|
|
411
|
+
):
|
|
412
|
+
"""Write cluster hierarchy and chunk assignments to SQLite.
|
|
413
|
+
|
|
414
|
+
Wrapped in a single transaction — either all clusters are written or none
|
|
415
|
+
(prevents partial state if an insert fails after clearing old data).
|
|
416
|
+
"""
|
|
417
|
+
cursor = conn.cursor()
|
|
418
|
+
cursor.execute("PRAGMA busy_timeout = 5000")
|
|
419
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
420
|
+
|
|
421
|
+
cursor.execute("BEGIN")
|
|
422
|
+
try:
|
|
423
|
+
# Clear previous data
|
|
424
|
+
cursor.execute("DELETE FROM clusters")
|
|
425
|
+
cursor.execute("DELETE FROM chunk_clusters")
|
|
426
|
+
cursor.execute("DELETE FROM vec_cluster_centroids")
|
|
427
|
+
|
|
428
|
+
logger.info(f"Writing {len(clusters)} clusters...")
|
|
429
|
+
|
|
430
|
+
for cluster in clusters:
|
|
431
|
+
# Write cluster
|
|
432
|
+
cursor.execute(
|
|
433
|
+
"INSERT INTO clusters (id, level, parent_id, path, chunk_count, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
434
|
+
(
|
|
435
|
+
cluster["id"],
|
|
436
|
+
cluster["level"],
|
|
437
|
+
cluster["parent_id"],
|
|
438
|
+
cluster["path"],
|
|
439
|
+
cluster["chunk_count"],
|
|
440
|
+
now,
|
|
441
|
+
now,
|
|
442
|
+
),
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Write centroid (L2-normalize for cosine similarity via sqlite-vec match)
|
|
446
|
+
centroid = cluster["centroid"].copy()
|
|
447
|
+
norm = np.linalg.norm(centroid)
|
|
448
|
+
if norm > 0:
|
|
449
|
+
centroid /= norm
|
|
450
|
+
cursor.execute(
|
|
451
|
+
"INSERT INTO vec_cluster_centroids (cluster_id, centroid) VALUES (?, ?)",
|
|
452
|
+
(cluster["id"], serialize_f32(centroid.tolist())),
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Write chunk assignments
|
|
456
|
+
for global_idx in cluster["node_indices"]:
|
|
457
|
+
cid = chunk_ids[global_idx]
|
|
458
|
+
dist = float(np.linalg.norm(embeddings[global_idx] - cluster["centroid"]))
|
|
459
|
+
cursor.execute(
|
|
460
|
+
"INSERT OR REPLACE INTO chunk_clusters (chunk_id, cluster_id, level, dist_to_centroid, assignment_method, assigned_at) VALUES (?, ?, ?, ?, 'initial', ?)",
|
|
461
|
+
(cid, cluster["id"], cluster["level"], dist, now),
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
cursor.execute("COMMIT")
|
|
465
|
+
logger.info(f"Written {len(clusters)} clusters to DB")
|
|
466
|
+
except Exception:
|
|
467
|
+
cursor.execute("ROLLBACK")
|
|
468
|
+
logger.error("Failed to write clusters — rolled back")
|
|
469
|
+
raise
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
# ─── Step 7: c-TF-IDF Labeling ──────────────────────────────────
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def generate_ctfidf_labels(
|
|
476
|
+
conn: apsw.Connection,
|
|
477
|
+
clusters: list,
|
|
478
|
+
chunk_ids: list,
|
|
479
|
+
):
|
|
480
|
+
"""Generate c-TF-IDF labels for all clusters.
|
|
481
|
+
|
|
482
|
+
For each cluster, gets the top-3 discriminative terms from its member chunks.
|
|
483
|
+
"""
|
|
484
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
485
|
+
|
|
486
|
+
cursor = conn.cursor()
|
|
487
|
+
cursor.execute("PRAGMA busy_timeout = 5000")
|
|
488
|
+
|
|
489
|
+
# Build a mapping from global index to chunk content (sample for efficiency)
|
|
490
|
+
logger.info("Loading chunk content for labeling...")
|
|
491
|
+
content_map = {}
|
|
492
|
+
for row in cursor.execute("SELECT id, content FROM chunks"):
|
|
493
|
+
content_map[row[0]] = row[1] or ""
|
|
494
|
+
|
|
495
|
+
# Group clusters by level
|
|
496
|
+
by_level = defaultdict(list)
|
|
497
|
+
for c in clusters:
|
|
498
|
+
by_level[c["level"]].append(c)
|
|
499
|
+
|
|
500
|
+
for level in sorted(by_level.keys()):
|
|
501
|
+
level_clusters = by_level[level]
|
|
502
|
+
logger.info(f"Labeling {len(level_clusters)} clusters at level {level}...")
|
|
503
|
+
|
|
504
|
+
# Build per-cluster documents
|
|
505
|
+
docs = []
|
|
506
|
+
cluster_ids_for_docs = []
|
|
507
|
+
for c in level_clusters:
|
|
508
|
+
# Sample up to 50 chunks per cluster for labeling
|
|
509
|
+
indices = c["node_indices"][:50] if len(c["node_indices"]) > 50 else c["node_indices"]
|
|
510
|
+
texts = []
|
|
511
|
+
for idx in indices:
|
|
512
|
+
cid = chunk_ids[idx]
|
|
513
|
+
text = content_map.get(cid, "")
|
|
514
|
+
if text:
|
|
515
|
+
texts.append(text[:2000])
|
|
516
|
+
doc = " ".join(texts)
|
|
517
|
+
if doc.strip():
|
|
518
|
+
docs.append(doc)
|
|
519
|
+
cluster_ids_for_docs.append(c["id"])
|
|
520
|
+
|
|
521
|
+
if not docs:
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
vectorizer = TfidfVectorizer(
|
|
525
|
+
max_features=2000,
|
|
526
|
+
stop_words="english",
|
|
527
|
+
ngram_range=(1, 2),
|
|
528
|
+
min_df=1,
|
|
529
|
+
max_df=0.8,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
tfidf = vectorizer.fit_transform(docs)
|
|
534
|
+
except ValueError:
|
|
535
|
+
continue
|
|
536
|
+
|
|
537
|
+
feature_names = vectorizer.get_feature_names_out()
|
|
538
|
+
|
|
539
|
+
for i, cluster_id in enumerate(cluster_ids_for_docs):
|
|
540
|
+
top_indices = tfidf[i].toarray()[0].argsort()[-5:][::-1]
|
|
541
|
+
top_terms = [feature_names[idx] for idx in top_indices if tfidf[i, idx] > 0]
|
|
542
|
+
label = " / ".join(top_terms[:3]) if top_terms else f"cluster-{cluster_id}"
|
|
543
|
+
|
|
544
|
+
cursor.execute(
|
|
545
|
+
"UPDATE clusters SET ctfidf_label = ?, label = ? WHERE id = ?",
|
|
546
|
+
(label, label, cluster_id),
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
logger.info("c-TF-IDF labeling complete")
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
# ─── Step 8: Silhouette Score ────────────────────────────────────
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def compute_silhouette_sample(
|
|
556
|
+
embeddings: np.ndarray,
|
|
557
|
+
clusters: list,
|
|
558
|
+
chunk_ids: list,
|
|
559
|
+
sample_size: int = 5000,
|
|
560
|
+
):
|
|
561
|
+
"""Compute silhouette score on a sample for each level.
|
|
562
|
+
|
|
563
|
+
Returns dict of {level: silhouette_score}.
|
|
564
|
+
"""
|
|
565
|
+
from sklearn.metrics import silhouette_score
|
|
566
|
+
|
|
567
|
+
# Build level-specific label arrays
|
|
568
|
+
by_level = defaultdict(list)
|
|
569
|
+
for c in clusters:
|
|
570
|
+
by_level[c["level"]].append(c)
|
|
571
|
+
|
|
572
|
+
scores = {}
|
|
573
|
+
for level in sorted(by_level.keys()):
|
|
574
|
+
level_clusters = by_level[level]
|
|
575
|
+
|
|
576
|
+
# Build label array (global_idx → cluster_id)
|
|
577
|
+
label_map = {}
|
|
578
|
+
for c in level_clusters:
|
|
579
|
+
for idx in c["node_indices"]:
|
|
580
|
+
label_map[idx] = c["id"]
|
|
581
|
+
|
|
582
|
+
indices = sorted(label_map.keys())
|
|
583
|
+
if len(indices) < 10:
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
# Sample for efficiency
|
|
587
|
+
if len(indices) > sample_size:
|
|
588
|
+
rng = np.random.default_rng(42)
|
|
589
|
+
indices = sorted(rng.choice(indices, sample_size, replace=False))
|
|
590
|
+
|
|
591
|
+
X = embeddings[indices]
|
|
592
|
+
labels = [label_map[i] for i in indices]
|
|
593
|
+
|
|
594
|
+
# Need at least 2 unique labels
|
|
595
|
+
if len(set(labels)) < 2:
|
|
596
|
+
continue
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
score = silhouette_score(X, labels, metric="cosine", sample_size=min(len(X), sample_size))
|
|
600
|
+
scores[level] = float(score)
|
|
601
|
+
logger.info(f"Silhouette score L{level}: {score:.4f}")
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.warning(f"Silhouette failed for L{level}: {e}")
|
|
604
|
+
|
|
605
|
+
return scores
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
# ─── Main Pipeline ───────────────────────────────────────────────
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def run_clustering(
|
|
612
|
+
db_path: str = str(DEFAULT_DB),
|
|
613
|
+
k: int = 30,
|
|
614
|
+
level_targets: list = None,
|
|
615
|
+
dry_run: bool = False,
|
|
616
|
+
):
|
|
617
|
+
"""Run the full clustering pipeline."""
|
|
618
|
+
if level_targets is None:
|
|
619
|
+
level_targets = LEVEL_TARGETS
|
|
620
|
+
|
|
621
|
+
run_id = str(uuid.uuid4())[:12]
|
|
622
|
+
t0 = time.time()
|
|
623
|
+
|
|
624
|
+
# Step 1: Extract
|
|
625
|
+
logger.info("=" * 60)
|
|
626
|
+
logger.info("STEP 1: Extracting embeddings")
|
|
627
|
+
logger.info("=" * 60)
|
|
628
|
+
chunk_ids, embeddings = extract_embeddings(db_path)
|
|
629
|
+
n = len(chunk_ids)
|
|
630
|
+
|
|
631
|
+
if dry_run:
|
|
632
|
+
logger.info(f"DRY RUN: {n} chunks extracted. Would cluster with k={k}, targets={level_targets}")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
# Step 2: KNN
|
|
636
|
+
logger.info("=" * 60)
|
|
637
|
+
logger.info("STEP 2: Building KNN graph")
|
|
638
|
+
logger.info("=" * 60)
|
|
639
|
+
distances, indices = build_knn_graph(embeddings, k=k)
|
|
640
|
+
|
|
641
|
+
# Step 3: igraph
|
|
642
|
+
logger.info("=" * 60)
|
|
643
|
+
logger.info("STEP 3: Converting to igraph")
|
|
644
|
+
logger.info("=" * 60)
|
|
645
|
+
graph = knn_to_igraph(indices, distances, n)
|
|
646
|
+
|
|
647
|
+
# Step 4+5: Recursive Leiden
|
|
648
|
+
logger.info("=" * 60)
|
|
649
|
+
logger.info("STEP 4: Running recursive Leiden clustering")
|
|
650
|
+
logger.info("=" * 60)
|
|
651
|
+
all_indices = np.arange(n)
|
|
652
|
+
clusters = recursive_leiden(graph, all_indices, level_targets, embeddings)
|
|
653
|
+
|
|
654
|
+
# Report cluster counts
|
|
655
|
+
by_level = defaultdict(int)
|
|
656
|
+
for c in clusters:
|
|
657
|
+
by_level[c["level"]] += 1
|
|
658
|
+
level_counts = {f"L{lv}": cnt for lv, cnt in sorted(by_level.items())}
|
|
659
|
+
logger.info(f"Cluster counts: {level_counts}")
|
|
660
|
+
|
|
661
|
+
# Step 6: Write to DB
|
|
662
|
+
logger.info("=" * 60)
|
|
663
|
+
logger.info("STEP 5: Writing to SQLite")
|
|
664
|
+
logger.info("=" * 60)
|
|
665
|
+
conn = apsw.Connection(db_path)
|
|
666
|
+
conn.enableloadextension(True)
|
|
667
|
+
conn.loadextension(sqlite_vec.loadable_path())
|
|
668
|
+
conn.enableloadextension(False)
|
|
669
|
+
|
|
670
|
+
create_cluster_schema(conn)
|
|
671
|
+
write_clusters(conn, clusters, chunk_ids, embeddings, run_id)
|
|
672
|
+
|
|
673
|
+
# Step 7: c-TF-IDF labels
|
|
674
|
+
logger.info("=" * 60)
|
|
675
|
+
logger.info("STEP 6: Generating c-TF-IDF labels")
|
|
676
|
+
logger.info("=" * 60)
|
|
677
|
+
generate_ctfidf_labels(conn, clusters, chunk_ids)
|
|
678
|
+
|
|
679
|
+
# Step 8: Silhouette
|
|
680
|
+
logger.info("=" * 60)
|
|
681
|
+
logger.info("STEP 7: Computing silhouette scores")
|
|
682
|
+
logger.info("=" * 60)
|
|
683
|
+
silhouette_scores = compute_silhouette_sample(embeddings, clusters, chunk_ids)
|
|
684
|
+
|
|
685
|
+
# Record run
|
|
686
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
687
|
+
cursor = conn.cursor()
|
|
688
|
+
cursor.execute(
|
|
689
|
+
"INSERT INTO clustering_runs (id, started_at, completed_at, status, total_chunks, level_counts, params, silhouette_scores) VALUES (?, ?, ?, 'completed', ?, ?, ?, ?)",
|
|
690
|
+
(
|
|
691
|
+
run_id,
|
|
692
|
+
datetime.fromtimestamp(t0, tz=timezone.utc).isoformat(),
|
|
693
|
+
now,
|
|
694
|
+
n,
|
|
695
|
+
json.dumps(level_counts),
|
|
696
|
+
json.dumps({"k": k, "level_targets": level_targets}),
|
|
697
|
+
json.dumps(silhouette_scores),
|
|
698
|
+
),
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
conn.close()
|
|
702
|
+
|
|
703
|
+
elapsed = time.time() - t0
|
|
704
|
+
logger.info("=" * 60)
|
|
705
|
+
logger.info(f"DONE in {elapsed / 60:.1f} minutes")
|
|
706
|
+
logger.info(f" Chunks: {n}")
|
|
707
|
+
logger.info(f" Clusters: {level_counts}")
|
|
708
|
+
logger.info(f" Silhouette: {silhouette_scores}")
|
|
709
|
+
logger.info(f" Run ID: {run_id}")
|
|
710
|
+
logger.info("=" * 60)
|
|
711
|
+
|
|
712
|
+
return {
|
|
713
|
+
"run_id": run_id,
|
|
714
|
+
"total_chunks": n,
|
|
715
|
+
"level_counts": level_counts,
|
|
716
|
+
"silhouette_scores": silhouette_scores,
|
|
717
|
+
"elapsed_minutes": elapsed / 60,
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
if __name__ == "__main__":
|
|
722
|
+
parser = argparse.ArgumentParser(description="Run hierarchical clustering on BrainLayer chunks")
|
|
723
|
+
parser.add_argument("--db-path", type=str, default=str(DEFAULT_DB))
|
|
724
|
+
parser.add_argument("--k", type=int, default=30, help="KNN neighbors")
|
|
725
|
+
parser.add_argument("--dry-run", action="store_true")
|
|
726
|
+
parser.add_argument("--l0", type=int, default=40, help="Target L0 clusters")
|
|
727
|
+
parser.add_argument("--l1", type=int, default=10, help="Target L1 per L0")
|
|
728
|
+
parser.add_argument("--l2", type=int, default=10, help="Target L2 per L1")
|
|
729
|
+
args = parser.parse_args()
|
|
730
|
+
|
|
731
|
+
run_clustering(
|
|
732
|
+
db_path=args.db_path,
|
|
733
|
+
k=args.k,
|
|
734
|
+
level_targets=[args.l0, args.l1, args.l2],
|
|
735
|
+
dry_run=args.dry_run,
|
|
736
|
+
)
|