claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ """Cluster message embeddings via UMAP + HDBSCAN.
2
+
3
+ Pipeline
4
+ --------
5
+ 1. Read the embeddings parquet into a numpy float32 matrix.
6
+ 2. UMAP reduce to 50d (for HDBSCAN) and 2d (for viz), both with ``seed=42``.
7
+ 3. HDBSCAN on the 50d projection → cluster_id per row, -1 for noise.
8
+ 4. Write ``clusters.parquet`` with ``(uuid, cluster_id, x, y, is_noise)``.
9
+
10
+ Public API
11
+ ----------
12
+ run_clustering(settings, *, force=False) -> dict[str, int]
13
+ Read parquet, compute clusters, write output parquet. Returns stats dict.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import time
19
+ from pathlib import Path
20
+
21
+ import numpy as np
22
+ import polars as pl
23
+ from loguru import logger
24
+
25
+ from claude_sql.config import Settings
26
+ from claude_sql.parquet_shards import iter_part_files
27
+
28
+
29
+ def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
30
+ """Read parquet → (uuid_list, embedding_matrix[float32]). Matrix shape (N, dim).
31
+
32
+ Accepts either a legacy single-file ``embeddings.parquet`` or a sharded
33
+ directory containing ``part-*.parquet`` files; the union of all parts is
34
+ materialized in scan order.
35
+ """
36
+ parts = iter_part_files(path)
37
+ df = pl.read_parquet([str(p) for p in parts]) if parts else pl.read_parquet(path)
38
+ uuids = df["uuid"].to_list()
39
+ # polars Array[Float32, dim] → 2D numpy (N, dim). to_numpy() on an Array column
40
+ # returns 2D when the element type is fixed-size Array; fall back to vstack if not.
41
+ emb = df["embedding"].to_numpy()
42
+ if emb.ndim == 1:
43
+ # Object array of 1D arrays (ragged container); stack into a 2D matrix.
44
+ emb = np.stack(list(emb))
45
+ return uuids, np.ascontiguousarray(emb, dtype=np.float32)
46
+
47
+
48
+ def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]:
49
+ """Run UMAP + HDBSCAN on the embeddings parquet.
50
+
51
+ Parameters
52
+ ----------
53
+ settings
54
+ Configured Settings; reads ``embeddings_parquet_path`` and writes
55
+ ``clusters_parquet_path``.
56
+ force
57
+ If False and clusters_parquet_path exists, return its stats without
58
+ recomputing. If True, always rerun.
59
+
60
+ Returns
61
+ -------
62
+ dict
63
+ ``{"total": N, "clusters": K, "noise": M}`` where K excludes the
64
+ noise cluster (label -1).
65
+ """
66
+ out_path = settings.clusters_parquet_path
67
+ in_path = settings.embeddings_parquet_path
68
+
69
+ # Sharded directory or legacy single file: insist on at least one part
70
+ # whose footer is plausibly populated (>16 bytes).
71
+ in_parts = [p for p in iter_part_files(in_path) if p.stat().st_size > 16]
72
+ if not in_parts:
73
+ raise FileNotFoundError(
74
+ f"Embeddings parquet missing at {in_path}. Run `claude-sql embed` first."
75
+ )
76
+
77
+ # Mtime-sidecar fast path: if the embeddings haven't moved since the
78
+ # last successful clustering, skip the ~40 s UMAP+HDBSCAN refit. The
79
+ # sidecar lives next to the output parquet and stores the latest
80
+ # part-file's nanosecond mtime; coarse hand-edits of clusters.parquet
81
+ # alone won't bust the cache (use ``force=True`` for that).
82
+ sidecar = out_path.with_suffix(out_path.suffix + ".embeddings_mtime")
83
+ in_mtime_ns = max(p.stat().st_mtime_ns for p in in_parts)
84
+ if (
85
+ not force
86
+ and out_path.exists()
87
+ and out_path.stat().st_size > 16
88
+ and sidecar.exists()
89
+ and sidecar.read_text().strip() == str(in_mtime_ns)
90
+ ):
91
+ logger.info("Embeddings unchanged since last cluster run; reusing {}.", out_path)
92
+ df = pl.read_parquet(out_path)
93
+ return {
94
+ "total": len(df),
95
+ "clusters": int((df["cluster_id"] >= 0).sum()),
96
+ "noise": int((df["cluster_id"] < 0).sum()),
97
+ }
98
+
99
+ # Legacy short-circuit: a clusters parquet exists but no sidecar (older
100
+ # install before the mtime-skip landed). Trust the parquet and stamp
101
+ # a sidecar so the next call hits the fast path. Forces a rebuild only
102
+ # when ``force=True`` is set explicitly.
103
+ if not force and out_path.exists() and out_path.stat().st_size > 16 and not sidecar.exists():
104
+ logger.info(
105
+ "Clusters parquet at {} predates sidecar; reusing and stamping mtime.",
106
+ out_path,
107
+ )
108
+ sidecar.write_text(str(in_mtime_ns))
109
+ df = pl.read_parquet(out_path)
110
+ return {
111
+ "total": len(df),
112
+ "clusters": int((df["cluster_id"] >= 0).sum()),
113
+ "noise": int((df["cluster_id"] < 0).sum()),
114
+ }
115
+
116
+ # Heavy imports inside the function so module import stays cheap.
117
+ import hdbscan
118
+ import umap
119
+
120
+ t0 = time.monotonic()
121
+ uuids, X = _load_embeddings(in_path) # noqa: N806 — X follows sklearn/ML matrix convention
122
+ logger.info("Loaded {} embeddings, shape={}, dtype={}", len(uuids), X.shape, X.dtype)
123
+
124
+ logger.info(
125
+ "UMAP → {}d (clustering): n_neighbors={}, min_dist={}, metric={}",
126
+ settings.umap_n_components_50,
127
+ settings.umap_n_neighbors,
128
+ settings.umap_min_dist_cluster,
129
+ settings.umap_metric,
130
+ )
131
+ reducer_cluster = umap.UMAP(
132
+ n_components=settings.umap_n_components_50,
133
+ n_neighbors=settings.umap_n_neighbors,
134
+ min_dist=settings.umap_min_dist_cluster,
135
+ metric=settings.umap_metric,
136
+ random_state=settings.seed,
137
+ )
138
+ X50 = reducer_cluster.fit_transform(X) # noqa: N806 — 50d projection matrix
139
+ logger.info("UMAP 50d done in {:.1f}s", time.monotonic() - t0)
140
+
141
+ t1 = time.monotonic()
142
+ logger.info(
143
+ "UMAP → {}d (viz): n_neighbors={}, min_dist={}",
144
+ settings.umap_n_components_2,
145
+ settings.umap_n_neighbors,
146
+ settings.umap_min_dist_viz,
147
+ )
148
+ reducer_viz = umap.UMAP(
149
+ n_components=settings.umap_n_components_2,
150
+ n_neighbors=settings.umap_n_neighbors,
151
+ min_dist=settings.umap_min_dist_viz,
152
+ metric=settings.umap_metric,
153
+ random_state=settings.seed,
154
+ )
155
+ X2 = reducer_viz.fit_transform(X) # noqa: N806 — 2d viz projection matrix
156
+ logger.info("UMAP 2d done in {:.1f}s", time.monotonic() - t1)
157
+
158
+ t2 = time.monotonic()
159
+ logger.info(
160
+ "HDBSCAN: min_cluster_size={}, min_samples={}",
161
+ settings.hdbscan_min_cluster_size,
162
+ settings.hdbscan_min_samples,
163
+ )
164
+ clusterer = hdbscan.HDBSCAN(
165
+ min_cluster_size=settings.hdbscan_min_cluster_size,
166
+ min_samples=settings.hdbscan_min_samples,
167
+ metric="euclidean", # post-UMAP space is euclidean-friendly
168
+ core_dist_n_jobs=-1,
169
+ prediction_data=False,
170
+ )
171
+ labels = clusterer.fit_predict(X50)
172
+ k = int(labels.max()) + 1 if labels.max() >= 0 else 0
173
+ noise = int((labels < 0).sum())
174
+ logger.info(
175
+ "HDBSCAN done in {:.1f}s: {} clusters, {} noise ({:.1%})",
176
+ time.monotonic() - t2,
177
+ k,
178
+ noise,
179
+ noise / len(labels) if len(labels) else 0,
180
+ )
181
+
182
+ df = pl.DataFrame(
183
+ {
184
+ "uuid": uuids,
185
+ "cluster_id": labels.astype(np.int32).tolist(),
186
+ "x": X2[:, 0].astype(np.float32).tolist(),
187
+ "y": X2[:, 1].astype(np.float32).tolist(),
188
+ "is_noise": (labels < 0).tolist(),
189
+ },
190
+ schema={
191
+ "uuid": pl.Utf8,
192
+ "cluster_id": pl.Int32,
193
+ "x": pl.Float32,
194
+ "y": pl.Float32,
195
+ "is_noise": pl.Boolean,
196
+ },
197
+ )
198
+ out_path.parent.mkdir(parents=True, exist_ok=True)
199
+ df.write_parquet(out_path)
200
+ sidecar.write_text(str(in_mtime_ns))
201
+ logger.info(
202
+ "Wrote {} rows to {} (total elapsed: {:.1f}s)",
203
+ len(df),
204
+ out_path,
205
+ time.monotonic() - t0,
206
+ )
207
+
208
+ return {"total": len(uuids), "clusters": k, "noise": noise}
@@ -0,0 +1,306 @@
1
+ """Session-level community detection via cosine graph + Louvain.
2
+
3
+ Pipeline
4
+ --------
5
+ 1. Compute session centroid embeddings: group message embeddings by
6
+ ``messages.session_id`` and average (mean over rows, then L2-normalize).
7
+ 2. Pick an adaptive cosine threshold that yields a connected but not
8
+ dense graph (target average degree in ``[target_avg_degree_low,
9
+ target_avg_degree_high]``), falling back to ``louvain_edge_threshold``
10
+ when the adaptive fit fails.
11
+ 3. Threshold the pairwise similarity into a sparse graph.
12
+ 4. Run ``networkx.community.louvain_communities`` with seed=42.
13
+ 5. Collapse singleton communities into a single ``community_id=-1`` "noise"
14
+ bucket so the output parquet stays scannable.
15
+ 6. Write ``session_communities.parquet`` with
16
+ ``(session_id, community_id, size)``.
17
+
18
+ Why adaptive thresholding
19
+ -------------------------
20
+ The previous design used a fixed ``louvain_edge_threshold=0.75`` absolute
21
+ cosine cut. Against ``int8``-quantized embeddings the pairwise cosine
22
+ distribution is both narrower and higher-mean than against unquantized
23
+ ``float32``, so 0.75 excluded the majority of "related" session pairs and
24
+ Louvain produced thousands of singleton communities (on the real corpus:
25
+ 1,512 communities for 6,329 sessions, with only 21 ≥ 20 sessions).
26
+
27
+ The adaptive path picks the percentile-based cut that puts average degree
28
+ in the 8–15 range (Louvain's typical sweet spot for this kind of graph),
29
+ so the answer stabilizes whether the underlying embeddings are int8,
30
+ float16, or float32.
31
+
32
+ Public API
33
+ ----------
34
+ run_communities(con, settings, *, force=False) -> dict[str, int]
35
+ Expects ``con`` to have v1 views registered (``messages`` + access to
36
+ the embeddings parquet via ``message_embeddings`` table or direct parquet
37
+ read). Returns ``{sessions, communities, noise, threshold}``.
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ import time
43
+ from pathlib import Path
44
+ from typing import TYPE_CHECKING
45
+
46
+ import numpy as np
47
+ import polars as pl
48
+ from loguru import logger
49
+
50
+ from claude_sql.config import Settings
51
+ from claude_sql.parquet_shards import iter_part_files
52
+
53
+ if TYPE_CHECKING:
54
+ import duckdb
55
+
56
+
57
+ #: Community id used for singleton / unclusterable sessions. -1 is an obvious
58
+ #: out-of-band sentinel that stays negative even after ``Int32`` serialization.
59
+ NOISE_COMMUNITY_ID: int = -1
60
+
61
+
62
+ def _load_session_centroids(
63
+ con: duckdb.DuckDBPyConnection, embeddings_parquet_path: Path
64
+ ) -> tuple[list[str], np.ndarray]:
65
+ """Return ``(session_ids, centroids)`` where centroids is ``(N_sessions, dim)`` float32.
66
+
67
+ Joins the embeddings parquet to the v1 ``messages`` view on uuid, then
68
+ aggregates inside DuckDB (unnest with position → ``avg`` per (session,
69
+ dim_index) → ordered ``list``). The L2-normalize step stays in numpy
70
+ where ``np.linalg.norm`` is faster on a contiguous (N, dim) matrix than
71
+ a per-row SQL norm. The previous Python double-loop over ~50K rows is
72
+ replaced with a single vectorized DuckDB query.
73
+ """
74
+ logger.info("Loading message embeddings and joining to sessions...")
75
+ parts = iter_part_files(embeddings_parquet_path)
76
+ if not parts:
77
+ raise RuntimeError(
78
+ f"No embeddings parquet at {embeddings_parquet_path} - run `claude-sql embed`."
79
+ )
80
+ # DDL doesn't accept prepared params for read_parquet's file list, so
81
+ # escape inline. Single-quotes inside paths get doubled per SQL rules.
82
+ path_literals = ", ".join("'{}'".format(str(p).replace("'", "''")) for p in parts)
83
+ sql = f"""
84
+ WITH joined AS (
85
+ SELECT CAST(m.session_id AS VARCHAR) AS session_id,
86
+ e.embedding::FLOAT[] AS emb
87
+ FROM read_parquet([{path_literals}]) e
88
+ JOIN messages m
89
+ ON CAST(m.uuid AS VARCHAR) = e.uuid
90
+ ),
91
+ unrolled AS (
92
+ SELECT session_id,
93
+ generate_subscripts(emb, 1) AS pos,
94
+ unnest(emb) AS v
95
+ FROM joined
96
+ ),
97
+ agg AS (
98
+ SELECT session_id, pos, avg(v) AS m
99
+ FROM unrolled
100
+ GROUP BY 1, 2
101
+ )
102
+ SELECT session_id, list(m ORDER BY pos) AS centroid
103
+ FROM agg
104
+ GROUP BY 1
105
+ ORDER BY 1
106
+ """
107
+ df = con.execute(sql).pl()
108
+ if len(df) == 0:
109
+ raise RuntimeError(
110
+ "No rows returned joining embeddings to messages - check that the "
111
+ "embeddings parquet exists and the messages view is registered."
112
+ )
113
+
114
+ sids = df["session_id"].to_list()
115
+ centroids = np.stack([np.asarray(c, dtype=np.float32) for c in df["centroid"].to_list()])
116
+ norms = np.linalg.norm(centroids, axis=1, keepdims=True)
117
+ centroids = centroids / np.where(norms == 0, 1.0, norms)
118
+ logger.info("Computed {} session centroids (dim={})", len(sids), centroids.shape[1])
119
+ return sids, centroids
120
+
121
+
122
+ def _pick_adaptive_threshold(
123
+ sim: np.ndarray,
124
+ *,
125
+ floor: float,
126
+ target_avg_degree_low: float,
127
+ target_avg_degree_high: float,
128
+ ) -> float:
129
+ """Pick a cosine threshold that puts the average graph degree in the target band.
130
+
131
+ Binary-searches the upper-triangular similarity quantiles. Falls back to
132
+ ``floor`` when the graph is too sparse to hit the band even at q=0.
133
+ """
134
+ n = sim.shape[0]
135
+ if n < 2:
136
+ return floor
137
+ # Upper triangle only -- pairwise matrix is symmetric; diag already zeroed.
138
+ iu = np.triu_indices(n, k=1)
139
+ upper = sim[iu]
140
+ if upper.size == 0:
141
+ return floor
142
+
143
+ target_edges_low = n * target_avg_degree_low / 2.0
144
+ target_edges_high = n * target_avg_degree_high / 2.0
145
+
146
+ # Quantile search: try a handful of candidate thresholds and pick the
147
+ # lowest one that stays under target_edges_high, then clamp up to floor.
148
+ # Using np.quantile ensures the candidate values are monotonically
149
+ # decreasing -> edge count monotonically increasing -- no binary search
150
+ # machinery needed.
151
+ quantiles = np.linspace(0.999, 0.80, 20)
152
+ candidates = np.quantile(upper, quantiles)
153
+ for thr in candidates:
154
+ n_edges = int((upper >= thr).sum())
155
+ if target_edges_low <= n_edges <= target_edges_high:
156
+ return max(float(thr), floor)
157
+ # No quantile hit the target band: pick the tightest threshold that stays
158
+ # at or under the high target to avoid a hairball.
159
+ for thr in candidates:
160
+ n_edges = int((upper >= thr).sum())
161
+ if n_edges <= target_edges_high:
162
+ return max(float(thr), floor)
163
+ return floor
164
+
165
+
166
+ def run_communities(
167
+ con: duckdb.DuckDBPyConnection, settings: Settings, *, force: bool = False
168
+ ) -> dict[str, int | float]:
169
+ """Run Louvain on session centroids, write parquet.
170
+
171
+ Parameters
172
+ ----------
173
+ con
174
+ An open DuckDB connection with the v1 ``messages`` view registered.
175
+ settings
176
+ Configured Settings. Honors ``louvain_edge_threshold`` (the floor
177
+ cosine value), ``louvain_resolution``, ``louvain_target_avg_degree_low``,
178
+ ``louvain_target_avg_degree_high``, and ``louvain_min_community_size``.
179
+ force
180
+ If False and output parquet exists, skip recomputation.
181
+
182
+ Returns
183
+ -------
184
+ dict
185
+ ``{sessions, communities, noise, threshold}`` — ``communities`` counts
186
+ only real (>= min-size) communities; singletons are aggregated into
187
+ ``noise``. ``threshold`` is the adaptive cut actually used.
188
+ """
189
+ out_path = settings.communities_parquet_path
190
+
191
+ if out_path.exists() and out_path.stat().st_size > 16 and not force:
192
+ logger.info("Communities parquet exists at {}; pass force=True to rebuild", out_path)
193
+ df = pl.read_parquet(out_path)
194
+ real = df.filter(pl.col("community_id") != NOISE_COMMUNITY_ID)
195
+ noise_n = df.height - real.height
196
+ n_comm = int(real["community_id"].n_unique()) if real.height else 0
197
+ return {
198
+ "sessions": int(df.height),
199
+ "communities": n_comm,
200
+ "noise": noise_n,
201
+ "threshold": float("nan"),
202
+ }
203
+
204
+ import networkx as nx
205
+ from scipy.sparse import csr_matrix
206
+
207
+ sids, centroids = _load_session_centroids(con, settings.embeddings_parquet_path)
208
+
209
+ t0 = time.monotonic()
210
+ logger.info("Computing pairwise cosine similarity over {} sessions...", len(sids))
211
+ sim = centroids @ centroids.T
212
+ np.fill_diagonal(sim, 0.0)
213
+
214
+ floor = settings.louvain_edge_threshold
215
+ threshold = _pick_adaptive_threshold(
216
+ sim,
217
+ floor=floor,
218
+ target_avg_degree_low=settings.louvain_target_avg_degree_low,
219
+ target_avg_degree_high=settings.louvain_target_avg_degree_high,
220
+ )
221
+
222
+ # Apply the chosen threshold. Operate on a copy so we keep the raw sim
223
+ # matrix intact for logging if we need to probe.
224
+ masked = np.where(sim >= threshold, sim, 0.0).astype(np.float32, copy=False)
225
+ sp = csr_matrix(masked)
226
+ logger.info(
227
+ "Graph: {} nodes, {} edges (threshold={:.3f}, avg_degree={:.1f}) in {:.1f}s",
228
+ sp.shape[0],
229
+ sp.nnz // 2,
230
+ threshold,
231
+ (sp.nnz / sp.shape[0]) if sp.shape[0] else 0.0,
232
+ time.monotonic() - t0,
233
+ )
234
+
235
+ graph = nx.from_scipy_sparse_array(sp)
236
+ t1 = time.monotonic()
237
+ communities = nx.community.louvain_communities(
238
+ graph,
239
+ weight="weight",
240
+ resolution=settings.louvain_resolution,
241
+ seed=settings.seed,
242
+ )
243
+
244
+ min_size = settings.louvain_min_community_size
245
+ rows: list[tuple[str, int, int]] = []
246
+ real_communities = 0
247
+ noise_count = 0
248
+ # networkx.community.louvain_communities returns list[set[int]] at
249
+ # runtime but the default stubs only expose it as Sized. Cast through
250
+ # typing.cast so the type checker agrees with reality.
251
+ from typing import cast
252
+
253
+ typed_communities = cast("list[set[int]]", communities)
254
+ sorted_communities = cast(
255
+ "list[set[int]]",
256
+ sorted(typed_communities, key=len, reverse=True),
257
+ )
258
+ for cid, comm in enumerate(sorted_communities):
259
+ size = len(comm)
260
+ if size < min_size:
261
+ for node in comm:
262
+ rows.append((sids[node], NOISE_COMMUNITY_ID, 0))
263
+ noise_count += 1
264
+ continue
265
+ real_communities += 1
266
+ # PERF401: comprehension would obscure the nested control flow above.
267
+ for node in comm:
268
+ rows.append((sids[node], cid, size)) # noqa: PERF401
269
+
270
+ logger.info(
271
+ "Louvain: {} raw communities ({} kept >= {} sessions, {} singletons -> noise) in {:.1f}s",
272
+ len(communities),
273
+ real_communities,
274
+ min_size,
275
+ noise_count,
276
+ time.monotonic() - t1,
277
+ )
278
+
279
+ df = pl.DataFrame(
280
+ rows,
281
+ schema={
282
+ "session_id": pl.Utf8,
283
+ "community_id": pl.Int32,
284
+ "size": pl.Int32,
285
+ },
286
+ orient="row",
287
+ )
288
+ out_path.parent.mkdir(parents=True, exist_ok=True)
289
+ df.write_parquet(out_path)
290
+ logger.info("Wrote {} rows to {}", len(df), out_path)
291
+
292
+ size_dist = (
293
+ df.filter(pl.col("community_id") != NOISE_COMMUNITY_ID)
294
+ .group_by("community_id")
295
+ .agg(pl.len().alias("n"))
296
+ .sort("n", descending=True)
297
+ .head(5)
298
+ )
299
+ logger.info("Top 5 community sizes: {}", size_dist.to_dicts())
300
+
301
+ return {
302
+ "sessions": int(df.height),
303
+ "communities": real_communities,
304
+ "noise": noise_count,
305
+ "threshold": float(threshold),
306
+ }