claude-sql 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_sql/__init__.py +5 -0
- claude_sql/binding.py +740 -0
- claude_sql/blind_handover.py +155 -0
- claude_sql/checkpointer.py +202 -0
- claude_sql/cli.py +2344 -0
- claude_sql/cluster_worker.py +208 -0
- claude_sql/community_worker.py +306 -0
- claude_sql/config.py +380 -0
- claude_sql/embed_worker.py +482 -0
- claude_sql/freeze.py +189 -0
- claude_sql/friction_worker.py +561 -0
- claude_sql/install_source.py +77 -0
- claude_sql/judge_worker.py +459 -0
- claude_sql/judges.py +239 -0
- claude_sql/kappa_worker.py +257 -0
- claude_sql/llm_worker.py +1760 -0
- claude_sql/logging_setup.py +95 -0
- claude_sql/output.py +248 -0
- claude_sql/parquet_shards.py +172 -0
- claude_sql/retry_queue.py +180 -0
- claude_sql/review_sheet_render.py +167 -0
- claude_sql/review_sheet_worker.py +463 -0
- claude_sql/schemas.py +454 -0
- claude_sql/session_text.py +387 -0
- claude_sql/skills_catalog.py +354 -0
- claude_sql/sql_views.py +1751 -0
- claude_sql/terms_worker.py +145 -0
- claude_sql/ungrounded_worker.py +190 -0
- claude_sql-0.4.0.dist-info/METADATA +530 -0
- claude_sql-0.4.0.dist-info/RECORD +32 -0
- claude_sql-0.4.0.dist-info/WHEEL +4 -0
- claude_sql-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Cluster message embeddings via UMAP + HDBSCAN.
|
|
2
|
+
|
|
3
|
+
Pipeline
|
|
4
|
+
--------
|
|
5
|
+
1. Read the embeddings parquet into a numpy float32 matrix.
|
|
6
|
+
2. UMAP reduce to 50d (for HDBSCAN) and 2d (for viz), both with ``seed=42``.
|
|
7
|
+
3. HDBSCAN on the 50d projection → cluster_id per row, -1 for noise.
|
|
8
|
+
4. Write ``clusters.parquet`` with ``(uuid, cluster_id, x, y, is_noise)``.
|
|
9
|
+
|
|
10
|
+
Public API
|
|
11
|
+
----------
|
|
12
|
+
run_clustering(settings, *, force=False) -> dict[str, int]
|
|
13
|
+
Read parquet, compute clusters, write output parquet. Returns stats dict.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import time
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import polars as pl
|
|
23
|
+
from loguru import logger
|
|
24
|
+
|
|
25
|
+
from claude_sql.config import Settings
|
|
26
|
+
from claude_sql.parquet_shards import iter_part_files
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
|
|
30
|
+
"""Read parquet → (uuid_list, embedding_matrix[float32]). Matrix shape (N, dim).
|
|
31
|
+
|
|
32
|
+
Accepts either a legacy single-file ``embeddings.parquet`` or a sharded
|
|
33
|
+
directory containing ``part-*.parquet`` files; the union of all parts is
|
|
34
|
+
materialized in scan order.
|
|
35
|
+
"""
|
|
36
|
+
parts = iter_part_files(path)
|
|
37
|
+
df = pl.read_parquet([str(p) for p in parts]) if parts else pl.read_parquet(path)
|
|
38
|
+
uuids = df["uuid"].to_list()
|
|
39
|
+
# polars Array[Float32, dim] → 2D numpy (N, dim). to_numpy() on an Array column
|
|
40
|
+
# returns 2D when the element type is fixed-size Array; fall back to vstack if not.
|
|
41
|
+
emb = df["embedding"].to_numpy()
|
|
42
|
+
if emb.ndim == 1:
|
|
43
|
+
# Object array of 1D arrays (ragged container); stack into a 2D matrix.
|
|
44
|
+
emb = np.stack(list(emb))
|
|
45
|
+
return uuids, np.ascontiguousarray(emb, dtype=np.float32)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]:
|
|
49
|
+
"""Run UMAP + HDBSCAN on the embeddings parquet.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
settings
|
|
54
|
+
Configured Settings; reads ``embeddings_parquet_path`` and writes
|
|
55
|
+
``clusters_parquet_path``.
|
|
56
|
+
force
|
|
57
|
+
If False and clusters_parquet_path exists, return its stats without
|
|
58
|
+
recomputing. If True, always rerun.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
dict
|
|
63
|
+
``{"total": N, "clusters": K, "noise": M}`` where K excludes the
|
|
64
|
+
noise cluster (label -1).
|
|
65
|
+
"""
|
|
66
|
+
out_path = settings.clusters_parquet_path
|
|
67
|
+
in_path = settings.embeddings_parquet_path
|
|
68
|
+
|
|
69
|
+
# Sharded directory or legacy single file: insist on at least one part
|
|
70
|
+
# whose footer is plausibly populated (>16 bytes).
|
|
71
|
+
in_parts = [p for p in iter_part_files(in_path) if p.stat().st_size > 16]
|
|
72
|
+
if not in_parts:
|
|
73
|
+
raise FileNotFoundError(
|
|
74
|
+
f"Embeddings parquet missing at {in_path}. Run `claude-sql embed` first."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Mtime-sidecar fast path: if the embeddings haven't moved since the
|
|
78
|
+
# last successful clustering, skip the ~40 s UMAP+HDBSCAN refit. The
|
|
79
|
+
# sidecar lives next to the output parquet and stores the latest
|
|
80
|
+
# part-file's nanosecond mtime; coarse hand-edits of clusters.parquet
|
|
81
|
+
# alone won't bust the cache (use ``force=True`` for that).
|
|
82
|
+
sidecar = out_path.with_suffix(out_path.suffix + ".embeddings_mtime")
|
|
83
|
+
in_mtime_ns = max(p.stat().st_mtime_ns for p in in_parts)
|
|
84
|
+
if (
|
|
85
|
+
not force
|
|
86
|
+
and out_path.exists()
|
|
87
|
+
and out_path.stat().st_size > 16
|
|
88
|
+
and sidecar.exists()
|
|
89
|
+
and sidecar.read_text().strip() == str(in_mtime_ns)
|
|
90
|
+
):
|
|
91
|
+
logger.info("Embeddings unchanged since last cluster run; reusing {}.", out_path)
|
|
92
|
+
df = pl.read_parquet(out_path)
|
|
93
|
+
return {
|
|
94
|
+
"total": len(df),
|
|
95
|
+
"clusters": int((df["cluster_id"] >= 0).sum()),
|
|
96
|
+
"noise": int((df["cluster_id"] < 0).sum()),
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Legacy short-circuit: a clusters parquet exists but no sidecar (older
|
|
100
|
+
# install before the mtime-skip landed). Trust the parquet and stamp
|
|
101
|
+
# a sidecar so the next call hits the fast path. Forces a rebuild only
|
|
102
|
+
# when ``force=True`` is set explicitly.
|
|
103
|
+
if not force and out_path.exists() and out_path.stat().st_size > 16 and not sidecar.exists():
|
|
104
|
+
logger.info(
|
|
105
|
+
"Clusters parquet at {} predates sidecar; reusing and stamping mtime.",
|
|
106
|
+
out_path,
|
|
107
|
+
)
|
|
108
|
+
sidecar.write_text(str(in_mtime_ns))
|
|
109
|
+
df = pl.read_parquet(out_path)
|
|
110
|
+
return {
|
|
111
|
+
"total": len(df),
|
|
112
|
+
"clusters": int((df["cluster_id"] >= 0).sum()),
|
|
113
|
+
"noise": int((df["cluster_id"] < 0).sum()),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Heavy imports inside the function so module import stays cheap.
|
|
117
|
+
import hdbscan
|
|
118
|
+
import umap
|
|
119
|
+
|
|
120
|
+
t0 = time.monotonic()
|
|
121
|
+
uuids, X = _load_embeddings(in_path) # noqa: N806 — X follows sklearn/ML matrix convention
|
|
122
|
+
logger.info("Loaded {} embeddings, shape={}, dtype={}", len(uuids), X.shape, X.dtype)
|
|
123
|
+
|
|
124
|
+
logger.info(
|
|
125
|
+
"UMAP → {}d (clustering): n_neighbors={}, min_dist={}, metric={}",
|
|
126
|
+
settings.umap_n_components_50,
|
|
127
|
+
settings.umap_n_neighbors,
|
|
128
|
+
settings.umap_min_dist_cluster,
|
|
129
|
+
settings.umap_metric,
|
|
130
|
+
)
|
|
131
|
+
reducer_cluster = umap.UMAP(
|
|
132
|
+
n_components=settings.umap_n_components_50,
|
|
133
|
+
n_neighbors=settings.umap_n_neighbors,
|
|
134
|
+
min_dist=settings.umap_min_dist_cluster,
|
|
135
|
+
metric=settings.umap_metric,
|
|
136
|
+
random_state=settings.seed,
|
|
137
|
+
)
|
|
138
|
+
X50 = reducer_cluster.fit_transform(X) # noqa: N806 — 50d projection matrix
|
|
139
|
+
logger.info("UMAP 50d done in {:.1f}s", time.monotonic() - t0)
|
|
140
|
+
|
|
141
|
+
t1 = time.monotonic()
|
|
142
|
+
logger.info(
|
|
143
|
+
"UMAP → {}d (viz): n_neighbors={}, min_dist={}",
|
|
144
|
+
settings.umap_n_components_2,
|
|
145
|
+
settings.umap_n_neighbors,
|
|
146
|
+
settings.umap_min_dist_viz,
|
|
147
|
+
)
|
|
148
|
+
reducer_viz = umap.UMAP(
|
|
149
|
+
n_components=settings.umap_n_components_2,
|
|
150
|
+
n_neighbors=settings.umap_n_neighbors,
|
|
151
|
+
min_dist=settings.umap_min_dist_viz,
|
|
152
|
+
metric=settings.umap_metric,
|
|
153
|
+
random_state=settings.seed,
|
|
154
|
+
)
|
|
155
|
+
X2 = reducer_viz.fit_transform(X) # noqa: N806 — 2d viz projection matrix
|
|
156
|
+
logger.info("UMAP 2d done in {:.1f}s", time.monotonic() - t1)
|
|
157
|
+
|
|
158
|
+
t2 = time.monotonic()
|
|
159
|
+
logger.info(
|
|
160
|
+
"HDBSCAN: min_cluster_size={}, min_samples={}",
|
|
161
|
+
settings.hdbscan_min_cluster_size,
|
|
162
|
+
settings.hdbscan_min_samples,
|
|
163
|
+
)
|
|
164
|
+
clusterer = hdbscan.HDBSCAN(
|
|
165
|
+
min_cluster_size=settings.hdbscan_min_cluster_size,
|
|
166
|
+
min_samples=settings.hdbscan_min_samples,
|
|
167
|
+
metric="euclidean", # post-UMAP space is euclidean-friendly
|
|
168
|
+
core_dist_n_jobs=-1,
|
|
169
|
+
prediction_data=False,
|
|
170
|
+
)
|
|
171
|
+
labels = clusterer.fit_predict(X50)
|
|
172
|
+
k = int(labels.max()) + 1 if labels.max() >= 0 else 0
|
|
173
|
+
noise = int((labels < 0).sum())
|
|
174
|
+
logger.info(
|
|
175
|
+
"HDBSCAN done in {:.1f}s: {} clusters, {} noise ({:.1%})",
|
|
176
|
+
time.monotonic() - t2,
|
|
177
|
+
k,
|
|
178
|
+
noise,
|
|
179
|
+
noise / len(labels) if len(labels) else 0,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
df = pl.DataFrame(
|
|
183
|
+
{
|
|
184
|
+
"uuid": uuids,
|
|
185
|
+
"cluster_id": labels.astype(np.int32).tolist(),
|
|
186
|
+
"x": X2[:, 0].astype(np.float32).tolist(),
|
|
187
|
+
"y": X2[:, 1].astype(np.float32).tolist(),
|
|
188
|
+
"is_noise": (labels < 0).tolist(),
|
|
189
|
+
},
|
|
190
|
+
schema={
|
|
191
|
+
"uuid": pl.Utf8,
|
|
192
|
+
"cluster_id": pl.Int32,
|
|
193
|
+
"x": pl.Float32,
|
|
194
|
+
"y": pl.Float32,
|
|
195
|
+
"is_noise": pl.Boolean,
|
|
196
|
+
},
|
|
197
|
+
)
|
|
198
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
df.write_parquet(out_path)
|
|
200
|
+
sidecar.write_text(str(in_mtime_ns))
|
|
201
|
+
logger.info(
|
|
202
|
+
"Wrote {} rows to {} (total elapsed: {:.1f}s)",
|
|
203
|
+
len(df),
|
|
204
|
+
out_path,
|
|
205
|
+
time.monotonic() - t0,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return {"total": len(uuids), "clusters": k, "noise": noise}
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
"""Session-level community detection via cosine graph + Louvain.
|
|
2
|
+
|
|
3
|
+
Pipeline
|
|
4
|
+
--------
|
|
5
|
+
1. Compute session centroid embeddings: group message embeddings by
|
|
6
|
+
``messages.session_id`` and average (mean over rows, then L2-normalize).
|
|
7
|
+
2. Pick an adaptive cosine threshold that yields a connected but not
|
|
8
|
+
dense graph (target average degree in ``[target_avg_degree_low,
|
|
9
|
+
target_avg_degree_high]``), falling back to ``louvain_edge_threshold``
|
|
10
|
+
when the adaptive fit fails.
|
|
11
|
+
3. Threshold the pairwise similarity into a sparse graph.
|
|
12
|
+
4. Run ``networkx.community.louvain_communities`` with seed=42.
|
|
13
|
+
5. Collapse singleton communities into a single ``community_id=-1`` "noise"
|
|
14
|
+
bucket so the output parquet stays scannable.
|
|
15
|
+
6. Write ``session_communities.parquet`` with
|
|
16
|
+
``(session_id, community_id, size)``.
|
|
17
|
+
|
|
18
|
+
Why adaptive thresholding
|
|
19
|
+
-------------------------
|
|
20
|
+
The previous design used a fixed ``louvain_edge_threshold=0.75`` absolute
|
|
21
|
+
cosine cut. Against ``int8``-quantized embeddings the pairwise cosine
|
|
22
|
+
distribution is both narrower and higher-mean than against unquantized
|
|
23
|
+
``float32``, so 0.75 excluded the majority of "related" session pairs and
|
|
24
|
+
Louvain produced thousands of singleton communities (on the real corpus:
|
|
25
|
+
1,512 communities for 6,329 sessions, with only 21 ≥ 20 sessions).
|
|
26
|
+
|
|
27
|
+
The adaptive path picks the percentile-based cut that puts average degree
|
|
28
|
+
in the 8–15 range (Louvain's typical sweet spot for this kind of graph),
|
|
29
|
+
so the answer stabilizes whether the underlying embeddings are int8,
|
|
30
|
+
float16, or float32.
|
|
31
|
+
|
|
32
|
+
Public API
|
|
33
|
+
----------
|
|
34
|
+
run_communities(con, settings, *, force=False) -> dict[str, int]
|
|
35
|
+
Expects ``con`` to have v1 views registered (``messages`` + access to
|
|
36
|
+
the embeddings parquet via ``message_embeddings`` table or direct parquet
|
|
37
|
+
read). Returns ``{sessions, communities, noise, threshold}``.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import time
|
|
43
|
+
from pathlib import Path
|
|
44
|
+
from typing import TYPE_CHECKING
|
|
45
|
+
|
|
46
|
+
import numpy as np
|
|
47
|
+
import polars as pl
|
|
48
|
+
from loguru import logger
|
|
49
|
+
|
|
50
|
+
from claude_sql.config import Settings
|
|
51
|
+
from claude_sql.parquet_shards import iter_part_files
|
|
52
|
+
|
|
53
|
+
if TYPE_CHECKING:
|
|
54
|
+
import duckdb
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
#: Community id used for singleton / unclusterable sessions. -1 is an obvious
|
|
58
|
+
#: out-of-band sentinel that stays negative even after ``Int32`` serialization.
|
|
59
|
+
NOISE_COMMUNITY_ID: int = -1
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_session_centroids(
|
|
63
|
+
con: duckdb.DuckDBPyConnection, embeddings_parquet_path: Path
|
|
64
|
+
) -> tuple[list[str], np.ndarray]:
|
|
65
|
+
"""Return ``(session_ids, centroids)`` where centroids is ``(N_sessions, dim)`` float32.
|
|
66
|
+
|
|
67
|
+
Joins the embeddings parquet to the v1 ``messages`` view on uuid, then
|
|
68
|
+
aggregates inside DuckDB (unnest with position → ``avg`` per (session,
|
|
69
|
+
dim_index) → ordered ``list``). The L2-normalize step stays in numpy
|
|
70
|
+
where ``np.linalg.norm`` is faster on a contiguous (N, dim) matrix than
|
|
71
|
+
a per-row SQL norm. The previous Python double-loop over ~50K rows is
|
|
72
|
+
replaced with a single vectorized DuckDB query.
|
|
73
|
+
"""
|
|
74
|
+
logger.info("Loading message embeddings and joining to sessions...")
|
|
75
|
+
parts = iter_part_files(embeddings_parquet_path)
|
|
76
|
+
if not parts:
|
|
77
|
+
raise RuntimeError(
|
|
78
|
+
f"No embeddings parquet at {embeddings_parquet_path} - run `claude-sql embed`."
|
|
79
|
+
)
|
|
80
|
+
# DDL doesn't accept prepared params for read_parquet's file list, so
|
|
81
|
+
# escape inline. Single-quotes inside paths get doubled per SQL rules.
|
|
82
|
+
path_literals = ", ".join("'{}'".format(str(p).replace("'", "''")) for p in parts)
|
|
83
|
+
sql = f"""
|
|
84
|
+
WITH joined AS (
|
|
85
|
+
SELECT CAST(m.session_id AS VARCHAR) AS session_id,
|
|
86
|
+
e.embedding::FLOAT[] AS emb
|
|
87
|
+
FROM read_parquet([{path_literals}]) e
|
|
88
|
+
JOIN messages m
|
|
89
|
+
ON CAST(m.uuid AS VARCHAR) = e.uuid
|
|
90
|
+
),
|
|
91
|
+
unrolled AS (
|
|
92
|
+
SELECT session_id,
|
|
93
|
+
generate_subscripts(emb, 1) AS pos,
|
|
94
|
+
unnest(emb) AS v
|
|
95
|
+
FROM joined
|
|
96
|
+
),
|
|
97
|
+
agg AS (
|
|
98
|
+
SELECT session_id, pos, avg(v) AS m
|
|
99
|
+
FROM unrolled
|
|
100
|
+
GROUP BY 1, 2
|
|
101
|
+
)
|
|
102
|
+
SELECT session_id, list(m ORDER BY pos) AS centroid
|
|
103
|
+
FROM agg
|
|
104
|
+
GROUP BY 1
|
|
105
|
+
ORDER BY 1
|
|
106
|
+
"""
|
|
107
|
+
df = con.execute(sql).pl()
|
|
108
|
+
if len(df) == 0:
|
|
109
|
+
raise RuntimeError(
|
|
110
|
+
"No rows returned joining embeddings to messages - check that the "
|
|
111
|
+
"embeddings parquet exists and the messages view is registered."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
sids = df["session_id"].to_list()
|
|
115
|
+
centroids = np.stack([np.asarray(c, dtype=np.float32) for c in df["centroid"].to_list()])
|
|
116
|
+
norms = np.linalg.norm(centroids, axis=1, keepdims=True)
|
|
117
|
+
centroids = centroids / np.where(norms == 0, 1.0, norms)
|
|
118
|
+
logger.info("Computed {} session centroids (dim={})", len(sids), centroids.shape[1])
|
|
119
|
+
return sids, centroids
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _pick_adaptive_threshold(
|
|
123
|
+
sim: np.ndarray,
|
|
124
|
+
*,
|
|
125
|
+
floor: float,
|
|
126
|
+
target_avg_degree_low: float,
|
|
127
|
+
target_avg_degree_high: float,
|
|
128
|
+
) -> float:
|
|
129
|
+
"""Pick a cosine threshold that puts the average graph degree in the target band.
|
|
130
|
+
|
|
131
|
+
Binary-searches the upper-triangular similarity quantiles. Falls back to
|
|
132
|
+
``floor`` when the graph is too sparse to hit the band even at q=0.
|
|
133
|
+
"""
|
|
134
|
+
n = sim.shape[0]
|
|
135
|
+
if n < 2:
|
|
136
|
+
return floor
|
|
137
|
+
# Upper triangle only -- pairwise matrix is symmetric; diag already zeroed.
|
|
138
|
+
iu = np.triu_indices(n, k=1)
|
|
139
|
+
upper = sim[iu]
|
|
140
|
+
if upper.size == 0:
|
|
141
|
+
return floor
|
|
142
|
+
|
|
143
|
+
target_edges_low = n * target_avg_degree_low / 2.0
|
|
144
|
+
target_edges_high = n * target_avg_degree_high / 2.0
|
|
145
|
+
|
|
146
|
+
# Quantile search: try a handful of candidate thresholds and pick the
|
|
147
|
+
# lowest one that stays under target_edges_high, then clamp up to floor.
|
|
148
|
+
# Using np.quantile ensures the candidate values are monotonically
|
|
149
|
+
# decreasing -> edge count monotonically increasing -- no binary search
|
|
150
|
+
# machinery needed.
|
|
151
|
+
quantiles = np.linspace(0.999, 0.80, 20)
|
|
152
|
+
candidates = np.quantile(upper, quantiles)
|
|
153
|
+
for thr in candidates:
|
|
154
|
+
n_edges = int((upper >= thr).sum())
|
|
155
|
+
if target_edges_low <= n_edges <= target_edges_high:
|
|
156
|
+
return max(float(thr), floor)
|
|
157
|
+
# No quantile hit the target band: pick the tightest threshold that stays
|
|
158
|
+
# at or under the high target to avoid a hairball.
|
|
159
|
+
for thr in candidates:
|
|
160
|
+
n_edges = int((upper >= thr).sum())
|
|
161
|
+
if n_edges <= target_edges_high:
|
|
162
|
+
return max(float(thr), floor)
|
|
163
|
+
return floor
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def run_communities(
|
|
167
|
+
con: duckdb.DuckDBPyConnection, settings: Settings, *, force: bool = False
|
|
168
|
+
) -> dict[str, int | float]:
|
|
169
|
+
"""Run Louvain on session centroids, write parquet.
|
|
170
|
+
|
|
171
|
+
Parameters
|
|
172
|
+
----------
|
|
173
|
+
con
|
|
174
|
+
An open DuckDB connection with the v1 ``messages`` view registered.
|
|
175
|
+
settings
|
|
176
|
+
Configured Settings. Honors ``louvain_edge_threshold`` (the floor
|
|
177
|
+
cosine value), ``louvain_resolution``, ``louvain_target_avg_degree_low``,
|
|
178
|
+
``louvain_target_avg_degree_high``, and ``louvain_min_community_size``.
|
|
179
|
+
force
|
|
180
|
+
If False and output parquet exists, skip recomputation.
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
dict
|
|
185
|
+
``{sessions, communities, noise, threshold}`` — ``communities`` counts
|
|
186
|
+
only real (>= min-size) communities; singletons are aggregated into
|
|
187
|
+
``noise``. ``threshold`` is the adaptive cut actually used.
|
|
188
|
+
"""
|
|
189
|
+
out_path = settings.communities_parquet_path
|
|
190
|
+
|
|
191
|
+
if out_path.exists() and out_path.stat().st_size > 16 and not force:
|
|
192
|
+
logger.info("Communities parquet exists at {}; pass force=True to rebuild", out_path)
|
|
193
|
+
df = pl.read_parquet(out_path)
|
|
194
|
+
real = df.filter(pl.col("community_id") != NOISE_COMMUNITY_ID)
|
|
195
|
+
noise_n = df.height - real.height
|
|
196
|
+
n_comm = int(real["community_id"].n_unique()) if real.height else 0
|
|
197
|
+
return {
|
|
198
|
+
"sessions": int(df.height),
|
|
199
|
+
"communities": n_comm,
|
|
200
|
+
"noise": noise_n,
|
|
201
|
+
"threshold": float("nan"),
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
import networkx as nx
|
|
205
|
+
from scipy.sparse import csr_matrix
|
|
206
|
+
|
|
207
|
+
sids, centroids = _load_session_centroids(con, settings.embeddings_parquet_path)
|
|
208
|
+
|
|
209
|
+
t0 = time.monotonic()
|
|
210
|
+
logger.info("Computing pairwise cosine similarity over {} sessions...", len(sids))
|
|
211
|
+
sim = centroids @ centroids.T
|
|
212
|
+
np.fill_diagonal(sim, 0.0)
|
|
213
|
+
|
|
214
|
+
floor = settings.louvain_edge_threshold
|
|
215
|
+
threshold = _pick_adaptive_threshold(
|
|
216
|
+
sim,
|
|
217
|
+
floor=floor,
|
|
218
|
+
target_avg_degree_low=settings.louvain_target_avg_degree_low,
|
|
219
|
+
target_avg_degree_high=settings.louvain_target_avg_degree_high,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Apply the chosen threshold. Operate on a copy so we keep the raw sim
|
|
223
|
+
# matrix intact for logging if we need to probe.
|
|
224
|
+
masked = np.where(sim >= threshold, sim, 0.0).astype(np.float32, copy=False)
|
|
225
|
+
sp = csr_matrix(masked)
|
|
226
|
+
logger.info(
|
|
227
|
+
"Graph: {} nodes, {} edges (threshold={:.3f}, avg_degree={:.1f}) in {:.1f}s",
|
|
228
|
+
sp.shape[0],
|
|
229
|
+
sp.nnz // 2,
|
|
230
|
+
threshold,
|
|
231
|
+
(sp.nnz / sp.shape[0]) if sp.shape[0] else 0.0,
|
|
232
|
+
time.monotonic() - t0,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
graph = nx.from_scipy_sparse_array(sp)
|
|
236
|
+
t1 = time.monotonic()
|
|
237
|
+
communities = nx.community.louvain_communities(
|
|
238
|
+
graph,
|
|
239
|
+
weight="weight",
|
|
240
|
+
resolution=settings.louvain_resolution,
|
|
241
|
+
seed=settings.seed,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
min_size = settings.louvain_min_community_size
|
|
245
|
+
rows: list[tuple[str, int, int]] = []
|
|
246
|
+
real_communities = 0
|
|
247
|
+
noise_count = 0
|
|
248
|
+
# networkx.community.louvain_communities returns list[set[int]] at
|
|
249
|
+
# runtime but the default stubs only expose it as Sized. Cast through
|
|
250
|
+
# typing.cast so the type checker agrees with reality.
|
|
251
|
+
from typing import cast
|
|
252
|
+
|
|
253
|
+
typed_communities = cast("list[set[int]]", communities)
|
|
254
|
+
sorted_communities = cast(
|
|
255
|
+
"list[set[int]]",
|
|
256
|
+
sorted(typed_communities, key=len, reverse=True),
|
|
257
|
+
)
|
|
258
|
+
for cid, comm in enumerate(sorted_communities):
|
|
259
|
+
size = len(comm)
|
|
260
|
+
if size < min_size:
|
|
261
|
+
for node in comm:
|
|
262
|
+
rows.append((sids[node], NOISE_COMMUNITY_ID, 0))
|
|
263
|
+
noise_count += 1
|
|
264
|
+
continue
|
|
265
|
+
real_communities += 1
|
|
266
|
+
# PERF401: comprehension would obscure the nested control flow above.
|
|
267
|
+
for node in comm:
|
|
268
|
+
rows.append((sids[node], cid, size)) # noqa: PERF401
|
|
269
|
+
|
|
270
|
+
logger.info(
|
|
271
|
+
"Louvain: {} raw communities ({} kept >= {} sessions, {} singletons -> noise) in {:.1f}s",
|
|
272
|
+
len(communities),
|
|
273
|
+
real_communities,
|
|
274
|
+
min_size,
|
|
275
|
+
noise_count,
|
|
276
|
+
time.monotonic() - t1,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
df = pl.DataFrame(
|
|
280
|
+
rows,
|
|
281
|
+
schema={
|
|
282
|
+
"session_id": pl.Utf8,
|
|
283
|
+
"community_id": pl.Int32,
|
|
284
|
+
"size": pl.Int32,
|
|
285
|
+
},
|
|
286
|
+
orient="row",
|
|
287
|
+
)
|
|
288
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
289
|
+
df.write_parquet(out_path)
|
|
290
|
+
logger.info("Wrote {} rows to {}", len(df), out_path)
|
|
291
|
+
|
|
292
|
+
size_dist = (
|
|
293
|
+
df.filter(pl.col("community_id") != NOISE_COMMUNITY_ID)
|
|
294
|
+
.group_by("community_id")
|
|
295
|
+
.agg(pl.len().alias("n"))
|
|
296
|
+
.sort("n", descending=True)
|
|
297
|
+
.head(5)
|
|
298
|
+
)
|
|
299
|
+
logger.info("Top 5 community sizes: {}", size_dist.to_dicts())
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
"sessions": int(df.height),
|
|
303
|
+
"communities": real_communities,
|
|
304
|
+
"noise": noise_count,
|
|
305
|
+
"threshold": float(threshold),
|
|
306
|
+
}
|