claude-sql 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {claude_sql-0.4.0 → claude_sql-0.5.0}/PKG-INFO +34 -11
- {claude_sql-0.4.0 → claude_sql-0.5.0}/README.md +31 -9
- {claude_sql-0.4.0 → claude_sql-0.5.0}/pyproject.toml +3 -2
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/cli.py +122 -13
- claude_sql-0.5.0/src/claude_sql/community_worker.py +662 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/config.py +34 -17
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/sql_views.py +6 -0
- claude_sql-0.4.0/src/claude_sql/community_worker.py +0 -306
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/__init__.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/binding.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/blind_handover.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/checkpointer.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/cluster_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/embed_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/freeze.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/friction_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/install_source.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/judge_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/judges.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/kappa_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/llm_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/logging_setup.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/output.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/parquet_shards.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/retry_queue.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/review_sheet_render.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/review_sheet_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/schemas.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/session_text.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/skills_catalog.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/terms_worker.py +0 -0
- {claude_sql-0.4.0 → claude_sql-0.5.0}/src/claude_sql/ungrounded_worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: claude-sql
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
|
|
5
5
|
Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
|
|
6
6
|
Author: Laith Al-Saadoon
|
|
@@ -23,8 +23,9 @@ Requires-Dist: boto3>=1.42.91
|
|
|
23
23
|
Requires-Dist: cyclopts>=4.10.2
|
|
24
24
|
Requires-Dist: duckdb>=1.5.2
|
|
25
25
|
Requires-Dist: hdbscan>=0.8.40
|
|
26
|
+
Requires-Dist: igraph>=1.0.0,<2.0
|
|
27
|
+
Requires-Dist: leidenalg>=0.11.0,<0.12
|
|
26
28
|
Requires-Dist: loguru>=0.7.3
|
|
27
|
-
Requires-Dist: networkx>=3.4
|
|
28
29
|
Requires-Dist: numpy>=2.4.4
|
|
29
30
|
Requires-Dist: packaging>=26.2
|
|
30
31
|
Requires-Dist: polars>=1.40.0
|
|
@@ -122,7 +123,7 @@ flowchart LR
|
|
|
122
123
|
L --> PA["session_classifications/, message_trajectory/,<br/>session_conflicts/, user_friction/<br/>(sharded part-*.parquet)"]
|
|
123
124
|
P --> C["claude-sql cluster<br/>(UMAP + HDBSCAN)"]
|
|
124
125
|
C --> PC["clusters + cluster_terms<br/>(c-TF-IDF)"]
|
|
125
|
-
P --> CM["claude-sql community<br/>(
|
|
126
|
+
P --> CM["claude-sql community<br/>(Leiden + CPM over mutual-kNN centroids)"]
|
|
126
127
|
CM --> PM["session_communities<br/>parquet"]
|
|
127
128
|
PA --> AV[analytics views + macros]
|
|
128
129
|
PC --> AV
|
|
@@ -272,7 +273,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
|
|
|
272
273
|
| `conflicts` | Per-session stance-conflict detection |
|
|
273
274
|
| `friction` | Regex + Sonnet 4.6 → status pings, unmet expectations, confusion, etc. |
|
|
274
275
|
| `cluster` | UMAP → HDBSCAN → c-TF-IDF over message embeddings |
|
|
275
|
-
| `community` |
|
|
276
|
+
| `community` | Leiden + CPM over mutual-kNN session centroids; emits medoid + coherence + resolution profile + `--neighbors-of` |
|
|
276
277
|
| `skills sync` | Walk `~/.claude/skills/` + `~/.claude/plugins/cache/` → seedable skills catalog |
|
|
277
278
|
| `skills ls` | List catalog entries, filterable by `--kind` and `--plugin` |
|
|
278
279
|
| `analyze` | Run the whole pipeline in dependency order |
|
|
@@ -325,7 +326,8 @@ Commands that spend real Bedrock money default to `--dry-run`.
|
|
|
325
326
|
| `session_conflicts` | per-session stance conflicts | `stance_a`, `stance_b`, `resolution` |
|
|
326
327
|
| `message_clusters` | cluster id + 2d viz coords | `cluster_id`, `x`, `y`, `is_noise` |
|
|
327
328
|
| `cluster_terms` | c-TF-IDF top terms per cluster | `cluster_id`, `term`, `weight`, `rank` |
|
|
328
|
-
| `session_communities` |
|
|
329
|
+
| `session_communities` | Leiden+CPM community per session | `community_id`, `size`, `is_medoid`, `coherence`, `gamma_used` |
|
|
330
|
+
| `community_profile` | Resolution-profile sidecar (auto-γ runs only) | `gamma`, `n_communities`, `quality`, `plateau_length` |
|
|
329
331
|
| `user_friction` | one row per classified short user message | `label` (7-way), `rationale`, `source` (`regex` / `llm` / `refused`), `confidence` |
|
|
330
332
|
| `skills_catalog` | one row per known skill / slash command (seed by `claude-sql skills sync`) | `skill_id`, `name`, `plugin`, `plugin_version`, `source_kind` (`user-skill` / `plugin-skill` / `plugin-command` / `builtin`), `description` |
|
|
331
333
|
| `skill_usage` | `skill_invocations` ⟕ `skills_catalog` | `source`, `skill_id`, `skill_name`, `plugin`, `is_builtin`, `description` |
|
|
@@ -380,7 +382,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
|
|
|
380
382
|
| `CLAUDE_SQL_SKILLS_CATALOG_PARQUET_PATH` | `~/.claude/skills_catalog.parquet` | Skills catalog parquet |
|
|
381
383
|
| `CLAUDE_SQL_USER_SKILLS_DIR` | `~/.claude/skills` | Root scanned for user-installed skills |
|
|
382
384
|
| `CLAUDE_SQL_PLUGINS_CACHE_DIR` | `~/.claude/plugins/cache` | Root scanned for plugin skills + commands |
|
|
383
|
-
| `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN /
|
|
385
|
+
| `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Leiden determinism |
|
|
386
|
+
| `CLAUDE_SQL_LEIDEN_KNN_K` | `15` | Mutual-kNN k for the session-centroid graph |
|
|
387
|
+
| `CLAUDE_SQL_LEIDEN_EDGE_FLOOR` | `0.3` | Cosine floor below which edges are dropped |
|
|
388
|
+
| `CLAUDE_SQL_LEIDEN_MIN_COMMUNITY_SIZE` | `3` | Communities below this collapse to noise (`-1`) |
|
|
389
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION` | unset (auto) | Explicit CPM γ; skips the resolution profile |
|
|
390
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_LO` | `0.05` | Lower bound for `Optimiser.resolution_profile` bisection |
|
|
391
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_HI` | `0.95` | Upper bound for the bisection |
|
|
392
|
+
| `CLAUDE_SQL_LEIDEN_N_ITERATIONS` | `-1` | Iterate to convergence; `2` is leidenalg's default |
|
|
393
|
+
| `CLAUDE_SQL_COMMUNITY_PROFILE_PARQUET_PATH` | `~/.claude/community_profile.parquet` | Resolution-profile sidecar path |
|
|
384
394
|
|
|
385
395
|
## Development
|
|
386
396
|
|
|
@@ -497,12 +507,25 @@ See `docs/adr/0015-stack-modernization.md` and
|
|
|
497
507
|
with adaptive thinking on. Pydantic v2 schemas are flattened (inline
|
|
498
508
|
`$ref`, inject `additionalProperties: false`, strip the numeric /
|
|
499
509
|
string constraints the validator rejects from Draft 2020-12).
|
|
500
|
-
- **Determinism.** UMAP, HDBSCAN, and
|
|
510
|
+
- **Determinism.** UMAP, HDBSCAN, and Leiden all seed from
|
|
501
511
|
`CLAUDE_SQL_SEED=42` (default) so cluster IDs and community IDs are
|
|
502
|
-
stable across reruns.
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
512
|
+
stable across reruns. The Leiden seed flows into both
|
|
513
|
+
`leidenalg.find_partition(seed=...)` and `Optimiser.set_rng_seed(...)`
|
|
514
|
+
for the resolution-profile bisection — same seed + same input ⇒
|
|
515
|
+
byte-equal parquets across runs.
|
|
516
|
+
- **Communities = `leidenalg` + CPM.** Reference Leiden implementation
|
|
517
|
+
(`leidenalg>=0.11.0`) over an `igraph>=1.0` mutual-kNN cosine graph
|
|
518
|
+
(k=15, edge floor 0.3) of session centroids. CPM γ has direct cosine
|
|
519
|
+
semantics (internal density ≥ γ, external ≤ γ); auto-γ via
|
|
520
|
+
`Optimiser.resolution_profile` + longest-plateau picker; warn-only
|
|
521
|
+
connectivity check. Output is signal-rich: `is_medoid`, `coherence`,
|
|
522
|
+
`gamma_used` per row, plus a `community_profile` sidecar with one row
|
|
523
|
+
per γ tested, so an LLM agent can ask "what γ would yield 50
|
|
524
|
+
communities?" without rerunning Leiden. Top terms per community come
|
|
525
|
+
from the live `community_top_topics(cid, n)` macro composed from
|
|
526
|
+
`cluster_terms`, not a frozen column. See
|
|
527
|
+
[`docs/research_notes.md`](docs/research_notes.md) for the
|
|
528
|
+
Louvain → Leiden+CPM swap rationale.
|
|
506
529
|
- **Hybrid friction pipeline.** A hand-curated regex bank catches the
|
|
507
530
|
unambiguous `status_ping` / `interruption` / `correction` cases at
|
|
508
531
|
zero Bedrock cost; the ambiguous class — especially
|
|
@@ -77,7 +77,7 @@ flowchart LR
|
|
|
77
77
|
L --> PA["session_classifications/, message_trajectory/,<br/>session_conflicts/, user_friction/<br/>(sharded part-*.parquet)"]
|
|
78
78
|
P --> C["claude-sql cluster<br/>(UMAP + HDBSCAN)"]
|
|
79
79
|
C --> PC["clusters + cluster_terms<br/>(c-TF-IDF)"]
|
|
80
|
-
P --> CM["claude-sql community<br/>(
|
|
80
|
+
P --> CM["claude-sql community<br/>(Leiden + CPM over mutual-kNN centroids)"]
|
|
81
81
|
CM --> PM["session_communities<br/>parquet"]
|
|
82
82
|
PA --> AV[analytics views + macros]
|
|
83
83
|
PC --> AV
|
|
@@ -227,7 +227,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
|
|
|
227
227
|
| `conflicts` | Per-session stance-conflict detection |
|
|
228
228
|
| `friction` | Regex + Sonnet 4.6 → status pings, unmet expectations, confusion, etc. |
|
|
229
229
|
| `cluster` | UMAP → HDBSCAN → c-TF-IDF over message embeddings |
|
|
230
|
-
| `community` |
|
|
230
|
+
| `community` | Leiden + CPM over mutual-kNN session centroids; emits medoid + coherence + resolution profile + `--neighbors-of` |
|
|
231
231
|
| `skills sync` | Walk `~/.claude/skills/` + `~/.claude/plugins/cache/` → seedable skills catalog |
|
|
232
232
|
| `skills ls` | List catalog entries, filterable by `--kind` and `--plugin` |
|
|
233
233
|
| `analyze` | Run the whole pipeline in dependency order |
|
|
@@ -280,7 +280,8 @@ Commands that spend real Bedrock money default to `--dry-run`.
|
|
|
280
280
|
| `session_conflicts` | per-session stance conflicts | `stance_a`, `stance_b`, `resolution` |
|
|
281
281
|
| `message_clusters` | cluster id + 2d viz coords | `cluster_id`, `x`, `y`, `is_noise` |
|
|
282
282
|
| `cluster_terms` | c-TF-IDF top terms per cluster | `cluster_id`, `term`, `weight`, `rank` |
|
|
283
|
-
| `session_communities` |
|
|
283
|
+
| `session_communities` | Leiden+CPM community per session | `community_id`, `size`, `is_medoid`, `coherence`, `gamma_used` |
|
|
284
|
+
| `community_profile` | Resolution-profile sidecar (auto-γ runs only) | `gamma`, `n_communities`, `quality`, `plateau_length` |
|
|
284
285
|
| `user_friction` | one row per classified short user message | `label` (7-way), `rationale`, `source` (`regex` / `llm` / `refused`), `confidence` |
|
|
285
286
|
| `skills_catalog` | one row per known skill / slash command (seed by `claude-sql skills sync`) | `skill_id`, `name`, `plugin`, `plugin_version`, `source_kind` (`user-skill` / `plugin-skill` / `plugin-command` / `builtin`), `description` |
|
|
286
287
|
| `skill_usage` | `skill_invocations` ⟕ `skills_catalog` | `source`, `skill_id`, `skill_name`, `plugin`, `is_builtin`, `description` |
|
|
@@ -335,7 +336,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
|
|
|
335
336
|
| `CLAUDE_SQL_SKILLS_CATALOG_PARQUET_PATH` | `~/.claude/skills_catalog.parquet` | Skills catalog parquet |
|
|
336
337
|
| `CLAUDE_SQL_USER_SKILLS_DIR` | `~/.claude/skills` | Root scanned for user-installed skills |
|
|
337
338
|
| `CLAUDE_SQL_PLUGINS_CACHE_DIR` | `~/.claude/plugins/cache` | Root scanned for plugin skills + commands |
|
|
338
|
-
| `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN /
|
|
339
|
+
| `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Leiden determinism |
|
|
340
|
+
| `CLAUDE_SQL_LEIDEN_KNN_K` | `15` | Mutual-kNN k for the session-centroid graph |
|
|
341
|
+
| `CLAUDE_SQL_LEIDEN_EDGE_FLOOR` | `0.3` | Cosine floor below which edges are dropped |
|
|
342
|
+
| `CLAUDE_SQL_LEIDEN_MIN_COMMUNITY_SIZE` | `3` | Communities below this collapse to noise (`-1`) |
|
|
343
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION` | unset (auto) | Explicit CPM γ; skips the resolution profile |
|
|
344
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_LO` | `0.05` | Lower bound for `Optimiser.resolution_profile` bisection |
|
|
345
|
+
| `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_HI` | `0.95` | Upper bound for the bisection |
|
|
346
|
+
| `CLAUDE_SQL_LEIDEN_N_ITERATIONS` | `-1` | Iterate to convergence; `2` is leidenalg's default |
|
|
347
|
+
| `CLAUDE_SQL_COMMUNITY_PROFILE_PARQUET_PATH` | `~/.claude/community_profile.parquet` | Resolution-profile sidecar path |
|
|
339
348
|
|
|
340
349
|
## Development
|
|
341
350
|
|
|
@@ -452,12 +461,25 @@ See `docs/adr/0015-stack-modernization.md` and
|
|
|
452
461
|
with adaptive thinking on. Pydantic v2 schemas are flattened (inline
|
|
453
462
|
`$ref`, inject `additionalProperties: false`, strip the numeric /
|
|
454
463
|
string constraints the validator rejects from Draft 2020-12).
|
|
455
|
-
- **Determinism.** UMAP, HDBSCAN, and
|
|
464
|
+
- **Determinism.** UMAP, HDBSCAN, and Leiden all seed from
|
|
456
465
|
`CLAUDE_SQL_SEED=42` (default) so cluster IDs and community IDs are
|
|
457
|
-
stable across reruns.
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
466
|
+
stable across reruns. The Leiden seed flows into both
|
|
467
|
+
`leidenalg.find_partition(seed=...)` and `Optimiser.set_rng_seed(...)`
|
|
468
|
+
for the resolution-profile bisection — same seed + same input ⇒
|
|
469
|
+
byte-equal parquets across runs.
|
|
470
|
+
- **Communities = `leidenalg` + CPM.** Reference Leiden implementation
|
|
471
|
+
(`leidenalg>=0.11.0`) over an `igraph>=1.0` mutual-kNN cosine graph
|
|
472
|
+
(k=15, edge floor 0.3) of session centroids. CPM γ has direct cosine
|
|
473
|
+
semantics (internal density ≥ γ, external ≤ γ); auto-γ via
|
|
474
|
+
`Optimiser.resolution_profile` + longest-plateau picker; warn-only
|
|
475
|
+
connectivity check. Output is signal-rich: `is_medoid`, `coherence`,
|
|
476
|
+
`gamma_used` per row, plus a `community_profile` sidecar with one row
|
|
477
|
+
per γ tested, so an LLM agent can ask "what γ would yield 50
|
|
478
|
+
communities?" without rerunning Leiden. Top terms per community come
|
|
479
|
+
from the live `community_top_topics(cid, n)` macro composed from
|
|
480
|
+
`cluster_terms`, not a frozen column. See
|
|
481
|
+
[`docs/research_notes.md`](docs/research_notes.md) for the
|
|
482
|
+
Louvain → Leiden+CPM swap rationale.
|
|
461
483
|
- **Hybrid friction pipeline.** A hand-curated regex bank catches the
|
|
462
484
|
unambiguous `status_ping` / `interruption` / `correction` cases at
|
|
463
485
|
zero Bedrock cost; the ambiguous class — especially
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "claude-sql"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
description = "Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { text = "Apache-2.0" }
|
|
@@ -33,8 +33,9 @@ dependencies = [
|
|
|
33
33
|
"cyclopts>=4.10.2",
|
|
34
34
|
"duckdb>=1.5.2",
|
|
35
35
|
"hdbscan>=0.8.40",
|
|
36
|
+
"igraph>=1.0.0,<2.0",
|
|
37
|
+
"leidenalg>=0.11.0,<0.12",
|
|
36
38
|
"loguru>=0.7.3",
|
|
37
|
-
"networkx>=3.4",
|
|
38
39
|
"numpy>=2.4.4",
|
|
39
40
|
"packaging>=26.2",
|
|
40
41
|
"polars>=1.40.0",
|
|
@@ -50,7 +50,11 @@ from claude_sql import (
|
|
|
50
50
|
ungrounded_worker as _ungrounded_worker,
|
|
51
51
|
)
|
|
52
52
|
from claude_sql.cluster_worker import run_clustering
|
|
53
|
-
from claude_sql.community_worker import
|
|
53
|
+
from claude_sql.community_worker import (
|
|
54
|
+
ResolutionLevel,
|
|
55
|
+
neighbors_of,
|
|
56
|
+
run_communities,
|
|
57
|
+
)
|
|
54
58
|
from claude_sql.config import Settings
|
|
55
59
|
from claude_sql.embed_worker import embed_query, run_backfill
|
|
56
60
|
from claude_sql.friction_worker import detect_user_friction
|
|
@@ -96,7 +100,7 @@ Surfaces at a glance
|
|
|
96
100
|
embed / search Cohere Embed v4 + HNSW cosine search
|
|
97
101
|
classify / trajectory / Sonnet 4.6 analytics -- each defaults to
|
|
98
102
|
conflicts / friction --dry-run; pass --no-dry-run to spend
|
|
99
|
-
cluster / terms / community UMAP+HDBSCAN, c-TF-IDF,
|
|
103
|
+
cluster / terms / community UMAP+HDBSCAN, c-TF-IDF, Leiden+CPM
|
|
100
104
|
analyze composite pipeline over every stage above
|
|
101
105
|
|
|
102
106
|
Flag placement (important for agents)
|
|
@@ -678,6 +682,7 @@ def list_cache(*, common: Common | None = None) -> None:
|
|
|
678
682
|
_describe_cache_entry("message_clusters", settings.clusters_parquet_path),
|
|
679
683
|
_describe_cache_entry("cluster_terms", settings.cluster_terms_parquet_path),
|
|
680
684
|
_describe_cache_entry("session_communities", settings.communities_parquet_path),
|
|
685
|
+
_describe_cache_entry("community_profile", settings.community_profile_parquet_path),
|
|
681
686
|
_describe_cache_entry("user_friction", settings.user_friction_parquet_path),
|
|
682
687
|
_describe_cache_entry("skills_catalog", settings.skills_catalog_parquet_path),
|
|
683
688
|
_describe_checkpoint_entry(settings.checkpoint_db_path),
|
|
@@ -1480,33 +1485,137 @@ def terms(*, force: bool = False, common: Common | None = None) -> None:
|
|
|
1480
1485
|
|
|
1481
1486
|
|
|
1482
1487
|
@app.command
|
|
1483
|
-
def community(
|
|
1484
|
-
|
|
1488
|
+
def community(
|
|
1489
|
+
*,
|
|
1490
|
+
force: bool = False,
|
|
1491
|
+
gamma: float | None = None,
|
|
1492
|
+
resolution: ResolutionLevel = "medium",
|
|
1493
|
+
neighbors_of_session: Annotated[str | None, Parameter(name=["--neighbors-of"])] = None,
|
|
1494
|
+
top_k: int = 15,
|
|
1495
|
+
dry_run: bool = False,
|
|
1496
|
+
common: Common | None = None,
|
|
1497
|
+
) -> None:
|
|
1498
|
+
"""Session-level Leiden+CPM community detection over a mutual-kNN cosine graph.
|
|
1485
1499
|
|
|
1486
1500
|
Prereq: ``embed`` (needs the embeddings parquet).
|
|
1487
1501
|
|
|
1488
1502
|
Output columns (``session_communities`` view)
|
|
1489
1503
|
---------------------------------------------
|
|
1490
|
-
session_id, community_id (int; -1 =
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1504
|
+
session_id, community_id (int; -1 = noise / sub-min-size),
|
|
1505
|
+
size (int), is_medoid (bool — best representative session of its
|
|
1506
|
+
community), coherence (float — mean intra-community cosine),
|
|
1507
|
+
gamma_used (float — the CPM γ used at run time).
|
|
1508
|
+
|
|
1509
|
+
Sidecar (``community_profile`` view, written when auto-γ runs)
|
|
1510
|
+
--------------------------------------------------------------
|
|
1511
|
+
gamma, n_communities, quality, plateau_length — one row per γ tested
|
|
1512
|
+
by ``leidenalg.Optimiser.resolution_profile``. Lets the agent ask
|
|
1513
|
+
"what γ would yield 50 communities?" without rerunning Leiden.
|
|
1514
|
+
|
|
1515
|
+
Method: build a session-centroid mutual-kNN graph (k=15, edge floor 0.3
|
|
1516
|
+
by default), then ``leidenalg.find_partition`` with
|
|
1517
|
+
``CPMVertexPartition``. CPM γ is auto-picked from the resolution
|
|
1518
|
+
profile via the longest-plateau heuristic (Traag et al.); the
|
|
1519
|
+
``--resolution {coarse, medium, fine}`` flag picks alternate plateaus
|
|
1520
|
+
of the same profile (no extra Leiden runs).
|
|
1521
|
+
|
|
1522
|
+
Cost: zero (CPU only). Seeded by ``CLAUDE_SQL_SEED=42``. For top
|
|
1523
|
+
terms per community, run
|
|
1524
|
+
``claude-sql query "SELECT * FROM community_top_topics(<cid>, 10)"``.
|
|
1495
1525
|
|
|
1496
1526
|
Flags
|
|
1497
1527
|
-----
|
|
1498
|
-
--force
|
|
1528
|
+
--force Re-detect even if session_communities.parquet exists.
|
|
1529
|
+
--gamma FLOAT Explicit CPM γ; skips the resolution profile + sidecar.
|
|
1530
|
+
Mutually exclusive with --resolution / --force / --neighbors-of.
|
|
1531
|
+
--resolution {coarse,medium,fine}
|
|
1532
|
+
Pick a γ plateau without specifying a value.
|
|
1533
|
+
Default 'medium' = longest plateau. Ignored if --gamma set.
|
|
1534
|
+
--neighbors-of SID Early-return path: skip Leiden, return top-k cosine
|
|
1535
|
+
neighbors of SID. Reads centroids on the fly +
|
|
1536
|
+
joins session_communities.parquet if it exists.
|
|
1537
|
+
--top-k N Used with --neighbors-of (default 15).
|
|
1538
|
+
--dry-run Plan-only: count candidate sessions via SQL, do not
|
|
1539
|
+
run Leiden. Honors agent JSON output for free.
|
|
1540
|
+
|
|
1541
|
+
Exit codes
|
|
1542
|
+
----------
|
|
1543
|
+
0 success
|
|
1544
|
+
64 invalid input (e.g., --neighbors-of combined with partition flags)
|
|
1499
1545
|
"""
|
|
1500
1546
|
_configure(common)
|
|
1501
1547
|
settings = _resolve_settings(common)
|
|
1548
|
+
fmt = _fmt(common)
|
|
1549
|
+
|
|
1550
|
+
if neighbors_of_session is not None and (gamma is not None or force or dry_run):
|
|
1551
|
+
err = ClassifiedError(
|
|
1552
|
+
kind="invalid_input",
|
|
1553
|
+
exit_code=EXIT_CODES["invalid_input"],
|
|
1554
|
+
message=(
|
|
1555
|
+
"--neighbors-of is mutually exclusive with --gamma, --force, "
|
|
1556
|
+
"and --dry-run; pass only --neighbors-of and --top-k."
|
|
1557
|
+
),
|
|
1558
|
+
hint="Run `claude-sql community --neighbors-of <sid> --top-k 15` alone.",
|
|
1559
|
+
)
|
|
1560
|
+
emit_error(err, fmt)
|
|
1561
|
+
sys.exit(err.exit_code)
|
|
1562
|
+
|
|
1502
1563
|
con = _open_connection(settings)
|
|
1503
1564
|
try:
|
|
1504
|
-
|
|
1565
|
+
if neighbors_of_session is not None:
|
|
1566
|
+
df = neighbors_of(con, settings, neighbors_of_session, top_k=top_k)
|
|
1567
|
+
emit_dataframe(df, fmt, table_rows=top_k)
|
|
1568
|
+
return
|
|
1569
|
+
|
|
1570
|
+
if dry_run:
|
|
1571
|
+
row = con.execute(
|
|
1572
|
+
"""
|
|
1573
|
+
SELECT COUNT(DISTINCT m.session_id) AS candidate_sessions
|
|
1574
|
+
FROM read_parquet(?) e
|
|
1575
|
+
JOIN messages m
|
|
1576
|
+
ON CAST(m.uuid AS VARCHAR) = e.uuid
|
|
1577
|
+
""",
|
|
1578
|
+
[str(settings.embeddings_parquet_path)],
|
|
1579
|
+
).fetchone()
|
|
1580
|
+
n = int(row[0]) if row else 0
|
|
1581
|
+
plan: dict[str, object] = {
|
|
1582
|
+
"pipeline": "community",
|
|
1583
|
+
"candidate_sessions": n,
|
|
1584
|
+
"knn_k": settings.leiden_knn_k,
|
|
1585
|
+
"edge_floor": settings.leiden_edge_floor,
|
|
1586
|
+
"min_community_size": settings.leiden_min_community_size,
|
|
1587
|
+
"gamma": gamma if gamma is not None else "auto",
|
|
1588
|
+
"resolution": resolution,
|
|
1589
|
+
"would_write": [
|
|
1590
|
+
str(settings.communities_parquet_path),
|
|
1591
|
+
]
|
|
1592
|
+
+ ([] if gamma is not None else [str(settings.community_profile_parquet_path)]),
|
|
1593
|
+
"dry_run": True,
|
|
1594
|
+
}
|
|
1595
|
+
emit_json(plan, fmt)
|
|
1596
|
+
return
|
|
1597
|
+
|
|
1598
|
+
stats = run_communities(
|
|
1599
|
+
con,
|
|
1600
|
+
settings,
|
|
1601
|
+
force=force,
|
|
1602
|
+
gamma=gamma,
|
|
1603
|
+
resolution=resolution,
|
|
1604
|
+
)
|
|
1605
|
+
import math
|
|
1606
|
+
|
|
1607
|
+
quality_val = stats["quality"]
|
|
1608
|
+
quality_log = (
|
|
1609
|
+
quality_val if isinstance(quality_val, float) and not math.isnan(quality_val) else 0.0
|
|
1610
|
+
)
|
|
1505
1611
|
logger.info(
|
|
1506
|
-
"community: {} sessions
|
|
1612
|
+
"community: {} sessions, {} communities (γ={:.4f}, quality={:.4f})",
|
|
1507
1613
|
stats["sessions"],
|
|
1508
1614
|
stats["communities"],
|
|
1615
|
+
stats["gamma_used"],
|
|
1616
|
+
quality_log,
|
|
1509
1617
|
)
|
|
1618
|
+
_emit_worker_result(stats, common, pipeline="community")
|
|
1510
1619
|
finally:
|
|
1511
1620
|
con.close()
|
|
1512
1621
|
|
|
@@ -1538,7 +1647,7 @@ def analyze(
|
|
|
1538
1647
|
1. embed (Bedrock Cohere Embed v4; honors --dry-run)
|
|
1539
1648
|
2. cluster (UMAP+HDBSCAN; zero-cost; --force_cluster to rebuild)
|
|
1540
1649
|
3. terms (c-TF-IDF labels for clusters; zero-cost)
|
|
1541
|
-
4. community (
|
|
1650
|
+
4. community (Leiden+CPM; zero-cost; --force-community to rebuild)
|
|
1542
1651
|
5. classify (Sonnet 4.6; honors --dry-run)
|
|
1543
1652
|
6. trajectory (Sonnet 4.6; honors --dry-run)
|
|
1544
1653
|
7. conflicts (Sonnet 4.6; honors --dry-run)
|