claude-sql 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {claude_sql-0.4.0 → claude_sql-0.6.0}/PKG-INFO +35 -11
  2. {claude_sql-0.4.0 → claude_sql-0.6.0}/README.md +32 -9
  3. {claude_sql-0.4.0 → claude_sql-0.6.0}/pyproject.toml +3 -2
  4. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/cli.py +325 -87
  5. claude_sql-0.6.0/src/claude_sql/community_worker.py +662 -0
  6. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/config.py +36 -18
  7. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/output.py +10 -4
  8. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/sql_views.py +576 -257
  9. claude_sql-0.4.0/src/claude_sql/community_worker.py +0 -306
  10. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/__init__.py +0 -0
  11. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/binding.py +0 -0
  12. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/blind_handover.py +0 -0
  13. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/checkpointer.py +0 -0
  14. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/cluster_worker.py +0 -0
  15. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/embed_worker.py +0 -0
  16. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/freeze.py +0 -0
  17. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/friction_worker.py +0 -0
  18. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/install_source.py +0 -0
  19. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/judge_worker.py +0 -0
  20. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/judges.py +0 -0
  21. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/kappa_worker.py +0 -0
  22. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/llm_worker.py +0 -0
  23. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/logging_setup.py +0 -0
  24. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/parquet_shards.py +0 -0
  25. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/retry_queue.py +0 -0
  26. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/review_sheet_render.py +0 -0
  27. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/review_sheet_worker.py +0 -0
  28. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/schemas.py +0 -0
  29. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/session_text.py +0 -0
  30. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/skills_catalog.py +0 -0
  31. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/terms_worker.py +0 -0
  32. {claude_sql-0.4.0 → claude_sql-0.6.0}/src/claude_sql/ungrounded_worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: claude-sql
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
5
5
  Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
6
6
  Author: Laith Al-Saadoon
@@ -23,8 +23,9 @@ Requires-Dist: boto3>=1.42.91
23
23
  Requires-Dist: cyclopts>=4.10.2
24
24
  Requires-Dist: duckdb>=1.5.2
25
25
  Requires-Dist: hdbscan>=0.8.40
26
+ Requires-Dist: igraph>=1.0.0,<2.0
27
+ Requires-Dist: leidenalg>=0.11.0,<0.12
26
28
  Requires-Dist: loguru>=0.7.3
27
- Requires-Dist: networkx>=3.4
28
29
  Requires-Dist: numpy>=2.4.4
29
30
  Requires-Dist: packaging>=26.2
30
31
  Requires-Dist: polars>=1.40.0
@@ -122,7 +123,7 @@ flowchart LR
122
123
  L --> PA["session_classifications/, message_trajectory/,<br/>session_conflicts/, user_friction/<br/>(sharded part-*.parquet)"]
123
124
  P --> C["claude-sql cluster<br/>(UMAP + HDBSCAN)"]
124
125
  C --> PC["clusters + cluster_terms<br/>(c-TF-IDF)"]
125
- P --> CM["claude-sql community<br/>(Louvain over centroids)"]
126
+ P --> CM["claude-sql community<br/>(Leiden + CPM over mutual-kNN centroids)"]
126
127
  CM --> PM["session_communities<br/>parquet"]
127
128
  PA --> AV[analytics views + macros]
128
129
  PC --> AV
@@ -272,7 +273,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
272
273
  | `conflicts` | Per-session stance-conflict detection |
273
274
  | `friction` | Regex + Sonnet 4.6 → status pings, unmet expectations, confusion, etc. |
274
275
  | `cluster` | UMAP → HDBSCAN → c-TF-IDF over message embeddings |
275
- | `community` | Louvain over session centroids |
276
+ | `community` | Leiden + CPM over mutual-kNN session centroids; emits medoid + coherence + resolution profile + `--neighbors-of` |
276
277
  | `skills sync` | Walk `~/.claude/skills/` + `~/.claude/plugins/cache/` → seedable skills catalog |
277
278
  | `skills ls` | List catalog entries, filterable by `--kind` and `--plugin` |
278
279
  | `analyze` | Run the whole pipeline in dependency order |
@@ -325,7 +326,8 @@ Commands that spend real Bedrock money default to `--dry-run`.
325
326
  | `session_conflicts` | per-session stance conflicts | `stance_a`, `stance_b`, `resolution` |
326
327
  | `message_clusters` | cluster id + 2d viz coords | `cluster_id`, `x`, `y`, `is_noise` |
327
328
  | `cluster_terms` | c-TF-IDF top terms per cluster | `cluster_id`, `term`, `weight`, `rank` |
328
- | `session_communities` | Louvain community per session | `community_id`, `size` |
329
+ | `session_communities` | Leiden+CPM community per session | `community_id`, `size`, `is_medoid`, `coherence`, `gamma_used` |
330
+ | `community_profile` | Resolution-profile sidecar (auto-γ runs only) | `gamma`, `n_communities`, `quality`, `plateau_length` |
329
331
  | `user_friction` | one row per classified short user message | `label` (7-way), `rationale`, `source` (`regex` / `llm` / `refused`), `confidence` |
330
332
  | `skills_catalog` | one row per known skill / slash command (seed by `claude-sql skills sync`) | `skill_id`, `name`, `plugin`, `plugin_version`, `source_kind` (`user-skill` / `plugin-skill` / `plugin-command` / `builtin`), `description` |
331
333
  | `skill_usage` | `skill_invocations` ⟕ `skills_catalog` | `source`, `skill_id`, `skill_name`, `plugin`, `is_builtin`, `description` |
@@ -334,6 +336,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
334
336
 
335
337
  | Macro | Signature | What it does |
336
338
  |---|---|---|
339
+ | `ago(interval_text)` | scalar → `TIMESTAMP` | `current_timestamp - INTERVAL <text>` -- e.g. `WHERE ts >= ago('30 days')` |
337
340
  | `model_used(sid)` | scalar → `VARCHAR` | Latest `model` observed in the session |
338
341
  | `cost_estimate(sid)` | scalar → `DOUBLE` | USD spend (dated model IDs prefix-matched) |
339
342
  | `tool_rank(last_n_days)` | table | Tool-use leaderboard over a window |
@@ -380,7 +383,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
380
383
  | `CLAUDE_SQL_SKILLS_CATALOG_PARQUET_PATH` | `~/.claude/skills_catalog.parquet` | Skills catalog parquet |
381
384
  | `CLAUDE_SQL_USER_SKILLS_DIR` | `~/.claude/skills` | Root scanned for user-installed skills |
382
385
  | `CLAUDE_SQL_PLUGINS_CACHE_DIR` | `~/.claude/plugins/cache` | Root scanned for plugin skills + commands |
383
- | `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Louvain determinism |
386
+ | `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Leiden determinism |
387
+ | `CLAUDE_SQL_LEIDEN_KNN_K` | `15` | Mutual-kNN k for the session-centroid graph |
388
+ | `CLAUDE_SQL_LEIDEN_EDGE_FLOOR` | `0.3` | Cosine floor below which edges are dropped |
389
+ | `CLAUDE_SQL_LEIDEN_MIN_COMMUNITY_SIZE` | `3` | Communities below this collapse to noise (`-1`) |
390
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION` | unset (auto) | Explicit CPM γ; skips the resolution profile |
391
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_LO` | `0.05` | Lower bound for `Optimiser.resolution_profile` bisection |
392
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_HI` | `0.95` | Upper bound for the bisection |
393
+ | `CLAUDE_SQL_LEIDEN_N_ITERATIONS` | `-1` | Iterate to convergence; `2` is leidenalg's default |
394
+ | `CLAUDE_SQL_COMMUNITY_PROFILE_PARQUET_PATH` | `~/.claude/community_profile.parquet` | Resolution-profile sidecar path |
384
395
 
385
396
  ## Development
386
397
 
@@ -497,12 +508,25 @@ See `docs/adr/0015-stack-modernization.md` and
497
508
  with adaptive thinking on. Pydantic v2 schemas are flattened (inline
498
509
  `$ref`, inject `additionalProperties: false`, strip the numeric /
499
510
  string constraints the validator rejects from Draft 2020-12).
500
- - **Determinism.** UMAP, HDBSCAN, and Louvain all seed from
511
+ - **Determinism.** UMAP, HDBSCAN, and Leiden all seed from
501
512
  `CLAUDE_SQL_SEED=42` (default) so cluster IDs and community IDs are
502
- stable across reruns.
503
- - **Louvain = `networkx`.** `networkx.community.louvain_communities`,
504
- built into `networkx >= 3.4`. The abandoned `python-louvain` package
505
- is not used.
513
+ stable across reruns. The Leiden seed flows into both
514
+ `leidenalg.find_partition(seed=...)` and `Optimiser.set_rng_seed(...)`
515
+ for the resolution-profile bisection same seed + same input ⇒
516
+ byte-equal parquets across runs.
517
+ - **Communities = `leidenalg` + CPM.** Reference Leiden implementation
518
+ (`leidenalg>=0.11.0`) over an `igraph>=1.0` mutual-kNN cosine graph
519
+ (k=15, edge floor 0.3) of session centroids. CPM γ has direct cosine
520
+ semantics (internal density ≥ γ, external ≤ γ); auto-γ via
521
+ `Optimiser.resolution_profile` + longest-plateau picker; warn-only
522
+ connectivity check. Output is signal-rich: `is_medoid`, `coherence`,
523
+ `gamma_used` per row, plus a `community_profile` sidecar with one row
524
+ per γ tested, so an LLM agent can ask "what γ would yield 50
525
+ communities?" without rerunning Leiden. Top terms per community come
526
+ from the live `community_top_topics(cid, n)` macro composed from
527
+ `cluster_terms`, not a frozen column. See
528
+ [`docs/research_notes.md`](docs/research_notes.md) for the
529
+ Louvain → Leiden+CPM swap rationale.
506
530
  - **Hybrid friction pipeline.** A hand-curated regex bank catches the
507
531
  unambiguous `status_ping` / `interruption` / `correction` cases at
508
532
  zero Bedrock cost; the ambiguous class — especially
@@ -77,7 +77,7 @@ flowchart LR
77
77
  L --> PA["session_classifications/, message_trajectory/,<br/>session_conflicts/, user_friction/<br/>(sharded part-*.parquet)"]
78
78
  P --> C["claude-sql cluster<br/>(UMAP + HDBSCAN)"]
79
79
  C --> PC["clusters + cluster_terms<br/>(c-TF-IDF)"]
80
- P --> CM["claude-sql community<br/>(Louvain over centroids)"]
80
+ P --> CM["claude-sql community<br/>(Leiden + CPM over mutual-kNN centroids)"]
81
81
  CM --> PM["session_communities<br/>parquet"]
82
82
  PA --> AV[analytics views + macros]
83
83
  PC --> AV
@@ -227,7 +227,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
227
227
  | `conflicts` | Per-session stance-conflict detection |
228
228
  | `friction` | Regex + Sonnet 4.6 → status pings, unmet expectations, confusion, etc. |
229
229
  | `cluster` | UMAP → HDBSCAN → c-TF-IDF over message embeddings |
230
- | `community` | Louvain over session centroids |
230
+ | `community` | Leiden + CPM over mutual-kNN session centroids; emits medoid + coherence + resolution profile + `--neighbors-of` |
231
231
  | `skills sync` | Walk `~/.claude/skills/` + `~/.claude/plugins/cache/` → seedable skills catalog |
232
232
  | `skills ls` | List catalog entries, filterable by `--kind` and `--plugin` |
233
233
  | `analyze` | Run the whole pipeline in dependency order |
@@ -280,7 +280,8 @@ Commands that spend real Bedrock money default to `--dry-run`.
280
280
  | `session_conflicts` | per-session stance conflicts | `stance_a`, `stance_b`, `resolution` |
281
281
  | `message_clusters` | cluster id + 2d viz coords | `cluster_id`, `x`, `y`, `is_noise` |
282
282
  | `cluster_terms` | c-TF-IDF top terms per cluster | `cluster_id`, `term`, `weight`, `rank` |
283
- | `session_communities` | Louvain community per session | `community_id`, `size` |
283
+ | `session_communities` | Leiden+CPM community per session | `community_id`, `size`, `is_medoid`, `coherence`, `gamma_used` |
284
+ | `community_profile` | Resolution-profile sidecar (auto-γ runs only) | `gamma`, `n_communities`, `quality`, `plateau_length` |
284
285
  | `user_friction` | one row per classified short user message | `label` (7-way), `rationale`, `source` (`regex` / `llm` / `refused`), `confidence` |
285
286
  | `skills_catalog` | one row per known skill / slash command (seed by `claude-sql skills sync`) | `skill_id`, `name`, `plugin`, `plugin_version`, `source_kind` (`user-skill` / `plugin-skill` / `plugin-command` / `builtin`), `description` |
286
287
  | `skill_usage` | `skill_invocations` ⟕ `skills_catalog` | `source`, `skill_id`, `skill_name`, `plugin`, `is_builtin`, `description` |
@@ -289,6 +290,7 @@ Commands that spend real Bedrock money default to `--dry-run`.
289
290
 
290
291
  | Macro | Signature | What it does |
291
292
  |---|---|---|
293
+ | `ago(interval_text)` | scalar → `TIMESTAMP` | `current_timestamp - INTERVAL <text>` -- e.g. `WHERE ts >= ago('30 days')` |
292
294
  | `model_used(sid)` | scalar → `VARCHAR` | Latest `model` observed in the session |
293
295
  | `cost_estimate(sid)` | scalar → `DOUBLE` | USD spend (dated model IDs prefix-matched) |
294
296
  | `tool_rank(last_n_days)` | table | Tool-use leaderboard over a window |
@@ -335,7 +337,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
335
337
  | `CLAUDE_SQL_SKILLS_CATALOG_PARQUET_PATH` | `~/.claude/skills_catalog.parquet` | Skills catalog parquet |
336
338
  | `CLAUDE_SQL_USER_SKILLS_DIR` | `~/.claude/skills` | Root scanned for user-installed skills |
337
339
  | `CLAUDE_SQL_PLUGINS_CACHE_DIR` | `~/.claude/plugins/cache` | Root scanned for plugin skills + commands |
338
- | `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Louvain determinism |
340
+ | `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Leiden determinism |
341
+ | `CLAUDE_SQL_LEIDEN_KNN_K` | `15` | Mutual-kNN k for the session-centroid graph |
342
+ | `CLAUDE_SQL_LEIDEN_EDGE_FLOOR` | `0.3` | Cosine floor below which edges are dropped |
343
+ | `CLAUDE_SQL_LEIDEN_MIN_COMMUNITY_SIZE` | `3` | Communities below this collapse to noise (`-1`) |
344
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION` | unset (auto) | Explicit CPM γ; skips the resolution profile |
345
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_LO` | `0.05` | Lower bound for `Optimiser.resolution_profile` bisection |
346
+ | `CLAUDE_SQL_LEIDEN_RESOLUTION_RANGE_HI` | `0.95` | Upper bound for the bisection |
347
+ | `CLAUDE_SQL_LEIDEN_N_ITERATIONS` | `-1` | Iterate to convergence; `2` is leidenalg's default |
348
+ | `CLAUDE_SQL_COMMUNITY_PROFILE_PARQUET_PATH` | `~/.claude/community_profile.parquet` | Resolution-profile sidecar path |
339
349
 
340
350
  ## Development
341
351
 
@@ -452,12 +462,25 @@ See `docs/adr/0015-stack-modernization.md` and
452
462
  with adaptive thinking on. Pydantic v2 schemas are flattened (inline
453
463
  `$ref`, inject `additionalProperties: false`, strip the numeric /
454
464
  string constraints the validator rejects from Draft 2020-12).
455
- - **Determinism.** UMAP, HDBSCAN, and Louvain all seed from
465
+ - **Determinism.** UMAP, HDBSCAN, and Leiden all seed from
456
466
  `CLAUDE_SQL_SEED=42` (default) so cluster IDs and community IDs are
457
- stable across reruns.
458
- - **Louvain = `networkx`.** `networkx.community.louvain_communities`,
459
- built into `networkx >= 3.4`. The abandoned `python-louvain` package
460
- is not used.
467
+ stable across reruns. The Leiden seed flows into both
468
+ `leidenalg.find_partition(seed=...)` and `Optimiser.set_rng_seed(...)`
469
+ for the resolution-profile bisection same seed + same input ⇒
470
+ byte-equal parquets across runs.
471
+ - **Communities = `leidenalg` + CPM.** Reference Leiden implementation
472
+ (`leidenalg>=0.11.0`) over an `igraph>=1.0` mutual-kNN cosine graph
473
+ (k=15, edge floor 0.3) of session centroids. CPM γ has direct cosine
474
+ semantics (internal density ≥ γ, external ≤ γ); auto-γ via
475
+ `Optimiser.resolution_profile` + longest-plateau picker; warn-only
476
+ connectivity check. Output is signal-rich: `is_medoid`, `coherence`,
477
+ `gamma_used` per row, plus a `community_profile` sidecar with one row
478
+ per γ tested, so an LLM agent can ask "what γ would yield 50
479
+ communities?" without rerunning Leiden. Top terms per community come
480
+ from the live `community_top_topics(cid, n)` macro composed from
481
+ `cluster_terms`, not a frozen column. See
482
+ [`docs/research_notes.md`](docs/research_notes.md) for the
483
+ Louvain → Leiden+CPM swap rationale.
461
484
  - **Hybrid friction pipeline.** A hand-curated regex bank catches the
462
485
  unambiguous `status_ping` / `interruption` / `correction` cases at
463
486
  zero Bedrock cost; the ambiguous class — especially
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "claude-sql"
3
- version = "0.4.0"
3
+ version = "0.6.0"
4
4
  description = "Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts."
5
5
  readme = "README.md"
6
6
  license = { text = "Apache-2.0" }
@@ -33,8 +33,9 @@ dependencies = [
33
33
  "cyclopts>=4.10.2",
34
34
  "duckdb>=1.5.2",
35
35
  "hdbscan>=0.8.40",
36
+ "igraph>=1.0.0,<2.0",
37
+ "leidenalg>=0.11.0,<0.12",
36
38
  "loguru>=0.7.3",
37
- "networkx>=3.4",
38
39
  "numpy>=2.4.4",
39
40
  "packaging>=26.2",
40
41
  "polars>=1.40.0",