claude-sql 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. claude_sql-0.4.0/PKG-INFO +530 -0
  2. claude_sql-0.4.0/README.md +485 -0
  3. claude_sql-0.4.0/pyproject.toml +314 -0
  4. claude_sql-0.4.0/src/claude_sql/__init__.py +5 -0
  5. claude_sql-0.4.0/src/claude_sql/binding.py +740 -0
  6. claude_sql-0.4.0/src/claude_sql/blind_handover.py +155 -0
  7. claude_sql-0.4.0/src/claude_sql/checkpointer.py +202 -0
  8. claude_sql-0.4.0/src/claude_sql/cli.py +2344 -0
  9. claude_sql-0.4.0/src/claude_sql/cluster_worker.py +208 -0
  10. claude_sql-0.4.0/src/claude_sql/community_worker.py +306 -0
  11. claude_sql-0.4.0/src/claude_sql/config.py +380 -0
  12. claude_sql-0.4.0/src/claude_sql/embed_worker.py +482 -0
  13. claude_sql-0.4.0/src/claude_sql/freeze.py +189 -0
  14. claude_sql-0.4.0/src/claude_sql/friction_worker.py +561 -0
  15. claude_sql-0.4.0/src/claude_sql/install_source.py +77 -0
  16. claude_sql-0.4.0/src/claude_sql/judge_worker.py +459 -0
  17. claude_sql-0.4.0/src/claude_sql/judges.py +239 -0
  18. claude_sql-0.4.0/src/claude_sql/kappa_worker.py +257 -0
  19. claude_sql-0.4.0/src/claude_sql/llm_worker.py +1760 -0
  20. claude_sql-0.4.0/src/claude_sql/logging_setup.py +95 -0
  21. claude_sql-0.4.0/src/claude_sql/output.py +248 -0
  22. claude_sql-0.4.0/src/claude_sql/parquet_shards.py +172 -0
  23. claude_sql-0.4.0/src/claude_sql/retry_queue.py +180 -0
  24. claude_sql-0.4.0/src/claude_sql/review_sheet_render.py +167 -0
  25. claude_sql-0.4.0/src/claude_sql/review_sheet_worker.py +463 -0
  26. claude_sql-0.4.0/src/claude_sql/schemas.py +454 -0
  27. claude_sql-0.4.0/src/claude_sql/session_text.py +387 -0
  28. claude_sql-0.4.0/src/claude_sql/skills_catalog.py +354 -0
  29. claude_sql-0.4.0/src/claude_sql/sql_views.py +1751 -0
  30. claude_sql-0.4.0/src/claude_sql/terms_worker.py +145 -0
  31. claude_sql-0.4.0/src/claude_sql/ungrounded_worker.py +190 -0
@@ -0,0 +1,530 @@
1
+ Metadata-Version: 2.3
2
+ Name: claude-sql
3
+ Version: 0.4.0
4
+ Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
5
+ Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
6
+ Author: Laith Al-Saadoon
7
+ Author-email: Laith Al-Saadoon <lalsaado@amazon.com>
8
+ License: Apache-2.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Topic :: Software Development
16
+ Classifier: Topic :: Database
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Utilities
19
+ Classifier: Typing :: Typed
20
+ Requires-Dist: anthropic>=0.40
21
+ Requires-Dist: anyio>=4.13.0
22
+ Requires-Dist: boto3>=1.42.91
23
+ Requires-Dist: cyclopts>=4.10.2
24
+ Requires-Dist: duckdb>=1.5.2
25
+ Requires-Dist: hdbscan>=0.8.40
26
+ Requires-Dist: loguru>=0.7.3
27
+ Requires-Dist: networkx>=3.4
28
+ Requires-Dist: numpy>=2.4.4
29
+ Requires-Dist: packaging>=26.2
30
+ Requires-Dist: polars>=1.40.0
31
+ Requires-Dist: pyarrow>=23.0.1
32
+ Requires-Dist: pydantic>=2.13.2
33
+ Requires-Dist: pydantic-settings>=2.13.1
34
+ Requires-Dist: pyyaml>=6.0.3
35
+ Requires-Dist: scikit-learn>=1.5
36
+ Requires-Dist: scipy>=1.13
37
+ Requires-Dist: tenacity>=9.1.4
38
+ Requires-Dist: umap-learn>=0.5.12
39
+ Requires-Python: >=3.13
40
+ Project-URL: Homepage, https://github.com/theagenticguy/claude-sql
41
+ Project-URL: Repository, https://github.com/theagenticguy/claude-sql
42
+ Project-URL: Issues, https://github.com/theagenticguy/claude-sql/issues
43
+ Project-URL: Changelog, https://github.com/theagenticguy/claude-sql/blob/main/CHANGELOG.md
44
+ Description-Content-Type: text/markdown
45
+
46
+ # claude-sql
47
+
48
+ [![CI](https://github.com/theagenticguy/claude-sql/actions/workflows/ci.yml/badge.svg)](https://github.com/theagenticguy/claude-sql/actions/workflows/ci.yml)
49
+ [![CodeQL](https://github.com/theagenticguy/claude-sql/actions/workflows/codeql.yml/badge.svg)](https://github.com/theagenticguy/claude-sql/actions/workflows/codeql.yml)
50
+ [![Semgrep](https://github.com/theagenticguy/claude-sql/actions/workflows/semgrep.yml/badge.svg)](https://github.com/theagenticguy/claude-sql/actions/workflows/semgrep.yml)
51
+ [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/theagenticguy/claude-sql/badge)](https://securityscorecards.dev/viewer/?uri=github.com/theagenticguy/claude-sql)
52
+ [![codecov](https://codecov.io/gh/theagenticguy/claude-sql/graph/badge.svg)](https://codecov.io/gh/theagenticguy/claude-sql)
53
+ [![Python 3.13+](https://img.shields.io/badge/python-3.13+-blue.svg)](https://www.python.org/downloads/release/python-3130/)
54
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](./LICENSE)
55
+
56
+ > **Ask your Claude Code transcripts anything.**
57
+ > Your sessions are already on disk. `claude-sql` turns them into a
58
+ > searchable, explorable, self-improving record of your work — in place,
59
+ > with zero copy.
60
+
61
+ ## What you get out of it
62
+
63
+ **Remember what you worked on.**
64
+
65
+ - "What was that thing I did last Tuesday with DuckDB and HNSW?"
66
+ - "Show me every conversation I've had about temporal workflows, ranked by
67
+ relevance."
68
+ - "Which week did I finally figure out that memory bug?"
69
+
70
+ **See where your time and money actually go.**
71
+
72
+ - "Which sessions cost me more than $5 on Opus this month — and what was I
73
+ trying to do?"
74
+ - "Which tools am I leaning on most? Which ones fail the most?"
75
+ - "Where am I spending hours on prose vs. on tool calls?"
76
+
77
+ **Notice patterns in how you work.**
78
+
79
+ - "When do I hand-hold the agent step-by-step vs. let it run on its own?
80
+ Has that shifted over time?"
81
+ - "What kinds of work am I doing most — coding, strategy, admin, writing?"
82
+ - "Which session types actually finish successfully vs. trail off?"
83
+ - "Which todos do I create and never close out?"
84
+
85
+ **Surface themes across hundreds of conversations.**
86
+
87
+ - "Group my sessions by what they're *about* and tell me what moved this
88
+ month."
89
+ - "Show me the biggest themes in my work and what's trending."
90
+ - "When I've wrestled with the same problem across multiple sessions,
91
+ group them together so I can see the arc."
92
+
93
+ **Catch yourself disagreeing with yourself.**
94
+
95
+ - "Find sessions where I took two opposing positions on the same decision
96
+ — and flag which ones got resolved vs. abandoned."
97
+
98
+ **Spot where the agent left you hanging.**
99
+
100
+ - "Which sessions had me pinging for status the most? What was the agent
101
+ doing?"
102
+ - "Show me every time I asked a one-word question like *screenshot?*
103
+ because the agent didn't proactively share one."
104
+ - "Rank sessions by how often I had to interrupt or correct the agent."
105
+
106
+ `claude-sql` turns every one of those into a SQL query that runs in under
107
+ a second on the live JSONL corpus — no export, no pipeline.
108
+
109
+ ## How it works
110
+
111
+ ```mermaid
112
+ flowchart LR
113
+ J["~/.claude/projects/<br/>*.jsonl"] -->|read_json| R[raw views]
114
+ J2["subagents/<br/>agent-*.jsonl"] -->|read_json| R
115
+ R --> V[business views]
116
+ V --> Q[["claude-sql query<br/>claude-sql explain<br/>claude-sql schema"]]
117
+ V --> E["claude-sql embed<br/>(Cohere Embed v4 on Bedrock)"]
118
+ E --> P["embeddings/<br/>part-*.parquet (sharded)"]
119
+ P --> H["HNSW index<br/>(persisted to<br/>~/.claude/hnsw.duckdb)"]
120
+ H --> S[["claude-sql search"]]
121
+ V --> L["claude-sql classify / trajectory /<br/>conflicts / friction (Sonnet 4.6 +<br/>output_config.format)"]
122
+ L --> PA["session_classifications/, message_trajectory/,<br/>session_conflicts/, user_friction/<br/>(sharded part-*.parquet)"]
123
+ P --> C["claude-sql cluster<br/>(UMAP + HDBSCAN)"]
124
+ C --> PC["clusters + cluster_terms<br/>(c-TF-IDF)"]
125
+ P --> CM["claude-sql community<br/>(Louvain over centroids)"]
126
+ CM --> PM["session_communities<br/>parquet"]
127
+ PA --> AV[analytics views + macros]
128
+ PC --> AV
129
+ PM --> AV
130
+ AV --> Q
131
+ ```
132
+
133
+ Every parquet is cached and rebuilt only on explicit re-run. Views
134
+ register over whichever parquets exist at connection open — missing ones
135
+ warn and no-op, never crash.
136
+
137
+ ## Install
138
+
139
+ ### As a uv tool (recommended)
140
+
141
+ `claude-sql` is **not published to PyPI**. Install it from a local
142
+ checkout. `mise run tool:install` wraps `uv tool install --from .
143
+ claude-sql --force --reinstall` so the binary on your `PATH` lands in an
144
+ isolated uv-managed venv.
145
+
146
+ ```bash
147
+ git clone https://github.com/theagenticguy/claude-sql.git
148
+ cd claude-sql
149
+ mise run tool:install # → uv tool install --from . claude-sql --force --reinstall
150
+ claude-sql --version
151
+ ```
152
+
153
+ Upgrade after pulling new commits by re-running the same task (the
154
+ `tool:upgrade` alias runs an identical command with clearer intent):
155
+
156
+ ```bash
157
+ git pull
158
+ mise run tool:upgrade
159
+ ```
160
+
161
+ > **Note.** `uv tool upgrade claude-sql` does **not** work — it resolves
162
+ > against PyPI, which has no `claude-sql` package. Always reinstall from
163
+ > your checkout.
164
+
165
+ Remove it with:
166
+
167
+ ```bash
168
+ mise run tool:uninstall # → uv tool uninstall claude-sql
169
+ ```
170
+
171
+ ### Project install (for development)
172
+
173
+ ```bash
174
+ git clone https://github.com/theagenticguy/claude-sql.git
175
+ cd claude-sql
176
+ mise install # fetch pinned Python + uv
177
+ mise run install # uv sync --all-extras + install lefthook git hooks
178
+ mise run check # ruff + fmt + ty + pytest
179
+ ```
180
+
181
+ `mise` auto-activates `.venv` on `cd`. Every command below is also
182
+ available as a mise task — run `mise tasks` for the full list.
183
+
184
+ `mise run install` also installs the lefthook git hooks. Re-run
185
+ `mise run hooks:install` any time `lefthook.yml` changes.
186
+
187
+ ### AWS credentials
188
+
189
+ Semantic search and Sonnet classification require Bedrock access.
190
+
191
+ ```bash
192
+ export AWS_PROFILE=your-profile
193
+ ```
194
+
195
+ The IAM policy needs `bedrock:InvokeModel` on:
196
+
197
+ - `inference-profile/global.cohere.embed-v4:0`
198
+ - `inference-profile/global.anthropic.claude-sonnet-4-6`
199
+
200
+ ## Quick tour
201
+
202
+ ```bash
203
+ # Inspect every registered view + macro.
204
+ claude-sql schema
205
+
206
+ # Opus sessions over $5 in the last 30 days.
207
+ claude-sql query "
208
+ SELECT session_id, model_used(session_id) AS model,
209
+ cost_estimate(session_id) AS usd
210
+ FROM sessions
211
+ WHERE started_at >= current_timestamp - INTERVAL 30 DAY
212
+ AND model_used(session_id) LIKE '%opus%'
213
+ AND cost_estimate(session_id) > 5.0
214
+ ORDER BY usd DESC
215
+ "
216
+
217
+ # See the EXPLAIN plan (static by default — no execution).
218
+ claude-sql explain "SELECT * FROM messages WHERE session_id = '<uuid>' LIMIT 1"
219
+
220
+ # Drop into the DuckDB REPL with everything pre-registered.
221
+ claude-sql shell
222
+
223
+ # Backfill embeddings (Cohere Embed v4 via global CRIS).
224
+ claude-sql embed --since-days 30
225
+
226
+ # Semantic search.
227
+ claude-sql search "temporal workflow determinism" --k 10
228
+
229
+ # Classify every recent session (dry-run prints a cost estimate first).
230
+ claude-sql classify --dry-run --since-days 30
231
+ claude-sql classify --no-dry-run --since-days 30
232
+
233
+ # Friction classifier — status pings, unmet expectations, interruptions.
234
+ claude-sql friction --dry-run --since-days 14
235
+ claude-sql friction --no-dry-run --since-days 14
236
+ claude-sql query "SELECT * FROM friction_counts(14)"
237
+ claude-sql query "SELECT * FROM friction_examples('unmet_expectation', 10)"
238
+
239
+ # Seed the Skills catalog from ~/.claude/skills + ~/.claude/plugins/cache.
240
+ claude-sql skills sync
241
+ claude-sql skills ls --kind plugin-skill | head
242
+ claude-sql query "SELECT * FROM skill_rank(30) LIMIT 15"
243
+ claude-sql query "SELECT * FROM skill_source_mix(30) WHERE skill_id LIKE '%erpaval%'"
244
+ claude-sql query "SELECT * FROM unused_skills(30) LIMIT 20"
245
+
246
+ # Full analytics pipeline (includes a zero-cost `skills sync` at step 0).
247
+ claude-sql analyze --since-days 30 --no-dry-run
248
+ ```
249
+
250
+ More recipes in [`docs/cookbook.md`](docs/cookbook.md) (v1: sessions,
251
+ messages, tools, todos, subagents, semantic search) and
252
+ [`docs/analytics_cookbook.md`](docs/analytics_cookbook.md) (v2: clusters,
253
+ communities, classifications, trajectory, conflicts, friction).
254
+
255
+ ## CLI surface
256
+
257
+ Every subcommand shares the top-level flags: `--verbose` / `--quiet`,
258
+ `--glob`, `--subagent-glob`, and `--format {auto,table,json,ndjson,csv}`.
259
+ Commands that spend real Bedrock money default to `--dry-run`.
260
+
261
+ | Command | Purpose |
262
+ |---|---|
263
+ | `schema` | List every view + its columns, plus every macro |
264
+ | `query <sql>` | Run a query, emit as table / JSON / NDJSON / CSV |
265
+ | `explain <sql>` | Static `EXPLAIN` by default; `--analyze` for `EXPLAIN ANALYZE` |
266
+ | `shell` | Launch the `duckdb` REPL with everything pre-registered |
267
+ | `list-cache` | Report freshness + row counts for every parquet cache |
268
+ | `embed` | Backfill embeddings via Cohere Embed v4 on Bedrock |
269
+ | `search <text>` | HNSW cosine semantic search over embeddings |
270
+ | `classify` | Sonnet 4.6 → session autonomy + work category + success + goal |
271
+ | `trajectory` | Per-message sentiment + `is_transition` |
272
+ | `conflicts` | Per-session stance-conflict detection |
273
+ | `friction` | Regex + Sonnet 4.6 → status pings, unmet expectations, confusion, etc. |
274
+ | `cluster` | UMAP → HDBSCAN → c-TF-IDF over message embeddings |
275
+ | `community` | Louvain over session centroids |
276
+ | `skills sync` | Walk `~/.claude/skills/` + `~/.claude/plugins/cache/` → seedable skills catalog |
277
+ | `skills ls` | List catalog entries, filterable by `--kind` and `--plugin` |
278
+ | `analyze` | Run the whole pipeline in dependency order |
279
+
280
+ ### Agent-friendly defaults
281
+
282
+ - **`--format auto`** emits a human table on a TTY and JSON when stdout
283
+ is piped, so agents calling `claude-sql` via subprocess get JSON for
284
+ free. `json`, `ndjson`, and `csv` are always available explicitly.
285
+ - **Classified exit codes** for DuckDB errors — `64` for parse errors,
286
+ `65` for unknown view / column / macro, `70` for other runtime
287
+ errors, and `2` when `search` is called before `embed` has run. On
288
+ non-TTY stdout the error also comes back as
289
+ `{"error": {"kind", "message", "hint"}}` on stderr, so agents don't
290
+ have to scrape tracebacks.
291
+ - **`list-cache`** reports every parquet (embeddings, classifications,
292
+ trajectory, conflicts, clusters, cluster terms, communities,
293
+ friction) with its `{exists, bytes, mtime, rows}`, so an agent can
294
+ decide whether to run a prerequisite stage before issuing a `search`
295
+ or `query`.
296
+ - **`explain`** is a static plan by default (no query execution); pass
297
+ `--analyze` for `EXPLAIN ANALYZE` when you want real timings.
298
+ - **`--quiet`** drops INFO / WARNING logs to ERROR-only. View
299
+ registration happens at DEBUG level, so the default `query` stderr is
300
+ already empty unless something actually warrants attention.
301
+
302
+ ## Views
303
+
304
+ | View | Grain | Key columns |
305
+ |---|---|---|
306
+ | `sessions` | one per transcript file | `session_id`, `started_at`, `ended_at` |
307
+ | `messages` | one per chat message | `uuid`, `session_id`, `role`, `model`, token usage |
308
+ | `content_blocks` | flattened `message.content[]` | `block_type`, `tool_name` |
309
+ | `messages_text` | text blocks aggregated per message | `uuid`, `text_content` |
310
+ | `tool_calls` | `content_blocks` where `type='tool_use'` | `tool_name`, `tool_use_id` |
311
+ | `tool_results` | `content_blocks` where `type='tool_result'` | `tool_use_id`, `content` |
312
+ | `todo_events` | one row per todo per `TodoWrite` snapshot (legacy + `--print`/SDK) | `subject`, `status`, `snapshot_ix` |
313
+ | `todo_state_current` | latest status per `(session, subject)` for `TodoWrite` | `status`, `written_at` |
314
+ | `subagent_spawns` | `Task` / `Agent` launch sites (Claude Code v2.1.63 renamed `Task`→`Agent`) | `subagent_type`, `description`, `prompt` |
315
+ | `task_creations` | `TaskCreate` / `mcp__tasks__task_create` (interactive task tracker, v2.1.16+) | `subject`, `description`, `active_form`, `metadata` |
316
+ | `task_updates` | `TaskUpdate` / `mcp__tasks__task_update` lifecycle events | `task_id`, `status`, `add_blocked_by`, `owner` |
317
+ | `tasks_state_current` | latest status per `(session, task_id)` for the v2.1.16+ family | `task_id`, `subject`, `status`, `last_updated_at` |
318
+ | `task_spawns` *(deprecated)* | `subagent_spawns` ∪ `task_creations` shim, removed next minor | `spawn_tool`, `subagent_type`, `description`, `prompt` |
319
+ | `skill_invocations` | every `Skill` tool call + `<command-name>/foo</command-name>` user slash | `source` (`tool` / `slash_command`), `skill_id`, `args` |
320
+ | `subagent_sessions` | rolled-up subagent runs | `parent_session_id`, `agent_hex`, `agent_type`, `description`, `started_at`, `ended_at`, `message_count`, `transcript_path` |
321
+ | `subagent_messages` | user + assistant events from subagent transcripts | `uuid`, `parent_session_id` |
322
+ | `session_classifications` | one row per classified session | `autonomy_tier`, `work_category`, `success`, `goal` |
323
+ | `session_goals` | projection over classifications | `session_id`, `goal` |
324
+ | `message_trajectory` | per-message sentiment + `is_transition` | `sentiment_delta` (`positive` / `neutral` / `negative`), `is_transition` |
325
+ | `session_conflicts` | per-session stance conflicts | `stance_a`, `stance_b`, `resolution` |
326
+ | `message_clusters` | cluster id + 2d viz coords | `cluster_id`, `x`, `y`, `is_noise` |
327
+ | `cluster_terms` | c-TF-IDF top terms per cluster | `cluster_id`, `term`, `weight`, `rank` |
328
+ | `session_communities` | Louvain community per session | `community_id`, `size` |
329
+ | `user_friction` | one row per classified short user message | `label` (7-way), `rationale`, `source` (`regex` / `llm` / `refused`), `confidence` |
330
+ | `skills_catalog` | one row per known skill / slash command (seed by `claude-sql skills sync`) | `skill_id`, `name`, `plugin`, `plugin_version`, `source_kind` (`user-skill` / `plugin-skill` / `plugin-command` / `builtin`), `description` |
331
+ | `skill_usage` | `skill_invocations` ⟕ `skills_catalog` | `source`, `skill_id`, `skill_name`, `plugin`, `is_builtin`, `description` |
332
+
333
+ ## Macros
334
+
335
+ | Macro | Signature | What it does |
336
+ |---|---|---|
337
+ | `model_used(sid)` | scalar → `VARCHAR` | Latest `model` observed in the session |
338
+ | `cost_estimate(sid)` | scalar → `DOUBLE` | USD spend (dated model IDs prefix-matched) |
339
+ | `tool_rank(last_n_days)` | table | Tool-use leaderboard over a window |
340
+ | `todo_velocity(sid)` | scalar → `DOUBLE` | Completed / distinct todos ratio |
341
+ | `subagent_fanout(sid)` | scalar → `INT` | Subagent runs for a session |
342
+ | `semantic_search(query_vec, k)` | table | HNSW top-k over embeddings |
343
+ | `autonomy_trend(window_days)` | table | Weekly autonomy-tier mix |
344
+ | `work_mix(since_days)` | table | Work-category distribution |
345
+ | `success_rate_by_work(since_days)` | table | Success / failure / partial rates per category |
346
+ | `cluster_top_terms(cid, n)` | table | Top-N terms for a cluster |
347
+ | `community_top_topics(cid, n)` | table | Dominant clusters within a community |
348
+ | `sentiment_arc(sid)` | table | Per-message sentiment timeline for one session |
349
+ | `friction_counts(since_days)` | table | Count + session breadth per friction label |
350
+ | `friction_rate(since_days)` | table | Per-session friction pressure vs. user message count |
351
+ | `friction_examples(label, n)` | table | Top-N example messages for a friction label |
352
+ | `skill_rank(last_n_days)` | table | Skill / slash leaderboard over a window (counts both `tool` and `slash_command` sources) |
353
+ | `skill_source_mix(last_n_days)` | table | Per skill `n_tool` vs. `n_slash` — how is each skill invoked? |
354
+ | `unused_skills(last_n_days)` | table | Catalog entries with zero invocations in the window (needs `skills sync`) |
355
+
356
+ ## Environment variables
357
+
358
+ Every option is configurable via `CLAUDE_SQL_*`:
359
+
360
+ | Variable | Default | Purpose |
361
+ |---|---|---|
362
+ | `CLAUDE_SQL_DEFAULT_GLOB` | `~/.claude/projects/*/*.jsonl` | Main transcript glob |
363
+ | `CLAUDE_SQL_SUBAGENT_GLOB` | `~/.claude/projects/*/*/subagents/agent-*.jsonl` | Subagent transcripts |
364
+ | `CLAUDE_SQL_TEAM_CORPUS_ROOT` | `None` | Team-corpus root; when set, derives all three globs from `<root>/<author>/projects/*` (replaces the personal corpus) |
365
+ | `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region |
366
+ | `CLAUDE_SQL_MODEL_ID` | `global.cohere.embed-v4:0` | Embedding model |
367
+ | `CLAUDE_SQL_SONNET_MODEL_ID` | `global.anthropic.claude-sonnet-4-6` | Classification model |
368
+ | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
369
+ | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
370
+ | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
371
+ | `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
372
+ | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
373
+ | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
374
+ | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
375
+ | `CLAUDE_SQL_FRICTION_MAX_CHARS` | `300` | Short-message cutoff for the friction classifier |
376
+ | `CLAUDE_SQL_HNSW_DB_PATH` | `~/.claude/hnsw.duckdb` | Persistent HNSW store (rebuilt automatically when stale) |
377
+ | `CLAUDE_SQL_DUCKDB_THREADS` | `os.cpu_count()` | DuckDB worker threads |
378
+ | `CLAUDE_SQL_DUCKDB_MEMORY_LIMIT` | `'70%'` | DuckDB memory ceiling (percentage or absolute size) |
379
+ | `CLAUDE_SQL_DUCKDB_TEMP_DIR` | `~/.claude/duckdb_tmp` | DuckDB spill directory (avoids `/tmp` tmpfs thrash) |
380
+ | `CLAUDE_SQL_SKILLS_CATALOG_PARQUET_PATH` | `~/.claude/skills_catalog.parquet` | Skills catalog parquet |
381
+ | `CLAUDE_SQL_USER_SKILLS_DIR` | `~/.claude/skills` | Root scanned for user-installed skills |
382
+ | `CLAUDE_SQL_PLUGINS_CACHE_DIR` | `~/.claude/plugins/cache` | Root scanned for plugin skills + commands |
383
+ | `CLAUDE_SQL_SEED` | `42` | UMAP / HDBSCAN / Louvain determinism |
384
+
385
+ ## Development
386
+
387
+ ```bash
388
+ mise run check # lint + fmt-check + typecheck + tests
389
+ mise run fmt:write # auto-apply ruff formatting
390
+ mise run upgrade # uv lock --upgrade && uv sync
391
+ mise run build # uv build → dist/*.whl + *.tar.gz
392
+ mise run tool:install # install claude-sql as a uv tool (global)
393
+ mise run cli -- schema # run the CLI in the project venv
394
+ mise tasks # list every mise task
395
+ ```
396
+
397
+ ### Git hooks + conventional commits
398
+
399
+ `mise run install` installs **lefthook** git hooks:
400
+
401
+ - **pre-commit** — parallel `ruff check --fix` + `ruff format` on staged
402
+ Python files (auto-staged), `ty check src/ tests/` across the whole tree
403
+ (strict mode), and `uv lock --check` when `pyproject.toml` or `uv.lock`
404
+ is staged.
405
+ - **commit-msg** — validates the message via `cz check --allow-abort`
406
+ against the conventional-commits schema.
407
+ - **pre-push** — runs the full pytest suite before the push lands.
408
+
409
+ Config lives in `lefthook.yml`. Reinstall any time that file changes:
410
+
411
+ ```bash
412
+ mise run hooks:install
413
+ mise run hooks:uninstall # if you need to opt out
414
+ ```
415
+
416
+ ### Version bumps + changelog (commitizen)
417
+
418
+ Commit messages follow
419
+ [Conventional Commits](https://www.conventionalcommits.org/). Supported
420
+ types: `feat`, `fix`, `docs`, `style`, `refactor`, `perf`, `test`,
421
+ `build`, `ci`, `chore`, `revert`. Use `mise run commit` for the
422
+ interactive wizard, or write the message yourself — either way the
423
+ `commit-msg` hook validates it.
424
+
425
+ Version bumps are driven by `cz bump`, which reads commit history and
426
+ decides MAJOR / MINOR / PATCH from the conventional-commits types:
427
+
428
+ ```bash
429
+ mise run bump:dry-run # preview next version + tag
430
+ mise run bump # bump + changelog + annotated tag (vX.Y.Z)
431
+ mise run changelog # regenerate CHANGELOG.md without bumping
432
+ ```
433
+
434
+ `[tool.commitizen]` is wired to `version_provider = "uv"`, so every bump
435
+ keeps `pyproject.toml[project.version]` and `uv.lock` in sync
436
+ atomically.
437
+
438
+ ### Quality gates
439
+
440
+ Local `mise run check` (`lint + fmt + typecheck + test`) and GitHub
441
+ Actions run the same commands against the same pinned tool versions (via
442
+ `jdx/mise-action`), so contributors who pass the hooks locally can trust
443
+ CI agrees. On top of the local gate, CI layers in:
444
+
445
+ - **Semgrep** — `p/auto` + `p/owasp-top-ten` rulesets, SARIF uploaded
446
+ to GitHub code scanning.
447
+ - **Bandit** — Python SAST as defense-in-depth alongside ruff's
448
+ `flake8-bandit` (`S`) selectors. Principled skips align 1:1 with the
449
+ ruff S-ignores in `pyproject.toml`. SARIF to code scanning.
450
+ - **CodeQL** — `security-and-quality` query pack, weekly cron.
451
+ - **OSV-Scanner** — known-CVE scan of `uv.lock`, fails on findings.
452
+ - **Betterleaks** — secrets sweep over full git history (gitleaks
453
+ successor). SARIF to code scanning.
454
+ - **OpenSSF Scorecard** — weekly, SARIF to code scanning.
455
+ - **Codecov** — coverage.xml uploaded via tokenless OIDC.
456
+ - **CycloneDX SBOM** — generated + attached on every release.
457
+
458
+ ### Local security sweep
459
+
460
+ `mise run security` runs all four SAST/SCA/secrets scanners in parallel
461
+ against the working tree:
462
+
463
+ ```bash
464
+ mise run security # bandit + semgrep + osv + leaks → .sarif/
465
+ mise run security:bandit # individual scanners also runnable
466
+ mise run security:semgrep
467
+ mise run security:osv
468
+ mise run security:leaks
469
+ ```
470
+
471
+ Each scanner emits SARIF under `.sarif/` (gitignored). The local sweep
472
+ mirrors what CI uploads to GitHub code scanning, so contributors can
473
+ reproduce findings without waiting on CI. All four use the
474
+ "report, don't gate" pattern (`--exit-zero` / `--exit-code=0`) — gating
475
+ happens through code-scanning branch protection, not the scanner exit.
476
+ See `.erpaval/solutions/best-practices/sarif-scanner-report-vs-gate.md`
477
+ for the rationale.
478
+
479
+ See `docs/adr/0015-stack-modernization.md` and
480
+ `docs/adr/0016-ci-hardening.md` for the full rationale.
481
+
482
+ ## Design notes
483
+
484
+ - **Zero-copy reads.** `read_json(..., filename=true, union_by_name=true,
485
+ sample_size=-1, ignore_errors=true)` so the corpus is queried in place.
486
+ - **Lazy content blocks.** Nested `message.content[]` stays as JSON and
487
+ flattens via `UNNEST + json_extract_string`, not eagerly shredded —
488
+ resilient to new block types (`thinking`, MCP shapes, etc.).
489
+ - **Global CRIS for Cohere.** The `global.cohere.embed-v4:0` profile
490
+ sustains the highest throughput with no throttling in testing; direct
491
+ and US CRIS both saturate at low TPM.
492
+ - **HNSW rebuild at open.** The cosine-metric DuckDB VSS index is rebuilt
493
+ from the parquet on every connection open. No experimental
494
+ persistence.
495
+ - **Structured output, GA path.** Sonnet 4.6 classification uses
496
+ Bedrock's GA `output_config.format` (not `tool_use` / `tool_choice`)
497
+ with adaptive thinking on. Pydantic v2 schemas are flattened (inline
498
+ `$ref`, inject `additionalProperties: false`, strip the numeric /
499
+ string constraints the validator rejects from Draft 2020-12).
500
+ - **Determinism.** UMAP, HDBSCAN, and Louvain all seed from
501
+ `CLAUDE_SQL_SEED=42` (default) so cluster IDs and community IDs are
502
+ stable across reruns.
503
+ - **Louvain = `networkx`.** `networkx.community.louvain_communities`,
504
+ built into `networkx >= 3.4`. The abandoned `python-louvain` package
505
+ is not used.
506
+ - **Hybrid friction pipeline.** A hand-curated regex bank catches the
507
+ unambiguous `status_ping` / `interruption` / `correction` cases at
508
+ zero Bedrock cost; the ambiguous class — especially
509
+ `unmet_expectation` (one-word questions like `screenshot?` that imply
510
+ the agent missed a proactive step) — falls through to Sonnet 4.6
511
+ structured output. Scoped to user-role messages under 300 characters
512
+ by default; longer turns are almost always genuine task instructions.
513
+
514
+ See [`docs/research_notes.md`](docs/research_notes.md) for deeper design
515
+ rationale.
516
+
517
+ ## Links
518
+
519
+ - [Cookbook (v1)](docs/cookbook.md) — sessions, messages, tools, todos,
520
+ subagents, semantic search.
521
+ - [Analytics cookbook (v2)](docs/analytics_cookbook.md) — clusters,
522
+ communities, classifications, trajectory, conflicts, friction.
523
+ - [Research notes](docs/research_notes.md) — design decisions and
524
+ tuning knobs.
525
+ - [JSONL schema reference](docs/jsonl_schema_v1.sql) — column listings
526
+ for every registered view.
527
+
528
+ ## License
529
+
530
+ Apache 2.0. See [LICENSE](LICENSE).