claude-sql 0.7.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {claude_sql-0.7.0 → claude_sql-1.0.0}/PKG-INFO +3 -2
  2. {claude_sql-0.7.0 → claude_sql-1.0.0}/pyproject.toml +4 -3
  3. claude_sql-1.0.0/src/claude_sql/classify_worker.py +254 -0
  4. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/cli.py +245 -17
  5. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/config.py +28 -13
  6. claude_sql-1.0.0/src/claude_sql/conflicts_worker.py +341 -0
  7. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/embed_worker.py +49 -1
  8. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/friction_worker.py +200 -20
  9. claude_sql-1.0.0/src/claude_sql/home.py +93 -0
  10. claude_sql-1.0.0/src/claude_sql/ingest.py +526 -0
  11. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/judge_worker.py +3 -0
  12. claude_sql-0.7.0/src/claude_sql/llm_worker.py → claude_sql-1.0.0/src/claude_sql/llm_shared.py +401 -820
  13. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/review_sheet_worker.py +2 -2
  14. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/schemas.py +187 -44
  15. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/sql_views.py +164 -8
  16. claude_sql-1.0.0/src/claude_sql/trajectory_worker.py +993 -0
  17. {claude_sql-0.7.0 → claude_sql-1.0.0}/README.md +0 -0
  18. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/__init__.py +0 -0
  19. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/binding.py +0 -0
  20. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/blind_handover.py +0 -0
  21. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/checkpointer.py +0 -0
  22. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/cluster_worker.py +0 -0
  23. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/community_worker.py +0 -0
  24. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/freeze.py +0 -0
  25. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/install_source.py +0 -0
  26. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/judges.py +0 -0
  27. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/kappa_worker.py +0 -0
  28. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/lance_store.py +0 -0
  29. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/logging_setup.py +0 -0
  30. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/output.py +0 -0
  31. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/parquet_shards.py +0 -0
  32. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/retry_queue.py +0 -0
  33. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/review_sheet_render.py +0 -0
  34. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/session_text.py +0 -0
  35. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/skills_catalog.py +0 -0
  36. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/terms_worker.py +0 -0
  37. {claude_sql-0.7.0 → claude_sql-1.0.0}/src/claude_sql/ungrounded_worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: claude-sql
3
- Version: 0.7.0
3
+ Version: 1.0.0
4
4
  Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
5
5
  Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
6
6
  Author: Laith Al-Saadoon
@@ -8,7 +8,7 @@ Author-email: Laith Al-Saadoon <lalsaado@amazon.com>
8
8
  License: Apache-2.0
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.13
11
- Classifier: Development Status :: 4 - Beta
11
+ Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Operating System :: POSIX :: Linux
14
14
  Classifier: Operating System :: MacOS
@@ -37,6 +37,7 @@ Requires-Dist: pyyaml>=6.0.3
37
37
  Requires-Dist: scikit-learn>=1.5
38
38
  Requires-Dist: scipy>=1.13
39
39
  Requires-Dist: tenacity>=9.1.4
40
+ Requires-Dist: tiktoken>=0.12.0
40
41
  Requires-Dist: umap-learn>=0.5.12
41
42
  Requires-Python: >=3.13
42
43
  Project-URL: Homepage, https://github.com/theagenticguy/claude-sql
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "claude-sql"
3
- version = "0.7.0"
3
+ version = "1.0.0"
4
4
  description = "Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts."
5
5
  readme = "README.md"
6
6
  license = { text = "Apache-2.0" }
@@ -16,7 +16,7 @@ keywords = [
16
16
  classifiers = [
17
17
  "Programming Language :: Python :: 3",
18
18
  "Programming Language :: Python :: 3.13",
19
- "Development Status :: 4 - Beta",
19
+ "Development Status :: 5 - Production/Stable",
20
20
  "Intended Audience :: Developers",
21
21
  "Operating System :: POSIX :: Linux",
22
22
  "Operating System :: MacOS",
@@ -48,6 +48,7 @@ dependencies = [
48
48
  "scikit-learn>=1.5",
49
49
  "scipy>=1.13",
50
50
  "tenacity>=9.1.4",
51
+ "tiktoken>=0.12.0",
51
52
  "umap-learn>=0.5.12",
52
53
  ]
53
54
 
@@ -304,7 +305,7 @@ version_provider = "uv"
304
305
  version_scheme = "pep440"
305
306
  tag_format = "v$version"
306
307
  update_changelog_on_bump = true
307
- major_version_zero = true
308
+ major_version_zero = false
308
309
  annotated_tag = true
309
310
  changelog_incremental = true
310
311
  changelog_merge_prerelease = true
@@ -0,0 +1,254 @@
1
+ """Session classification pipeline.
2
+
3
+ Reads complete Claude Code session transcripts and emits one row per session
4
+ into ``settings.classifications_parquet_path`` with autonomy_tier,
5
+ work_category, success, goal, and confidence fields. Pull-once / write-many
6
+ shape: anti-join against the parquet, dispatch parallel Bedrock calls under
7
+ ``settings.llm_concurrency``, write results in chunks of
8
+ ``max(batch_size * 4, 256)`` for crash-resilience.
9
+
10
+ All Bedrock plumbing — client construction, retry, structured-output
11
+ parsing, the per-pipeline cache-stat accumulator — lives in
12
+ :mod:`claude_sql.llm_shared`.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import time
19
+ from datetime import UTC, datetime
20
+ from typing import TYPE_CHECKING, Any
21
+
22
+ import anyio
23
+ import polars as pl
24
+ from loguru import logger
25
+
26
+ from claude_sql import checkpointer, retry_queue
27
+ from claude_sql.llm_shared import (
28
+ CLASSIFY_SYSTEM_PROMPT,
29
+ _build_bedrock_client,
30
+ _count_pending_sessions,
31
+ _estimate_cost,
32
+ classify_one,
33
+ pipeline_cache_stats,
34
+ )
35
+ from claude_sql.parquet_shards import read_all, write_part
36
+ from claude_sql.schemas import SESSION_CLASSIFICATION_SCHEMA
37
+ from claude_sql.session_text import iter_session_texts, session_bounds
38
+
39
+ if TYPE_CHECKING:
40
+ import duckdb
41
+
42
+ from claude_sql.config import Settings
43
+
44
+
45
+ async def _classify_sessions_async(
46
+ con: duckdb.DuckDBPyConnection,
47
+ settings: Settings,
48
+ *,
49
+ since_days: int | None,
50
+ limit: int | None,
51
+ thinking_mode: str,
52
+ ) -> int:
53
+ """Async implementation behind :func:`classify_sessions`."""
54
+ already: set[str] = set()
55
+ done_df = read_all(settings.classifications_parquet_path)
56
+ if done_df is not None and done_df.height > 0:
57
+ already = set(done_df["session_id"].to_list())
58
+
59
+ # Checkpoint skip: compare current (last_ts, mtime) against the last run.
60
+ bounds = session_bounds(con, since_days=since_days, limit=limit)
61
+ unchanged_pending, skipped = checkpointer.filter_unchanged(
62
+ ((sid, lt, mt) for sid, (lt, mt) in bounds.items()),
63
+ pipeline="classify",
64
+ checkpoint_db_path=settings.checkpoint_db_path,
65
+ )
66
+ keep = set(unchanged_pending)
67
+
68
+ # Retry queue: pull pending retries first so they're re-enqueued into
69
+ # `keep` even when the checkpoint would otherwise skip them.
70
+ retry_ids = set(retry_queue.drain(settings.checkpoint_db_path, pipeline="classify"))
71
+ if retry_ids:
72
+ logger.info("classify: draining {} retry-queue entries", len(retry_ids))
73
+ keep |= retry_ids
74
+
75
+ pending: list[tuple[str, str]] = []
76
+ for sid, text in iter_session_texts(con, settings=settings, since_days=since_days, limit=limit):
77
+ if sid in already and sid not in retry_ids:
78
+ continue
79
+ if sid not in keep:
80
+ continue
81
+ pending.append((sid, text))
82
+
83
+ if not pending:
84
+ logger.info("classify: no pending sessions (skipped={} via checkpoint)", skipped)
85
+ return 0
86
+ if skipped:
87
+ logger.info("classify: skipped {} sessions via checkpoint", skipped)
88
+
89
+ client = _build_bedrock_client(settings)
90
+ sem = anyio.CapacityLimiter(settings.llm_concurrency)
91
+ chunk_size = max(settings.batch_size * 4, 256)
92
+ logger.info(
93
+ "classify: {} pending, model={}, thinking={}, concurrency={}, chunks of {}",
94
+ len(pending),
95
+ settings.sonnet_model_id,
96
+ thinking_mode,
97
+ settings.llm_concurrency,
98
+ chunk_size,
99
+ )
100
+
101
+ written = 0
102
+ for i in range(0, len(pending), chunk_size):
103
+ chunk = pending[i : i + chunk_size]
104
+ t0 = time.monotonic()
105
+ coros = [
106
+ classify_one(
107
+ client,
108
+ settings.sonnet_model_id,
109
+ SESSION_CLASSIFICATION_SCHEMA,
110
+ text,
111
+ max_tokens=settings.classify_max_tokens,
112
+ thinking_mode=thinking_mode,
113
+ sem=sem,
114
+ system=CLASSIFY_SYSTEM_PROMPT,
115
+ pipeline="classify",
116
+ )
117
+ for _, text in chunk
118
+ ]
119
+ results = await asyncio.gather(*coros, return_exceptions=True)
120
+ elapsed = time.monotonic() - t0
121
+
122
+ now = datetime.now(UTC)
123
+ ok_rows: list[dict[str, Any]] = []
124
+ errors = 0
125
+ for (sid, _), res in zip(chunk, results, strict=True):
126
+ if isinstance(res, BaseException):
127
+ errors += 1
128
+ logger.warning("classify: {} failed (queued for retry): {}", sid, res)
129
+ retry_queue.enqueue(
130
+ settings.checkpoint_db_path,
131
+ pipeline="classify",
132
+ unit_id=sid,
133
+ error=str(res),
134
+ )
135
+ continue
136
+ res_dict: dict[str, Any] = res
137
+ ok_rows.append(
138
+ {
139
+ "session_id": sid,
140
+ "autonomy_tier": res_dict.get("autonomy_tier"),
141
+ "work_category": res_dict.get("work_category"),
142
+ "success": res_dict.get("success"),
143
+ "goal": res_dict.get("goal"),
144
+ "confidence": float(res_dict.get("confidence", 0.0)),
145
+ "classified_at": now,
146
+ }
147
+ )
148
+
149
+ if ok_rows:
150
+ df = pl.DataFrame(
151
+ ok_rows,
152
+ schema={
153
+ "session_id": pl.Utf8,
154
+ "autonomy_tier": pl.Utf8,
155
+ "work_category": pl.Utf8,
156
+ "success": pl.Utf8,
157
+ "goal": pl.Utf8,
158
+ "confidence": pl.Float32,
159
+ "classified_at": pl.Datetime("us", "UTC"),
160
+ },
161
+ )
162
+ write_part(settings.classifications_parquet_path, df)
163
+
164
+ # Checkpoint the sessions we just classified — at their CURRENT bounds,
165
+ # so a later re-run with no new messages is a no-op. Also clear those
166
+ # sessions from the retry queue.
167
+ if ok_rows:
168
+ ok_sids = [row["session_id"] for row in ok_rows]
169
+ checkpointer.mark_completed(
170
+ settings.checkpoint_db_path,
171
+ pipeline="classify",
172
+ rows=[(sid, *bounds.get(sid, (None, None))) for sid in ok_sids],
173
+ )
174
+ retry_queue.mark_done(
175
+ settings.checkpoint_db_path,
176
+ pipeline="classify",
177
+ unit_ids=ok_sids,
178
+ )
179
+
180
+ written += len(ok_rows)
181
+ logger.info(
182
+ "classify chunk {}/{}: {} ok, {} errors, {:.1f}s ({:.1f} sess/s)",
183
+ i // chunk_size + 1,
184
+ (len(pending) + chunk_size - 1) // chunk_size,
185
+ len(ok_rows),
186
+ errors,
187
+ elapsed,
188
+ len(ok_rows) / elapsed if elapsed > 0 else 0,
189
+ )
190
+
191
+ logger.info("classify: wrote {} total rows", written)
192
+ return written
193
+
194
+
195
+ def classify_sessions(
196
+ con: duckdb.DuckDBPyConnection,
197
+ settings: Settings,
198
+ *,
199
+ since_days: int | None = None,
200
+ limit: int | None = None,
201
+ dry_run: bool = False,
202
+ no_thinking: bool = False,
203
+ ) -> int | dict[str, Any]:
204
+ """Classify pending sessions and return count of successful classifications.
205
+
206
+ In ``--dry-run`` mode, returns a plan dict with keys ``{pipeline,
207
+ candidates, llm_calls, avg_input_tokens, avg_output_tokens,
208
+ estimated_cost_usd, model, thinking, since_days, limit}`` instead of the
209
+ row count, so the CLI can emit it as structured JSON.
210
+ """
211
+ thinking_mode = "disabled" if no_thinking else settings.classify_thinking
212
+
213
+ if dry_run:
214
+ already: set[str] = set()
215
+ done_df = read_all(settings.classifications_parquet_path)
216
+ if done_df is not None and done_df.height > 0:
217
+ already = set(done_df["session_id"].to_list())
218
+ pending_count = _count_pending_sessions(
219
+ con, already=already, since_days=since_days, limit=limit
220
+ )
221
+ # Back-of-envelope: avg 8K input tokens, 300 output per session.
222
+ cost = _estimate_cost(pending_count, 8000, 300, settings.sonnet_pricing)
223
+ logger.info(
224
+ "classify --dry-run: {} sessions pending. Estimated cost ~${:.2f} "
225
+ "(thinking={}, model={})",
226
+ pending_count,
227
+ cost,
228
+ thinking_mode,
229
+ settings.sonnet_model_id,
230
+ )
231
+ return {
232
+ "pipeline": "classify",
233
+ "candidates": pending_count,
234
+ "llm_calls": pending_count,
235
+ "avg_input_tokens": 8000,
236
+ "avg_output_tokens": 300,
237
+ "estimated_cost_usd": round(cost, 4),
238
+ "model": settings.sonnet_model_id,
239
+ "thinking": thinking_mode,
240
+ "since_days": since_days,
241
+ "limit": limit,
242
+ "dry_run": True,
243
+ }
244
+
245
+ with pipeline_cache_stats("classify"):
246
+ return asyncio.run(
247
+ _classify_sessions_async(
248
+ con,
249
+ settings,
250
+ since_days=since_days,
251
+ limit=limit,
252
+ thinking_mode=thinking_mode,
253
+ )
254
+ )