claude-sql 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {claude_sql-1.0.0 → claude_sql-1.1.0}/PKG-INFO +30 -4
  2. {claude_sql-1.0.0 → claude_sql-1.1.0}/README.md +28 -2
  3. claude_sql-1.1.0/pyproject.toml +66 -0
  4. claude_sql-1.1.0/src/claude_sql/analytics/__init__.py +1 -0
  5. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/classify_worker.py +8 -8
  6. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/cluster_worker.py +19 -6
  7. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/community_worker.py +48 -29
  8. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/conflicts_worker.py +8 -8
  9. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/embed_worker.py +12 -30
  10. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/friction_worker.py +7 -7
  11. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/ingest.py +2 -2
  12. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/skills_catalog.py +1 -1
  13. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/terms_worker.py +1 -1
  14. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/trajectory_worker.py +19 -7
  15. claude_sql-1.1.0/src/claude_sql/app/__init__.py +1 -0
  16. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/app}/cli.py +240 -34
  17. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/app}/install_source.py +2 -1
  18. claude_sql-1.1.0/src/claude_sql/core/__init__.py +1 -0
  19. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/checkpointer.py +5 -2
  20. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/config.py +23 -32
  21. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/lance_store.py +10 -3
  22. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/llm_shared.py +14 -12
  23. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/parquet_shards.py +97 -2
  24. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/retry_queue.py +1 -1
  25. claude_sql-1.1.0/src/claude_sql/core/s3_source.py +134 -0
  26. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/schemas.py +9 -9
  27. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/session_text.py +1 -1
  28. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/sql_views.py +64 -97
  29. claude_sql-1.1.0/src/claude_sql/evals/__init__.py +1 -0
  30. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/freeze.py +1 -1
  31. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/judge_worker.py +3 -3
  32. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/kappa_worker.py +17 -3
  33. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/ungrounded_worker.py +4 -3
  34. claude_sql-1.1.0/src/claude_sql/provenance/__init__.py +1 -0
  35. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/binding.py +4 -1
  36. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/review_sheet_worker.py +7 -5
  37. claude_sql-1.0.0/pyproject.toml +0 -318
  38. claude_sql-1.0.0/src/claude_sql/__init__.py +0 -5
  39. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/home.py +0 -0
  40. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/logging_setup.py +0 -0
  41. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/output.py +0 -0
  42. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/blind_handover.py +0 -0
  43. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/judges.py +0 -0
  44. {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/review_sheet_render.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: claude-sql
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
5
5
  Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
6
6
  Author: Laith Al-Saadoon
@@ -31,8 +31,8 @@ Requires-Dist: numpy>=2.4.4
31
31
  Requires-Dist: packaging>=26.2
32
32
  Requires-Dist: polars>=1.40.0
33
33
  Requires-Dist: pyarrow>=23.0.1
34
- Requires-Dist: pydantic>=2.13.2
35
34
  Requires-Dist: pydantic-settings>=2.13.1
35
+ Requires-Dist: pydantic>=2.13.2
36
36
  Requires-Dist: pyyaml>=6.0.3
37
37
  Requires-Dist: scikit-learn>=1.5
38
38
  Requires-Dist: scipy>=1.13
@@ -200,6 +200,30 @@ The IAM policy needs `bedrock:InvokeModel` on:
200
200
  - `inference-profile/global.cohere.embed-v4:0`
201
201
  - `inference-profile/global.anthropic.claude-sonnet-4-6`
202
202
 
203
+ ### Reading transcripts from S3
204
+
205
+ claude-sql reads the local JSONL corpus by default, but any transcript glob
206
+ can be an `s3://` URI instead — point it at sessions mirrored to S3 by the
207
+ [`claude-agent-sdk` `S3SessionStore`](https://github.com/anthropics/claude-agent-sdk-python/tree/main/examples/session_stores)
208
+ (layout `s3://{bucket}/{prefix}{project}/{session}/part-*.jsonl`). DuckDB reads
209
+ the parts zero-copy over HTTP range requests — no download step — and every
210
+ view and macro works unchanged.
211
+
212
+ ```bash
213
+ # Personal corpus on S3 instead of ~/.claude/projects.
214
+ export CLAUDE_SQL_DEFAULT_GLOB='s3://my-bucket/transcripts/*/*/part-*.jsonl'
215
+ export AWS_PROFILE=your-profile # credentials via the standard AWS chain
216
+ claude-sql schema
217
+ claude-sql query "SELECT session_id, started_at FROM sessions ORDER BY started_at DESC LIMIT 10"
218
+ ```
219
+
220
+ claude-sql loads DuckDB's `httpfs` extension and creates a `credential_chain`
221
+ S3 secret automatically when it sees an `s3://` glob — no keys are embedded
222
+ anywhere. For a non-AWS store (MinIO) or a local mock, set
223
+ `CLAUDE_SQL_S3_ENDPOINT`, `CLAUDE_SQL_S3_URL_STYLE=path`, and
224
+ `CLAUDE_SQL_S3_USE_SSL=false`. The IAM policy needs `s3:GetObject` +
225
+ `s3:ListBucket` on the prefix.
226
+
203
227
  ## Quick tour
204
228
 
205
229
  ```bash
@@ -367,13 +391,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
367
391
  | `CLAUDE_SQL_DEFAULT_GLOB` | `~/.claude/projects/*/*.jsonl` | Main transcript glob |
368
392
  | `CLAUDE_SQL_SUBAGENT_GLOB` | `~/.claude/projects/*/*/subagents/agent-*.jsonl` | Subagent transcripts |
369
393
  | `CLAUDE_SQL_TEAM_CORPUS_ROOT` | `None` | Team-corpus root; when set, derives all three globs from `<root>/<author>/projects/*` (replaces the personal corpus) |
370
- | `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region |
394
+ | `CLAUDE_SQL_S3_ENDPOINT` | `None` | Custom S3 endpoint `host[:port]` for non-AWS stores (MinIO) or a local mock; unset uses default AWS S3. Only consulted when a glob is an `s3://` URI |
395
+ | `CLAUDE_SQL_S3_URL_STYLE` | `vhost` | S3 addressing style (`vhost` or `path`); set `path` for MinIO / moto |
396
+ | `CLAUDE_SQL_S3_USE_SSL` | `true` | Toggle TLS for the S3 endpoint; set `false` for a local mock |
397
+ | `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region **and** the S3 secret region |
371
398
  | `CLAUDE_SQL_MODEL_ID` | `global.cohere.embed-v4:0` | Embedding model |
372
399
  | `CLAUDE_SQL_SONNET_MODEL_ID` | `global.anthropic.claude-sonnet-4-6` | Classification model |
373
400
  | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
374
401
  | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
375
402
  | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
376
- | `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
377
403
  | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
378
404
  | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
379
405
  | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
@@ -152,6 +152,30 @@ The IAM policy needs `bedrock:InvokeModel` on:
152
152
  - `inference-profile/global.cohere.embed-v4:0`
153
153
  - `inference-profile/global.anthropic.claude-sonnet-4-6`
154
154
 
155
+ ### Reading transcripts from S3
156
+
157
+ claude-sql reads the local JSONL corpus by default, but any transcript glob
158
+ can be an `s3://` URI instead — point it at sessions mirrored to S3 by the
159
+ [`claude-agent-sdk` `S3SessionStore`](https://github.com/anthropics/claude-agent-sdk-python/tree/main/examples/session_stores)
160
+ (layout `s3://{bucket}/{prefix}{project}/{session}/part-*.jsonl`). DuckDB reads
161
+ the parts zero-copy over HTTP range requests — no download step — and every
162
+ view and macro works unchanged.
163
+
164
+ ```bash
165
+ # Personal corpus on S3 instead of ~/.claude/projects.
166
+ export CLAUDE_SQL_DEFAULT_GLOB='s3://my-bucket/transcripts/*/*/part-*.jsonl'
167
+ export AWS_PROFILE=your-profile # credentials via the standard AWS chain
168
+ claude-sql schema
169
+ claude-sql query "SELECT session_id, started_at FROM sessions ORDER BY started_at DESC LIMIT 10"
170
+ ```
171
+
172
+ claude-sql loads DuckDB's `httpfs` extension and creates a `credential_chain`
173
+ S3 secret automatically when it sees an `s3://` glob — no keys are embedded
174
+ anywhere. For a non-AWS store (MinIO) or a local mock, set
175
+ `CLAUDE_SQL_S3_ENDPOINT`, `CLAUDE_SQL_S3_URL_STYLE=path`, and
176
+ `CLAUDE_SQL_S3_USE_SSL=false`. The IAM policy needs `s3:GetObject` +
177
+ `s3:ListBucket` on the prefix.
178
+
155
179
  ## Quick tour
156
180
 
157
181
  ```bash
@@ -319,13 +343,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
319
343
  | `CLAUDE_SQL_DEFAULT_GLOB` | `~/.claude/projects/*/*.jsonl` | Main transcript glob |
320
344
  | `CLAUDE_SQL_SUBAGENT_GLOB` | `~/.claude/projects/*/*/subagents/agent-*.jsonl` | Subagent transcripts |
321
345
  | `CLAUDE_SQL_TEAM_CORPUS_ROOT` | `None` | Team-corpus root; when set, derives all three globs from `<root>/<author>/projects/*` (replaces the personal corpus) |
322
- | `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region |
346
+ | `CLAUDE_SQL_S3_ENDPOINT` | `None` | Custom S3 endpoint `host[:port]` for non-AWS stores (MinIO) or a local mock; unset uses default AWS S3. Only consulted when a glob is an `s3://` URI |
347
+ | `CLAUDE_SQL_S3_URL_STYLE` | `vhost` | S3 addressing style (`vhost` or `path`); set `path` for MinIO / moto |
348
+ | `CLAUDE_SQL_S3_USE_SSL` | `true` | Toggle TLS for the S3 endpoint; set `false` for a local mock |
349
+ | `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region **and** the S3 secret region |
323
350
  | `CLAUDE_SQL_MODEL_ID` | `global.cohere.embed-v4:0` | Embedding model |
324
351
  | `CLAUDE_SQL_SONNET_MODEL_ID` | `global.anthropic.claude-sonnet-4-6` | Classification model |
325
352
  | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
326
353
  | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
327
354
  | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
328
- | `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
329
355
  | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
330
356
  | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
331
357
  | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
@@ -0,0 +1,66 @@
1
+ # GENERATED by mise-tasks/build-dist — do not edit. The source of truth is
2
+ # packages/*/pyproject.toml. This file bundles all five members into the one
3
+ # publishable ``claude-sql`` wheel. See the task docstring for why.
4
+ [project]
5
+ name = "claude-sql"
6
+ version = "1.1.0"
7
+ description = 'Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.'
8
+ readme = "README.md"
9
+ license = { text = "Apache-2.0" }
10
+ authors = [{ name = "Laith Al-Saadoon", email = "lalsaado@amazon.com" }]
11
+ requires-python = ">=3.13"
12
+ keywords = ["claude", "claude-code", "anthropic", "duckdb", "sql", "semantic-search", "embeddings", "bedrock", "transcripts", "analytics", "observability"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.13",
16
+ "Development Status :: 5 - Production/Stable",
17
+ "Intended Audience :: Developers",
18
+ "Operating System :: POSIX :: Linux",
19
+ "Operating System :: MacOS",
20
+ "Topic :: Software Development",
21
+ "Topic :: Database",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Topic :: Utilities",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "anthropic>=0.40",
28
+ "anyio>=4.13.0",
29
+ "boto3>=1.42.91",
30
+ "cyclopts>=4.10.2",
31
+ "duckdb>=1.5.2,<2",
32
+ "hdbscan>=0.8.40",
33
+ "igraph>=1.0.0,<2.0",
34
+ "lancedb>=0.30,<0.31",
35
+ "leidenalg>=0.11.0,<0.12",
36
+ "loguru>=0.7.3",
37
+ "numpy>=2.4.4",
38
+ "packaging>=26.2",
39
+ "polars>=1.40.0",
40
+ "pyarrow>=23.0.1",
41
+ "pydantic-settings>=2.13.1",
42
+ "pydantic>=2.13.2",
43
+ "pyyaml>=6.0.3",
44
+ "scikit-learn>=1.5",
45
+ "scipy>=1.13",
46
+ "tenacity>=9.1.4",
47
+ "tiktoken>=0.12.0",
48
+ "umap-learn>=0.5.12",
49
+ ]
50
+
51
+ [project.scripts]
52
+ claude-sql = "claude_sql.app.cli:main"
53
+
54
+ [project.urls]
55
+ Homepage = "https://github.com/theagenticguy/claude-sql"
56
+ Repository = "https://github.com/theagenticguy/claude-sql"
57
+ Issues = "https://github.com/theagenticguy/claude-sql/issues"
58
+ Changelog = "https://github.com/theagenticguy/claude-sql/blob/main/CHANGELOG.md"
59
+
60
+ [build-system]
61
+ requires = ["uv_build>=0.11.14,<0.12"]
62
+ build-backend = "uv_build"
63
+
64
+ [tool.uv.build-backend]
65
+ module-name = "claude_sql"
66
+ namespace = true
@@ -0,0 +1 @@
1
+ """claude-sql analytics: embed, classify, trajectory, conflicts, friction, cluster, terms, community, ingest."""
@@ -23,8 +23,8 @@ import anyio
23
23
  import polars as pl
24
24
  from loguru import logger
25
25
 
26
- from claude_sql import checkpointer, retry_queue
27
- from claude_sql.llm_shared import (
26
+ from claude_sql.core import checkpointer, retry_queue
27
+ from claude_sql.core.llm_shared import (
28
28
  CLASSIFY_SYSTEM_PROMPT,
29
29
  _build_bedrock_client,
30
30
  _count_pending_sessions,
@@ -32,14 +32,14 @@ from claude_sql.llm_shared import (
32
32
  classify_one,
33
33
  pipeline_cache_stats,
34
34
  )
35
- from claude_sql.parquet_shards import read_all, write_part
36
- from claude_sql.schemas import SESSION_CLASSIFICATION_SCHEMA
37
- from claude_sql.session_text import iter_session_texts, session_bounds
35
+ from claude_sql.core.parquet_shards import read_all, write_part
36
+ from claude_sql.core.schemas import SESSION_CLASSIFICATION_SCHEMA
37
+ from claude_sql.core.session_text import iter_session_texts, session_bounds
38
38
 
39
39
  if TYPE_CHECKING:
40
40
  import duckdb
41
41
 
42
- from claude_sql.config import Settings
42
+ from claude_sql.core.config import Settings
43
43
 
44
44
 
45
45
  async def _classify_sessions_async(
@@ -52,7 +52,7 @@ async def _classify_sessions_async(
52
52
  ) -> int:
53
53
  """Async implementation behind :func:`classify_sessions`."""
54
54
  already: set[str] = set()
55
- done_df = read_all(settings.classifications_parquet_path)
55
+ done_df = read_all(settings.classifications_parquet_path, columns=["session_id"])
56
56
  if done_df is not None and done_df.height > 0:
57
57
  already = set(done_df["session_id"].to_list())
58
58
 
@@ -212,7 +212,7 @@ def classify_sessions(
212
212
 
213
213
  if dry_run:
214
214
  already: set[str] = set()
215
- done_df = read_all(settings.classifications_parquet_path)
215
+ done_df = read_all(settings.classifications_parquet_path, columns=["session_id"])
216
216
  if done_df is not None and done_df.height > 0:
217
217
  already = set(done_df["session_id"].to_list())
218
218
  pending_count = _count_pending_sessions(
@@ -22,8 +22,7 @@ import numpy as np
22
22
  import polars as pl
23
23
  from loguru import logger
24
24
 
25
- from claude_sql import lance_store
26
- from claude_sql.config import Settings
25
+ from claude_sql.core.config import Settings
27
26
 
28
27
 
29
28
  def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
@@ -33,6 +32,10 @@ def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
33
32
  through the DuckDB ``message_embeddings`` view) so this worker can run
34
33
  independently of view registration on the calling connection.
35
34
  """
35
+ # Deferred so importing this module via the CLI for a non-cluster command
36
+ # doesn't pull in the ~2.6s lancedb import subtree.
37
+ from claude_sql.core import lance_store
38
+
36
39
  db = lance_store.connect_db(path)
37
40
  if not lance_store._has_table(db, lance_store.TABLE_NAME):
38
41
  return [], np.zeros((0, 0), dtype=np.float32)
@@ -65,6 +68,10 @@ def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]
65
68
  ``{"total": N, "clusters": K, "noise": M}`` where K excludes the
66
69
  noise cluster (label -1).
67
70
  """
71
+ # Deferred (see _load_embeddings) — keeps the lancedb import off the CLI's
72
+ # module-load path for non-cluster commands.
73
+ from claude_sql.core import lance_store
74
+
68
75
  out_path = settings.clusters_parquet_path
69
76
  in_path = settings.lance_uri
70
77
 
@@ -186,13 +193,19 @@ def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]
186
193
  noise / len(labels) if len(labels) else 0,
187
194
  )
188
195
 
196
+ # Hand polars the numpy arrays directly — it ingests contiguous arrays
197
+ # near-zero-copy. Round-tripping through ``.tolist()`` materialized N
198
+ # boxed Python ints/floats/bools per column just to have polars re-parse
199
+ # them back into the typed columns the schema already pins (mirrors the
200
+ # read-side boxing fix in #68, now on the write side). ``X2`` columns are
201
+ # sliced views, so copy to contiguous float32 before handing them over.
189
202
  df = pl.DataFrame(
190
203
  {
191
204
  "uuid": uuids,
192
- "cluster_id": labels.astype(np.int32).tolist(),
193
- "x": X2[:, 0].astype(np.float32).tolist(),
194
- "y": X2[:, 1].astype(np.float32).tolist(),
195
- "is_noise": (labels < 0).tolist(),
205
+ "cluster_id": labels.astype(np.int32),
206
+ "x": np.ascontiguousarray(X2[:, 0], dtype=np.float32),
207
+ "y": np.ascontiguousarray(X2[:, 1], dtype=np.float32),
208
+ "is_noise": labels < 0,
196
209
  },
197
210
  schema={
198
211
  "uuid": pl.Utf8,
@@ -63,7 +63,7 @@ import numpy as np
63
63
  import polars as pl
64
64
  from loguru import logger
65
65
 
66
- from claude_sql.config import Settings
66
+ from claude_sql.core.config import Settings
67
67
 
68
68
  if TYPE_CHECKING:
69
69
  import duckdb
@@ -85,10 +85,12 @@ def _load_session_centroids(
85
85
  """Return ``(session_ids, centroids)`` where centroids is ``(N_sessions, dim)`` float32.
86
86
 
87
87
  Joins the ``message_embeddings`` view (LanceDB-backed via ``register_vss``)
88
- to the v1 ``messages`` view on uuid, then aggregates inside DuckDB
89
- (unnest with position ``avg`` per (session, dim_index) ordered
90
- ``list``). The L2-normalize step stays in numpy where
91
- ``np.linalg.norm`` is faster on a contiguous (N, dim) matrix.
88
+ to the v1 ``messages`` view on uuid, pulls one ``(session_id, embedding)``
89
+ row per message ordered by session, then computes per-session means in
90
+ numpy with a single ``np.add.reduceat`` segmented sum (sessions are
91
+ contiguous after the ``ORDER BY``) followed by an L2-normalize. This keeps
92
+ the intermediate at ``N_messages`` rows rather than the ``N_messages ×
93
+ dim`` explosion the prior ``unnest``-per-dimension aggregation produced.
92
94
 
93
95
  ``embeddings_parquet_path`` is accepted for back-compat with callers that
94
96
  still pass it but is no longer consulted — the connection's
@@ -96,29 +98,24 @@ def _load_session_centroids(
96
98
  """
97
99
  del embeddings_parquet_path # legacy kwarg — view is the source of truth now
98
100
  logger.info("Loading message embeddings and joining to sessions...")
101
+ # Pull one row per message (session_id, embedding) ordered by session, then
102
+ # compute per-session means in numpy via a single segmented reduction.
103
+ #
104
+ # The prior implementation unnested every embedding into ``dim`` rows
105
+ # (``generate_subscripts`` + ``unnest``) and grouped on (session, pos) —
106
+ # that explodes the working set to N_messages × dim rows before the
107
+ # average. Carrying the FLOAT[dim] vector through the join and reducing it
108
+ # in numpy keeps the intermediate at N_messages rows and is 1.4–1.8×
109
+ # faster on a 24k–96k-message corpus (measured), with the win widening as
110
+ # the corpus grows. ``ORDER BY session_id`` makes the sessions contiguous
111
+ # so ``np.add.reduceat`` can segment-sum without a Python per-session loop.
99
112
  sql = """
100
- WITH joined AS (
101
- SELECT CAST(m.session_id AS VARCHAR) AS session_id,
102
- e.embedding::FLOAT[] AS emb
103
- FROM message_embeddings e
104
- JOIN messages m
105
- ON CAST(m.uuid AS VARCHAR) = e.uuid
106
- ),
107
- unrolled AS (
108
- SELECT session_id,
109
- generate_subscripts(emb, 1) AS pos,
110
- unnest(emb) AS v
111
- FROM joined
112
- ),
113
- agg AS (
114
- SELECT session_id, pos, avg(v) AS m
115
- FROM unrolled
116
- GROUP BY 1, 2
117
- )
118
- SELECT session_id, list(m ORDER BY pos) AS centroid
119
- FROM agg
120
- GROUP BY 1
121
- ORDER BY 1
113
+ SELECT CAST(m.session_id AS VARCHAR) AS session_id,
114
+ e.embedding AS emb
115
+ FROM message_embeddings e
116
+ JOIN messages m
117
+ ON CAST(m.uuid AS VARCHAR) = e.uuid
118
+ ORDER BY session_id
122
119
  """
123
120
  try:
124
121
  df = con.execute(sql).pl()
@@ -136,10 +133,32 @@ def _load_session_centroids(
136
133
  "Lance embeddings exist and the messages view is registered."
137
134
  )
138
135
 
139
- sids = df["session_id"].to_list()
140
- centroids = np.stack([np.asarray(c, dtype=np.float32) for c in df["centroid"].to_list()])
136
+ # ``emb`` is a DuckDB ``FLOAT[dim]`` column; polars surfaces it as a
137
+ # fixed-size ``Array(Float32, dim)`` dtype (occasionally a variable
138
+ # ``List`` if the cast was lost upstream). ``Series.to_numpy()`` extracts
139
+ # the buffer directly into a contiguous ``(N_messages, dim)`` matrix; for
140
+ # the fixed-``Array`` case it's a near-zero-copy view. The prior
141
+ # ``np.asarray(series.to_list(), ...)`` boxed every one of the
142
+ # N_messages × dim float32 values into a Python ``float`` object first —
143
+ # measured at 6–10× slower and ~43× higher peak RSS on a 6k–96k-message
144
+ # corpus (e.g. 7.4 s / 3.5 GB → 1.2 s / 83 MB at 96k). This is the
145
+ # read-side analog of the SQL-side ``unnest`` explosion removed in #65.
146
+ emb_arr = df["emb"].to_numpy()
147
+ if emb_arr.ndim == 1:
148
+ # Variable ``List`` dtype (or object array of rows) — stack to 2-D.
149
+ emb_arr = np.stack(list(emb_arr))
150
+ emb_np = np.ascontiguousarray(emb_arr, dtype=np.float32)
151
+ sessions = df["session_id"].to_numpy()
152
+ # ``return_index`` gives each group's first row offset on the sorted array;
153
+ # ``np.unique`` returns the labels already sorted, matching the prior
154
+ # ``ORDER BY 1`` contract. ``reduceat`` sums each [start_i, start_{i+1})
155
+ # segment in one pass.
156
+ sids_arr, starts, counts = np.unique(sessions, return_index=True, return_counts=True)
157
+ summed = np.add.reduceat(emb_np, starts, axis=0)
158
+ centroids = (summed / counts[:, None]).astype(np.float32)
141
159
  norms = np.linalg.norm(centroids, axis=1, keepdims=True)
142
160
  centroids = centroids / np.where(norms == 0, 1.0, norms)
161
+ sids = sids_arr.tolist()
143
162
  logger.info("Computed {} session centroids (dim={})", len(sids), centroids.shape[1])
144
163
  return sids, centroids
145
164
 
@@ -41,8 +41,8 @@ import pyarrow as pa
41
41
  import pyarrow.parquet as pq
42
42
  from loguru import logger
43
43
 
44
- from claude_sql import checkpointer, retry_queue
45
- from claude_sql.llm_shared import (
44
+ from claude_sql.core import checkpointer, retry_queue
45
+ from claude_sql.core.llm_shared import (
46
46
  CONFLICTS_SYSTEM_PROMPT,
47
47
  _build_bedrock_client,
48
48
  _count_pending_sessions,
@@ -50,16 +50,16 @@ from claude_sql.llm_shared import (
50
50
  classify_one,
51
51
  pipeline_cache_stats,
52
52
  )
53
- from claude_sql.parquet_shards import iter_part_files, read_all, write_part
54
- from claude_sql.schemas import SESSION_CONFLICTS_SCHEMA
55
- from claude_sql.session_text import iter_session_texts, session_bounds
53
+ from claude_sql.core.parquet_shards import iter_part_files, read_all, write_part
54
+ from claude_sql.core.schemas import SESSION_CONFLICTS_SCHEMA
55
+ from claude_sql.core.session_text import iter_session_texts, session_bounds
56
56
 
57
57
  if TYPE_CHECKING:
58
58
  from pathlib import Path
59
59
 
60
60
  import duckdb
61
61
 
62
- from claude_sql.config import Settings
62
+ from claude_sql.core.config import Settings
63
63
 
64
64
 
65
65
  # v1.0 parquet schema — kept as a module constant so the worker, the test
@@ -143,7 +143,7 @@ async def _conflicts_async(
143
143
  _purge_legacy_shards(settings.conflicts_parquet_path)
144
144
 
145
145
  already: set[str] = set()
146
- done_df = read_all(settings.conflicts_parquet_path)
146
+ done_df = read_all(settings.conflicts_parquet_path, columns=["session_id"])
147
147
  if done_df is not None and done_df.height > 0:
148
148
  already = set(done_df["session_id"].to_list())
149
149
 
@@ -304,7 +304,7 @@ def detect_conflicts(
304
304
  thinking_mode = "disabled" if no_thinking else settings.classify_thinking
305
305
  if dry_run:
306
306
  already: set[str] = set()
307
- done_df = read_all(settings.conflicts_parquet_path)
307
+ done_df = read_all(settings.conflicts_parquet_path, columns=["session_id"])
308
308
  if done_df is not None and done_df.height > 0:
309
309
  already = set(done_df["session_id"].to_list())
310
310
  pending_count = _count_pending_sessions(
@@ -22,9 +22,7 @@ from datetime import UTC, datetime
22
22
  from pathlib import Path
23
23
  from typing import TYPE_CHECKING, Any
24
24
 
25
- import boto3
26
25
  import polars as pl
27
- from botocore.config import Config as BotoConfig
28
26
  from botocore.exceptions import (
29
27
  ClientError,
30
28
  ConnectionError as BotoConnectionError,
@@ -40,9 +38,9 @@ from tenacity import (
40
38
  wait_exponential,
41
39
  )
42
40
 
43
- from claude_sql import lance_store
44
- from claude_sql.config import Settings
45
- from claude_sql.logging_setup import loguru_before_sleep
41
+ from claude_sql.core.config import Settings
42
+ from claude_sql.core.llm_shared import _build_bedrock_client
43
+ from claude_sql.core.logging_setup import loguru_before_sleep
46
44
 
47
45
  if TYPE_CHECKING:
48
46
  import duckdb
@@ -135,6 +133,11 @@ def discover_unembedded(
135
133
  list of (uuid, text) tuples
136
134
  Messages needing embedding, in DuckDB's scan order.
137
135
  """
136
+ # Deferred so importing this module (e.g. via the CLI for a non-embed
137
+ # command) doesn't drag in the ~2.6s lancedb import subtree. lance_store
138
+ # is only touched once an embed-path function actually runs.
139
+ from claude_sql.core import lance_store
140
+
138
141
  # Read the already-embedded uuids straight from Lance via its Python API.
139
142
  # We don't go through the DuckDB ``message_embeddings`` view here because
140
143
  # the embed command runs with ``register_vss`` skipped (cli.py:1205-1213),
@@ -179,31 +182,6 @@ def discover_unembedded(
179
182
  return pairs
180
183
 
181
184
 
182
- def _build_bedrock_client(settings: Settings) -> Any:
183
- """Construct a boto3 ``bedrock-runtime`` client from settings.
184
-
185
- Parameters
186
- ----------
187
- settings
188
- Application settings providing the target AWS region.
189
-
190
- Returns
191
- -------
192
- botocore client
193
- A low-level ``bedrock-runtime`` client.
194
- """
195
- # Disable botocore's internal retry layer so tenacity sees throttling
196
- # immediately — otherwise botocore silently absorbs 4 retries and our
197
- # retry policy never kicks in. Also bump read_timeout for large batches.
198
- boto_cfg = BotoConfig(
199
- region_name=settings.region,
200
- retries={"max_attempts": 0, "mode": "standard"},
201
- read_timeout=60,
202
- connect_timeout=10,
203
- )
204
- return boto3.client("bedrock-runtime", config=boto_cfg)
205
-
206
-
207
185
  @retry(
208
186
  # Cohere Embed v4 on Bedrock has a strict TPM bucket that replenishes over
209
187
  # tens of seconds; wait up to 60s between attempts and try up to 10 times
@@ -462,6 +440,10 @@ async def run_backfill(
462
440
  "dry_run": True,
463
441
  }
464
442
 
443
+ # Deferred (see discover_unembedded) — keeps the lancedb import off the
444
+ # dry-run / nothing-pending paths above, which return before this point.
445
+ from claude_sql.core import lance_store
446
+
465
447
  # Checkpoint every N messages so a throttling-induced timeout doesn't
466
448
  # discard work already embedded. chunk must be a multiple of batch_size.
467
449
  chunk_size = max(settings.batch_size * 4, 256)
@@ -49,8 +49,8 @@ import duckdb
49
49
  import polars as pl
50
50
  from loguru import logger
51
51
 
52
- from claude_sql import checkpointer, retry_queue
53
- from claude_sql.llm_shared import (
52
+ from claude_sql.core import checkpointer, retry_queue
53
+ from claude_sql.core.llm_shared import (
54
54
  USER_FRICTION_SYSTEM_PROMPT,
55
55
  BedrockRefusalError,
56
56
  _build_bedrock_client,
@@ -58,12 +58,12 @@ from claude_sql.llm_shared import (
58
58
  classify_one,
59
59
  pipeline_cache_stats,
60
60
  )
61
- from claude_sql.parquet_shards import read_all, write_part
62
- from claude_sql.schemas import USER_FRICTION_SCHEMA
63
- from claude_sql.session_text import session_bounds
61
+ from claude_sql.core.parquet_shards import read_all, write_part
62
+ from claude_sql.core.schemas import USER_FRICTION_SCHEMA
63
+ from claude_sql.core.session_text import session_bounds
64
64
 
65
65
  if TYPE_CHECKING:
66
- from claude_sql.config import Settings
66
+ from claude_sql.core.config import Settings
67
67
 
68
68
 
69
69
  # ---------------------------------------------------------------------------
@@ -428,7 +428,7 @@ async def _classify_async(
428
428
  """Async body behind :func:`detect_user_friction`."""
429
429
  out_path = settings.user_friction_parquet_path
430
430
  already: set[str] = set()
431
- done_df = read_all(out_path)
431
+ done_df = read_all(out_path, columns=["uuid"])
432
432
  if done_df is not None and done_df.height > 0:
433
433
  already = set(done_df["uuid"].to_list())
434
434
 
@@ -49,8 +49,8 @@ import polars as pl
49
49
  import tiktoken
50
50
  from loguru import logger
51
51
 
52
- from claude_sql.config import Settings
53
- from claude_sql.parquet_shards import iter_part_files, write_part
52
+ from claude_sql.core.config import Settings
53
+ from claude_sql.core.parquet_shards import iter_part_files, write_part
54
54
 
55
55
  if TYPE_CHECKING:
56
56
  import duckdb
@@ -41,7 +41,7 @@ import yaml
41
41
  from loguru import logger
42
42
  from packaging.version import InvalidVersion, Version as _Version
43
43
 
44
- from claude_sql.config import Settings
44
+ from claude_sql.core.config import Settings
45
45
 
46
46
  # Built-in Claude Code slash commands. These never map to a SKILL.md on
47
47
  # disk but show up as ``<command-name>/clear</command-name>`` in the
@@ -23,7 +23,7 @@ from loguru import logger
23
23
  if TYPE_CHECKING:
24
24
  import duckdb
25
25
 
26
- from claude_sql.config import Settings
26
+ from claude_sql.core.config import Settings
27
27
 
28
28
 
29
29
  def run_terms(
@@ -43,22 +43,22 @@ import anyio
43
43
  import polars as pl
44
44
  from loguru import logger
45
45
 
46
- from claude_sql import checkpointer, retry_queue
47
- from claude_sql.llm_shared import (
46
+ from claude_sql.core import checkpointer, retry_queue
47
+ from claude_sql.core.llm_shared import (
48
48
  BedrockRefusalError,
49
49
  _build_bedrock_client,
50
50
  _estimate_cost,
51
51
  classify_one,
52
52
  pipeline_cache_stats,
53
53
  )
54
- from claude_sql.parquet_shards import iter_part_files, write_part
55
- from claude_sql.schemas import TRAJECTORY_ARRAY_SCHEMA
56
- from claude_sql.session_text import session_bounds
54
+ from claude_sql.core.parquet_shards import iter_part_files, replace_sessions, write_part
55
+ from claude_sql.core.schemas import TRAJECTORY_ARRAY_SCHEMA
56
+ from claude_sql.core.session_text import session_bounds
57
57
 
58
58
  if TYPE_CHECKING:
59
59
  import duckdb
60
60
 
61
- from claude_sql.config import Settings
61
+ from claude_sql.core.config import Settings
62
62
 
63
63
 
64
64
  # ---------------------------------------------------------------------------
@@ -726,7 +726,7 @@ async def _trajectory_async(
726
726
 
727
727
  # Group by session to chunk per-session (anchor-sharing requires
728
728
  # contiguous windows from the same session in chunk order).
729
- by_session: dict[str, list] = defaultdict(list)
729
+ by_session: dict[str, list[Any]] = defaultdict(list)
730
730
  for row in raw_rows:
731
731
  by_session[row[0]].append(row)
732
732
 
@@ -886,8 +886,20 @@ async def _trajectory_async(
886
886
  # don't collide on filenames — but we still keep the lock so the
887
887
  # in-memory ``written_box`` / ``processed_sessions`` set updates
888
888
  # in lockstep with the on-disk write.
889
+ #
890
+ # replace_sessions drops any prior rows for ``sid`` still sitting
891
+ # in the cache from earlier runs. The checkpointer gates
892
+ # computation on advancing (latest_ts, message_count) bounds but
893
+ # does NOT touch the parquet cache; without this step a growing
894
+ # active session duplicates its (prev_uuid, curr_uuid) pairs
895
+ # on every rerun. See GH #45.
889
896
  df = pl.DataFrame(all_rows, schema=_PARQUET_SCHEMA)
890
897
  async with write_lock:
898
+ replace_sessions(
899
+ settings.trajectory_parquet_path,
900
+ key_column="session_id",
901
+ session_ids=[sid],
902
+ )
891
903
  write_part(settings.trajectory_parquet_path, df)
892
904
  written_box[0] += len(all_rows)
893
905
  processed_sessions.add(sid)
@@ -0,0 +1 @@
1
+ """claude-sql binary: cyclopts CLI + entry point."""