claude-sql 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {claude_sql-1.0.0 → claude_sql-1.1.0}/PKG-INFO +30 -4
- {claude_sql-1.0.0 → claude_sql-1.1.0}/README.md +28 -2
- claude_sql-1.1.0/pyproject.toml +66 -0
- claude_sql-1.1.0/src/claude_sql/analytics/__init__.py +1 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/classify_worker.py +8 -8
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/cluster_worker.py +19 -6
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/community_worker.py +48 -29
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/conflicts_worker.py +8 -8
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/embed_worker.py +12 -30
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/friction_worker.py +7 -7
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/ingest.py +2 -2
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/skills_catalog.py +1 -1
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/terms_worker.py +1 -1
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/trajectory_worker.py +19 -7
- claude_sql-1.1.0/src/claude_sql/app/__init__.py +1 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/app}/cli.py +240 -34
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/app}/install_source.py +2 -1
- claude_sql-1.1.0/src/claude_sql/core/__init__.py +1 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/checkpointer.py +5 -2
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/config.py +23 -32
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/lance_store.py +10 -3
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/llm_shared.py +14 -12
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/parquet_shards.py +97 -2
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/retry_queue.py +1 -1
- claude_sql-1.1.0/src/claude_sql/core/s3_source.py +134 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/schemas.py +9 -9
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/session_text.py +1 -1
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/sql_views.py +64 -97
- claude_sql-1.1.0/src/claude_sql/evals/__init__.py +1 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/freeze.py +1 -1
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/judge_worker.py +3 -3
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/kappa_worker.py +17 -3
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/ungrounded_worker.py +4 -3
- claude_sql-1.1.0/src/claude_sql/provenance/__init__.py +1 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/binding.py +4 -1
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/review_sheet_worker.py +7 -5
- claude_sql-1.0.0/pyproject.toml +0 -318
- claude_sql-1.0.0/src/claude_sql/__init__.py +0 -5
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/home.py +0 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/logging_setup.py +0 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/core}/output.py +0 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/blind_handover.py +0 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/evals}/judges.py +0 -0
- {claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/provenance}/review_sheet_render.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: claude-sql
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
|
|
5
5
|
Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
|
|
6
6
|
Author: Laith Al-Saadoon
|
|
@@ -31,8 +31,8 @@ Requires-Dist: numpy>=2.4.4
|
|
|
31
31
|
Requires-Dist: packaging>=26.2
|
|
32
32
|
Requires-Dist: polars>=1.40.0
|
|
33
33
|
Requires-Dist: pyarrow>=23.0.1
|
|
34
|
-
Requires-Dist: pydantic>=2.13.2
|
|
35
34
|
Requires-Dist: pydantic-settings>=2.13.1
|
|
35
|
+
Requires-Dist: pydantic>=2.13.2
|
|
36
36
|
Requires-Dist: pyyaml>=6.0.3
|
|
37
37
|
Requires-Dist: scikit-learn>=1.5
|
|
38
38
|
Requires-Dist: scipy>=1.13
|
|
@@ -200,6 +200,30 @@ The IAM policy needs `bedrock:InvokeModel` on:
|
|
|
200
200
|
- `inference-profile/global.cohere.embed-v4:0`
|
|
201
201
|
- `inference-profile/global.anthropic.claude-sonnet-4-6`
|
|
202
202
|
|
|
203
|
+
### Reading transcripts from S3
|
|
204
|
+
|
|
205
|
+
claude-sql reads the local JSONL corpus by default, but any transcript glob
|
|
206
|
+
can be an `s3://` URI instead — point it at sessions mirrored to S3 by the
|
|
207
|
+
[`claude-agent-sdk` `S3SessionStore`](https://github.com/anthropics/claude-agent-sdk-python/tree/main/examples/session_stores)
|
|
208
|
+
(layout `s3://{bucket}/{prefix}{project}/{session}/part-*.jsonl`). DuckDB reads
|
|
209
|
+
the parts zero-copy over HTTP range requests — no download step — and every
|
|
210
|
+
view and macro works unchanged.
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
# Personal corpus on S3 instead of ~/.claude/projects.
|
|
214
|
+
export CLAUDE_SQL_DEFAULT_GLOB='s3://my-bucket/transcripts/*/*/part-*.jsonl'
|
|
215
|
+
export AWS_PROFILE=your-profile # credentials via the standard AWS chain
|
|
216
|
+
claude-sql schema
|
|
217
|
+
claude-sql query "SELECT session_id, started_at FROM sessions ORDER BY started_at DESC LIMIT 10"
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
claude-sql loads DuckDB's `httpfs` extension and creates a `credential_chain`
|
|
221
|
+
S3 secret automatically when it sees an `s3://` glob — no keys are embedded
|
|
222
|
+
anywhere. For a non-AWS store (MinIO) or a local mock, set
|
|
223
|
+
`CLAUDE_SQL_S3_ENDPOINT`, `CLAUDE_SQL_S3_URL_STYLE=path`, and
|
|
224
|
+
`CLAUDE_SQL_S3_USE_SSL=false`. The IAM policy needs `s3:GetObject` +
|
|
225
|
+
`s3:ListBucket` on the prefix.
|
|
226
|
+
|
|
203
227
|
## Quick tour
|
|
204
228
|
|
|
205
229
|
```bash
|
|
@@ -367,13 +391,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
|
|
|
367
391
|
| `CLAUDE_SQL_DEFAULT_GLOB` | `~/.claude/projects/*/*.jsonl` | Main transcript glob |
|
|
368
392
|
| `CLAUDE_SQL_SUBAGENT_GLOB` | `~/.claude/projects/*/*/subagents/agent-*.jsonl` | Subagent transcripts |
|
|
369
393
|
| `CLAUDE_SQL_TEAM_CORPUS_ROOT` | `None` | Team-corpus root; when set, derives all three globs from `<root>/<author>/projects/*` (replaces the personal corpus) |
|
|
370
|
-
| `
|
|
394
|
+
| `CLAUDE_SQL_S3_ENDPOINT` | `None` | Custom S3 endpoint `host[:port]` for non-AWS stores (MinIO) or a local mock; unset uses default AWS S3. Only consulted when a glob is an `s3://` URI |
|
|
395
|
+
| `CLAUDE_SQL_S3_URL_STYLE` | `vhost` | S3 addressing style (`vhost` or `path`); set `path` for MinIO / moto |
|
|
396
|
+
| `CLAUDE_SQL_S3_USE_SSL` | `true` | Toggle TLS for the S3 endpoint; set `false` for a local mock |
|
|
397
|
+
| `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region **and** the S3 secret region |
|
|
371
398
|
| `CLAUDE_SQL_MODEL_ID` | `global.cohere.embed-v4:0` | Embedding model |
|
|
372
399
|
| `CLAUDE_SQL_SONNET_MODEL_ID` | `global.anthropic.claude-sonnet-4-6` | Classification model |
|
|
373
400
|
| `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
|
|
374
401
|
| `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
|
|
375
402
|
| `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
|
|
376
|
-
| `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
|
|
377
403
|
| `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
|
|
378
404
|
| `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
|
|
379
405
|
| `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
|
|
@@ -152,6 +152,30 @@ The IAM policy needs `bedrock:InvokeModel` on:
|
|
|
152
152
|
- `inference-profile/global.cohere.embed-v4:0`
|
|
153
153
|
- `inference-profile/global.anthropic.claude-sonnet-4-6`
|
|
154
154
|
|
|
155
|
+
### Reading transcripts from S3
|
|
156
|
+
|
|
157
|
+
claude-sql reads the local JSONL corpus by default, but any transcript glob
|
|
158
|
+
can be an `s3://` URI instead — point it at sessions mirrored to S3 by the
|
|
159
|
+
[`claude-agent-sdk` `S3SessionStore`](https://github.com/anthropics/claude-agent-sdk-python/tree/main/examples/session_stores)
|
|
160
|
+
(layout `s3://{bucket}/{prefix}{project}/{session}/part-*.jsonl`). DuckDB reads
|
|
161
|
+
the parts zero-copy over HTTP range requests — no download step — and every
|
|
162
|
+
view and macro works unchanged.
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Personal corpus on S3 instead of ~/.claude/projects.
|
|
166
|
+
export CLAUDE_SQL_DEFAULT_GLOB='s3://my-bucket/transcripts/*/*/part-*.jsonl'
|
|
167
|
+
export AWS_PROFILE=your-profile # credentials via the standard AWS chain
|
|
168
|
+
claude-sql schema
|
|
169
|
+
claude-sql query "SELECT session_id, started_at FROM sessions ORDER BY started_at DESC LIMIT 10"
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
claude-sql loads DuckDB's `httpfs` extension and creates a `credential_chain`
|
|
173
|
+
S3 secret automatically when it sees an `s3://` glob — no keys are embedded
|
|
174
|
+
anywhere. For a non-AWS store (MinIO) or a local mock, set
|
|
175
|
+
`CLAUDE_SQL_S3_ENDPOINT`, `CLAUDE_SQL_S3_URL_STYLE=path`, and
|
|
176
|
+
`CLAUDE_SQL_S3_USE_SSL=false`. The IAM policy needs `s3:GetObject` +
|
|
177
|
+
`s3:ListBucket` on the prefix.
|
|
178
|
+
|
|
155
179
|
## Quick tour
|
|
156
180
|
|
|
157
181
|
```bash
|
|
@@ -319,13 +343,15 @@ Every option is configurable via `CLAUDE_SQL_*`:
|
|
|
319
343
|
| `CLAUDE_SQL_DEFAULT_GLOB` | `~/.claude/projects/*/*.jsonl` | Main transcript glob |
|
|
320
344
|
| `CLAUDE_SQL_SUBAGENT_GLOB` | `~/.claude/projects/*/*/subagents/agent-*.jsonl` | Subagent transcripts |
|
|
321
345
|
| `CLAUDE_SQL_TEAM_CORPUS_ROOT` | `None` | Team-corpus root; when set, derives all three globs from `<root>/<author>/projects/*` (replaces the personal corpus) |
|
|
322
|
-
| `
|
|
346
|
+
| `CLAUDE_SQL_S3_ENDPOINT` | `None` | Custom S3 endpoint `host[:port]` for non-AWS stores (MinIO) or a local mock; unset uses default AWS S3. Only consulted when a glob is an `s3://` URI |
|
|
347
|
+
| `CLAUDE_SQL_S3_URL_STYLE` | `vhost` | S3 addressing style (`vhost` or `path`); set `path` for MinIO / moto |
|
|
348
|
+
| `CLAUDE_SQL_S3_USE_SSL` | `true` | Toggle TLS for the S3 endpoint; set `false` for a local mock |
|
|
349
|
+
| `CLAUDE_SQL_REGION` | `us-east-1` | Bedrock region **and** the S3 secret region |
|
|
323
350
|
| `CLAUDE_SQL_MODEL_ID` | `global.cohere.embed-v4:0` | Embedding model |
|
|
324
351
|
| `CLAUDE_SQL_SONNET_MODEL_ID` | `global.anthropic.claude-sonnet-4-6` | Classification model |
|
|
325
352
|
| `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
|
|
326
353
|
| `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
|
|
327
354
|
| `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
|
|
328
|
-
| `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
|
|
329
355
|
| `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
|
|
330
356
|
| `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
|
|
331
357
|
| `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# GENERATED by mise-tasks/build-dist — do not edit. The source of truth is
|
|
2
|
+
# packages/*/pyproject.toml. This file bundles all five members into the one
|
|
3
|
+
# publishable ``claude-sql`` wheel. See the task docstring for why.
|
|
4
|
+
[project]
|
|
5
|
+
name = "claude-sql"
|
|
6
|
+
version = "1.1.0"
|
|
7
|
+
description = 'Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.'
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = { text = "Apache-2.0" }
|
|
10
|
+
authors = [{ name = "Laith Al-Saadoon", email = "lalsaado@amazon.com" }]
|
|
11
|
+
requires-python = ">=3.13"
|
|
12
|
+
keywords = ["claude", "claude-code", "anthropic", "duckdb", "sql", "semantic-search", "embeddings", "bedrock", "transcripts", "analytics", "observability"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.13",
|
|
16
|
+
"Development Status :: 5 - Production/Stable",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Operating System :: POSIX :: Linux",
|
|
19
|
+
"Operating System :: MacOS",
|
|
20
|
+
"Topic :: Software Development",
|
|
21
|
+
"Topic :: Database",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"anthropic>=0.40",
|
|
28
|
+
"anyio>=4.13.0",
|
|
29
|
+
"boto3>=1.42.91",
|
|
30
|
+
"cyclopts>=4.10.2",
|
|
31
|
+
"duckdb>=1.5.2,<2",
|
|
32
|
+
"hdbscan>=0.8.40",
|
|
33
|
+
"igraph>=1.0.0,<2.0",
|
|
34
|
+
"lancedb>=0.30,<0.31",
|
|
35
|
+
"leidenalg>=0.11.0,<0.12",
|
|
36
|
+
"loguru>=0.7.3",
|
|
37
|
+
"numpy>=2.4.4",
|
|
38
|
+
"packaging>=26.2",
|
|
39
|
+
"polars>=1.40.0",
|
|
40
|
+
"pyarrow>=23.0.1",
|
|
41
|
+
"pydantic-settings>=2.13.1",
|
|
42
|
+
"pydantic>=2.13.2",
|
|
43
|
+
"pyyaml>=6.0.3",
|
|
44
|
+
"scikit-learn>=1.5",
|
|
45
|
+
"scipy>=1.13",
|
|
46
|
+
"tenacity>=9.1.4",
|
|
47
|
+
"tiktoken>=0.12.0",
|
|
48
|
+
"umap-learn>=0.5.12",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.scripts]
|
|
52
|
+
claude-sql = "claude_sql.app.cli:main"
|
|
53
|
+
|
|
54
|
+
[project.urls]
|
|
55
|
+
Homepage = "https://github.com/theagenticguy/claude-sql"
|
|
56
|
+
Repository = "https://github.com/theagenticguy/claude-sql"
|
|
57
|
+
Issues = "https://github.com/theagenticguy/claude-sql/issues"
|
|
58
|
+
Changelog = "https://github.com/theagenticguy/claude-sql/blob/main/CHANGELOG.md"
|
|
59
|
+
|
|
60
|
+
[build-system]
|
|
61
|
+
requires = ["uv_build>=0.11.14,<0.12"]
|
|
62
|
+
build-backend = "uv_build"
|
|
63
|
+
|
|
64
|
+
[tool.uv.build-backend]
|
|
65
|
+
module-name = "claude_sql"
|
|
66
|
+
namespace = true
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""claude-sql analytics: embed, classify, trajectory, conflicts, friction, cluster, terms, community, ingest."""
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/classify_worker.py
RENAMED
|
@@ -23,8 +23,8 @@ import anyio
|
|
|
23
23
|
import polars as pl
|
|
24
24
|
from loguru import logger
|
|
25
25
|
|
|
26
|
-
from claude_sql import checkpointer, retry_queue
|
|
27
|
-
from claude_sql.llm_shared import (
|
|
26
|
+
from claude_sql.core import checkpointer, retry_queue
|
|
27
|
+
from claude_sql.core.llm_shared import (
|
|
28
28
|
CLASSIFY_SYSTEM_PROMPT,
|
|
29
29
|
_build_bedrock_client,
|
|
30
30
|
_count_pending_sessions,
|
|
@@ -32,14 +32,14 @@ from claude_sql.llm_shared import (
|
|
|
32
32
|
classify_one,
|
|
33
33
|
pipeline_cache_stats,
|
|
34
34
|
)
|
|
35
|
-
from claude_sql.parquet_shards import read_all, write_part
|
|
36
|
-
from claude_sql.schemas import SESSION_CLASSIFICATION_SCHEMA
|
|
37
|
-
from claude_sql.session_text import iter_session_texts, session_bounds
|
|
35
|
+
from claude_sql.core.parquet_shards import read_all, write_part
|
|
36
|
+
from claude_sql.core.schemas import SESSION_CLASSIFICATION_SCHEMA
|
|
37
|
+
from claude_sql.core.session_text import iter_session_texts, session_bounds
|
|
38
38
|
|
|
39
39
|
if TYPE_CHECKING:
|
|
40
40
|
import duckdb
|
|
41
41
|
|
|
42
|
-
from claude_sql.config import Settings
|
|
42
|
+
from claude_sql.core.config import Settings
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
async def _classify_sessions_async(
|
|
@@ -52,7 +52,7 @@ async def _classify_sessions_async(
|
|
|
52
52
|
) -> int:
|
|
53
53
|
"""Async implementation behind :func:`classify_sessions`."""
|
|
54
54
|
already: set[str] = set()
|
|
55
|
-
done_df = read_all(settings.classifications_parquet_path)
|
|
55
|
+
done_df = read_all(settings.classifications_parquet_path, columns=["session_id"])
|
|
56
56
|
if done_df is not None and done_df.height > 0:
|
|
57
57
|
already = set(done_df["session_id"].to_list())
|
|
58
58
|
|
|
@@ -212,7 +212,7 @@ def classify_sessions(
|
|
|
212
212
|
|
|
213
213
|
if dry_run:
|
|
214
214
|
already: set[str] = set()
|
|
215
|
-
done_df = read_all(settings.classifications_parquet_path)
|
|
215
|
+
done_df = read_all(settings.classifications_parquet_path, columns=["session_id"])
|
|
216
216
|
if done_df is not None and done_df.height > 0:
|
|
217
217
|
already = set(done_df["session_id"].to_list())
|
|
218
218
|
pending_count = _count_pending_sessions(
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/cluster_worker.py
RENAMED
|
@@ -22,8 +22,7 @@ import numpy as np
|
|
|
22
22
|
import polars as pl
|
|
23
23
|
from loguru import logger
|
|
24
24
|
|
|
25
|
-
from claude_sql import
|
|
26
|
-
from claude_sql.config import Settings
|
|
25
|
+
from claude_sql.core.config import Settings
|
|
27
26
|
|
|
28
27
|
|
|
29
28
|
def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
|
|
@@ -33,6 +32,10 @@ def _load_embeddings(path: Path) -> tuple[list[str], np.ndarray]:
|
|
|
33
32
|
through the DuckDB ``message_embeddings`` view) so this worker can run
|
|
34
33
|
independently of view registration on the calling connection.
|
|
35
34
|
"""
|
|
35
|
+
# Deferred so importing this module via the CLI for a non-cluster command
|
|
36
|
+
# doesn't pull in the ~2.6s lancedb import subtree.
|
|
37
|
+
from claude_sql.core import lance_store
|
|
38
|
+
|
|
36
39
|
db = lance_store.connect_db(path)
|
|
37
40
|
if not lance_store._has_table(db, lance_store.TABLE_NAME):
|
|
38
41
|
return [], np.zeros((0, 0), dtype=np.float32)
|
|
@@ -65,6 +68,10 @@ def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]
|
|
|
65
68
|
``{"total": N, "clusters": K, "noise": M}`` where K excludes the
|
|
66
69
|
noise cluster (label -1).
|
|
67
70
|
"""
|
|
71
|
+
# Deferred (see _load_embeddings) — keeps the lancedb import off the CLI's
|
|
72
|
+
# module-load path for non-cluster commands.
|
|
73
|
+
from claude_sql.core import lance_store
|
|
74
|
+
|
|
68
75
|
out_path = settings.clusters_parquet_path
|
|
69
76
|
in_path = settings.lance_uri
|
|
70
77
|
|
|
@@ -186,13 +193,19 @@ def run_clustering(settings: Settings, *, force: bool = False) -> dict[str, int]
|
|
|
186
193
|
noise / len(labels) if len(labels) else 0,
|
|
187
194
|
)
|
|
188
195
|
|
|
196
|
+
# Hand polars the numpy arrays directly — it ingests contiguous arrays
|
|
197
|
+
# near-zero-copy. Round-tripping through ``.tolist()`` materialized N
|
|
198
|
+
# boxed Python ints/floats/bools per column just to have polars re-parse
|
|
199
|
+
# them back into the typed columns the schema already pins (mirrors the
|
|
200
|
+
# read-side boxing fix in #68, now on the write side). ``X2`` columns are
|
|
201
|
+
# sliced views, so copy to contiguous float32 before handing them over.
|
|
189
202
|
df = pl.DataFrame(
|
|
190
203
|
{
|
|
191
204
|
"uuid": uuids,
|
|
192
|
-
"cluster_id": labels.astype(np.int32)
|
|
193
|
-
"x": X2[:, 0]
|
|
194
|
-
"y": X2[:, 1]
|
|
195
|
-
"is_noise":
|
|
205
|
+
"cluster_id": labels.astype(np.int32),
|
|
206
|
+
"x": np.ascontiguousarray(X2[:, 0], dtype=np.float32),
|
|
207
|
+
"y": np.ascontiguousarray(X2[:, 1], dtype=np.float32),
|
|
208
|
+
"is_noise": labels < 0,
|
|
196
209
|
},
|
|
197
210
|
schema={
|
|
198
211
|
"uuid": pl.Utf8,
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/community_worker.py
RENAMED
|
@@ -63,7 +63,7 @@ import numpy as np
|
|
|
63
63
|
import polars as pl
|
|
64
64
|
from loguru import logger
|
|
65
65
|
|
|
66
|
-
from claude_sql.config import Settings
|
|
66
|
+
from claude_sql.core.config import Settings
|
|
67
67
|
|
|
68
68
|
if TYPE_CHECKING:
|
|
69
69
|
import duckdb
|
|
@@ -85,10 +85,12 @@ def _load_session_centroids(
|
|
|
85
85
|
"""Return ``(session_ids, centroids)`` where centroids is ``(N_sessions, dim)`` float32.
|
|
86
86
|
|
|
87
87
|
Joins the ``message_embeddings`` view (LanceDB-backed via ``register_vss``)
|
|
88
|
-
to the v1 ``messages`` view on uuid,
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
88
|
+
to the v1 ``messages`` view on uuid, pulls one ``(session_id, embedding)``
|
|
89
|
+
row per message ordered by session, then computes per-session means in
|
|
90
|
+
numpy with a single ``np.add.reduceat`` segmented sum (sessions are
|
|
91
|
+
contiguous after the ``ORDER BY``) followed by an L2-normalize. This keeps
|
|
92
|
+
the intermediate at ``N_messages`` rows rather than the ``N_messages ×
|
|
93
|
+
dim`` explosion the prior ``unnest``-per-dimension aggregation produced.
|
|
92
94
|
|
|
93
95
|
``embeddings_parquet_path`` is accepted for back-compat with callers that
|
|
94
96
|
still pass it but is no longer consulted — the connection's
|
|
@@ -96,29 +98,24 @@ def _load_session_centroids(
|
|
|
96
98
|
"""
|
|
97
99
|
del embeddings_parquet_path # legacy kwarg — view is the source of truth now
|
|
98
100
|
logger.info("Loading message embeddings and joining to sessions...")
|
|
101
|
+
# Pull one row per message (session_id, embedding) ordered by session, then
|
|
102
|
+
# compute per-session means in numpy via a single segmented reduction.
|
|
103
|
+
#
|
|
104
|
+
# The prior implementation unnested every embedding into ``dim`` rows
|
|
105
|
+
# (``generate_subscripts`` + ``unnest``) and grouped on (session, pos) —
|
|
106
|
+
# that explodes the working set to N_messages × dim rows before the
|
|
107
|
+
# average. Carrying the FLOAT[dim] vector through the join and reducing it
|
|
108
|
+
# in numpy keeps the intermediate at N_messages rows and is 1.4–1.8×
|
|
109
|
+
# faster on a 24k–96k-message corpus (measured), with the win widening as
|
|
110
|
+
# the corpus grows. ``ORDER BY session_id`` makes the sessions contiguous
|
|
111
|
+
# so ``np.add.reduceat`` can segment-sum without a Python per-session loop.
|
|
99
112
|
sql = """
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
),
|
|
107
|
-
unrolled AS (
|
|
108
|
-
SELECT session_id,
|
|
109
|
-
generate_subscripts(emb, 1) AS pos,
|
|
110
|
-
unnest(emb) AS v
|
|
111
|
-
FROM joined
|
|
112
|
-
),
|
|
113
|
-
agg AS (
|
|
114
|
-
SELECT session_id, pos, avg(v) AS m
|
|
115
|
-
FROM unrolled
|
|
116
|
-
GROUP BY 1, 2
|
|
117
|
-
)
|
|
118
|
-
SELECT session_id, list(m ORDER BY pos) AS centroid
|
|
119
|
-
FROM agg
|
|
120
|
-
GROUP BY 1
|
|
121
|
-
ORDER BY 1
|
|
113
|
+
SELECT CAST(m.session_id AS VARCHAR) AS session_id,
|
|
114
|
+
e.embedding AS emb
|
|
115
|
+
FROM message_embeddings e
|
|
116
|
+
JOIN messages m
|
|
117
|
+
ON CAST(m.uuid AS VARCHAR) = e.uuid
|
|
118
|
+
ORDER BY session_id
|
|
122
119
|
"""
|
|
123
120
|
try:
|
|
124
121
|
df = con.execute(sql).pl()
|
|
@@ -136,10 +133,32 @@ def _load_session_centroids(
|
|
|
136
133
|
"Lance embeddings exist and the messages view is registered."
|
|
137
134
|
)
|
|
138
135
|
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
# ``emb`` is a DuckDB ``FLOAT[dim]`` column; polars surfaces it as a
|
|
137
|
+
# fixed-size ``Array(Float32, dim)`` dtype (occasionally a variable
|
|
138
|
+
# ``List`` if the cast was lost upstream). ``Series.to_numpy()`` extracts
|
|
139
|
+
# the buffer directly into a contiguous ``(N_messages, dim)`` matrix; for
|
|
140
|
+
# the fixed-``Array`` case it's a near-zero-copy view. The prior
|
|
141
|
+
# ``np.asarray(series.to_list(), ...)`` boxed every one of the
|
|
142
|
+
# N_messages × dim float32 values into a Python ``float`` object first —
|
|
143
|
+
# measured at 6–10× slower and ~43× higher peak RSS on a 6k–96k-message
|
|
144
|
+
# corpus (e.g. 7.4 s / 3.5 GB → 1.2 s / 83 MB at 96k). This is the
|
|
145
|
+
# read-side analog of the SQL-side ``unnest`` explosion removed in #65.
|
|
146
|
+
emb_arr = df["emb"].to_numpy()
|
|
147
|
+
if emb_arr.ndim == 1:
|
|
148
|
+
# Variable ``List`` dtype (or object array of rows) — stack to 2-D.
|
|
149
|
+
emb_arr = np.stack(list(emb_arr))
|
|
150
|
+
emb_np = np.ascontiguousarray(emb_arr, dtype=np.float32)
|
|
151
|
+
sessions = df["session_id"].to_numpy()
|
|
152
|
+
# ``return_index`` gives each group's first row offset on the sorted array;
|
|
153
|
+
# ``np.unique`` returns the labels already sorted, matching the prior
|
|
154
|
+
# ``ORDER BY 1`` contract. ``reduceat`` sums each [start_i, start_{i+1})
|
|
155
|
+
# segment in one pass.
|
|
156
|
+
sids_arr, starts, counts = np.unique(sessions, return_index=True, return_counts=True)
|
|
157
|
+
summed = np.add.reduceat(emb_np, starts, axis=0)
|
|
158
|
+
centroids = (summed / counts[:, None]).astype(np.float32)
|
|
141
159
|
norms = np.linalg.norm(centroids, axis=1, keepdims=True)
|
|
142
160
|
centroids = centroids / np.where(norms == 0, 1.0, norms)
|
|
161
|
+
sids = sids_arr.tolist()
|
|
143
162
|
logger.info("Computed {} session centroids (dim={})", len(sids), centroids.shape[1])
|
|
144
163
|
return sids, centroids
|
|
145
164
|
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/conflicts_worker.py
RENAMED
|
@@ -41,8 +41,8 @@ import pyarrow as pa
|
|
|
41
41
|
import pyarrow.parquet as pq
|
|
42
42
|
from loguru import logger
|
|
43
43
|
|
|
44
|
-
from claude_sql import checkpointer, retry_queue
|
|
45
|
-
from claude_sql.llm_shared import (
|
|
44
|
+
from claude_sql.core import checkpointer, retry_queue
|
|
45
|
+
from claude_sql.core.llm_shared import (
|
|
46
46
|
CONFLICTS_SYSTEM_PROMPT,
|
|
47
47
|
_build_bedrock_client,
|
|
48
48
|
_count_pending_sessions,
|
|
@@ -50,16 +50,16 @@ from claude_sql.llm_shared import (
|
|
|
50
50
|
classify_one,
|
|
51
51
|
pipeline_cache_stats,
|
|
52
52
|
)
|
|
53
|
-
from claude_sql.parquet_shards import iter_part_files, read_all, write_part
|
|
54
|
-
from claude_sql.schemas import SESSION_CONFLICTS_SCHEMA
|
|
55
|
-
from claude_sql.session_text import iter_session_texts, session_bounds
|
|
53
|
+
from claude_sql.core.parquet_shards import iter_part_files, read_all, write_part
|
|
54
|
+
from claude_sql.core.schemas import SESSION_CONFLICTS_SCHEMA
|
|
55
|
+
from claude_sql.core.session_text import iter_session_texts, session_bounds
|
|
56
56
|
|
|
57
57
|
if TYPE_CHECKING:
|
|
58
58
|
from pathlib import Path
|
|
59
59
|
|
|
60
60
|
import duckdb
|
|
61
61
|
|
|
62
|
-
from claude_sql.config import Settings
|
|
62
|
+
from claude_sql.core.config import Settings
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
# v1.0 parquet schema — kept as a module constant so the worker, the test
|
|
@@ -143,7 +143,7 @@ async def _conflicts_async(
|
|
|
143
143
|
_purge_legacy_shards(settings.conflicts_parquet_path)
|
|
144
144
|
|
|
145
145
|
already: set[str] = set()
|
|
146
|
-
done_df = read_all(settings.conflicts_parquet_path)
|
|
146
|
+
done_df = read_all(settings.conflicts_parquet_path, columns=["session_id"])
|
|
147
147
|
if done_df is not None and done_df.height > 0:
|
|
148
148
|
already = set(done_df["session_id"].to_list())
|
|
149
149
|
|
|
@@ -304,7 +304,7 @@ def detect_conflicts(
|
|
|
304
304
|
thinking_mode = "disabled" if no_thinking else settings.classify_thinking
|
|
305
305
|
if dry_run:
|
|
306
306
|
already: set[str] = set()
|
|
307
|
-
done_df = read_all(settings.conflicts_parquet_path)
|
|
307
|
+
done_df = read_all(settings.conflicts_parquet_path, columns=["session_id"])
|
|
308
308
|
if done_df is not None and done_df.height > 0:
|
|
309
309
|
already = set(done_df["session_id"].to_list())
|
|
310
310
|
pending_count = _count_pending_sessions(
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/embed_worker.py
RENAMED
|
@@ -22,9 +22,7 @@ from datetime import UTC, datetime
|
|
|
22
22
|
from pathlib import Path
|
|
23
23
|
from typing import TYPE_CHECKING, Any
|
|
24
24
|
|
|
25
|
-
import boto3
|
|
26
25
|
import polars as pl
|
|
27
|
-
from botocore.config import Config as BotoConfig
|
|
28
26
|
from botocore.exceptions import (
|
|
29
27
|
ClientError,
|
|
30
28
|
ConnectionError as BotoConnectionError,
|
|
@@ -40,9 +38,9 @@ from tenacity import (
|
|
|
40
38
|
wait_exponential,
|
|
41
39
|
)
|
|
42
40
|
|
|
43
|
-
from claude_sql import
|
|
44
|
-
from claude_sql.
|
|
45
|
-
from claude_sql.logging_setup import loguru_before_sleep
|
|
41
|
+
from claude_sql.core.config import Settings
|
|
42
|
+
from claude_sql.core.llm_shared import _build_bedrock_client
|
|
43
|
+
from claude_sql.core.logging_setup import loguru_before_sleep
|
|
46
44
|
|
|
47
45
|
if TYPE_CHECKING:
|
|
48
46
|
import duckdb
|
|
@@ -135,6 +133,11 @@ def discover_unembedded(
|
|
|
135
133
|
list of (uuid, text) tuples
|
|
136
134
|
Messages needing embedding, in DuckDB's scan order.
|
|
137
135
|
"""
|
|
136
|
+
# Deferred so importing this module (e.g. via the CLI for a non-embed
|
|
137
|
+
# command) doesn't drag in the ~2.6s lancedb import subtree. lance_store
|
|
138
|
+
# is only touched once an embed-path function actually runs.
|
|
139
|
+
from claude_sql.core import lance_store
|
|
140
|
+
|
|
138
141
|
# Read the already-embedded uuids straight from Lance via its Python API.
|
|
139
142
|
# We don't go through the DuckDB ``message_embeddings`` view here because
|
|
140
143
|
# the embed command runs with ``register_vss`` skipped (cli.py:1205-1213),
|
|
@@ -179,31 +182,6 @@ def discover_unembedded(
|
|
|
179
182
|
return pairs
|
|
180
183
|
|
|
181
184
|
|
|
182
|
-
def _build_bedrock_client(settings: Settings) -> Any:
|
|
183
|
-
"""Construct a boto3 ``bedrock-runtime`` client from settings.
|
|
184
|
-
|
|
185
|
-
Parameters
|
|
186
|
-
----------
|
|
187
|
-
settings
|
|
188
|
-
Application settings providing the target AWS region.
|
|
189
|
-
|
|
190
|
-
Returns
|
|
191
|
-
-------
|
|
192
|
-
botocore client
|
|
193
|
-
A low-level ``bedrock-runtime`` client.
|
|
194
|
-
"""
|
|
195
|
-
# Disable botocore's internal retry layer so tenacity sees throttling
|
|
196
|
-
# immediately — otherwise botocore silently absorbs 4 retries and our
|
|
197
|
-
# retry policy never kicks in. Also bump read_timeout for large batches.
|
|
198
|
-
boto_cfg = BotoConfig(
|
|
199
|
-
region_name=settings.region,
|
|
200
|
-
retries={"max_attempts": 0, "mode": "standard"},
|
|
201
|
-
read_timeout=60,
|
|
202
|
-
connect_timeout=10,
|
|
203
|
-
)
|
|
204
|
-
return boto3.client("bedrock-runtime", config=boto_cfg)
|
|
205
|
-
|
|
206
|
-
|
|
207
185
|
@retry(
|
|
208
186
|
# Cohere Embed v4 on Bedrock has a strict TPM bucket that replenishes over
|
|
209
187
|
# tens of seconds; wait up to 60s between attempts and try up to 10 times
|
|
@@ -462,6 +440,10 @@ async def run_backfill(
|
|
|
462
440
|
"dry_run": True,
|
|
463
441
|
}
|
|
464
442
|
|
|
443
|
+
# Deferred (see discover_unembedded) — keeps the lancedb import off the
|
|
444
|
+
# dry-run / nothing-pending paths above, which return before this point.
|
|
445
|
+
from claude_sql.core import lance_store
|
|
446
|
+
|
|
465
447
|
# Checkpoint every N messages so a throttling-induced timeout doesn't
|
|
466
448
|
# discard work already embedded. chunk must be a multiple of batch_size.
|
|
467
449
|
chunk_size = max(settings.batch_size * 4, 256)
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/friction_worker.py
RENAMED
|
@@ -49,8 +49,8 @@ import duckdb
|
|
|
49
49
|
import polars as pl
|
|
50
50
|
from loguru import logger
|
|
51
51
|
|
|
52
|
-
from claude_sql import checkpointer, retry_queue
|
|
53
|
-
from claude_sql.llm_shared import (
|
|
52
|
+
from claude_sql.core import checkpointer, retry_queue
|
|
53
|
+
from claude_sql.core.llm_shared import (
|
|
54
54
|
USER_FRICTION_SYSTEM_PROMPT,
|
|
55
55
|
BedrockRefusalError,
|
|
56
56
|
_build_bedrock_client,
|
|
@@ -58,12 +58,12 @@ from claude_sql.llm_shared import (
|
|
|
58
58
|
classify_one,
|
|
59
59
|
pipeline_cache_stats,
|
|
60
60
|
)
|
|
61
|
-
from claude_sql.parquet_shards import read_all, write_part
|
|
62
|
-
from claude_sql.schemas import USER_FRICTION_SCHEMA
|
|
63
|
-
from claude_sql.session_text import session_bounds
|
|
61
|
+
from claude_sql.core.parquet_shards import read_all, write_part
|
|
62
|
+
from claude_sql.core.schemas import USER_FRICTION_SCHEMA
|
|
63
|
+
from claude_sql.core.session_text import session_bounds
|
|
64
64
|
|
|
65
65
|
if TYPE_CHECKING:
|
|
66
|
-
from claude_sql.config import Settings
|
|
66
|
+
from claude_sql.core.config import Settings
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
# ---------------------------------------------------------------------------
|
|
@@ -428,7 +428,7 @@ async def _classify_async(
|
|
|
428
428
|
"""Async body behind :func:`detect_user_friction`."""
|
|
429
429
|
out_path = settings.user_friction_parquet_path
|
|
430
430
|
already: set[str] = set()
|
|
431
|
-
done_df = read_all(out_path)
|
|
431
|
+
done_df = read_all(out_path, columns=["uuid"])
|
|
432
432
|
if done_df is not None and done_df.height > 0:
|
|
433
433
|
already = set(done_df["uuid"].to_list())
|
|
434
434
|
|
|
@@ -49,8 +49,8 @@ import polars as pl
|
|
|
49
49
|
import tiktoken
|
|
50
50
|
from loguru import logger
|
|
51
51
|
|
|
52
|
-
from claude_sql.config import Settings
|
|
53
|
-
from claude_sql.parquet_shards import iter_part_files, write_part
|
|
52
|
+
from claude_sql.core.config import Settings
|
|
53
|
+
from claude_sql.core.parquet_shards import iter_part_files, write_part
|
|
54
54
|
|
|
55
55
|
if TYPE_CHECKING:
|
|
56
56
|
import duckdb
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/skills_catalog.py
RENAMED
|
@@ -41,7 +41,7 @@ import yaml
|
|
|
41
41
|
from loguru import logger
|
|
42
42
|
from packaging.version import InvalidVersion, Version as _Version
|
|
43
43
|
|
|
44
|
-
from claude_sql.config import Settings
|
|
44
|
+
from claude_sql.core.config import Settings
|
|
45
45
|
|
|
46
46
|
# Built-in Claude Code slash commands. These never map to a SKILL.md on
|
|
47
47
|
# disk but show up as ``<command-name>/clear</command-name>`` in the
|
{claude_sql-1.0.0/src/claude_sql → claude_sql-1.1.0/src/claude_sql/analytics}/trajectory_worker.py
RENAMED
|
@@ -43,22 +43,22 @@ import anyio
|
|
|
43
43
|
import polars as pl
|
|
44
44
|
from loguru import logger
|
|
45
45
|
|
|
46
|
-
from claude_sql import checkpointer, retry_queue
|
|
47
|
-
from claude_sql.llm_shared import (
|
|
46
|
+
from claude_sql.core import checkpointer, retry_queue
|
|
47
|
+
from claude_sql.core.llm_shared import (
|
|
48
48
|
BedrockRefusalError,
|
|
49
49
|
_build_bedrock_client,
|
|
50
50
|
_estimate_cost,
|
|
51
51
|
classify_one,
|
|
52
52
|
pipeline_cache_stats,
|
|
53
53
|
)
|
|
54
|
-
from claude_sql.parquet_shards import iter_part_files, write_part
|
|
55
|
-
from claude_sql.schemas import TRAJECTORY_ARRAY_SCHEMA
|
|
56
|
-
from claude_sql.session_text import session_bounds
|
|
54
|
+
from claude_sql.core.parquet_shards import iter_part_files, replace_sessions, write_part
|
|
55
|
+
from claude_sql.core.schemas import TRAJECTORY_ARRAY_SCHEMA
|
|
56
|
+
from claude_sql.core.session_text import session_bounds
|
|
57
57
|
|
|
58
58
|
if TYPE_CHECKING:
|
|
59
59
|
import duckdb
|
|
60
60
|
|
|
61
|
-
from claude_sql.config import Settings
|
|
61
|
+
from claude_sql.core.config import Settings
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
# ---------------------------------------------------------------------------
|
|
@@ -726,7 +726,7 @@ async def _trajectory_async(
|
|
|
726
726
|
|
|
727
727
|
# Group by session to chunk per-session (anchor-sharing requires
|
|
728
728
|
# contiguous windows from the same session in chunk order).
|
|
729
|
-
by_session: dict[str, list] = defaultdict(list)
|
|
729
|
+
by_session: dict[str, list[Any]] = defaultdict(list)
|
|
730
730
|
for row in raw_rows:
|
|
731
731
|
by_session[row[0]].append(row)
|
|
732
732
|
|
|
@@ -886,8 +886,20 @@ async def _trajectory_async(
|
|
|
886
886
|
# don't collide on filenames — but we still keep the lock so the
|
|
887
887
|
# in-memory ``written_box`` / ``processed_sessions`` set updates
|
|
888
888
|
# in lockstep with the on-disk write.
|
|
889
|
+
#
|
|
890
|
+
# replace_sessions drops any prior rows for ``sid`` still sitting
|
|
891
|
+
# in the cache from earlier runs. The checkpointer gates
|
|
892
|
+
# computation on advancing (latest_ts, message_count) bounds but
|
|
893
|
+
# does NOT touch the parquet cache; without this step a growing
|
|
894
|
+
# active session duplicates its (prev_uuid, curr_uuid) pairs
|
|
895
|
+
# on every rerun. See GH #45.
|
|
889
896
|
df = pl.DataFrame(all_rows, schema=_PARQUET_SCHEMA)
|
|
890
897
|
async with write_lock:
|
|
898
|
+
replace_sessions(
|
|
899
|
+
settings.trajectory_parquet_path,
|
|
900
|
+
key_column="session_id",
|
|
901
|
+
session_ids=[sid],
|
|
902
|
+
)
|
|
891
903
|
write_part(settings.trajectory_parquet_path, df)
|
|
892
904
|
written_box[0] += len(all_rows)
|
|
893
905
|
processed_sessions.add(sid)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""claude-sql binary: cyclopts CLI + entry point."""
|