claude-sql 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {claude_sql-1.0.0 → claude_sql-1.0.1}/PKG-INFO +1 -2
  2. {claude_sql-1.0.0 → claude_sql-1.0.1}/README.md +0 -1
  3. {claude_sql-1.0.0 → claude_sql-1.0.1}/pyproject.toml +1 -1
  4. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/checkpointer.py +2 -0
  5. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/config.py +0 -31
  6. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/embed_worker.py +1 -27
  7. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/parquet_shards.py +81 -0
  8. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/sql_views.py +11 -88
  9. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/trajectory_worker.py +13 -1
  10. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/__init__.py +0 -0
  11. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/binding.py +0 -0
  12. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/blind_handover.py +0 -0
  13. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/classify_worker.py +0 -0
  14. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/cli.py +0 -0
  15. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/cluster_worker.py +0 -0
  16. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/community_worker.py +0 -0
  17. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/conflicts_worker.py +0 -0
  18. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/freeze.py +0 -0
  19. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/friction_worker.py +0 -0
  20. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/home.py +0 -0
  21. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/ingest.py +0 -0
  22. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/install_source.py +0 -0
  23. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/judge_worker.py +0 -0
  24. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/judges.py +0 -0
  25. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/kappa_worker.py +0 -0
  26. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/lance_store.py +0 -0
  27. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/llm_shared.py +0 -0
  28. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/logging_setup.py +0 -0
  29. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/output.py +0 -0
  30. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/retry_queue.py +0 -0
  31. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/review_sheet_render.py +0 -0
  32. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/review_sheet_worker.py +0 -0
  33. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/schemas.py +0 -0
  34. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/session_text.py +0 -0
  35. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/skills_catalog.py +0 -0
  36. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/terms_worker.py +0 -0
  37. {claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/ungrounded_worker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: claude-sql
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
5
5
  Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
6
6
  Author: Laith Al-Saadoon
@@ -373,7 +373,6 @@ Every option is configurable via `CLAUDE_SQL_*`:
373
373
  | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
374
374
  | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
375
375
  | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
376
- | `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
377
376
  | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
378
377
  | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
379
378
  | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
@@ -325,7 +325,6 @@ Every option is configurable via `CLAUDE_SQL_*`:
325
325
  | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
326
326
  | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
327
327
  | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
328
- | `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
329
328
  | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
330
329
  | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
331
330
  | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "claude-sql"
3
- version = "1.0.0"
3
+ version = "1.0.1"
4
4
  description = "Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts."
5
5
  readme = "README.md"
6
6
  license = { text = "Apache-2.0" }
@@ -207,6 +207,8 @@ def _migrate_from_duckdb_if_present(new_path: Path) -> None:
207
207
  )
208
208
  con.execute("COMMIT")
209
209
  except Exception:
210
+ # Rollback-and-reraise: any mid-bulk-INSERT failure must abort
211
+ # the txn cleanly and surface to the caller.
210
212
  con.execute("ROLLBACK")
211
213
  raise
212
214
  finally:
@@ -8,7 +8,6 @@ Defaults are picked for a single-user devbox install pointing at
8
8
  from __future__ import annotations
9
9
 
10
10
  import os
11
- import warnings
12
11
  from pathlib import Path
13
12
  from typing import Literal, Self
14
13
 
@@ -179,10 +178,6 @@ class Settings(BaseSettings):
179
178
  #: concurrency=16 scales that linearly with negligible throttle.
180
179
  #: Drop to 2–4 if a future model has a smaller TPM bucket.
181
180
  llm_concurrency: int = 16
182
- #: DEPRECATED: use ``embed_concurrency`` / ``llm_concurrency``. Kept for
183
- #: one release as a back-compat alias — when set explicitly (env or
184
- #: kwarg), it overrides both. Removed once downstream callers migrate.
185
- concurrency: int | None = None
186
181
  batch_size: int = 96
187
182
 
188
183
  embeddings_parquet_path: Path = Field(default_factory=_default_embeddings_parquet)
@@ -381,32 +376,6 @@ class Settings(BaseSettings):
381
376
  )
382
377
  return self
383
378
 
384
- @model_validator(mode="after")
385
- def _resolve_concurrency_alias(self) -> Settings:
386
- """Honor the deprecated ``concurrency`` field as an alias for both pipelines.
387
-
388
- When ``concurrency`` is set explicitly (env or kwarg) and the modern
389
- per-pipeline fields are at their defaults, mirror it onto both. We
390
- only override when the user clearly didn't set the new fields, so
391
- ``embed_concurrency=8, concurrency=4`` keeps the explicit 8.
392
- """
393
- if self.concurrency is None:
394
- return self
395
- warnings.warn(
396
- "CLAUDE_SQL_CONCURRENCY / Settings.concurrency is deprecated. "
397
- "Use CLAUDE_SQL_EMBED_CONCURRENCY (default 8) and "
398
- "CLAUDE_SQL_LLM_CONCURRENCY (default 2) instead. The single "
399
- "knob will be removed in the next release.",
400
- DeprecationWarning,
401
- stacklevel=2,
402
- )
403
- # Only apply the alias to fields still at their default value.
404
- if self.embed_concurrency == 8:
405
- object.__setattr__(self, "embed_concurrency", self.concurrency)
406
- if self.llm_concurrency == 16:
407
- object.__setattr__(self, "llm_concurrency", self.concurrency)
408
- return self
409
-
410
379
  @property
411
380
  def active_model_id(self) -> str:
412
381
  """Return the Bedrock embedding model ID (kept as a property for call-site stability)."""
@@ -22,9 +22,7 @@ from datetime import UTC, datetime
22
22
  from pathlib import Path
23
23
  from typing import TYPE_CHECKING, Any
24
24
 
25
- import boto3
26
25
  import polars as pl
27
- from botocore.config import Config as BotoConfig
28
26
  from botocore.exceptions import (
29
27
  ClientError,
30
28
  ConnectionError as BotoConnectionError,
@@ -42,6 +40,7 @@ from tenacity import (
42
40
 
43
41
  from claude_sql import lance_store
44
42
  from claude_sql.config import Settings
43
+ from claude_sql.llm_shared import _build_bedrock_client
45
44
  from claude_sql.logging_setup import loguru_before_sleep
46
45
 
47
46
  if TYPE_CHECKING:
@@ -179,31 +178,6 @@ def discover_unembedded(
179
178
  return pairs
180
179
 
181
180
 
182
- def _build_bedrock_client(settings: Settings) -> Any:
183
- """Construct a boto3 ``bedrock-runtime`` client from settings.
184
-
185
- Parameters
186
- ----------
187
- settings
188
- Application settings providing the target AWS region.
189
-
190
- Returns
191
- -------
192
- botocore client
193
- A low-level ``bedrock-runtime`` client.
194
- """
195
- # Disable botocore's internal retry layer so tenacity sees throttling
196
- # immediately — otherwise botocore silently absorbs 4 retries and our
197
- # retry policy never kicks in. Also bump read_timeout for large batches.
198
- boto_cfg = BotoConfig(
199
- region_name=settings.region,
200
- retries={"max_attempts": 0, "mode": "standard"},
201
- read_timeout=60,
202
- connect_timeout=10,
203
- )
204
- return boto3.client("bedrock-runtime", config=boto_cfg)
205
-
206
-
207
181
  @retry(
208
182
  # Cohere Embed v4 on Bedrock has a strict TPM bucket that replenishes over
209
183
  # tens of seconds; wait up to 60s between attempts and try up to 10 times
@@ -40,10 +40,12 @@ Public API
40
40
  from __future__ import annotations
41
41
 
42
42
  import time
43
+ from collections.abc import Iterable
43
44
  from pathlib import Path
44
45
  from typing import Any
45
46
 
46
47
  import polars as pl
48
+ from loguru import logger
47
49
 
48
50
  #: Glob pattern for shard part files within a sharded cache directory.
49
51
  PART_GLOB: str = "part-*.parquet"
@@ -162,11 +164,90 @@ def count_rows(target: Path) -> int:
162
164
  return total
163
165
 
164
166
 
167
+ def replace_sessions(
168
+ target: Path,
169
+ *,
170
+ key_column: str,
171
+ session_ids: Iterable[str],
172
+ ) -> int:
173
+ """Drop rows whose ``key_column`` is in ``session_ids`` across every shard.
174
+
175
+ Context
176
+ -------
177
+ Workers like :mod:`claude_sql.trajectory_worker` gate computation on a
178
+ ``(session_id, latest_ts, message_count)`` checkpoint that advances when
179
+ a session grows. When the checkpoint admits a session for re-scoring,
180
+ the pipeline rewrites rows it may have already produced under an earlier
181
+ shard — without this helper those prior rows accumulate and every
182
+ ``(session_id, prev_uuid, curr_uuid)`` pair duplicates on rerun.
183
+
184
+ Behavior
185
+ --------
186
+ * Shards containing *some* rows for ``session_ids`` are rewritten in place
187
+ with those rows filtered out; other sessions' rows are preserved.
188
+ * Shards that become empty are unlinked — leaving empty part files causes
189
+ DuckDB's ``read_parquet`` glob to bind a zero-row parquet and surfaces
190
+ no other harm, but the file is unreachable as data so we remove it.
191
+ * A shard with no matching rows is left untouched (cheap footer read).
192
+ * The legacy single-file branch mirrors the same shape: filter → rewrite
193
+ (or unlink when the filter empties the file).
194
+
195
+ Returns the total number of rows removed across the cache. Returns 0 on
196
+ an empty cache, a missing file, or an empty ``session_ids``.
197
+ """
198
+ ids = set(session_ids)
199
+ if not ids:
200
+ return 0
201
+ parts = iter_part_files(target)
202
+ if not parts:
203
+ return 0
204
+ removed_total = 0
205
+ for part in parts:
206
+ try:
207
+ df = pl.read_parquet(part)
208
+ except (OSError, pl.exceptions.ComputeError) as exc:
209
+ # A truncated or unreadable shard is worth flagging — don't
210
+ # silently let it block the replace. Leave it on disk; the
211
+ # caller's next write still lands, and the analytics view will
212
+ # surface the unreadable file the next time it binds.
213
+ logger.warning("replace_sessions: unreadable shard {} ({}); skipping", part, exc)
214
+ continue
215
+ if key_column not in df.columns or df.height == 0:
216
+ continue
217
+ mask = df[key_column].is_in(list(ids))
218
+ hit_count = int(mask.sum())
219
+ if hit_count == 0:
220
+ continue
221
+ removed_total += hit_count
222
+ kept = df.filter(~mask)
223
+ if kept.height == 0:
224
+ # Nothing left in this shard — remove it so the cache doesn't
225
+ # accumulate empty part files across reruns.
226
+ try:
227
+ part.unlink()
228
+ except OSError as exc:
229
+ logger.warning("replace_sessions: failed to unlink empty shard {}: {}", part, exc)
230
+ continue
231
+ # Rewrite in place. The legacy single-file branch lands here too
232
+ # (``iter_part_files`` returns ``[target]`` in that case), which is
233
+ # correct — we want to overwrite the same file.
234
+ kept.write_parquet(part)
235
+ if removed_total:
236
+ logger.info(
237
+ "replace_sessions: dropped {} row(s) for {} session(s) under {}",
238
+ removed_total,
239
+ len(ids),
240
+ target,
241
+ )
242
+ return removed_total
243
+
244
+
165
245
  __all__ = [
166
246
  "PART_GLOB",
167
247
  "count_rows",
168
248
  "is_sharded_dir",
169
249
  "iter_part_files",
170
250
  "read_all",
251
+ "replace_sessions",
171
252
  "write_part",
172
253
  ]
@@ -57,8 +57,8 @@ SUBAGENT_META_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/a
57
57
 
58
58
  # Business-level views emitted by ``register_views``. Used by the
59
59
  # ``claude-sql schema`` subcommand for schema dumps. Includes the v2
60
- # analytics view names at the tail so ``describe_all`` can enumerate them
61
- # once :func:`register_analytics` has populated the corresponding parquets.
60
+ # analytics view names at the tail; the schema dump materializes only
61
+ # rows where :func:`register_analytics` has populated the matching parquets.
62
62
  VIEW_NAMES: tuple[str, ...] = (
63
63
  "sessions",
64
64
  "messages",
@@ -73,7 +73,6 @@ VIEW_NAMES: tuple[str, ...] = (
73
73
  "task_creations",
74
74
  "task_updates",
75
75
  "tasks_state_current",
76
- "task_spawns",
77
76
  "skill_invocations",
78
77
  "subagent_sessions",
79
78
  "subagent_messages",
@@ -109,11 +108,11 @@ VIEW_NAMES: tuple[str, ...] = (
109
108
  # they're correctly omitted because the source of truth for those views
110
109
  # is the parquet, not this dict.
111
110
  #
112
- # Drift is caught by :func:`tests.test_sql_views.test_view_schema_matches_describe_all`,
113
- # which registers the v1 views over the fixture corpus, runs
114
- # :func:`describe_all`, and asserts column-level equality with this dict.
115
- # A contributor who edits view DDL without updating ``VIEW_SCHEMA`` gets
116
- # a hard CI failure rather than a runtime mystery.
111
+ # Drift is caught by :func:`tests.test_sql_views.test_view_schema_matches_describe_inline`,
112
+ # which registers the v1 views over the fixture corpus, runs ``DESCRIBE``
113
+ # inline per view, and asserts column-level equality with this dict. A
114
+ # contributor who edits view DDL without updating ``VIEW_SCHEMA`` gets a
115
+ # hard CI failure rather than a runtime mystery.
117
116
  VIEW_SCHEMA: dict[str, tuple[tuple[str, str], ...]] = {
118
117
  "sessions": (
119
118
  ("session_id", "VARCHAR"),
@@ -251,16 +250,6 @@ VIEW_SCHEMA: dict[str, tuple[tuple[str, str], ...]] = {
251
250
  ("created_at", "TIMESTAMP"),
252
251
  ("last_updated_at", "TIMESTAMP"),
253
252
  ),
254
- "task_spawns": (
255
- ("session_id", "VARCHAR"),
256
- ("spawned_at", "TIMESTAMP"),
257
- ("message_uuid", "VARCHAR"),
258
- ("tool_use_id", "VARCHAR"),
259
- ("spawn_tool", "VARCHAR"),
260
- ("subagent_type", "VARCHAR"),
261
- ("description", "VARCHAR"),
262
- ("prompt", "VARCHAR"),
263
- ),
264
253
  "skill_invocations": (
265
254
  ("session_id", "VARCHAR"),
266
255
  ("ts", "TIMESTAMP"),
@@ -421,7 +410,7 @@ def _sql_str(value: str) -> str:
421
410
  # Drift discipline: when a downstream view in :func:`register_views` adds a
422
411
  # new top-level field reference, add it here too — otherwise the view will
423
412
  # silently return NULL for that column. The
424
- # ``test_view_schema_matches_describe_all`` drift test catches the column
413
+ # ``test_view_schema_matches_describe_inline`` drift test catches the column
425
414
  # disappearing from any of the 18 v1 views.
426
415
  _MESSAGE_STRUCT_TYPE: str = (
427
416
  "STRUCT("
@@ -610,6 +599,7 @@ def register_raw(
610
599
  )
611
600
  logger.debug("Registered v_raw_subagent_meta from glob {}", subagent_meta_glob)
612
601
  except Exception:
602
+ # register-or-fail-loud — any DuckDB error must surface to the caller.
613
603
  logger.exception("Failed to register raw views")
614
604
  raise
615
605
 
@@ -626,8 +616,7 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
626
616
  ``sessions``, ``messages``, ``content_blocks``, ``messages_text``,
627
617
  ``tool_calls``, ``tool_results``, ``todo_events``, ``todo_state_current``,
628
618
  ``subagent_spawns``, ``task_creations``, ``task_updates``,
629
- ``tasks_state_current``, ``task_spawns`` (deprecated alias),
630
- ``subagent_sessions``, ``subagent_messages``.
619
+ ``tasks_state_current``, ``subagent_sessions``, ``subagent_messages``.
631
620
 
632
621
  The split between ``subagent_spawns`` and ``task_creations`` reflects
633
622
  the Claude Code v2.1.63 ``Task``→``Agent`` rename and the v2.1.16
@@ -973,30 +962,6 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
973
962
  )
974
963
  logger.debug("Registered view: tasks_state_current")
975
964
 
976
- # DEPRECATED: ``task_spawns`` predates the Task→Agent rename (v2.1.63)
977
- # and the TodoWrite→TaskCreate split (v2.1.16). It conflated subagent
978
- # launchers with task-tracker creation. Kept as a UNION ALL alias for
979
- # one release; new analytics should use ``subagent_spawns`` or
980
- # ``task_creations`` directly. Removed in the next minor release.
981
- con.execute(
982
- """
983
- CREATE OR REPLACE VIEW task_spawns AS
984
- SELECT
985
- session_id, spawned_at, message_uuid, tool_use_id,
986
- spawn_tool, subagent_type, description, prompt
987
- FROM subagent_spawns
988
- UNION ALL
989
- SELECT
990
- session_id, created_at AS spawned_at, message_uuid, tool_use_id,
991
- create_tool AS spawn_tool,
992
- NULL AS subagent_type,
993
- description,
994
- NULL AS prompt
995
- FROM task_creations;
996
- """
997
- )
998
- logger.debug("Registered view: task_spawns (deprecated)")
999
-
1000
965
  # Every Skill / slash-command invocation observable in the transcripts,
1001
966
  # unioned across the two shapes they take:
1002
967
  #
@@ -1108,6 +1073,7 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
1108
1073
  )
1109
1074
  logger.debug("Registered view: subagent_messages")
1110
1075
  except Exception:
1076
+ # register-or-fail-loud — any DuckDB error must surface to the caller.
1111
1077
  logger.exception("Failed to register derived views")
1112
1078
  raise
1113
1079
 
@@ -2148,49 +2114,6 @@ def register_all(
2148
2114
  # ---------------------------------------------------------------------------
2149
2115
 
2150
2116
 
2151
- def describe_all(con: duckdb.DuckDBPyConnection) -> dict[str, list[tuple[str, str]]]:
2152
- """Return the column schema of every business-level view.
2153
-
2154
- .. deprecated::
2155
- Use :data:`VIEW_SCHEMA` for static introspection. ``describe_all``
2156
- opens a DuckDB connection and runs ``DESCRIBE`` per view, which on
2157
- the live corpus takes ~14 s -- prohibitive for the agent-facing
2158
- ``schema`` command. Kept for one release as a fallback and as the
2159
- ground truth used by the ``test_view_schema_matches_describe_all``
2160
- drift test; it will be removed once that test moves to a different
2161
- introspection path.
2162
-
2163
- Parameters
2164
- ----------
2165
- con
2166
- Open DuckDB connection with views registered.
2167
-
2168
- Returns
2169
- -------
2170
- dict
2171
- ``{view_name: [(column_name, column_type), ...]}``. Views that fail to
2172
- describe (e.g. missing because ``register_views`` was not called) map
2173
- to an empty list and emit a warning.
2174
- """
2175
- import warnings
2176
-
2177
- warnings.warn(
2178
- "describe_all is deprecated; use VIEW_SCHEMA for static "
2179
- "introspection. Will be removed in a future release.",
2180
- DeprecationWarning,
2181
- stacklevel=2,
2182
- )
2183
- out: dict[str, list[tuple[str, str]]] = {}
2184
- for name in VIEW_NAMES:
2185
- try:
2186
- rows = con.execute(f"DESCRIBE {name}").fetchall()
2187
- out[name] = [(str(r[0]), str(r[1])) for r in rows]
2188
- except duckdb.Error as exc:
2189
- logger.warning("Could not describe {}: {}", name, exc)
2190
- out[name] = []
2191
- return out
2192
-
2193
-
2194
2117
  def list_macros(con: duckdb.DuckDBPyConnection) -> list[tuple[str, tuple[str, ...]]]:
2195
2118
  """Return ``(name, params)`` for every registered macro.
2196
2119
 
@@ -51,7 +51,7 @@ from claude_sql.llm_shared import (
51
51
  classify_one,
52
52
  pipeline_cache_stats,
53
53
  )
54
- from claude_sql.parquet_shards import iter_part_files, write_part
54
+ from claude_sql.parquet_shards import iter_part_files, replace_sessions, write_part
55
55
  from claude_sql.schemas import TRAJECTORY_ARRAY_SCHEMA
56
56
  from claude_sql.session_text import session_bounds
57
57
 
@@ -886,8 +886,20 @@ async def _trajectory_async(
886
886
  # don't collide on filenames — but we still keep the lock so the
887
887
  # in-memory ``written_box`` / ``processed_sessions`` set updates
888
888
  # in lockstep with the on-disk write.
889
+ #
890
+ # replace_sessions drops any prior rows for ``sid`` still sitting
891
+ # in the cache from earlier runs. The checkpointer gates
892
+ # computation on advancing (latest_ts, message_count) bounds but
893
+ # does NOT touch the parquet cache; without this step a growing
894
+ # active session duplicates its (prev_uuid, curr_uuid) pairs
895
+ # on every rerun. See GH #45.
889
896
  df = pl.DataFrame(all_rows, schema=_PARQUET_SCHEMA)
890
897
  async with write_lock:
898
+ replace_sessions(
899
+ settings.trajectory_parquet_path,
900
+ key_column="session_id",
901
+ session_ids=[sid],
902
+ )
891
903
  write_part(settings.trajectory_parquet_path, df)
892
904
  written_box[0] += len(all_rows)
893
905
  processed_sessions.add(sid)