PyPI - claude-sql - Versions diffs - 1.0.0__tar.gz → 1.0.1__tar.gz - Mend

claude-sql 1.0.0tar.gz → 1.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{claude_sql-1.0.0 → claude_sql-1.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: claude-sql
-Version: 1.0.0
+Version: 1.0.1
 Summary: Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts.
 Keywords: claude,claude-code,anthropic,duckdb,sql,semantic-search,embeddings,bedrock,transcripts,analytics,observability
 Author: Laith Al-Saadoon
@@ -373,7 +373,6 @@ Every option is configurable via `CLAUDE_SQL_*`:
 | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
 | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
 | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
-| `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
 | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
 | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
 | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |

{claude_sql-1.0.0 → claude_sql-1.0.1}/README.md RENAMED Viewed

@@ -325,7 +325,6 @@ Every option is configurable via `CLAUDE_SQL_*`:
 | `CLAUDE_SQL_OUTPUT_DIMENSION` | `1024` | Matryoshka embedding dimension |
 | `CLAUDE_SQL_EMBED_CONCURRENCY` | `8` | Parallel Cohere Embed v4 calls (global CRIS) |
 | `CLAUDE_SQL_LLM_CONCURRENCY` | `2` | Parallel Sonnet 4.6 calls (global CRIS) |
-| `CLAUDE_SQL_CONCURRENCY` | `None` | DEPRECATED single knob — aliases onto both pipelines with a warning |
 | `CLAUDE_SQL_BATCH_SIZE` | `96` | Cohere batch size |
 | `CLAUDE_SQL_EMBEDDINGS_PARQUET_PATH` | `~/.claude/embeddings/` | Embeddings cache (sharded directory of `part-*.parquet`) |
 | `CLAUDE_SQL_USER_FRICTION_PARQUET_PATH` | `~/.claude/user_friction/` | Friction cache (sharded) |

{claude_sql-1.0.0 → claude_sql-1.0.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "claude-sql"
-version = "1.0.0"
+version = "1.0.1"
 description = "Zero-copy SQL + semantic search + LLM analytics over ~/.claude/ transcripts."
 readme = "README.md"
 license = { text = "Apache-2.0" }

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/checkpointer.py RENAMED Viewed

@@ -207,6 +207,8 @@ def _migrate_from_duckdb_if_present(new_path: Path) -> None:
                     )
                 con.execute("COMMIT")
             except Exception:
+                # Rollback-and-reraise: any mid-bulk-INSERT failure must abort
+                # the txn cleanly and surface to the caller.
                 con.execute("ROLLBACK")
                 raise
     finally:

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/config.py RENAMED Viewed

@@ -8,7 +8,6 @@ Defaults are picked for a single-user devbox install pointing at
 from __future__ import annotations
 import os
-import warnings
 from pathlib import Path
 from typing import Literal, Self
@@ -179,10 +178,6 @@ class Settings(BaseSettings):
     #: concurrency=16 scales that linearly with negligible throttle.
     #: Drop to 2–4 if a future model has a smaller TPM bucket.
     llm_concurrency: int = 16
-    #: DEPRECATED: use ``embed_concurrency`` / ``llm_concurrency``. Kept for
-    #: one release as a back-compat alias — when set explicitly (env or
-    #: kwarg), it overrides both. Removed once downstream callers migrate.
-    concurrency: int | None = None
     batch_size: int = 96
     embeddings_parquet_path: Path = Field(default_factory=_default_embeddings_parquet)
@@ -381,32 +376,6 @@ class Settings(BaseSettings):
         )
         return self
-    @model_validator(mode="after")
-    def _resolve_concurrency_alias(self) -> Settings:
-        """Honor the deprecated ``concurrency`` field as an alias for both pipelines.
-        When ``concurrency`` is set explicitly (env or kwarg) and the modern
-        per-pipeline fields are at their defaults, mirror it onto both. We
-        only override when the user clearly didn't set the new fields, so
-        ``embed_concurrency=8, concurrency=4`` keeps the explicit 8.
-        """
-        if self.concurrency is None:
-            return self
-        warnings.warn(
-            "CLAUDE_SQL_CONCURRENCY / Settings.concurrency is deprecated. "
-            "Use CLAUDE_SQL_EMBED_CONCURRENCY (default 8) and "
-            "CLAUDE_SQL_LLM_CONCURRENCY (default 2) instead. The single "
-            "knob will be removed in the next release.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        # Only apply the alias to fields still at their default value.
-        if self.embed_concurrency == 8:
-            object.__setattr__(self, "embed_concurrency", self.concurrency)
-        if self.llm_concurrency == 16:
-            object.__setattr__(self, "llm_concurrency", self.concurrency)
-        return self
     @property
     def active_model_id(self) -> str:
         """Return the Bedrock embedding model ID (kept as a property for call-site stability)."""

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/embed_worker.py RENAMED Viewed

@@ -22,9 +22,7 @@ from datetime import UTC, datetime
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-import boto3
 import polars as pl
-from botocore.config import Config as BotoConfig
 from botocore.exceptions import (
     ClientError,
     ConnectionError as BotoConnectionError,
@@ -42,6 +40,7 @@ from tenacity import (
 from claude_sql import lance_store
 from claude_sql.config import Settings
+from claude_sql.llm_shared import _build_bedrock_client
 from claude_sql.logging_setup import loguru_before_sleep
 if TYPE_CHECKING:
@@ -179,31 +178,6 @@ def discover_unembedded(
     return pairs
-def _build_bedrock_client(settings: Settings) -> Any:
-    """Construct a boto3 ``bedrock-runtime`` client from settings.
-    Parameters
-    ----------
-    settings
-        Application settings providing the target AWS region.
-    Returns
-    -------
-    botocore client
-        A low-level ``bedrock-runtime`` client.
-    """
-    # Disable botocore's internal retry layer so tenacity sees throttling
-    # immediately — otherwise botocore silently absorbs 4 retries and our
-    # retry policy never kicks in.  Also bump read_timeout for large batches.
-    boto_cfg = BotoConfig(
-        region_name=settings.region,
-        retries={"max_attempts": 0, "mode": "standard"},
-        read_timeout=60,
-        connect_timeout=10,
-    )
-    return boto3.client("bedrock-runtime", config=boto_cfg)
 @retry(
     # Cohere Embed v4 on Bedrock has a strict TPM bucket that replenishes over
     # tens of seconds; wait up to 60s between attempts and try up to 10 times

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/parquet_shards.py RENAMED Viewed

@@ -40,10 +40,12 @@ Public API
 from __future__ import annotations
 import time
+from collections.abc import Iterable
 from pathlib import Path
 from typing import Any
 import polars as pl
+from loguru import logger
 #: Glob pattern for shard part files within a sharded cache directory.
 PART_GLOB: str = "part-*.parquet"
@@ -162,11 +164,90 @@ def count_rows(target: Path) -> int:
     return total
+def replace_sessions(
+    target: Path,
+    *,
+    key_column: str,
+    session_ids: Iterable[str],
+) -> int:
+    """Drop rows whose ``key_column`` is in ``session_ids`` across every shard.
+    Context
+    -------
+    Workers like :mod:`claude_sql.trajectory_worker` gate computation on a
+    ``(session_id, latest_ts, message_count)`` checkpoint that advances when
+    a session grows. When the checkpoint admits a session for re-scoring,
+    the pipeline rewrites rows it may have already produced under an earlier
+    shard — without this helper those prior rows accumulate and every
+    ``(session_id, prev_uuid, curr_uuid)`` pair duplicates on rerun.
+    Behavior
+    --------
+    * Shards containing *some* rows for ``session_ids`` are rewritten in place
+      with those rows filtered out; other sessions' rows are preserved.
+    * Shards that become empty are unlinked — leaving empty part files causes
+      DuckDB's ``read_parquet`` glob to bind a zero-row parquet and surfaces
+      no other harm, but the file is unreachable as data so we remove it.
+    * A shard with no matching rows is left untouched (cheap footer read).
+    * The legacy single-file branch mirrors the same shape: filter → rewrite
+      (or unlink when the filter empties the file).
+    Returns the total number of rows removed across the cache. Returns 0 on
+    an empty cache, a missing file, or an empty ``session_ids``.
+    """
+    ids = set(session_ids)
+    if not ids:
+        return 0
+    parts = iter_part_files(target)
+    if not parts:
+        return 0
+    removed_total = 0
+    for part in parts:
+        try:
+            df = pl.read_parquet(part)
+        except (OSError, pl.exceptions.ComputeError) as exc:
+            # A truncated or unreadable shard is worth flagging — don't
+            # silently let it block the replace. Leave it on disk; the
+            # caller's next write still lands, and the analytics view will
+            # surface the unreadable file the next time it binds.
+            logger.warning("replace_sessions: unreadable shard {} ({}); skipping", part, exc)
+            continue
+        if key_column not in df.columns or df.height == 0:
+            continue
+        mask = df[key_column].is_in(list(ids))
+        hit_count = int(mask.sum())
+        if hit_count == 0:
+            continue
+        removed_total += hit_count
+        kept = df.filter(~mask)
+        if kept.height == 0:
+            # Nothing left in this shard — remove it so the cache doesn't
+            # accumulate empty part files across reruns.
+            try:
+                part.unlink()
+            except OSError as exc:
+                logger.warning("replace_sessions: failed to unlink empty shard {}: {}", part, exc)
+            continue
+        # Rewrite in place. The legacy single-file branch lands here too
+        # (``iter_part_files`` returns ``[target]`` in that case), which is
+        # correct — we want to overwrite the same file.
+        kept.write_parquet(part)
+    if removed_total:
+        logger.info(
+            "replace_sessions: dropped {} row(s) for {} session(s) under {}",
+            removed_total,
+            len(ids),
+            target,
+        )
+    return removed_total
 __all__ = [
     "PART_GLOB",
     "count_rows",
     "is_sharded_dir",
     "iter_part_files",
     "read_all",
+    "replace_sessions",
     "write_part",
 ]

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/sql_views.py RENAMED Viewed

@@ -57,8 +57,8 @@ SUBAGENT_META_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/a
 # Business-level views emitted by ``register_views``. Used by the
 # ``claude-sql schema`` subcommand for schema dumps.  Includes the v2
-# analytics view names at the tail so ``describe_all`` can enumerate them
-# once :func:`register_analytics` has populated the corresponding parquets.
+# analytics view names at the tail; the schema dump materializes only
+# rows where :func:`register_analytics` has populated the matching parquets.
 VIEW_NAMES: tuple[str, ...] = (
     "sessions",
     "messages",
@@ -73,7 +73,6 @@ VIEW_NAMES: tuple[str, ...] = (
     "task_creations",
     "task_updates",
     "tasks_state_current",
-    "task_spawns",
     "skill_invocations",
     "subagent_sessions",
     "subagent_messages",
@@ -109,11 +108,11 @@ VIEW_NAMES: tuple[str, ...] = (
 # they're correctly omitted because the source of truth for those views
 # is the parquet, not this dict.
 #
-# Drift is caught by :func:`tests.test_sql_views.test_view_schema_matches_describe_all`,
-# which registers the v1 views over the fixture corpus, runs
-# :func:`describe_all`, and asserts column-level equality with this dict.
-# A contributor who edits view DDL without updating ``VIEW_SCHEMA`` gets
-# a hard CI failure rather than a runtime mystery.
+# Drift is caught by :func:`tests.test_sql_views.test_view_schema_matches_describe_inline`,
+# which registers the v1 views over the fixture corpus, runs ``DESCRIBE``
+# inline per view, and asserts column-level equality with this dict. A
+# contributor who edits view DDL without updating ``VIEW_SCHEMA`` gets a
+# hard CI failure rather than a runtime mystery.
 VIEW_SCHEMA: dict[str, tuple[tuple[str, str], ...]] = {
     "sessions": (
         ("session_id", "VARCHAR"),
@@ -251,16 +250,6 @@ VIEW_SCHEMA: dict[str, tuple[tuple[str, str], ...]] = {
         ("created_at", "TIMESTAMP"),
         ("last_updated_at", "TIMESTAMP"),
     ),
-    "task_spawns": (
-        ("session_id", "VARCHAR"),
-        ("spawned_at", "TIMESTAMP"),
-        ("message_uuid", "VARCHAR"),
-        ("tool_use_id", "VARCHAR"),
-        ("spawn_tool", "VARCHAR"),
-        ("subagent_type", "VARCHAR"),
-        ("description", "VARCHAR"),
-        ("prompt", "VARCHAR"),
-    ),
     "skill_invocations": (
         ("session_id", "VARCHAR"),
         ("ts", "TIMESTAMP"),
@@ -421,7 +410,7 @@ def _sql_str(value: str) -> str:
 # Drift discipline: when a downstream view in :func:`register_views` adds a
 # new top-level field reference, add it here too — otherwise the view will
 # silently return NULL for that column. The
-# ``test_view_schema_matches_describe_all`` drift test catches the column
+# ``test_view_schema_matches_describe_inline`` drift test catches the column
 # disappearing from any of the 18 v1 views.
 _MESSAGE_STRUCT_TYPE: str = (
     "STRUCT("
@@ -610,6 +599,7 @@ def register_raw(
         )
         logger.debug("Registered v_raw_subagent_meta from glob {}", subagent_meta_glob)
     except Exception:
+        # register-or-fail-loud — any DuckDB error must surface to the caller.
         logger.exception("Failed to register raw views")
         raise
@@ -626,8 +616,7 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
     ``sessions``, ``messages``, ``content_blocks``, ``messages_text``,
     ``tool_calls``, ``tool_results``, ``todo_events``, ``todo_state_current``,
     ``subagent_spawns``, ``task_creations``, ``task_updates``,
-    ``tasks_state_current``, ``task_spawns`` (deprecated alias),
-    ``subagent_sessions``, ``subagent_messages``.
+    ``tasks_state_current``, ``subagent_sessions``, ``subagent_messages``.
     The split between ``subagent_spawns`` and ``task_creations`` reflects
     the Claude Code v2.1.63 ``Task``→``Agent`` rename and the v2.1.16
@@ -973,30 +962,6 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
         )
         logger.debug("Registered view: tasks_state_current")
-        # DEPRECATED: ``task_spawns`` predates the Task→Agent rename (v2.1.63)
-        # and the TodoWrite→TaskCreate split (v2.1.16). It conflated subagent
-        # launchers with task-tracker creation. Kept as a UNION ALL alias for
-        # one release; new analytics should use ``subagent_spawns`` or
-        # ``task_creations`` directly. Removed in the next minor release.
-        con.execute(
-            """
-            CREATE OR REPLACE VIEW task_spawns AS
-            SELECT
-                session_id, spawned_at, message_uuid, tool_use_id,
-                spawn_tool, subagent_type, description, prompt
-            FROM subagent_spawns
-            UNION ALL
-            SELECT
-                session_id, created_at AS spawned_at, message_uuid, tool_use_id,
-                create_tool AS spawn_tool,
-                NULL AS subagent_type,
-                description,
-                NULL AS prompt
-            FROM task_creations;
-            """
-        )
-        logger.debug("Registered view: task_spawns (deprecated)")
         # Every Skill / slash-command invocation observable in the transcripts,
         # unioned across the two shapes they take:
         #
@@ -1108,6 +1073,7 @@ def register_views(con: duckdb.DuckDBPyConnection) -> None:
         )
         logger.debug("Registered view: subagent_messages")
     except Exception:
+        # register-or-fail-loud — any DuckDB error must surface to the caller.
         logger.exception("Failed to register derived views")
         raise
@@ -2148,49 +2114,6 @@ def register_all(
 # ---------------------------------------------------------------------------
-def describe_all(con: duckdb.DuckDBPyConnection) -> dict[str, list[tuple[str, str]]]:
-    """Return the column schema of every business-level view.
-    .. deprecated::
-        Use :data:`VIEW_SCHEMA` for static introspection.  ``describe_all``
-        opens a DuckDB connection and runs ``DESCRIBE`` per view, which on
-        the live corpus takes ~14 s -- prohibitive for the agent-facing
-        ``schema`` command.  Kept for one release as a fallback and as the
-        ground truth used by the ``test_view_schema_matches_describe_all``
-        drift test; it will be removed once that test moves to a different
-        introspection path.
-    Parameters
-    ----------
-    con
-        Open DuckDB connection with views registered.
-    Returns
-    -------
-    dict
-        ``{view_name: [(column_name, column_type), ...]}``. Views that fail to
-        describe (e.g. missing because ``register_views`` was not called) map
-        to an empty list and emit a warning.
-    """
-    import warnings
-    warnings.warn(
-        "describe_all is deprecated; use VIEW_SCHEMA for static "
-        "introspection. Will be removed in a future release.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-    out: dict[str, list[tuple[str, str]]] = {}
-    for name in VIEW_NAMES:
-        try:
-            rows = con.execute(f"DESCRIBE {name}").fetchall()
-            out[name] = [(str(r[0]), str(r[1])) for r in rows]
-        except duckdb.Error as exc:
-            logger.warning("Could not describe {}: {}", name, exc)
-            out[name] = []
-    return out
 def list_macros(con: duckdb.DuckDBPyConnection) -> list[tuple[str, tuple[str, ...]]]:
     """Return ``(name, params)`` for every registered macro.

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/trajectory_worker.py RENAMED Viewed

@@ -51,7 +51,7 @@ from claude_sql.llm_shared import (
     classify_one,
     pipeline_cache_stats,
 )
-from claude_sql.parquet_shards import iter_part_files, write_part
+from claude_sql.parquet_shards import iter_part_files, replace_sessions, write_part
 from claude_sql.schemas import TRAJECTORY_ARRAY_SCHEMA
 from claude_sql.session_text import session_bounds
@@ -886,8 +886,20 @@ async def _trajectory_async(
             # don't collide on filenames — but we still keep the lock so the
             # in-memory ``written_box`` / ``processed_sessions`` set updates
             # in lockstep with the on-disk write.
+            #
+            # replace_sessions drops any prior rows for ``sid`` still sitting
+            # in the cache from earlier runs. The checkpointer gates
+            # computation on advancing (latest_ts, message_count) bounds but
+            # does NOT touch the parquet cache; without this step a growing
+            # active session duplicates its (prev_uuid, curr_uuid) pairs
+            # on every rerun. See GH #45.
             df = pl.DataFrame(all_rows, schema=_PARQUET_SCHEMA)
             async with write_lock:
+                replace_sessions(
+                    settings.trajectory_parquet_path,
+                    key_column="session_id",
+                    session_ids=[sid],
+                )
                 write_part(settings.trajectory_parquet_path, df)
                 written_box[0] += len(all_rows)
                 processed_sessions.add(sid)

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/__init__.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/binding.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/blind_handover.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/classify_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/cli.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/cluster_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/community_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/conflicts_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/freeze.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/friction_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/home.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/ingest.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/install_source.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/judge_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/judges.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/kappa_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/lance_store.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/llm_shared.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/logging_setup.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/output.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/retry_queue.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/review_sheet_render.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/review_sheet_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/schemas.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/session_text.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/skills_catalog.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/terms_worker.py RENAMED Viewed

File without changes

{claude_sql-1.0.0 → claude_sql-1.0.1}/src/claude_sql/ungrounded_worker.py RENAMED Viewed

File without changes

claude-sql 1.0.0__tar.gz → 1.0.1__tar.gz

claude-sql 1.0.0tar.gz → 1.0.1tar.gz