PyPI - claude-sql - Versions diffs - 0.4.0__py3-none-any.whl - Mend

claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

claude_sql/__init__.py +5 -0
claude_sql/binding.py +740 -0
claude_sql/blind_handover.py +155 -0
claude_sql/checkpointer.py +202 -0
claude_sql/cli.py +2344 -0
claude_sql/cluster_worker.py +208 -0
claude_sql/community_worker.py +306 -0
claude_sql/config.py +380 -0
claude_sql/embed_worker.py +482 -0
claude_sql/freeze.py +189 -0
claude_sql/friction_worker.py +561 -0
claude_sql/install_source.py +77 -0
claude_sql/judge_worker.py +459 -0
claude_sql/judges.py +239 -0
claude_sql/kappa_worker.py +257 -0
claude_sql/llm_worker.py +1760 -0
claude_sql/logging_setup.py +95 -0
claude_sql/output.py +248 -0
claude_sql/parquet_shards.py +172 -0
claude_sql/retry_queue.py +180 -0
claude_sql/review_sheet_render.py +167 -0
claude_sql/review_sheet_worker.py +463 -0
claude_sql/schemas.py +454 -0
claude_sql/session_text.py +387 -0
claude_sql/skills_catalog.py +354 -0
claude_sql/sql_views.py +1751 -0
claude_sql/terms_worker.py +145 -0
claude_sql/ungrounded_worker.py +190 -0
claude_sql-0.4.0.dist-info/METADATA +530 -0
claude_sql-0.4.0.dist-info/RECORD +32 -0
claude_sql-0.4.0.dist-info/WHEEL +4 -0
claude_sql-0.4.0.dist-info/entry_points.txt +3 -0

claude_sql/config.py ADDED Viewed

@@ -0,0 +1,380 @@
+"""Runtime configuration for claude-sql.
+Pydantic v2 ``BaseSettings`` populated from env vars prefixed with ``CLAUDE_SQL_``.
+Defaults are picked for a single-user devbox install pointing at
+``~/.claude/projects/**/*.jsonl``.
+"""
+from __future__ import annotations
+import os
+import warnings
+from pathlib import Path
+from typing import Literal, Self
+from pydantic import Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+def _default_glob() -> str:
+    # Top-level session transcripts only.  Subagent side-files live one level
+    # deeper under ``<session>/subagents/`` and are discovered via SUBAGENT_GLOB.
+    return os.path.expanduser("~/.claude/projects/*/*.jsonl")
+def _default_subagent_glob() -> str:
+    return os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.jsonl")
+def _default_subagent_meta_glob() -> str:
+    return os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.meta.json")
+def _default_embeddings_parquet() -> Path:
+    # Sharded cache directory (see ``claude_sql.parquet_shards``).  Writers
+    # drop ``part-<ts_ns>.parquet`` files into it; readers glob the directory.
+    # The field name keeps the ``_parquet_path`` suffix so existing call sites
+    # stay stable — only the *semantics* of the path moved from "single file"
+    # to "directory of parts".
+    return Path(os.path.expanduser("~/.claude/embeddings/"))
+def _default_classifications_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/session_classifications/"))
+def _default_trajectory_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/message_trajectory/"))
+def _default_conflicts_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/session_conflicts/"))
+def _default_clusters_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/clusters.parquet"))
+def _default_cluster_terms_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/cluster_terms.parquet"))
+def _default_communities_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/session_communities.parquet"))
+def _default_user_friction_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/user_friction/"))
+def _default_skills_catalog_parquet() -> Path:
+    return Path(os.path.expanduser("~/.claude/skills_catalog.parquet"))
+def _default_user_skills_dir() -> Path:
+    return Path(os.path.expanduser("~/.claude/skills"))
+def _default_plugins_cache_dir() -> Path:
+    return Path(os.path.expanduser("~/.claude/plugins/cache"))
+def _default_checkpoint_db() -> Path:
+    return Path(os.path.expanduser("~/.claude/claude_sql.duckdb"))
+def _default_hnsw_db() -> Path:
+    return Path(os.path.expanduser("~/.claude/hnsw.duckdb"))
+def _default_duckdb_temp_dir() -> Path:
+    return Path(os.path.expanduser("~/.claude/duckdb_tmp"))
+def _default_duckdb_threads() -> int:
+    return os.cpu_count() or 4
+# Model pricing per 1M tokens (in_rate, out_rate).  Mirrors claude-mine/transform.py.
+DEFAULT_PRICING: dict[str, tuple[float, float]] = {
+    "claude-opus-4-7": (15.0, 75.0),
+    "claude-opus-4-6": (15.0, 75.0),
+    "claude-sonnet-4-6": (3.0, 15.0),
+    "claude-sonnet-4-5": (3.0, 15.0),
+    "claude-haiku-4-5": (0.80, 4.0),
+}
+class Settings(BaseSettings):
+    """Environment-driven settings for claude-sql.
+    All fields are overridable via env vars prefixed ``CLAUDE_SQL_`` (e.g.
+    ``CLAUDE_SQL_REGION=us-west-2``) or via ``.env`` in the working directory.
+    """
+    model_config = SettingsConfigDict(
+        env_prefix="CLAUDE_SQL_",
+        env_file=".env",
+        extra="ignore",
+    )
+    # ------------------------------------------------------------------
+    # Data discovery
+    # ------------------------------------------------------------------
+    default_glob: str = Field(default_factory=_default_glob)
+    subagent_glob: str = Field(default_factory=_default_subagent_glob)
+    subagent_meta_glob: str = Field(default_factory=_default_subagent_meta_glob)
+    #: Team-corpus root.  When set, ``default_glob`` / ``subagent_glob`` /
+    #: ``subagent_meta_glob`` are derived from ``<root>/<author>/projects/*``
+    #: instead of ``~/.claude/projects/*``.  Replaces (does not union with)
+    #: the personal corpus root; an explicit per-glob override always wins.
+    team_corpus_root: Path | None = Field(
+        default=None,
+        description=(
+            "If set, default_glob/subagent_glob/subagent_meta_glob derive from "
+            "<root>/<author>/projects/* instead of ~/.claude/projects/*. "
+            "Replaces (does not union with) the personal corpus root."
+        ),
+    )
+    # ------------------------------------------------------------------
+    # Bedrock / embedding
+    # ------------------------------------------------------------------
+    region: str = "us-east-1"
+    #: Cohere Embed v4 global CRIS profile. Sustained 223 vec/s with zero
+    #: throttling at concurrency=8 in testing; US-only and direct on-demand
+    #: both throttle hard at low TPM. No reason to expose the knob.
+    model_id: str = "global.cohere.embed-v4:0"
+    output_dimension: Literal[256, 512, 1024, 1536] = 1024
+    embedding_type: Literal["int8", "float", "uint8", "binary", "ubinary"] = "int8"
+    #: Parallel Bedrock calls for Cohere Embed v4 on global CRIS. Sustained
+    #: 8 × batch_size 96 in testing without throttling — Cohere's TPM bucket
+    #: is the binding constraint and embed v4 is generous on global CRIS.
+    embed_concurrency: int = 8
+    #: Parallel Bedrock calls for Sonnet 4.6 on global CRIS. 16 is the
+    #: sweet spot once system prompts cross the cache threshold — cache
+    #: reads don't deduct from the per-model TPM bucket, so 16 parallel
+    #: cached calls sustain well below the throttle ceiling. Observed
+    #: ~5 calls/sec at concurrency=8 on trajectory's full backfill;
+    #: concurrency=16 scales that linearly with negligible throttle.
+    #: Drop to 2–4 if a future model has a smaller TPM bucket.
+    llm_concurrency: int = 16
+    #: DEPRECATED: use ``embed_concurrency`` / ``llm_concurrency``. Kept for
+    #: one release as a back-compat alias — when set explicitly (env or
+    #: kwarg), it overrides both. Removed once downstream callers migrate.
+    concurrency: int | None = None
+    batch_size: int = 96
+    embeddings_parquet_path: Path = Field(default_factory=_default_embeddings_parquet)
+    # ------------------------------------------------------------------
+    # VSS / HNSW
+    # ------------------------------------------------------------------
+    hnsw_metric: Literal["cosine", "l2sq", "ip"] = "cosine"
+    hnsw_ef_construction: int = 128
+    hnsw_ef_search: int = 64
+    hnsw_m: int = 16
+    hnsw_m0: int = 32
+    #: Persistent DuckDB file backing the HNSW index. ``register_vss``
+    #: ATTACHes this file (separate from ``checkpoint_db_path`` so a
+    #: corruption in either store recovers in isolation — ``rm
+    #: ~/.claude/hnsw.duckdb`` is the documented HNSW recovery path) and
+    #: rebuilds from the embeddings parquet only when the parquet's mtime
+    #: is newer than the file's. Persistence rides on DuckDB's
+    #: ``hnsw_enable_experimental_persistence`` flag.
+    hnsw_db_path: Path = Field(default_factory=_default_hnsw_db)
+    # ------------------------------------------------------------------
+    # Pricing
+    # ------------------------------------------------------------------
+    pricing: dict[str, tuple[float, float]] = Field(default_factory=lambda: dict(DEFAULT_PRICING))
+    # ------------------------------------------------------------------
+    # v2: LLM classification (Bedrock Sonnet 4.6 + output_config.format)
+    # ------------------------------------------------------------------
+    #: Sonnet 4.6 global CRIS inference profile — CRIS-only, 1M context native,
+    #: no beta header. Supports `output_config.format` GA structured output.
+    sonnet_model_id: str = "global.anthropic.claude-sonnet-4-6"
+    #: (input, output) $/MTok for Sonnet 4.6 on Bedrock us-east-1.
+    sonnet_pricing: tuple[float, float] = (3.0, 15.0)
+    #: Default thinking mode used by the session-level ``classify`` and
+    #: ``conflicts`` pipelines.  ``"adaptive"`` lets Sonnet reason before
+    #: emitting structured output; ``"disabled"`` is the escape hatch when
+    #: Bedrock 400s on thinking + output_config (rare, undocumented).
+    classify_thinking: Literal["adaptive", "disabled"] = "adaptive"
+    #: Per-message trajectory classifier thinking mode. Disabled by
+    #: default — trajectory is a 3-class enum + 1 boolean; reasoning burns
+    #: 5–20× output tokens for no measurable quality gain on this shape.
+    trajectory_thinking: Literal["adaptive", "disabled"] = "disabled"
+    #: Friction classifier thinking mode. Disabled by default for the same
+    #: reason as trajectory: short-message classification doesn't benefit
+    #: from reasoning.  Bumps to ``adaptive`` only if quality regresses
+    #: in real eval data.
+    friction_thinking: Literal["adaptive", "disabled"] = "disabled"
+    #: Max output tokens for a single classification call.
+    classify_max_tokens: int = 2048
+    #: Per-text clip used when assembling session_text — tool_results can be
+    #: arbitrarily large (Bash output, file reads).
+    session_text_tool_result_max_chars: int = 50_000
+    #: Total session_text cap (conservative 800K chars ≈ 200K tokens, leaves
+    #: room for the response under the 1M window).
+    session_text_total_max_chars: int = 800_000
+    # v2 parquet outputs
+    classifications_parquet_path: Path = Field(default_factory=_default_classifications_parquet)
+    trajectory_parquet_path: Path = Field(default_factory=_default_trajectory_parquet)
+    conflicts_parquet_path: Path = Field(default_factory=_default_conflicts_parquet)
+    clusters_parquet_path: Path = Field(default_factory=_default_clusters_parquet)
+    cluster_terms_parquet_path: Path = Field(default_factory=_default_cluster_terms_parquet)
+    communities_parquet_path: Path = Field(default_factory=_default_communities_parquet)
+    #: Output of the user-friction classifier (see ``friction_worker.py``).
+    #: One row per user message flagged as status_ping, unmet_expectation,
+    #: confusion, interruption, correction, frustration, or (sentinel) none.
+    #: Backs the ``user_friction`` view and the ``friction_counts`` /
+    #: ``friction_rate`` analytics macros.
+    user_friction_parquet_path: Path = Field(default_factory=_default_user_friction_parquet)
+    #: Short-message cutoff for the friction classifier candidate filter.
+    #: Friction signals cluster in short messages ("screenshot?", "wait",
+    #: "why?"); long messages are almost always on-topic turns.  300 chars
+    #: captures ~95% of the interesting class without bloating Bedrock cost.
+    friction_max_chars: int = 300
+    #: Catalog of locally-available Skills and slash commands, produced by
+    #: ``claude-sql skills sync`` (see :mod:`claude_sql.skills_catalog`).
+    #: Backs the ``skills_catalog`` view, the ``skill_usage`` enrichment join,
+    #: and the ``unused_skills`` macro.  Walked from :attr:`user_skills_dir`
+    #: and :attr:`plugins_cache_dir`.
+    skills_catalog_parquet_path: Path = Field(default_factory=_default_skills_catalog_parquet)
+    #: Root of user-level skills (each entry has a ``SKILL.md``).
+    user_skills_dir: Path = Field(default_factory=_default_user_skills_dir)
+    #: Root of the plugins cache maintained by Claude Code.  The walker
+    #: expects ``<owner>/<plugin>/<version>/`` underneath, each with a
+    #: ``.claude-plugin/plugin.json`` and ``skills/`` / ``commands/`` subdirs.
+    plugins_cache_dir: Path = Field(default_factory=_default_plugins_cache_dir)
+    #: Per-(session_id, pipeline) checkpoint DuckDB file. See ``checkpointer.py``.
+    checkpoint_db_path: Path = Field(default_factory=_default_checkpoint_db)
+    # ------------------------------------------------------------------
+    # v2: UMAP + HDBSCAN + Louvain hyperparameters
+    # ------------------------------------------------------------------
+    umap_n_components_50: int = 50
+    umap_n_components_2: int = 2
+    umap_n_neighbors: int = 30
+    umap_min_dist_cluster: float = 0.0
+    umap_min_dist_viz: float = 0.1
+    umap_metric: str = "cosine"
+    hdbscan_min_cluster_size: int = 20
+    hdbscan_min_samples: int = 5
+    #: Absolute cosine floor below which a pair is never considered related,
+    #: regardless of the adaptive search.  Kept conservative so the graph
+    #: doesn't collapse into a single giant component on very similar
+    #: corpora.
+    louvain_edge_threshold: float = 0.55
+    #: Target band for the average graph degree.  ``_pick_adaptive_threshold``
+    #: picks the cosine cut that puts average degree in ``[low, high]``.
+    #: 8-15 is the empirically-tested sweet spot for Louvain on session-
+    #: centroid graphs (1K-20K nodes): enough to let community structure
+    #: emerge, not enough to produce a hairball.
+    louvain_target_avg_degree_low: float = 8.0
+    louvain_target_avg_degree_high: float = 15.0
+    #: Louvain communities smaller than this get collapsed into the
+    #: NOISE_COMMUNITY_ID bucket (-1) so reports stay legible.
+    louvain_min_community_size: int = 3
+    louvain_resolution: float = 1.0
+    seed: int = 42
+    # ------------------------------------------------------------------
+    # v2: TF-IDF  # noqa: ERA001 — section header, not commented-out code
+    # ------------------------------------------------------------------
+    tfidf_min_df: int = 2
+    tfidf_max_df: float = 0.95
+    tfidf_ngram_min: int = 1
+    tfidf_ngram_max: int = 2
+    tfidf_top_n_terms: int = 10
+    # ------------------------------------------------------------------
+    # DuckDB engine tuning — applied as PRAGMAs in cli._open_connection.
+    # ------------------------------------------------------------------
+    #: Worker threads. Defaults to ``os.cpu_count()`` so DuckDB uses every
+    #: core; agents and CI runners with limited parallelism can override.
+    duckdb_threads: int = Field(default_factory=_default_duckdb_threads)
+    #: Memory ceiling. ``"70%"`` is permissive for a single-user devbox;
+    #: drop on shared hosts via the env var if it pressures other workloads.
+    duckdb_memory_limit: str = "70%"
+    #: Spill directory. Amazon devboxes ship ``/tmp`` as a 4 GB tmpfs that
+    #: thrashes the host once a clustering run starts spilling — point at
+    #: ``~/.claude/duckdb_tmp`` (real disk) instead.
+    duckdb_temp_dir: Path = Field(default_factory=_default_duckdb_temp_dir)
+    @model_validator(mode="after")
+    def _derive_team_corpus_globs(self) -> Self:
+        """Rewrite the three transcript globs when ``team_corpus_root`` is set.
+        Pattern: ``<root>/<author>/projects/<project>/<sid>.jsonl`` (and the
+        matching ``subagents/`` siblings).  Replaces — does not union with —
+        the personal corpus root, per memo §Coherent Actions #3.
+        Per-glob user pins always win: if any of ``default_glob`` /
+        ``subagent_glob`` / ``subagent_meta_glob`` differ from their factory
+        defaults at validation time, none of them are rewritten (we can't
+        cherry-pick a partial rewrite without smuggling intent).
+        """
+        root = self.team_corpus_root
+        if root is None:
+            return self
+        # Detect "user pinned a glob" by comparing to the factory-provided
+        # default rather than literal string equality, so refactors of
+        # ``_default_glob()`` and friends don't silently break this path.
+        user_pinned = (
+            self.default_glob != _default_glob()
+            or self.subagent_glob != _default_subagent_glob()
+            or self.subagent_meta_glob != _default_subagent_meta_glob()
+        )
+        if user_pinned:
+            return self
+        resolved = root.expanduser().resolve()
+        object.__setattr__(self, "default_glob", f"{resolved}/*/projects/*/*.jsonl")
+        object.__setattr__(
+            self,
+            "subagent_glob",
+            f"{resolved}/*/projects/*/subagents/agent-*.jsonl",
+        )
+        object.__setattr__(
+            self,
+            "subagent_meta_glob",
+            f"{resolved}/*/projects/*/subagents/agent-*.meta.json",
+        )
+        return self
+    @model_validator(mode="after")
+    def _resolve_concurrency_alias(self) -> Settings:
+        """Honor the deprecated ``concurrency`` field as an alias for both pipelines.
+        When ``concurrency`` is set explicitly (env or kwarg) and the modern
+        per-pipeline fields are at their defaults, mirror it onto both. We
+        only override when the user clearly didn't set the new fields, so
+        ``embed_concurrency=8, concurrency=4`` keeps the explicit 8.
+        """
+        if self.concurrency is None:
+            return self
+        warnings.warn(
+            "CLAUDE_SQL_CONCURRENCY / Settings.concurrency is deprecated. "
+            "Use CLAUDE_SQL_EMBED_CONCURRENCY (default 8) and "
+            "CLAUDE_SQL_LLM_CONCURRENCY (default 2) instead. The single "
+            "knob will be removed in the next release.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        # Only apply the alias to fields still at their default value.
+        if self.embed_concurrency == 8:
+            object.__setattr__(self, "embed_concurrency", self.concurrency)
+        if self.llm_concurrency == 16:
+            object.__setattr__(self, "llm_concurrency", self.concurrency)
+        return self
+    @property
+    def active_model_id(self) -> str:
+        """Return the Bedrock embedding model ID (kept as a property for call-site stability)."""
+        return self.model_id