claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
claude_sql/config.py ADDED
@@ -0,0 +1,380 @@
1
+ """Runtime configuration for claude-sql.
2
+
3
+ Pydantic v2 ``BaseSettings`` populated from env vars prefixed with ``CLAUDE_SQL_``.
4
+ Defaults are picked for a single-user devbox install pointing at
5
+ ``~/.claude/projects/**/*.jsonl``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import warnings
12
+ from pathlib import Path
13
+ from typing import Literal, Self
14
+
15
+ from pydantic import Field, model_validator
16
+ from pydantic_settings import BaseSettings, SettingsConfigDict
17
+
18
+
19
+ def _default_glob() -> str:
20
+ # Top-level session transcripts only. Subagent side-files live one level
21
+ # deeper under ``<session>/subagents/`` and are discovered via SUBAGENT_GLOB.
22
+ return os.path.expanduser("~/.claude/projects/*/*.jsonl")
23
+
24
+
25
+ def _default_subagent_glob() -> str:
26
+ return os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.jsonl")
27
+
28
+
29
+ def _default_subagent_meta_glob() -> str:
30
+ return os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.meta.json")
31
+
32
+
33
+ def _default_embeddings_parquet() -> Path:
34
+ # Sharded cache directory (see ``claude_sql.parquet_shards``). Writers
35
+ # drop ``part-<ts_ns>.parquet`` files into it; readers glob the directory.
36
+ # The field name keeps the ``_parquet_path`` suffix so existing call sites
37
+ # stay stable — only the *semantics* of the path moved from "single file"
38
+ # to "directory of parts".
39
+ return Path(os.path.expanduser("~/.claude/embeddings/"))
40
+
41
+
42
+ def _default_classifications_parquet() -> Path:
43
+ return Path(os.path.expanduser("~/.claude/session_classifications/"))
44
+
45
+
46
+ def _default_trajectory_parquet() -> Path:
47
+ return Path(os.path.expanduser("~/.claude/message_trajectory/"))
48
+
49
+
50
+ def _default_conflicts_parquet() -> Path:
51
+ return Path(os.path.expanduser("~/.claude/session_conflicts/"))
52
+
53
+
54
+ def _default_clusters_parquet() -> Path:
55
+ return Path(os.path.expanduser("~/.claude/clusters.parquet"))
56
+
57
+
58
+ def _default_cluster_terms_parquet() -> Path:
59
+ return Path(os.path.expanduser("~/.claude/cluster_terms.parquet"))
60
+
61
+
62
+ def _default_communities_parquet() -> Path:
63
+ return Path(os.path.expanduser("~/.claude/session_communities.parquet"))
64
+
65
+
66
+ def _default_user_friction_parquet() -> Path:
67
+ return Path(os.path.expanduser("~/.claude/user_friction/"))
68
+
69
+
70
+ def _default_skills_catalog_parquet() -> Path:
71
+ return Path(os.path.expanduser("~/.claude/skills_catalog.parquet"))
72
+
73
+
74
+ def _default_user_skills_dir() -> Path:
75
+ return Path(os.path.expanduser("~/.claude/skills"))
76
+
77
+
78
+ def _default_plugins_cache_dir() -> Path:
79
+ return Path(os.path.expanduser("~/.claude/plugins/cache"))
80
+
81
+
82
+ def _default_checkpoint_db() -> Path:
83
+ return Path(os.path.expanduser("~/.claude/claude_sql.duckdb"))
84
+
85
+
86
+ def _default_hnsw_db() -> Path:
87
+ return Path(os.path.expanduser("~/.claude/hnsw.duckdb"))
88
+
89
+
90
+ def _default_duckdb_temp_dir() -> Path:
91
+ return Path(os.path.expanduser("~/.claude/duckdb_tmp"))
92
+
93
+
94
+ def _default_duckdb_threads() -> int:
95
+ return os.cpu_count() or 4
96
+
97
+
98
+ # Model pricing per 1M tokens (in_rate, out_rate). Mirrors claude-mine/transform.py.
99
+ DEFAULT_PRICING: dict[str, tuple[float, float]] = {
100
+ "claude-opus-4-7": (15.0, 75.0),
101
+ "claude-opus-4-6": (15.0, 75.0),
102
+ "claude-sonnet-4-6": (3.0, 15.0),
103
+ "claude-sonnet-4-5": (3.0, 15.0),
104
+ "claude-haiku-4-5": (0.80, 4.0),
105
+ }
106
+
107
+
108
+ class Settings(BaseSettings):
109
+ """Environment-driven settings for claude-sql.
110
+
111
+ All fields are overridable via env vars prefixed ``CLAUDE_SQL_`` (e.g.
112
+ ``CLAUDE_SQL_REGION=us-west-2``) or via ``.env`` in the working directory.
113
+ """
114
+
115
+ model_config = SettingsConfigDict(
116
+ env_prefix="CLAUDE_SQL_",
117
+ env_file=".env",
118
+ extra="ignore",
119
+ )
120
+
121
+ # ------------------------------------------------------------------
122
+ # Data discovery
123
+ # ------------------------------------------------------------------
124
+ default_glob: str = Field(default_factory=_default_glob)
125
+ subagent_glob: str = Field(default_factory=_default_subagent_glob)
126
+ subagent_meta_glob: str = Field(default_factory=_default_subagent_meta_glob)
127
+ #: Team-corpus root. When set, ``default_glob`` / ``subagent_glob`` /
128
+ #: ``subagent_meta_glob`` are derived from ``<root>/<author>/projects/*``
129
+ #: instead of ``~/.claude/projects/*``. Replaces (does not union with)
130
+ #: the personal corpus root; an explicit per-glob override always wins.
131
+ team_corpus_root: Path | None = Field(
132
+ default=None,
133
+ description=(
134
+ "If set, default_glob/subagent_glob/subagent_meta_glob derive from "
135
+ "<root>/<author>/projects/* instead of ~/.claude/projects/*. "
136
+ "Replaces (does not union with) the personal corpus root."
137
+ ),
138
+ )
139
+
140
+ # ------------------------------------------------------------------
141
+ # Bedrock / embedding
142
+ # ------------------------------------------------------------------
143
+ region: str = "us-east-1"
144
+ #: Cohere Embed v4 global CRIS profile. Sustained 223 vec/s with zero
145
+ #: throttling at concurrency=8 in testing; US-only and direct on-demand
146
+ #: both throttle hard at low TPM. No reason to expose the knob.
147
+ model_id: str = "global.cohere.embed-v4:0"
148
+
149
+ output_dimension: Literal[256, 512, 1024, 1536] = 1024
150
+ embedding_type: Literal["int8", "float", "uint8", "binary", "ubinary"] = "int8"
151
+ #: Parallel Bedrock calls for Cohere Embed v4 on global CRIS. Sustained
152
+ #: 8 × batch_size 96 in testing without throttling — Cohere's TPM bucket
153
+ #: is the binding constraint and embed v4 is generous on global CRIS.
154
+ embed_concurrency: int = 8
155
+ #: Parallel Bedrock calls for Sonnet 4.6 on global CRIS. 16 is the
156
+ #: sweet spot once system prompts cross the cache threshold — cache
157
+ #: reads don't deduct from the per-model TPM bucket, so 16 parallel
158
+ #: cached calls sustain well below the throttle ceiling. Observed
159
+ #: ~5 calls/sec at concurrency=8 on trajectory's full backfill;
160
+ #: concurrency=16 scales that linearly with negligible throttle.
161
+ #: Drop to 2–4 if a future model has a smaller TPM bucket.
162
+ llm_concurrency: int = 16
163
+ #: DEPRECATED: use ``embed_concurrency`` / ``llm_concurrency``. Kept for
164
+ #: one release as a back-compat alias — when set explicitly (env or
165
+ #: kwarg), it overrides both. Removed once downstream callers migrate.
166
+ concurrency: int | None = None
167
+ batch_size: int = 96
168
+
169
+ embeddings_parquet_path: Path = Field(default_factory=_default_embeddings_parquet)
170
+
171
+ # ------------------------------------------------------------------
172
+ # VSS / HNSW
173
+ # ------------------------------------------------------------------
174
+ hnsw_metric: Literal["cosine", "l2sq", "ip"] = "cosine"
175
+ hnsw_ef_construction: int = 128
176
+ hnsw_ef_search: int = 64
177
+ hnsw_m: int = 16
178
+ hnsw_m0: int = 32
179
+ #: Persistent DuckDB file backing the HNSW index. ``register_vss``
180
+ #: ATTACHes this file (separate from ``checkpoint_db_path`` so a
181
+ #: corruption in either store recovers in isolation — ``rm
182
+ #: ~/.claude/hnsw.duckdb`` is the documented HNSW recovery path) and
183
+ #: rebuilds from the embeddings parquet only when the parquet's mtime
184
+ #: is newer than the file's. Persistence rides on DuckDB's
185
+ #: ``hnsw_enable_experimental_persistence`` flag.
186
+ hnsw_db_path: Path = Field(default_factory=_default_hnsw_db)
187
+
188
+ # ------------------------------------------------------------------
189
+ # Pricing
190
+ # ------------------------------------------------------------------
191
+ pricing: dict[str, tuple[float, float]] = Field(default_factory=lambda: dict(DEFAULT_PRICING))
192
+
193
+ # ------------------------------------------------------------------
194
+ # v2: LLM classification (Bedrock Sonnet 4.6 + output_config.format)
195
+ # ------------------------------------------------------------------
196
+ #: Sonnet 4.6 global CRIS inference profile — CRIS-only, 1M context native,
197
+ #: no beta header. Supports `output_config.format` GA structured output.
198
+ sonnet_model_id: str = "global.anthropic.claude-sonnet-4-6"
199
+ #: (input, output) $/MTok for Sonnet 4.6 on Bedrock us-east-1.
200
+ sonnet_pricing: tuple[float, float] = (3.0, 15.0)
201
+ #: Default thinking mode used by the session-level ``classify`` and
202
+ #: ``conflicts`` pipelines. ``"adaptive"`` lets Sonnet reason before
203
+ #: emitting structured output; ``"disabled"`` is the escape hatch when
204
+ #: Bedrock 400s on thinking + output_config (rare, undocumented).
205
+ classify_thinking: Literal["adaptive", "disabled"] = "adaptive"
206
+ #: Per-message trajectory classifier thinking mode. Disabled by
207
+ #: default — trajectory is a 3-class enum + 1 boolean; reasoning burns
208
+ #: 5–20× output tokens for no measurable quality gain on this shape.
209
+ trajectory_thinking: Literal["adaptive", "disabled"] = "disabled"
210
+ #: Friction classifier thinking mode. Disabled by default for the same
211
+ #: reason as trajectory: short-message classification doesn't benefit
212
+ #: from reasoning. Bumps to ``adaptive`` only if quality regresses
213
+ #: in real eval data.
214
+ friction_thinking: Literal["adaptive", "disabled"] = "disabled"
215
+ #: Max output tokens for a single classification call.
216
+ classify_max_tokens: int = 2048
217
+ #: Per-text clip used when assembling session_text — tool_results can be
218
+ #: arbitrarily large (Bash output, file reads).
219
+ session_text_tool_result_max_chars: int = 50_000
220
+ #: Total session_text cap (conservative 800K chars ≈ 200K tokens, leaves
221
+ #: room for the response under the 1M window).
222
+ session_text_total_max_chars: int = 800_000
223
+
224
+ # v2 parquet outputs
225
+ classifications_parquet_path: Path = Field(default_factory=_default_classifications_parquet)
226
+ trajectory_parquet_path: Path = Field(default_factory=_default_trajectory_parquet)
227
+ conflicts_parquet_path: Path = Field(default_factory=_default_conflicts_parquet)
228
+ clusters_parquet_path: Path = Field(default_factory=_default_clusters_parquet)
229
+ cluster_terms_parquet_path: Path = Field(default_factory=_default_cluster_terms_parquet)
230
+ communities_parquet_path: Path = Field(default_factory=_default_communities_parquet)
231
+ #: Output of the user-friction classifier (see ``friction_worker.py``).
232
+ #: One row per user message flagged as status_ping, unmet_expectation,
233
+ #: confusion, interruption, correction, frustration, or (sentinel) none.
234
+ #: Backs the ``user_friction`` view and the ``friction_counts`` /
235
+ #: ``friction_rate`` analytics macros.
236
+ user_friction_parquet_path: Path = Field(default_factory=_default_user_friction_parquet)
237
+ #: Short-message cutoff for the friction classifier candidate filter.
238
+ #: Friction signals cluster in short messages ("screenshot?", "wait",
239
+ #: "why?"); long messages are almost always on-topic turns. 300 chars
240
+ #: captures ~95% of the interesting class without bloating Bedrock cost.
241
+ friction_max_chars: int = 300
242
+
243
+ #: Catalog of locally-available Skills and slash commands, produced by
244
+ #: ``claude-sql skills sync`` (see :mod:`claude_sql.skills_catalog`).
245
+ #: Backs the ``skills_catalog`` view, the ``skill_usage`` enrichment join,
246
+ #: and the ``unused_skills`` macro. Walked from :attr:`user_skills_dir`
247
+ #: and :attr:`plugins_cache_dir`.
248
+ skills_catalog_parquet_path: Path = Field(default_factory=_default_skills_catalog_parquet)
249
+ #: Root of user-level skills (each entry has a ``SKILL.md``).
250
+ user_skills_dir: Path = Field(default_factory=_default_user_skills_dir)
251
+ #: Root of the plugins cache maintained by Claude Code. The walker
252
+ #: expects ``<owner>/<plugin>/<version>/`` underneath, each with a
253
+ #: ``.claude-plugin/plugin.json`` and ``skills/`` / ``commands/`` subdirs.
254
+ plugins_cache_dir: Path = Field(default_factory=_default_plugins_cache_dir)
255
+
256
+ #: Per-(session_id, pipeline) checkpoint DuckDB file. See ``checkpointer.py``.
257
+ checkpoint_db_path: Path = Field(default_factory=_default_checkpoint_db)
258
+
259
+ # ------------------------------------------------------------------
260
+ # v2: UMAP + HDBSCAN + Louvain hyperparameters
261
+ # ------------------------------------------------------------------
262
+ umap_n_components_50: int = 50
263
+ umap_n_components_2: int = 2
264
+ umap_n_neighbors: int = 30
265
+ umap_min_dist_cluster: float = 0.0
266
+ umap_min_dist_viz: float = 0.1
267
+ umap_metric: str = "cosine"
268
+ hdbscan_min_cluster_size: int = 20
269
+ hdbscan_min_samples: int = 5
270
+ #: Absolute cosine floor below which a pair is never considered related,
271
+ #: regardless of the adaptive search. Kept conservative so the graph
272
+ #: doesn't collapse into a single giant component on very similar
273
+ #: corpora.
274
+ louvain_edge_threshold: float = 0.55
275
+ #: Target band for the average graph degree. ``_pick_adaptive_threshold``
276
+ #: picks the cosine cut that puts average degree in ``[low, high]``.
277
+ #: 8-15 is the empirically-tested sweet spot for Louvain on session-
278
+ #: centroid graphs (1K-20K nodes): enough to let community structure
279
+ #: emerge, not enough to produce a hairball.
280
+ louvain_target_avg_degree_low: float = 8.0
281
+ louvain_target_avg_degree_high: float = 15.0
282
+ #: Louvain communities smaller than this get collapsed into the
283
+ #: NOISE_COMMUNITY_ID bucket (-1) so reports stay legible.
284
+ louvain_min_community_size: int = 3
285
+ louvain_resolution: float = 1.0
286
+ seed: int = 42
287
+
288
+ # ------------------------------------------------------------------
289
+ # v2: TF-IDF # noqa: ERA001 — section header, not commented-out code
290
+ # ------------------------------------------------------------------
291
+ tfidf_min_df: int = 2
292
+ tfidf_max_df: float = 0.95
293
+ tfidf_ngram_min: int = 1
294
+ tfidf_ngram_max: int = 2
295
+ tfidf_top_n_terms: int = 10
296
+
297
+ # ------------------------------------------------------------------
298
+ # DuckDB engine tuning — applied as PRAGMAs in cli._open_connection.
299
+ # ------------------------------------------------------------------
300
+ #: Worker threads. Defaults to ``os.cpu_count()`` so DuckDB uses every
301
+ #: core; agents and CI runners with limited parallelism can override.
302
+ duckdb_threads: int = Field(default_factory=_default_duckdb_threads)
303
+ #: Memory ceiling. ``"70%"`` is permissive for a single-user devbox;
304
+ #: drop on shared hosts via the env var if it pressures other workloads.
305
+ duckdb_memory_limit: str = "70%"
306
+ #: Spill directory. Amazon devboxes ship ``/tmp`` as a 4 GB tmpfs that
307
+ #: thrashes the host once a clustering run starts spilling — point at
308
+ #: ``~/.claude/duckdb_tmp`` (real disk) instead.
309
+ duckdb_temp_dir: Path = Field(default_factory=_default_duckdb_temp_dir)
310
+
311
+ @model_validator(mode="after")
312
+ def _derive_team_corpus_globs(self) -> Self:
313
+ """Rewrite the three transcript globs when ``team_corpus_root`` is set.
314
+
315
+ Pattern: ``<root>/<author>/projects/<project>/<sid>.jsonl`` (and the
316
+ matching ``subagents/`` siblings). Replaces — does not union with —
317
+ the personal corpus root, per memo §Coherent Actions #3.
318
+
319
+ Per-glob user pins always win: if any of ``default_glob`` /
320
+ ``subagent_glob`` / ``subagent_meta_glob`` differ from their factory
321
+ defaults at validation time, none of them are rewritten (we can't
322
+ cherry-pick a partial rewrite without smuggling intent).
323
+ """
324
+ root = self.team_corpus_root
325
+ if root is None:
326
+ return self
327
+ # Detect "user pinned a glob" by comparing to the factory-provided
328
+ # default rather than literal string equality, so refactors of
329
+ # ``_default_glob()`` and friends don't silently break this path.
330
+ user_pinned = (
331
+ self.default_glob != _default_glob()
332
+ or self.subagent_glob != _default_subagent_glob()
333
+ or self.subagent_meta_glob != _default_subagent_meta_glob()
334
+ )
335
+ if user_pinned:
336
+ return self
337
+ resolved = root.expanduser().resolve()
338
+ object.__setattr__(self, "default_glob", f"{resolved}/*/projects/*/*.jsonl")
339
+ object.__setattr__(
340
+ self,
341
+ "subagent_glob",
342
+ f"{resolved}/*/projects/*/subagents/agent-*.jsonl",
343
+ )
344
+ object.__setattr__(
345
+ self,
346
+ "subagent_meta_glob",
347
+ f"{resolved}/*/projects/*/subagents/agent-*.meta.json",
348
+ )
349
+ return self
350
+
351
+ @model_validator(mode="after")
352
+ def _resolve_concurrency_alias(self) -> Settings:
353
+ """Honor the deprecated ``concurrency`` field as an alias for both pipelines.
354
+
355
+ When ``concurrency`` is set explicitly (env or kwarg) and the modern
356
+ per-pipeline fields are at their defaults, mirror it onto both. We
357
+ only override when the user clearly didn't set the new fields, so
358
+ ``embed_concurrency=8, concurrency=4`` keeps the explicit 8.
359
+ """
360
+ if self.concurrency is None:
361
+ return self
362
+ warnings.warn(
363
+ "CLAUDE_SQL_CONCURRENCY / Settings.concurrency is deprecated. "
364
+ "Use CLAUDE_SQL_EMBED_CONCURRENCY (default 8) and "
365
+ "CLAUDE_SQL_LLM_CONCURRENCY (default 2) instead. The single "
366
+ "knob will be removed in the next release.",
367
+ DeprecationWarning,
368
+ stacklevel=2,
369
+ )
370
+ # Only apply the alias to fields still at their default value.
371
+ if self.embed_concurrency == 8:
372
+ object.__setattr__(self, "embed_concurrency", self.concurrency)
373
+ if self.llm_concurrency == 16:
374
+ object.__setattr__(self, "llm_concurrency", self.concurrency)
375
+ return self
376
+
377
+ @property
378
+ def active_model_id(self) -> str:
379
+ """Return the Bedrock embedding model ID (kept as a property for call-site stability)."""
380
+ return self.model_id