claude-sql 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_sql/__init__.py +5 -0
- claude_sql/binding.py +740 -0
- claude_sql/blind_handover.py +155 -0
- claude_sql/checkpointer.py +202 -0
- claude_sql/cli.py +2344 -0
- claude_sql/cluster_worker.py +208 -0
- claude_sql/community_worker.py +306 -0
- claude_sql/config.py +380 -0
- claude_sql/embed_worker.py +482 -0
- claude_sql/freeze.py +189 -0
- claude_sql/friction_worker.py +561 -0
- claude_sql/install_source.py +77 -0
- claude_sql/judge_worker.py +459 -0
- claude_sql/judges.py +239 -0
- claude_sql/kappa_worker.py +257 -0
- claude_sql/llm_worker.py +1760 -0
- claude_sql/logging_setup.py +95 -0
- claude_sql/output.py +248 -0
- claude_sql/parquet_shards.py +172 -0
- claude_sql/retry_queue.py +180 -0
- claude_sql/review_sheet_render.py +167 -0
- claude_sql/review_sheet_worker.py +463 -0
- claude_sql/schemas.py +454 -0
- claude_sql/session_text.py +387 -0
- claude_sql/skills_catalog.py +354 -0
- claude_sql/sql_views.py +1751 -0
- claude_sql/terms_worker.py +145 -0
- claude_sql/ungrounded_worker.py +190 -0
- claude_sql-0.4.0.dist-info/METADATA +530 -0
- claude_sql-0.4.0.dist-info/RECORD +32 -0
- claude_sql-0.4.0.dist-info/WHEEL +4 -0
- claude_sql-0.4.0.dist-info/entry_points.txt +3 -0
claude_sql/sql_views.py
ADDED
|
@@ -0,0 +1,1751 @@
|
|
|
1
|
+
"""DuckDB view, macro, and VSS registry for claude-sql.
|
|
2
|
+
|
|
3
|
+
Wires a DuckDB connection to the on-disk ``~/.claude/`` JSONL transcript corpus
|
|
4
|
+
and exposes it as a stable set of zero-copy SQL views, analytical macros, and
|
|
5
|
+
an HNSW-indexed embeddings table. v2 analytics outputs (classifications,
|
|
6
|
+
trajectory, conflicts, clusters, communities) are surfaced as parquet-backed
|
|
7
|
+
views alongside the transcript-derived views.
|
|
8
|
+
|
|
9
|
+
Design notes
|
|
10
|
+
------------
|
|
11
|
+
* Reads are zero-copy via ``read_json(..., filename=true)`` -- no intermediate
|
|
12
|
+
parquet ingestion; the corpus is queried in place. ``filename`` unlocks
|
|
13
|
+
file-level predicate pushdown (DuckDB 1.3+).
|
|
14
|
+
* Nested ``message.content`` is left as JSON and flattened at query time via
|
|
15
|
+
``UNNEST(json_extract(content_json, '$[*]'))``. This keeps views resilient
|
|
16
|
+
to new content block types (``text``, ``tool_use``, ``tool_result``,
|
|
17
|
+
``thinking``, ...).
|
|
18
|
+
* Subagent transcripts live in sibling ``agent-<hex>.jsonl`` files under
|
|
19
|
+
``subagents/`` with ``*.meta.json`` partners; they surface via dedicated
|
|
20
|
+
views so primary-session views stay pure.
|
|
21
|
+
* v2 analytics views (``session_classifications``, ``message_trajectory``,
|
|
22
|
+
``session_conflicts``, ``message_clusters``, ``cluster_terms``,
|
|
23
|
+
``session_communities``, and the derived ``session_goals``) are created by
|
|
24
|
+
:func:`register_analytics` from the corresponding parquet files. Each is
|
|
25
|
+
skipped with a warning when its parquet is missing, so the function is
|
|
26
|
+
idempotent on partially-populated systems.
|
|
27
|
+
* All views use ``CREATE OR REPLACE`` so callers may safely re-register.
|
|
28
|
+
* Globs are inlined into DDL (DuckDB rejects prepared parameters as
|
|
29
|
+
table-function arguments); ``sample_size`` and ``maximum_object_size`` are
|
|
30
|
+
likewise inlined (guarded by Python ``int`` typing).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import contextlib
|
|
36
|
+
import os
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
|
|
39
|
+
import duckdb
|
|
40
|
+
from loguru import logger
|
|
41
|
+
|
|
42
|
+
from claude_sql.config import DEFAULT_PRICING, Settings
|
|
43
|
+
from claude_sql.parquet_shards import iter_part_files
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Glob constants
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
DEFAULT_GLOB: str = os.path.expanduser("~/.claude/projects/*/*.jsonl")
|
|
50
|
+
SUBAGENT_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.jsonl")
|
|
51
|
+
SUBAGENT_META_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.meta.json")
|
|
52
|
+
|
|
53
|
+
# Business-level views emitted by ``register_views``. Used by the
|
|
54
|
+
# ``claude-sql schema`` subcommand for schema dumps. Includes the v2
|
|
55
|
+
# analytics view names at the tail so ``describe_all`` can enumerate them
|
|
56
|
+
# once :func:`register_analytics` has populated the corresponding parquets.
|
|
57
|
+
VIEW_NAMES: tuple[str, ...] = (
|
|
58
|
+
"sessions",
|
|
59
|
+
"messages",
|
|
60
|
+
"content_blocks",
|
|
61
|
+
"messages_text",
|
|
62
|
+
"tool_calls",
|
|
63
|
+
"tool_results",
|
|
64
|
+
"todo_events",
|
|
65
|
+
"todo_state_current",
|
|
66
|
+
"subagent_spawns",
|
|
67
|
+
"task_creations",
|
|
68
|
+
"task_updates",
|
|
69
|
+
"tasks_state_current",
|
|
70
|
+
"task_spawns",
|
|
71
|
+
"skill_invocations",
|
|
72
|
+
"subagent_sessions",
|
|
73
|
+
"subagent_messages",
|
|
74
|
+
# v2 analytics views (materialize when the matching parquet exists).
|
|
75
|
+
"session_classifications",
|
|
76
|
+
"session_goals",
|
|
77
|
+
"message_trajectory",
|
|
78
|
+
"session_conflicts",
|
|
79
|
+
"message_clusters",
|
|
80
|
+
"cluster_terms",
|
|
81
|
+
"session_communities",
|
|
82
|
+
"user_friction",
|
|
83
|
+
"skills_catalog",
|
|
84
|
+
"skill_usage",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Analytics-only view names -- the subset of :data:`VIEW_NAMES` backed by v2
|
|
88
|
+
# parquet outputs. Exported so callers (``claude-sql`` subcommands, smoke
|
|
89
|
+
# tests) can enumerate analytics views without needing to filter out the
|
|
90
|
+
# transcript-derived views.
|
|
91
|
+
ANALYTICS_VIEW_NAMES: tuple[str, ...] = (
|
|
92
|
+
"session_classifications",
|
|
93
|
+
"session_goals",
|
|
94
|
+
"message_trajectory",
|
|
95
|
+
"session_conflicts",
|
|
96
|
+
"message_clusters",
|
|
97
|
+
"cluster_terms",
|
|
98
|
+
"session_communities",
|
|
99
|
+
"user_friction",
|
|
100
|
+
"skills_catalog",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Macro names registered by :func:`register_macros`. The first six are the
|
|
104
|
+
# v1 macros that ship unconditionally; the remaining six are the v2 analytics
|
|
105
|
+
# macros, each registered via :func:`_safe_macro` so a missing backing view
|
|
106
|
+
# downgrades to a warning instead of an exception.
|
|
107
|
+
MACRO_NAMES: tuple[str, ...] = (
|
|
108
|
+
"model_used",
|
|
109
|
+
"cost_estimate",
|
|
110
|
+
"tool_rank",
|
|
111
|
+
"todo_velocity",
|
|
112
|
+
"subagent_fanout",
|
|
113
|
+
"semantic_search",
|
|
114
|
+
"skill_rank",
|
|
115
|
+
"skill_source_mix",
|
|
116
|
+
# v2 analytics macros
|
|
117
|
+
"autonomy_trend",
|
|
118
|
+
"work_mix",
|
|
119
|
+
"success_rate_by_work",
|
|
120
|
+
"cluster_top_terms",
|
|
121
|
+
"community_top_topics",
|
|
122
|
+
"sentiment_arc",
|
|
123
|
+
"friction_counts",
|
|
124
|
+
"friction_rate",
|
|
125
|
+
"friction_examples",
|
|
126
|
+
"unused_skills",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _sql_str(value: str) -> str:
|
|
131
|
+
"""Escape a Python string as a single-quoted SQL literal.
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
value
|
|
136
|
+
Value to embed in a DDL statement.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
str
|
|
141
|
+
The value wrapped in single quotes with any embedded quotes doubled.
|
|
142
|
+
"""
|
|
143
|
+
escaped = value.replace("'", "''")
|
|
144
|
+
return f"'{escaped}'"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Raw readers
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def register_raw(
|
|
153
|
+
con: duckdb.DuckDBPyConnection,
|
|
154
|
+
*,
|
|
155
|
+
glob: str | None = None,
|
|
156
|
+
subagent_glob: str | None = None,
|
|
157
|
+
subagent_meta_glob: str | None = None,
|
|
158
|
+
sample_size: int = -1,
|
|
159
|
+
maximum_object_size: int = 67_108_864,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""Create the low-level ``v_raw_events`` and ``v_raw_subagents`` views.
|
|
162
|
+
|
|
163
|
+
Both views are glob-driven zero-copy scans of JSONL via ``read_json`` with
|
|
164
|
+
``filename=true`` for file-level predicate pushdown. The subagent
|
|
165
|
+
``meta.json`` files are registered separately as ``v_raw_subagent_meta``
|
|
166
|
+
so ``subagent_sessions`` can join them in.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
con
|
|
171
|
+
Open DuckDB connection.
|
|
172
|
+
glob
|
|
173
|
+
Glob for primary session transcripts. Defaults to :data:`DEFAULT_GLOB`.
|
|
174
|
+
subagent_glob
|
|
175
|
+
Glob for subagent transcripts. Defaults to :data:`SUBAGENT_GLOB`.
|
|
176
|
+
subagent_meta_glob
|
|
177
|
+
Glob for sibling ``*.meta.json`` files. Defaults to
|
|
178
|
+
:data:`SUBAGENT_META_GLOB`.
|
|
179
|
+
sample_size
|
|
180
|
+
``read_json`` schema-inference sample size. ``-1`` forces a full scan.
|
|
181
|
+
maximum_object_size
|
|
182
|
+
Maximum JSON object size in bytes (``read_json`` option). Must be an
|
|
183
|
+
int so we can inline it safely.
|
|
184
|
+
|
|
185
|
+
Raises
|
|
186
|
+
------
|
|
187
|
+
duckdb.Error
|
|
188
|
+
If any view DDL fails. Logged via ``logger.exception`` before re-raise.
|
|
189
|
+
"""
|
|
190
|
+
glob = glob if glob is not None else DEFAULT_GLOB
|
|
191
|
+
subagent_glob = subagent_glob if subagent_glob is not None else SUBAGENT_GLOB
|
|
192
|
+
subagent_meta_glob = (
|
|
193
|
+
subagent_meta_glob if subagent_meta_glob is not None else SUBAGENT_META_GLOB
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Inline numeric literals; type-narrow via int() to neutralize injection.
|
|
197
|
+
sample_size_i = int(sample_size)
|
|
198
|
+
max_obj_i = int(maximum_object_size)
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
con.execute(
|
|
202
|
+
f"""
|
|
203
|
+
CREATE OR REPLACE VIEW v_raw_events AS
|
|
204
|
+
SELECT *,
|
|
205
|
+
filename AS source_file,
|
|
206
|
+
regexp_extract(filename, '([^/]+)\\.jsonl$', 1) AS session_id_file
|
|
207
|
+
FROM read_json(
|
|
208
|
+
{_sql_str(glob)},
|
|
209
|
+
format='newline_delimited',
|
|
210
|
+
union_by_name=true,
|
|
211
|
+
filename=true,
|
|
212
|
+
ignore_errors=true,
|
|
213
|
+
sample_size={sample_size_i},
|
|
214
|
+
maximum_object_size={max_obj_i}
|
|
215
|
+
);
|
|
216
|
+
"""
|
|
217
|
+
)
|
|
218
|
+
logger.debug(
|
|
219
|
+
"Registered v_raw_events from glob {} with sample_size={}",
|
|
220
|
+
glob,
|
|
221
|
+
sample_size_i,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
con.execute(
|
|
225
|
+
f"""
|
|
226
|
+
CREATE OR REPLACE VIEW v_raw_subagents AS
|
|
227
|
+
SELECT *,
|
|
228
|
+
filename AS source_file,
|
|
229
|
+
regexp_extract(
|
|
230
|
+
filename,
|
|
231
|
+
'/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.jsonl$',
|
|
232
|
+
1
|
|
233
|
+
) AS parent_session_id,
|
|
234
|
+
regexp_extract(
|
|
235
|
+
filename,
|
|
236
|
+
'/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.jsonl$',
|
|
237
|
+
2
|
|
238
|
+
) AS agent_hex
|
|
239
|
+
FROM read_json(
|
|
240
|
+
{_sql_str(subagent_glob)},
|
|
241
|
+
format='newline_delimited',
|
|
242
|
+
union_by_name=true,
|
|
243
|
+
filename=true,
|
|
244
|
+
ignore_errors=true,
|
|
245
|
+
sample_size={sample_size_i},
|
|
246
|
+
maximum_object_size={max_obj_i}
|
|
247
|
+
);
|
|
248
|
+
"""
|
|
249
|
+
)
|
|
250
|
+
logger.debug("Registered v_raw_subagents from glob {}", subagent_glob)
|
|
251
|
+
|
|
252
|
+
# meta.json files are one object per file (not NDJSON) -> format='auto'.
|
|
253
|
+
con.execute(
|
|
254
|
+
f"""
|
|
255
|
+
CREATE OR REPLACE VIEW v_raw_subagent_meta AS
|
|
256
|
+
SELECT *,
|
|
257
|
+
filename AS source_file,
|
|
258
|
+
regexp_extract(
|
|
259
|
+
filename,
|
|
260
|
+
'/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.meta\\.json$',
|
|
261
|
+
1
|
|
262
|
+
) AS parent_session_id,
|
|
263
|
+
regexp_extract(
|
|
264
|
+
filename,
|
|
265
|
+
'/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.meta\\.json$',
|
|
266
|
+
2
|
|
267
|
+
) AS agent_hex
|
|
268
|
+
FROM read_json(
|
|
269
|
+
{_sql_str(subagent_meta_glob)},
|
|
270
|
+
format='auto',
|
|
271
|
+
union_by_name=true,
|
|
272
|
+
filename=true,
|
|
273
|
+
ignore_errors=true
|
|
274
|
+
);
|
|
275
|
+
"""
|
|
276
|
+
)
|
|
277
|
+
logger.debug("Registered v_raw_subagent_meta from glob {}", subagent_meta_glob)
|
|
278
|
+
except Exception:
|
|
279
|
+
logger.exception("Failed to register raw views")
|
|
280
|
+
raise
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
# Derived views
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def register_views(con: duckdb.DuckDBPyConnection) -> None:
|
|
289
|
+
"""Create logical business-level views on top of the raw readers.
|
|
290
|
+
|
|
291
|
+
Must be called after :func:`register_raw`. Creates, in order:
|
|
292
|
+
``sessions``, ``messages``, ``content_blocks``, ``messages_text``,
|
|
293
|
+
``tool_calls``, ``tool_results``, ``todo_events``, ``todo_state_current``,
|
|
294
|
+
``subagent_spawns``, ``task_creations``, ``task_updates``,
|
|
295
|
+
``tasks_state_current``, ``task_spawns`` (deprecated alias),
|
|
296
|
+
``subagent_sessions``, ``subagent_messages``.
|
|
297
|
+
|
|
298
|
+
The split between ``subagent_spawns`` and ``task_creations`` reflects
|
|
299
|
+
the Claude Code v2.1.63 ``Task``→``Agent`` rename and the v2.1.16
|
|
300
|
+
(Jan 2026) split of interactive todo tracking from ``TodoWrite`` into
|
|
301
|
+
the ``TaskCreate``/``TaskGet``/``TaskList``/``TaskUpdate`` family.
|
|
302
|
+
Pre-2026 transcripts and Agent-SDK / ``--print`` runs still emit
|
|
303
|
+
``TodoWrite`` (covered by ``todo_events``).
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
con
|
|
308
|
+
Open DuckDB connection with raw views already registered.
|
|
309
|
+
|
|
310
|
+
Raises
|
|
311
|
+
------
|
|
312
|
+
duckdb.Error
|
|
313
|
+
If any view DDL fails. Logged via ``logger.exception`` before re-raise.
|
|
314
|
+
"""
|
|
315
|
+
try:
|
|
316
|
+
con.execute(
|
|
317
|
+
"""
|
|
318
|
+
CREATE OR REPLACE VIEW sessions AS
|
|
319
|
+
SELECT
|
|
320
|
+
session_id_file AS session_id,
|
|
321
|
+
any_value(cwd) AS cwd,
|
|
322
|
+
any_value(gitBranch) AS git_branch,
|
|
323
|
+
min(timestamp::TIMESTAMP) AS started_at,
|
|
324
|
+
max(timestamp::TIMESTAMP) AS ended_at,
|
|
325
|
+
count(*) FILTER (WHERE type = 'assistant') AS assistant_messages,
|
|
326
|
+
count(*) AS record_count,
|
|
327
|
+
any_value(source_file) AS transcript_path
|
|
328
|
+
FROM v_raw_events
|
|
329
|
+
WHERE sessionId IS NOT NULL
|
|
330
|
+
GROUP BY session_id_file;
|
|
331
|
+
"""
|
|
332
|
+
)
|
|
333
|
+
logger.debug("Registered view: sessions")
|
|
334
|
+
|
|
335
|
+
# ``message.content`` is inferred by ``read_json`` as JSON. ``to_json``
|
|
336
|
+
# is defensive: if a future schema infers LIST, ``to_json`` normalizes
|
|
337
|
+
# it back to a JSON-typed column that ``json_extract`` understands.
|
|
338
|
+
con.execute(
|
|
339
|
+
"""
|
|
340
|
+
CREATE OR REPLACE VIEW messages AS
|
|
341
|
+
SELECT
|
|
342
|
+
uuid,
|
|
343
|
+
parentUuid AS parent_uuid,
|
|
344
|
+
sessionId AS session_id,
|
|
345
|
+
timestamp::TIMESTAMP AS ts,
|
|
346
|
+
type,
|
|
347
|
+
isSidechain AS is_sidechain,
|
|
348
|
+
message.role AS role,
|
|
349
|
+
message.model AS model,
|
|
350
|
+
message.stop_reason AS stop_reason,
|
|
351
|
+
message.usage.input_tokens AS input_tokens,
|
|
352
|
+
message.usage.output_tokens AS output_tokens,
|
|
353
|
+
message.usage.cache_read_input_tokens AS cache_read,
|
|
354
|
+
message.usage.cache_creation_input_tokens AS cache_write,
|
|
355
|
+
to_json(message.content) AS content_json,
|
|
356
|
+
source_file
|
|
357
|
+
FROM v_raw_events
|
|
358
|
+
WHERE type IN ('user', 'assistant');
|
|
359
|
+
"""
|
|
360
|
+
)
|
|
361
|
+
logger.debug("Registered view: messages")
|
|
362
|
+
|
|
363
|
+
con.execute(
|
|
364
|
+
"""
|
|
365
|
+
CREATE OR REPLACE VIEW content_blocks AS
|
|
366
|
+
SELECT
|
|
367
|
+
m.session_id,
|
|
368
|
+
m.uuid AS message_uuid,
|
|
369
|
+
m.ts,
|
|
370
|
+
m.role,
|
|
371
|
+
json_extract_string(block, '$.type') AS block_type,
|
|
372
|
+
json_extract_string(block, '$.text') AS text,
|
|
373
|
+
json_extract_string(block, '$.id') AS tool_use_id_field,
|
|
374
|
+
json_extract_string(block, '$.name') AS tool_name,
|
|
375
|
+
json_extract(block, '$.input') AS tool_input,
|
|
376
|
+
json_extract_string(block, '$.tool_use_id') AS tool_use_id,
|
|
377
|
+
json_extract(block, '$.content') AS tool_result_content,
|
|
378
|
+
json_extract_string(block, '$.thinking') AS thinking
|
|
379
|
+
FROM messages m,
|
|
380
|
+
UNNEST(json_extract(m.content_json, '$[*]')) AS t(block);
|
|
381
|
+
"""
|
|
382
|
+
)
|
|
383
|
+
logger.debug("Registered view: content_blocks")
|
|
384
|
+
|
|
385
|
+
# One row per *message*, not per text block. Aggregating the text
|
|
386
|
+
# blocks preserves enough context for useful embeddings; the per-block
|
|
387
|
+
# fan-out made semantic search noisy (tiny fragments like
|
|
388
|
+
# "Now run the tests" dominated results). Messages with no text blocks
|
|
389
|
+
# (tool-use-only, tool-result-only) are omitted.
|
|
390
|
+
con.execute(
|
|
391
|
+
"""
|
|
392
|
+
CREATE OR REPLACE VIEW messages_text AS
|
|
393
|
+
SELECT
|
|
394
|
+
cb.message_uuid AS uuid,
|
|
395
|
+
any_value(cb.session_id) AS session_id,
|
|
396
|
+
any_value(cb.ts) AS ts,
|
|
397
|
+
any_value(cb.role) AS role,
|
|
398
|
+
string_agg(cb.text, '\n\n') AS text_content
|
|
399
|
+
FROM content_blocks cb
|
|
400
|
+
WHERE cb.block_type = 'text'
|
|
401
|
+
AND cb.text IS NOT NULL
|
|
402
|
+
AND length(cb.text) > 0
|
|
403
|
+
GROUP BY cb.message_uuid
|
|
404
|
+
HAVING length(string_agg(cb.text, '\n\n')) >= 32;
|
|
405
|
+
"""
|
|
406
|
+
)
|
|
407
|
+
logger.debug("Registered view: messages_text")
|
|
408
|
+
|
|
409
|
+
con.execute(
|
|
410
|
+
"""
|
|
411
|
+
CREATE OR REPLACE VIEW tool_calls AS
|
|
412
|
+
SELECT
|
|
413
|
+
cb.message_uuid,
|
|
414
|
+
cb.session_id,
|
|
415
|
+
cb.ts,
|
|
416
|
+
cb.tool_name,
|
|
417
|
+
cb.tool_use_id_field AS tool_use_id,
|
|
418
|
+
cb.tool_input
|
|
419
|
+
FROM content_blocks cb
|
|
420
|
+
WHERE cb.block_type = 'tool_use';
|
|
421
|
+
"""
|
|
422
|
+
)
|
|
423
|
+
logger.debug("Registered view: tool_calls")
|
|
424
|
+
|
|
425
|
+
con.execute(
|
|
426
|
+
"""
|
|
427
|
+
CREATE OR REPLACE VIEW tool_results AS
|
|
428
|
+
SELECT
|
|
429
|
+
cb.message_uuid,
|
|
430
|
+
cb.session_id,
|
|
431
|
+
cb.ts,
|
|
432
|
+
cb.tool_use_id,
|
|
433
|
+
cb.tool_result_content AS content
|
|
434
|
+
FROM content_blocks cb
|
|
435
|
+
WHERE cb.block_type = 'tool_result';
|
|
436
|
+
"""
|
|
437
|
+
)
|
|
438
|
+
logger.debug("Registered view: tool_results")
|
|
439
|
+
|
|
440
|
+
# DuckDB's ``UNNEST`` requires a LIST. ``json_extract(x, '$.todos')``
|
|
441
|
+
# returns a JSON scalar (potentially an array) that UNNEST rejects.
|
|
442
|
+
# The ``$.todos[*]`` wildcard path yields a ``JSON[]`` that UNNEST
|
|
443
|
+
# accepts natively.
|
|
444
|
+
con.execute(
|
|
445
|
+
"""
|
|
446
|
+
CREATE OR REPLACE VIEW todo_events AS
|
|
447
|
+
SELECT
|
|
448
|
+
tc.session_id,
|
|
449
|
+
tc.ts AS written_at,
|
|
450
|
+
tc.message_uuid,
|
|
451
|
+
json_extract_string(todo, '$.content') AS subject,
|
|
452
|
+
json_extract_string(todo, '$.status') AS status,
|
|
453
|
+
json_extract_string(todo, '$.activeForm') AS active_form,
|
|
454
|
+
row_number() OVER (
|
|
455
|
+
PARTITION BY tc.session_id
|
|
456
|
+
ORDER BY tc.ts, tc.message_uuid
|
|
457
|
+
) AS snapshot_ix
|
|
458
|
+
FROM tool_calls tc,
|
|
459
|
+
UNNEST(json_extract(tc.tool_input, '$.todos[*]')) AS t(todo)
|
|
460
|
+
WHERE tc.tool_name = 'TodoWrite';
|
|
461
|
+
"""
|
|
462
|
+
)
|
|
463
|
+
logger.debug("Registered view: todo_events")
|
|
464
|
+
|
|
465
|
+
con.execute(
|
|
466
|
+
"""
|
|
467
|
+
CREATE OR REPLACE VIEW todo_state_current AS
|
|
468
|
+
SELECT session_id, subject, status, active_form, written_at
|
|
469
|
+
FROM (
|
|
470
|
+
SELECT *,
|
|
471
|
+
row_number() OVER (
|
|
472
|
+
PARTITION BY session_id, subject
|
|
473
|
+
ORDER BY snapshot_ix DESC
|
|
474
|
+
) AS rn
|
|
475
|
+
FROM todo_events
|
|
476
|
+
)
|
|
477
|
+
WHERE rn = 1;
|
|
478
|
+
"""
|
|
479
|
+
)
|
|
480
|
+
logger.debug("Registered view: todo_state_current")
|
|
481
|
+
|
|
482
|
+
# Subagent launchers: ``Task`` (pre-v2.1.63) and ``Agent`` (v2.1.63+).
|
|
483
|
+
# Input shape: {subagent_type, description, prompt, run_in_background?}.
|
|
484
|
+
con.execute(
|
|
485
|
+
"""
|
|
486
|
+
CREATE OR REPLACE VIEW subagent_spawns AS
|
|
487
|
+
SELECT
|
|
488
|
+
session_id,
|
|
489
|
+
ts AS spawned_at,
|
|
490
|
+
message_uuid,
|
|
491
|
+
tool_use_id,
|
|
492
|
+
tool_name AS spawn_tool,
|
|
493
|
+
json_extract_string(tool_input, '$.subagent_type') AS subagent_type,
|
|
494
|
+
json_extract_string(tool_input, '$.description') AS description,
|
|
495
|
+
json_extract_string(tool_input, '$.prompt') AS prompt,
|
|
496
|
+
json_extract_string(tool_input, '$.run_in_background') AS run_in_background
|
|
497
|
+
FROM tool_calls
|
|
498
|
+
WHERE tool_name IN ('Task', 'Agent');
|
|
499
|
+
"""
|
|
500
|
+
)
|
|
501
|
+
logger.debug("Registered view: subagent_spawns")
|
|
502
|
+
|
|
503
|
+
# Persistent task creation: ``TaskCreate`` (Claude Code v2.1.16+
|
|
504
|
+
# interactive sessions) and the SDK-py mirror ``mcp__tasks__task_create``.
|
|
505
|
+
# Input shape: {subject, description, activeForm?, metadata?}. Distinct
|
|
506
|
+
# from subagent_spawns -- no subagent_type / prompt fields.
|
|
507
|
+
con.execute(
|
|
508
|
+
"""
|
|
509
|
+
CREATE OR REPLACE VIEW task_creations AS
|
|
510
|
+
SELECT
|
|
511
|
+
session_id,
|
|
512
|
+
ts AS created_at,
|
|
513
|
+
message_uuid,
|
|
514
|
+
tool_use_id,
|
|
515
|
+
tool_name AS create_tool,
|
|
516
|
+
json_extract_string(tool_input, '$.subject') AS subject,
|
|
517
|
+
json_extract_string(tool_input, '$.description') AS description,
|
|
518
|
+
json_extract_string(tool_input, '$.activeForm') AS active_form,
|
|
519
|
+
json_extract(tool_input, '$.metadata') AS metadata
|
|
520
|
+
FROM tool_calls
|
|
521
|
+
WHERE tool_name IN ('TaskCreate', 'mcp__tasks__task_create');
|
|
522
|
+
"""
|
|
523
|
+
)
|
|
524
|
+
logger.debug("Registered view: task_creations")
|
|
525
|
+
|
|
526
|
+
# Task lifecycle updates: ``TaskUpdate`` (v2.1.16+) and the SDK-py
|
|
527
|
+
# mirror ``mcp__tasks__task_update``. Native uses ``taskId`` (camel),
|
|
528
|
+
# mcp variant uses ``id`` -- COALESCE to one column.
|
|
529
|
+
con.execute(
|
|
530
|
+
"""
|
|
531
|
+
CREATE OR REPLACE VIEW task_updates AS
|
|
532
|
+
SELECT
|
|
533
|
+
session_id,
|
|
534
|
+
ts AS updated_at,
|
|
535
|
+
message_uuid,
|
|
536
|
+
tool_use_id,
|
|
537
|
+
tool_name AS update_tool,
|
|
538
|
+
COALESCE(
|
|
539
|
+
json_extract_string(tool_input, '$.taskId'),
|
|
540
|
+
json_extract_string(tool_input, '$.id')
|
|
541
|
+
) AS task_id,
|
|
542
|
+
json_extract_string(tool_input, '$.status') AS status,
|
|
543
|
+
json_extract(tool_input, '$.addBlockedBy') AS add_blocked_by,
|
|
544
|
+
json_extract_string(tool_input, '$.owner') AS owner
|
|
545
|
+
FROM tool_calls
|
|
546
|
+
WHERE tool_name IN ('TaskUpdate', 'mcp__tasks__task_update');
|
|
547
|
+
"""
|
|
548
|
+
)
|
|
549
|
+
logger.debug("Registered view: task_updates")
|
|
550
|
+
|
|
551
|
+
# Latest status per (session_id, task_id) by joining task_creations
|
|
552
|
+
# to task_updates. The task_id on TaskCreate isn't carried in the
|
|
553
|
+
# tool_input (the runtime assigns it), so we recover it from the
|
|
554
|
+
# tool_result -- and fall back to row-position when the result is
|
|
555
|
+
# missing. Mirrors ``todo_state_current`` for the v2.1.16+ family.
|
|
556
|
+
con.execute(
|
|
557
|
+
"""
|
|
558
|
+
CREATE OR REPLACE VIEW tasks_state_current AS
|
|
559
|
+
WITH creates AS (
|
|
560
|
+
SELECT
|
|
561
|
+
tc.session_id,
|
|
562
|
+
tc.created_at,
|
|
563
|
+
tc.subject,
|
|
564
|
+
tc.active_form,
|
|
565
|
+
tc.tool_use_id,
|
|
566
|
+
-- The runtime returns the assigned task id in tool_result.
|
|
567
|
+
-- Common shape: text content like "Task #N created..." or
|
|
568
|
+
-- a JSON {taskId: "N"}. Try both; fall back to per-session
|
|
569
|
+
-- creation order.
|
|
570
|
+
COALESCE(
|
|
571
|
+
regexp_extract(
|
|
572
|
+
CAST(tr.content AS VARCHAR), 'Task #(\\d+)', 1
|
|
573
|
+
),
|
|
574
|
+
json_extract_string(tr.content, '$.taskId'),
|
|
575
|
+
CAST(row_number() OVER (
|
|
576
|
+
PARTITION BY tc.session_id ORDER BY tc.created_at
|
|
577
|
+
) AS VARCHAR)
|
|
578
|
+
) AS task_id
|
|
579
|
+
FROM task_creations tc
|
|
580
|
+
LEFT JOIN tool_results tr USING (tool_use_id)
|
|
581
|
+
),
|
|
582
|
+
latest_status AS (
|
|
583
|
+
SELECT session_id, task_id, status, updated_at,
|
|
584
|
+
row_number() OVER (
|
|
585
|
+
PARTITION BY session_id, task_id
|
|
586
|
+
ORDER BY updated_at DESC
|
|
587
|
+
) AS rn
|
|
588
|
+
FROM task_updates
|
|
589
|
+
WHERE task_id IS NOT NULL
|
|
590
|
+
)
|
|
591
|
+
SELECT
|
|
592
|
+
c.session_id,
|
|
593
|
+
c.task_id,
|
|
594
|
+
c.subject,
|
|
595
|
+
c.active_form,
|
|
596
|
+
COALESCE(ls.status, 'pending') AS status,
|
|
597
|
+
c.created_at,
|
|
598
|
+
ls.updated_at AS last_updated_at
|
|
599
|
+
FROM creates c
|
|
600
|
+
LEFT JOIN latest_status ls
|
|
601
|
+
ON ls.session_id = c.session_id
|
|
602
|
+
AND ls.task_id = c.task_id
|
|
603
|
+
AND ls.rn = 1;
|
|
604
|
+
"""
|
|
605
|
+
)
|
|
606
|
+
logger.debug("Registered view: tasks_state_current")
|
|
607
|
+
|
|
608
|
+
# DEPRECATED: ``task_spawns`` predates the Task→Agent rename (v2.1.63)
|
|
609
|
+
# and the TodoWrite→TaskCreate split (v2.1.16). It conflated subagent
|
|
610
|
+
# launchers with task-tracker creation. Kept as a UNION ALL alias for
|
|
611
|
+
# one release; new analytics should use ``subagent_spawns`` or
|
|
612
|
+
# ``task_creations`` directly. Removed in the next minor release.
|
|
613
|
+
con.execute(
|
|
614
|
+
"""
|
|
615
|
+
CREATE OR REPLACE VIEW task_spawns AS
|
|
616
|
+
SELECT
|
|
617
|
+
session_id, spawned_at, message_uuid, tool_use_id,
|
|
618
|
+
spawn_tool, subagent_type, description, prompt
|
|
619
|
+
FROM subagent_spawns
|
|
620
|
+
UNION ALL
|
|
621
|
+
SELECT
|
|
622
|
+
session_id, created_at AS spawned_at, message_uuid, tool_use_id,
|
|
623
|
+
create_tool AS spawn_tool,
|
|
624
|
+
NULL AS subagent_type,
|
|
625
|
+
description,
|
|
626
|
+
NULL AS prompt
|
|
627
|
+
FROM task_creations;
|
|
628
|
+
"""
|
|
629
|
+
)
|
|
630
|
+
logger.debug("Registered view: task_spawns (deprecated)")
|
|
631
|
+
|
|
632
|
+
# Every Skill / slash-command invocation observable in the transcripts,
|
|
633
|
+
# unioned across the two shapes they take:
|
|
634
|
+
#
|
|
635
|
+
# * ``tool`` — the assistant invokes the built-in ``Skill`` tool with
|
|
636
|
+
# ``tool_input.skill = '<name>'``. Lives in ``tool_calls`` already.
|
|
637
|
+
# * ``slash_command`` — the user types ``/<name>`` in chat, which
|
|
638
|
+
# Claude Code serializes into the text block as
|
|
639
|
+
# ``<command-name>/<name></command-name>`` (sometimes paired with
|
|
640
|
+
# ``<command-message>`` and ``<command-args>``).
|
|
641
|
+
#
|
|
642
|
+
# ``skill_id`` stays raw (``erpaval`` and ``personal-plugins:erpaval``
|
|
643
|
+
# are distinct rows) — the ``skills_catalog`` seed emits both shapes
|
|
644
|
+
# so the enriched ``skill_usage`` view joins cleanly either way.
|
|
645
|
+
# ``<command-name>/<name></command-name>`` slash-command text lands in
|
|
646
|
+
# two shapes across the corpus: inside a ``text`` block of a
|
|
647
|
+
# list-typed ``content`` array (newer transcripts), and as a bare
|
|
648
|
+
# VARCHAR ``message.content`` (older user turns). We scan both
|
|
649
|
+
# so the slash-command surface isn't biased toward one era.
|
|
650
|
+
cmd_name_re = "<command-name>/([A-Za-z0-9_:.-]+)</command-name>"
|
|
651
|
+
args_re = "<command-args>([^<]*)</command-args>"
|
|
652
|
+
con.execute(
|
|
653
|
+
f"""
|
|
654
|
+
CREATE OR REPLACE VIEW skill_invocations AS
|
|
655
|
+
SELECT
|
|
656
|
+
tc.session_id,
|
|
657
|
+
tc.ts,
|
|
658
|
+
tc.message_uuid,
|
|
659
|
+
'tool' AS source,
|
|
660
|
+
json_extract_string(tc.tool_input, '$.skill') AS skill_id,
|
|
661
|
+
json_extract_string(tc.tool_input, '$.args') AS args,
|
|
662
|
+
tc.tool_use_id
|
|
663
|
+
FROM tool_calls tc
|
|
664
|
+
WHERE tc.tool_name = 'Skill'
|
|
665
|
+
AND json_extract_string(tc.tool_input, '$.skill') IS NOT NULL
|
|
666
|
+
UNION ALL
|
|
667
|
+
SELECT
|
|
668
|
+
cb.session_id,
|
|
669
|
+
cb.ts,
|
|
670
|
+
cb.message_uuid,
|
|
671
|
+
'slash_command' AS source,
|
|
672
|
+
regexp_extract(cb.text, '{cmd_name_re}', 1) AS skill_id,
|
|
673
|
+
NULLIF(regexp_extract(cb.text, '{args_re}', 1), '') AS args,
|
|
674
|
+
NULL AS tool_use_id
|
|
675
|
+
FROM content_blocks cb
|
|
676
|
+
WHERE cb.role = 'user'
|
|
677
|
+
AND cb.block_type = 'text'
|
|
678
|
+
AND cb.text LIKE '%<command-name>/%'
|
|
679
|
+
AND regexp_extract(cb.text, '{cmd_name_re}', 1) != ''
|
|
680
|
+
UNION ALL
|
|
681
|
+
SELECT
|
|
682
|
+
m.session_id,
|
|
683
|
+
m.ts,
|
|
684
|
+
m.uuid AS message_uuid,
|
|
685
|
+
'slash_command' AS source,
|
|
686
|
+
regexp_extract(raw.txt, '{cmd_name_re}', 1) AS skill_id,
|
|
687
|
+
NULLIF(regexp_extract(raw.txt, '{args_re}', 1), '') AS args,
|
|
688
|
+
NULL AS tool_use_id
|
|
689
|
+
FROM messages m,
|
|
690
|
+
LATERAL (SELECT json_extract_string(m.content_json, '$') AS txt) raw
|
|
691
|
+
WHERE m.role = 'user'
|
|
692
|
+
AND json_type(m.content_json) = 'VARCHAR'
|
|
693
|
+
AND raw.txt LIKE '%<command-name>/%'
|
|
694
|
+
AND regexp_extract(raw.txt, '{cmd_name_re}', 1) != '';
|
|
695
|
+
"""
|
|
696
|
+
)
|
|
697
|
+
logger.debug("Registered view: skill_invocations")
|
|
698
|
+
|
|
699
|
+
con.execute(
|
|
700
|
+
"""
|
|
701
|
+
CREATE OR REPLACE VIEW subagent_sessions AS
|
|
702
|
+
SELECT
|
|
703
|
+
r.parent_session_id,
|
|
704
|
+
r.agent_hex,
|
|
705
|
+
any_value(m.agentType) AS agent_type,
|
|
706
|
+
any_value(m.description) AS description,
|
|
707
|
+
min(r.timestamp::TIMESTAMP) AS started_at,
|
|
708
|
+
max(r.timestamp::TIMESTAMP) AS ended_at,
|
|
709
|
+
count(*) AS message_count,
|
|
710
|
+
any_value(r.source_file) AS transcript_path
|
|
711
|
+
FROM v_raw_subagents r
|
|
712
|
+
LEFT JOIN v_raw_subagent_meta m
|
|
713
|
+
ON m.parent_session_id = r.parent_session_id
|
|
714
|
+
AND m.agent_hex = r.agent_hex
|
|
715
|
+
GROUP BY r.parent_session_id, r.agent_hex;
|
|
716
|
+
"""
|
|
717
|
+
)
|
|
718
|
+
logger.debug("Registered view: subagent_sessions")
|
|
719
|
+
|
|
720
|
+
con.execute(
|
|
721
|
+
"""
|
|
722
|
+
CREATE OR REPLACE VIEW subagent_messages AS
|
|
723
|
+
SELECT
|
|
724
|
+
uuid,
|
|
725
|
+
parentUuid AS parent_uuid,
|
|
726
|
+
sessionId AS session_id,
|
|
727
|
+
parent_session_id,
|
|
728
|
+
agent_hex,
|
|
729
|
+
timestamp::TIMESTAMP AS ts,
|
|
730
|
+
type,
|
|
731
|
+
message.role AS role,
|
|
732
|
+
message.model AS model,
|
|
733
|
+
message.usage.input_tokens AS input_tokens,
|
|
734
|
+
message.usage.output_tokens AS output_tokens,
|
|
735
|
+
to_json(message.content) AS content_json,
|
|
736
|
+
source_file
|
|
737
|
+
FROM v_raw_subagents
|
|
738
|
+
WHERE type IN ('user', 'assistant');
|
|
739
|
+
"""
|
|
740
|
+
)
|
|
741
|
+
logger.debug("Registered view: subagent_messages")
|
|
742
|
+
except Exception:
|
|
743
|
+
logger.exception("Failed to register derived views")
|
|
744
|
+
raise
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
# ---------------------------------------------------------------------------
|
|
748
|
+
# Macros
|
|
749
|
+
# ---------------------------------------------------------------------------
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _pricing_values_clause(pricing: dict[str, tuple[float, float]]) -> str:
|
|
753
|
+
"""Render a pricing dict as an inline SQL ``VALUES`` row list.
|
|
754
|
+
|
|
755
|
+
Parameters
|
|
756
|
+
----------
|
|
757
|
+
pricing
|
|
758
|
+
Mapping of ``model_name -> (input_rate, output_rate)`` per 1M tokens.
|
|
759
|
+
|
|
760
|
+
Returns
|
|
761
|
+
-------
|
|
762
|
+
str
|
|
763
|
+
Comma-separated ``('model', in, out)`` rows. Emits a sentinel row that
|
|
764
|
+
matches no real model if ``pricing`` is empty (DuckDB rejects empty
|
|
765
|
+
``VALUES`` lists).
|
|
766
|
+
"""
|
|
767
|
+
if not pricing:
|
|
768
|
+
return "('__no_pricing__', 0.0, 0.0)"
|
|
769
|
+
rows = [
|
|
770
|
+
f"('{model}', {in_rate}, {out_rate})"
|
|
771
|
+
for model, (in_rate, out_rate) in sorted(pricing.items())
|
|
772
|
+
]
|
|
773
|
+
return ", ".join(rows)
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def _safe_macro(con: duckdb.DuckDBPyConnection, name: str, ddl: str) -> None:
|
|
777
|
+
"""Execute a ``CREATE OR REPLACE MACRO`` DDL, downgrading failures to warnings.
|
|
778
|
+
|
|
779
|
+
Analytics macros reference views (``session_classifications``,
|
|
780
|
+
``cluster_terms``, etc.) that only materialize once the corresponding
|
|
781
|
+
parquet has been produced. Wrapping creation in ``try/except
|
|
782
|
+
duckdb.Error`` means a fresh install (pre-``claude-sql classify``) can
|
|
783
|
+
still call :func:`register_macros` without blowing up: the macro simply
|
|
784
|
+
doesn't get created and the caller gets a ``logger.warning`` pointing at
|
|
785
|
+
the missing backing view.
|
|
786
|
+
|
|
787
|
+
Parameters
|
|
788
|
+
----------
|
|
789
|
+
con
|
|
790
|
+
Open DuckDB connection.
|
|
791
|
+
name
|
|
792
|
+
Macro name, used only for log messages.
|
|
793
|
+
ddl
|
|
794
|
+
Complete ``CREATE OR REPLACE MACRO`` statement.
|
|
795
|
+
"""
|
|
796
|
+
try:
|
|
797
|
+
con.execute(ddl)
|
|
798
|
+
logger.debug("Registered analytics macro: {}", name)
|
|
799
|
+
except duckdb.Error as exc:
|
|
800
|
+
logger.warning("Skipped macro {} (backing view missing): {}", name, exc)
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def register_macros(
|
|
804
|
+
con: duckdb.DuckDBPyConnection,
|
|
805
|
+
settings: Settings | None = None,
|
|
806
|
+
) -> None:
|
|
807
|
+
"""Create SQL macros used by the CLI and analysts.
|
|
808
|
+
|
|
809
|
+
v1 macros (always created): ``model_used``, ``cost_estimate``,
|
|
810
|
+
``tool_rank``, ``todo_velocity``, ``subagent_fanout``, ``semantic_search``.
|
|
811
|
+
|
|
812
|
+
v2 analytics macros (created via :func:`_safe_macro`, skipped when their
|
|
813
|
+
backing analytics view is missing): ``autonomy_trend``, ``work_mix``,
|
|
814
|
+
``success_rate_by_work``, ``cluster_top_terms``, ``community_top_topics``,
|
|
815
|
+
``sentiment_arc``.
|
|
816
|
+
|
|
817
|
+
``semantic_search(query_vec, k)`` is a table macro that returns the top-k
|
|
818
|
+
uuids by cosine distance to ``query_vec`` using the HNSW index.
|
|
819
|
+
``query_vec`` must be ``FLOAT[<dim>]`` matching the ``message_embeddings``
|
|
820
|
+
column type.
|
|
821
|
+
|
|
822
|
+
Parameters
|
|
823
|
+
----------
|
|
824
|
+
con
|
|
825
|
+
Open DuckDB connection with views (and the ``message_embeddings``
|
|
826
|
+
table from :func:`register_vss`) already registered. Analytics views
|
|
827
|
+
should be registered first (via :func:`register_analytics`) so the
|
|
828
|
+
analytics macros bind successfully; if they're not, those macros are
|
|
829
|
+
skipped with a warning.
|
|
830
|
+
settings
|
|
831
|
+
Optional :class:`Settings` for pricing overrides; falls back to
|
|
832
|
+
:data:`claude_sql.config.DEFAULT_PRICING`.
|
|
833
|
+
"""
|
|
834
|
+
pricing = settings.pricing if settings is not None else DEFAULT_PRICING
|
|
835
|
+
pricing_rows = _pricing_values_clause(pricing)
|
|
836
|
+
|
|
837
|
+
con.execute(
|
|
838
|
+
"""
|
|
839
|
+
CREATE OR REPLACE MACRO model_used(sid) AS (
|
|
840
|
+
SELECT any_value(model)
|
|
841
|
+
FROM messages
|
|
842
|
+
WHERE session_id = sid AND model IS NOT NULL
|
|
843
|
+
);
|
|
844
|
+
"""
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
# Pricing join uses a prefix match so dated model IDs like
|
|
848
|
+
# ``claude-haiku-4-5-20251001`` still resolve to the base entry
|
|
849
|
+
# ``claude-haiku-4-5`` in ``DEFAULT_PRICING``.
|
|
850
|
+
con.execute(
|
|
851
|
+
f"""
|
|
852
|
+
CREATE OR REPLACE MACRO cost_estimate(sid) AS (
|
|
853
|
+
SELECT sum(
|
|
854
|
+
(coalesce(m.input_tokens, 0) + coalesce(m.cache_write, 0)) * p.in_rate
|
|
855
|
+
+ coalesce(m.output_tokens, 0) * p.out_rate
|
|
856
|
+
) / 1e6
|
|
857
|
+
FROM messages m
|
|
858
|
+
JOIN (VALUES {pricing_rows}) p(model, in_rate, out_rate)
|
|
859
|
+
ON regexp_replace(m.model, '-\\d{{8}}$', '') = p.model
|
|
860
|
+
WHERE m.session_id = sid
|
|
861
|
+
);
|
|
862
|
+
"""
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
con.execute(
|
|
866
|
+
"""
|
|
867
|
+
CREATE OR REPLACE MACRO tool_rank(last_n_days) AS TABLE (
|
|
868
|
+
SELECT tool_name, count(*) AS n
|
|
869
|
+
FROM tool_calls
|
|
870
|
+
WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
|
|
871
|
+
AND tool_name IS NOT NULL
|
|
872
|
+
GROUP BY 1
|
|
873
|
+
ORDER BY n DESC
|
|
874
|
+
);
|
|
875
|
+
"""
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
con.execute(
|
|
879
|
+
"""
|
|
880
|
+
CREATE OR REPLACE MACRO todo_velocity(sid) AS (
|
|
881
|
+
SELECT count(*) FILTER (WHERE status = 'completed')::DOUBLE
|
|
882
|
+
/ NULLIF(count(DISTINCT subject), 0)
|
|
883
|
+
FROM todo_state_current
|
|
884
|
+
WHERE session_id = sid
|
|
885
|
+
);
|
|
886
|
+
"""
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
con.execute(
|
|
890
|
+
"""
|
|
891
|
+
CREATE OR REPLACE MACRO subagent_fanout(sid) AS (
|
|
892
|
+
SELECT count(*)
|
|
893
|
+
FROM subagent_sessions
|
|
894
|
+
WHERE parent_session_id = sid
|
|
895
|
+
);
|
|
896
|
+
"""
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# ``ORDER BY array_distance`` triggers the HNSW index rewrite; cosine
|
|
900
|
+
# similarity and distance are both surfaced for human-readable ranking.
|
|
901
|
+
con.execute(
|
|
902
|
+
"""
|
|
903
|
+
CREATE OR REPLACE MACRO semantic_search(query_vec, k) AS TABLE (
|
|
904
|
+
SELECT me.uuid,
|
|
905
|
+
array_cosine_similarity(me.embedding, query_vec) AS sim,
|
|
906
|
+
array_distance(me.embedding, query_vec) AS distance
|
|
907
|
+
FROM message_embeddings me
|
|
908
|
+
ORDER BY array_distance(me.embedding, query_vec)
|
|
909
|
+
LIMIT k
|
|
910
|
+
);
|
|
911
|
+
"""
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
# Skill / slash-command leaderboard over the last N days. Resolves
|
|
915
|
+
# against ``skill_usage``, which always exists (with or without the
|
|
916
|
+
# catalog), so this macro is safe to register unconditionally.
|
|
917
|
+
_safe_macro(
|
|
918
|
+
con,
|
|
919
|
+
"skill_rank",
|
|
920
|
+
"""
|
|
921
|
+
CREATE OR REPLACE MACRO skill_rank(last_n_days) AS TABLE (
|
|
922
|
+
SELECT skill_id,
|
|
923
|
+
skill_name,
|
|
924
|
+
plugin,
|
|
925
|
+
is_builtin,
|
|
926
|
+
count(*) AS n,
|
|
927
|
+
count(DISTINCT session_id) AS sessions
|
|
928
|
+
FROM skill_usage
|
|
929
|
+
WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
|
|
930
|
+
GROUP BY 1, 2, 3, 4
|
|
931
|
+
ORDER BY n DESC
|
|
932
|
+
);
|
|
933
|
+
""",
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
# How is each skill invoked? ``n_tool`` comes from the ``Skill`` tool,
|
|
937
|
+
# ``n_slash`` from user-typed ``/<name>`` in chat. Built-ins are
|
|
938
|
+
# excluded because they're almost always slash-only and would drown
|
|
939
|
+
# everything else out.
|
|
940
|
+
_safe_macro(
|
|
941
|
+
con,
|
|
942
|
+
"skill_source_mix",
|
|
943
|
+
"""
|
|
944
|
+
CREATE OR REPLACE MACRO skill_source_mix(last_n_days) AS TABLE (
|
|
945
|
+
SELECT skill_id,
|
|
946
|
+
skill_name,
|
|
947
|
+
count(*) FILTER (WHERE source = 'tool') AS n_tool,
|
|
948
|
+
count(*) FILTER (WHERE source = 'slash_command') AS n_slash,
|
|
949
|
+
count(*) AS n_total
|
|
950
|
+
FROM skill_usage
|
|
951
|
+
WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
|
|
952
|
+
AND NOT is_builtin
|
|
953
|
+
GROUP BY 1, 2
|
|
954
|
+
ORDER BY n_total DESC
|
|
955
|
+
);
|
|
956
|
+
""",
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
logger.debug(
|
|
960
|
+
"Registered macros: model_used, cost_estimate, tool_rank, "
|
|
961
|
+
"todo_velocity, subagent_fanout, semantic_search, skill_rank, "
|
|
962
|
+
"skill_source_mix"
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
# ------------------------------------------------------------------
|
|
966
|
+
# v2 analytics macros -- each wrapped in _safe_macro so a missing
|
|
967
|
+
# backing view (pre-``claude-sql classify`` run) is a warning, not an
|
|
968
|
+
# exception.
|
|
969
|
+
# ------------------------------------------------------------------
|
|
970
|
+
|
|
971
|
+
# Time series: autonomy tier mix over rolling windows.
|
|
972
|
+
_safe_macro(
|
|
973
|
+
con,
|
|
974
|
+
"autonomy_trend",
|
|
975
|
+
"""
|
|
976
|
+
CREATE OR REPLACE MACRO autonomy_trend(window_days) AS TABLE (
|
|
977
|
+
SELECT
|
|
978
|
+
date_trunc('week', classified_at) AS week,
|
|
979
|
+
autonomy_tier,
|
|
980
|
+
count(*) AS n
|
|
981
|
+
FROM session_classifications
|
|
982
|
+
WHERE classified_at >= current_timestamp - (window_days * INTERVAL 1 DAY)
|
|
983
|
+
GROUP BY 1, 2
|
|
984
|
+
ORDER BY 1, 2
|
|
985
|
+
);
|
|
986
|
+
""",
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
# Work-category mix in the last N days.
|
|
990
|
+
_safe_macro(
|
|
991
|
+
con,
|
|
992
|
+
"work_mix",
|
|
993
|
+
"""
|
|
994
|
+
CREATE OR REPLACE MACRO work_mix(since_days) AS TABLE (
|
|
995
|
+
SELECT work_category, count(*) AS n
|
|
996
|
+
FROM session_classifications
|
|
997
|
+
WHERE classified_at >= current_timestamp - (since_days * INTERVAL 1 DAY)
|
|
998
|
+
GROUP BY 1
|
|
999
|
+
ORDER BY n DESC
|
|
1000
|
+
);
|
|
1001
|
+
""",
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
# Success / failure / partial rate broken down by work category.
|
|
1005
|
+
_safe_macro(
|
|
1006
|
+
con,
|
|
1007
|
+
"success_rate_by_work",
|
|
1008
|
+
"""
|
|
1009
|
+
CREATE OR REPLACE MACRO success_rate_by_work(since_days) AS TABLE (
|
|
1010
|
+
SELECT
|
|
1011
|
+
work_category,
|
|
1012
|
+
count(*) AS sessions,
|
|
1013
|
+
count(*) FILTER (WHERE success = 'success')::DOUBLE
|
|
1014
|
+
/ NULLIF(count(*), 0) AS success_rate,
|
|
1015
|
+
count(*) FILTER (WHERE success = 'failure')::DOUBLE
|
|
1016
|
+
/ NULLIF(count(*), 0) AS failure_rate,
|
|
1017
|
+
count(*) FILTER (WHERE success = 'partial')::DOUBLE
|
|
1018
|
+
/ NULLIF(count(*), 0) AS partial_rate
|
|
1019
|
+
FROM session_classifications
|
|
1020
|
+
WHERE classified_at >= current_timestamp - (since_days * INTERVAL 1 DAY)
|
|
1021
|
+
GROUP BY 1
|
|
1022
|
+
ORDER BY sessions DESC
|
|
1023
|
+
);
|
|
1024
|
+
""",
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
# Top-N TF-IDF terms for a single cluster.
|
|
1028
|
+
_safe_macro(
|
|
1029
|
+
con,
|
|
1030
|
+
"cluster_top_terms",
|
|
1031
|
+
"""
|
|
1032
|
+
CREATE OR REPLACE MACRO cluster_top_terms(cid, n) AS TABLE (
|
|
1033
|
+
SELECT term, weight, rank
|
|
1034
|
+
FROM cluster_terms
|
|
1035
|
+
WHERE cluster_id = cid
|
|
1036
|
+
ORDER BY rank
|
|
1037
|
+
LIMIT n
|
|
1038
|
+
);
|
|
1039
|
+
""",
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
# Top cluster_ids within a given community, ranked by the number of
|
|
1043
|
+
# messages each cluster contributes to the community. Each row carries
|
|
1044
|
+
# its top 5 TF-IDF terms for human-readable context.
|
|
1045
|
+
_safe_macro(
|
|
1046
|
+
con,
|
|
1047
|
+
"community_top_topics",
|
|
1048
|
+
"""
|
|
1049
|
+
CREATE OR REPLACE MACRO community_top_topics(cid, n) AS TABLE (
|
|
1050
|
+
WITH community_msgs AS (
|
|
1051
|
+
SELECT CAST(m.uuid AS VARCHAR) AS uuid
|
|
1052
|
+
FROM messages m
|
|
1053
|
+
JOIN session_communities sc
|
|
1054
|
+
ON CAST(m.session_id AS VARCHAR) = sc.session_id
|
|
1055
|
+
WHERE sc.community_id = cid
|
|
1056
|
+
),
|
|
1057
|
+
cluster_counts AS (
|
|
1058
|
+
SELECT mc.cluster_id, count(*) AS n_msgs
|
|
1059
|
+
FROM message_clusters mc
|
|
1060
|
+
JOIN community_msgs cm USING (uuid)
|
|
1061
|
+
WHERE mc.cluster_id >= 0
|
|
1062
|
+
GROUP BY mc.cluster_id
|
|
1063
|
+
)
|
|
1064
|
+
SELECT cc.cluster_id, cc.n_msgs,
|
|
1065
|
+
(SELECT string_agg(term, ', ' ORDER BY rank)
|
|
1066
|
+
FROM cluster_terms ct
|
|
1067
|
+
WHERE ct.cluster_id = cc.cluster_id
|
|
1068
|
+
AND ct.rank <= 5) AS top_terms
|
|
1069
|
+
FROM cluster_counts cc
|
|
1070
|
+
ORDER BY n_msgs DESC
|
|
1071
|
+
LIMIT n
|
|
1072
|
+
);
|
|
1073
|
+
""",
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
# Sentiment arc for a single session: per-message (ts, role, delta,
|
|
1077
|
+
# transition flag, confidence) in chronological order.
|
|
1078
|
+
_safe_macro(
|
|
1079
|
+
con,
|
|
1080
|
+
"sentiment_arc",
|
|
1081
|
+
"""
|
|
1082
|
+
CREATE OR REPLACE MACRO sentiment_arc(sid) AS TABLE (
|
|
1083
|
+
SELECT m.ts, m.role, mt.sentiment_delta, mt.is_transition, mt.confidence
|
|
1084
|
+
FROM messages m
|
|
1085
|
+
JOIN message_trajectory mt
|
|
1086
|
+
ON CAST(m.uuid AS VARCHAR) = mt.uuid
|
|
1087
|
+
WHERE CAST(m.session_id AS VARCHAR) = sid
|
|
1088
|
+
ORDER BY m.ts
|
|
1089
|
+
);
|
|
1090
|
+
""",
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
# Counts per friction label, scoped to the last N days by message ``ts``
|
|
1094
|
+
# (the user's actual utterance time, not detected_at). Pass ``NULL`` to
|
|
1095
|
+
# include the full corpus. Excludes label='none' because that is the
|
|
1096
|
+
# majority sentinel class and would swamp the output.
|
|
1097
|
+
_safe_macro(
|
|
1098
|
+
con,
|
|
1099
|
+
"friction_counts",
|
|
1100
|
+
"""
|
|
1101
|
+
CREATE OR REPLACE MACRO friction_counts(since_days) AS TABLE (
|
|
1102
|
+
SELECT label,
|
|
1103
|
+
count(*) AS n,
|
|
1104
|
+
count(DISTINCT session_id) AS sessions,
|
|
1105
|
+
avg(confidence) AS avg_confidence,
|
|
1106
|
+
sum(CASE WHEN source='regex' THEN 1 ELSE 0 END) AS n_regex,
|
|
1107
|
+
sum(CASE WHEN source='llm' THEN 1 ELSE 0 END) AS n_llm
|
|
1108
|
+
FROM user_friction
|
|
1109
|
+
WHERE label != 'none'
|
|
1110
|
+
AND (since_days IS NULL
|
|
1111
|
+
OR ts >= current_timestamp - (since_days * INTERVAL 1 DAY))
|
|
1112
|
+
GROUP BY label
|
|
1113
|
+
ORDER BY n DESC
|
|
1114
|
+
);
|
|
1115
|
+
""",
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
# Per-session friction pressure: how many non-'none' friction messages
|
|
1119
|
+
# fired vs the total user message count. A high rate is a strong proxy
|
|
1120
|
+
# for a session where the agent repeatedly fell short of what the user
|
|
1121
|
+
# expected.
|
|
1122
|
+
_safe_macro(
|
|
1123
|
+
con,
|
|
1124
|
+
"friction_rate",
|
|
1125
|
+
"""
|
|
1126
|
+
CREATE OR REPLACE MACRO friction_rate(since_days) AS TABLE (
|
|
1127
|
+
WITH hits AS (
|
|
1128
|
+
SELECT session_id,
|
|
1129
|
+
count(*) FILTER (WHERE label != 'none') AS n_friction,
|
|
1130
|
+
count(*) FILTER (WHERE label = 'status_ping') AS n_status,
|
|
1131
|
+
count(*) FILTER (WHERE label = 'unmet_expectation') AS n_unmet,
|
|
1132
|
+
count(*) FILTER (WHERE label = 'confusion') AS n_confusion,
|
|
1133
|
+
count(*) FILTER (WHERE label = 'interruption') AS n_interruption,
|
|
1134
|
+
count(*) FILTER (WHERE label = 'correction') AS n_correction,
|
|
1135
|
+
count(*) FILTER (WHERE label = 'frustration') AS n_frustration
|
|
1136
|
+
FROM user_friction
|
|
1137
|
+
WHERE since_days IS NULL
|
|
1138
|
+
OR ts >= current_timestamp - (since_days * INTERVAL 1 DAY)
|
|
1139
|
+
GROUP BY session_id
|
|
1140
|
+
),
|
|
1141
|
+
user_msgs AS (
|
|
1142
|
+
SELECT CAST(mt.session_id AS VARCHAR) AS session_id,
|
|
1143
|
+
count(*) AS n_user_msgs
|
|
1144
|
+
FROM messages_text mt
|
|
1145
|
+
WHERE mt.role = 'user'
|
|
1146
|
+
AND (since_days IS NULL
|
|
1147
|
+
OR mt.ts >= current_timestamp - (since_days * INTERVAL 1 DAY))
|
|
1148
|
+
GROUP BY 1
|
|
1149
|
+
)
|
|
1150
|
+
SELECT h.session_id,
|
|
1151
|
+
h.n_friction,
|
|
1152
|
+
h.n_status, h.n_unmet, h.n_confusion,
|
|
1153
|
+
h.n_interruption, h.n_correction, h.n_frustration,
|
|
1154
|
+
COALESCE(um.n_user_msgs, 0) AS n_user_msgs,
|
|
1155
|
+
h.n_friction::DOUBLE / NULLIF(um.n_user_msgs, 0) AS rate
|
|
1156
|
+
FROM hits h
|
|
1157
|
+
LEFT JOIN user_msgs um USING (session_id)
|
|
1158
|
+
WHERE h.n_friction > 0
|
|
1159
|
+
ORDER BY h.n_friction DESC
|
|
1160
|
+
);
|
|
1161
|
+
""",
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
# Top-N example user messages for a given friction label, highest
|
|
1165
|
+
# confidence first. ``label_name`` is a VARCHAR so DuckDB callers
|
|
1166
|
+
# don't have to quote-escape through the macro boundary.
|
|
1167
|
+
_safe_macro(
|
|
1168
|
+
con,
|
|
1169
|
+
"friction_examples",
|
|
1170
|
+
"""
|
|
1171
|
+
CREATE OR REPLACE MACRO friction_examples(label_name, n) AS TABLE (
|
|
1172
|
+
SELECT session_id, ts, text_snippet, rationale, source, confidence
|
|
1173
|
+
FROM user_friction
|
|
1174
|
+
WHERE label = label_name
|
|
1175
|
+
ORDER BY confidence DESC, ts DESC
|
|
1176
|
+
LIMIT n
|
|
1177
|
+
);
|
|
1178
|
+
""",
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
# Catalog entries the user has NOT invoked in the last N days. Pure
|
|
1182
|
+
# catalog lookup; ``skills_catalog`` may be missing pre-sync, so this
|
|
1183
|
+
# is wrapped in ``_safe_macro`` and skipped cleanly in that case.
|
|
1184
|
+
# ``source_kind`` filter keeps out the 'builtin' rows (users don't
|
|
1185
|
+
# install or uninstall ``/clear``).
|
|
1186
|
+
_safe_macro(
|
|
1187
|
+
con,
|
|
1188
|
+
"unused_skills",
|
|
1189
|
+
"""
|
|
1190
|
+
CREATE OR REPLACE MACRO unused_skills(last_n_days) AS TABLE (
|
|
1191
|
+
SELECT cat.skill_id,
|
|
1192
|
+
cat.name,
|
|
1193
|
+
cat.plugin,
|
|
1194
|
+
cat.plugin_version,
|
|
1195
|
+
cat.source_kind,
|
|
1196
|
+
cat.description
|
|
1197
|
+
FROM skills_catalog cat
|
|
1198
|
+
LEFT JOIN (
|
|
1199
|
+
SELECT DISTINCT skill_id
|
|
1200
|
+
FROM skill_invocations
|
|
1201
|
+
WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
|
|
1202
|
+
) used USING (skill_id)
|
|
1203
|
+
WHERE used.skill_id IS NULL
|
|
1204
|
+
AND cat.source_kind IN ('user-skill', 'plugin-skill', 'plugin-command')
|
|
1205
|
+
ORDER BY cat.plugin NULLS FIRST, cat.name
|
|
1206
|
+
);
|
|
1207
|
+
""",
|
|
1208
|
+
)
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
# ---------------------------------------------------------------------------
|
|
1212
|
+
# VSS
|
|
1213
|
+
# ---------------------------------------------------------------------------
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
def _hnsw_rebuild_needed(parquet: Path, hnsw_db: Path) -> bool:
|
|
1217
|
+
"""Decide from filesystem state alone whether the parquet has shifted.
|
|
1218
|
+
|
|
1219
|
+
Handles both legacy single-file caches and sharded directories: for a
|
|
1220
|
+
sharded directory we compare against the *latest* part file's mtime so
|
|
1221
|
+
a brand-new shard invalidates the persisted HNSW even when the
|
|
1222
|
+
directory's own mtime hasn't moved (some filesystems update dir mtime
|
|
1223
|
+
only on add/remove, not on touch of children).
|
|
1224
|
+
|
|
1225
|
+
This is a *necessary* but not sufficient signal — even when the
|
|
1226
|
+
parquet hasn't moved, the attached store might be empty (for instance,
|
|
1227
|
+
DuckDB's ATTACH on a missing path creates a ~12 KB header-only file
|
|
1228
|
+
before any tables exist). Catalog existence is checked separately
|
|
1229
|
+
inside ``register_vss`` after the ATTACH.
|
|
1230
|
+
"""
|
|
1231
|
+
if not hnsw_db.exists():
|
|
1232
|
+
return True
|
|
1233
|
+
parts = iter_part_files(parquet)
|
|
1234
|
+
if not parts:
|
|
1235
|
+
# No source-of-truth on disk yet. The attached store is whatever
|
|
1236
|
+
# was previously persisted; nothing to rebuild from.
|
|
1237
|
+
return False
|
|
1238
|
+
latest_ns = max(p.stat().st_mtime_ns for p in parts)
|
|
1239
|
+
return latest_ns > hnsw_db.stat().st_mtime_ns
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
def _attached_embeddings_table_present(con: duckdb.DuckDBPyConnection) -> bool:
|
|
1243
|
+
"""Return True when ``hnsw_store.main.message_embeddings`` exists in the catalog."""
|
|
1244
|
+
row = con.execute(
|
|
1245
|
+
"""
|
|
1246
|
+
SELECT count(*)
|
|
1247
|
+
FROM duckdb_tables()
|
|
1248
|
+
WHERE database_name = 'hnsw_store'
|
|
1249
|
+
AND schema_name = 'main'
|
|
1250
|
+
AND table_name = 'message_embeddings';
|
|
1251
|
+
"""
|
|
1252
|
+
).fetchone()
|
|
1253
|
+
return bool(row and row[0])
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
def register_vss(
|
|
1257
|
+
con: duckdb.DuckDBPyConnection,
|
|
1258
|
+
*,
|
|
1259
|
+
embeddings_parquet: Path,
|
|
1260
|
+
hnsw_db_path: Path | None = None,
|
|
1261
|
+
dim: int = 1024,
|
|
1262
|
+
metric: str = "cosine",
|
|
1263
|
+
ef_construction: int = 128,
|
|
1264
|
+
ef_search: int = 64,
|
|
1265
|
+
m: int = 16,
|
|
1266
|
+
m0: int = 32,
|
|
1267
|
+
) -> bool:
|
|
1268
|
+
"""Install + load VSS and bind ``message_embeddings`` over a persisted HNSW store.
|
|
1269
|
+
|
|
1270
|
+
When ``hnsw_db_path`` is provided the embeddings table and its HNSW
|
|
1271
|
+
index live inside that DuckDB file (ATTACHed under the alias
|
|
1272
|
+
``hnsw_store``) so reopening a CLI command reuses the index instead of
|
|
1273
|
+
rebuilding it from parquet. The store is rebuilt only when missing,
|
|
1274
|
+
suspiciously small, or older than the embeddings parquet on disk; an
|
|
1275
|
+
``IOException`` during attach unlinks the store and rebuilds.
|
|
1276
|
+
|
|
1277
|
+
When ``hnsw_db_path`` is ``None`` (legacy / tests) the table and index
|
|
1278
|
+
stay in the connection's main database, matching the original
|
|
1279
|
+
in-memory behavior.
|
|
1280
|
+
|
|
1281
|
+
Parameters
|
|
1282
|
+
----------
|
|
1283
|
+
con
|
|
1284
|
+
Open DuckDB connection.
|
|
1285
|
+
embeddings_parquet
|
|
1286
|
+
Path to the embeddings parquet produced by ``claude-sql embed``.
|
|
1287
|
+
hnsw_db_path
|
|
1288
|
+
Persistent DuckDB file that backs the HNSW index, or ``None`` to
|
|
1289
|
+
keep everything in the connection's main database.
|
|
1290
|
+
dim
|
|
1291
|
+
Fixed-length embedding dimension. Must match the parquet's
|
|
1292
|
+
``embedding`` column. Defaults to 1024 (Cohere Embed v4 mid-tier).
|
|
1293
|
+
metric
|
|
1294
|
+
HNSW distance metric. One of ``cosine``, ``l2sq``, ``ip``.
|
|
1295
|
+
ef_construction, ef_search, m, m0
|
|
1296
|
+
Standard HNSW tuning knobs. ``m`` and ``m0`` map to DuckDB's ``M``
|
|
1297
|
+
and ``M0`` parameters.
|
|
1298
|
+
|
|
1299
|
+
Returns
|
|
1300
|
+
-------
|
|
1301
|
+
bool
|
|
1302
|
+
``True`` if the table was populated and the HNSW index is usable;
|
|
1303
|
+
``False`` if the parquet file does not exist yet.
|
|
1304
|
+
|
|
1305
|
+
Notes
|
|
1306
|
+
-----
|
|
1307
|
+
VSS only supports ``FLOAT`` element type. Embeddings persisted as
|
|
1308
|
+
``DOUBLE[]`` are cast via ``CAST(embedding AS FLOAT[<dim>])``.
|
|
1309
|
+
Persistence rides on the experimental
|
|
1310
|
+
``hnsw_enable_experimental_persistence`` flag — when corruption
|
|
1311
|
+
surfaces, ``rm`` the file and the next call rebuilds from parquet.
|
|
1312
|
+
"""
|
|
1313
|
+
dim_i = int(dim)
|
|
1314
|
+
ef_c_i = int(ef_construction)
|
|
1315
|
+
ef_s_i = int(ef_search)
|
|
1316
|
+
m_i = int(m)
|
|
1317
|
+
m0_i = int(m0)
|
|
1318
|
+
if metric not in {"cosine", "l2sq", "ip"}:
|
|
1319
|
+
raise ValueError(f"Unsupported HNSW metric: {metric!r}")
|
|
1320
|
+
|
|
1321
|
+
con.execute("INSTALL vss;")
|
|
1322
|
+
con.execute("LOAD vss;")
|
|
1323
|
+
con.execute("SET hnsw_enable_experimental_persistence = true;")
|
|
1324
|
+
|
|
1325
|
+
use_persistence = hnsw_db_path is not None
|
|
1326
|
+
schema_qualifier = ""
|
|
1327
|
+
persisted_path: Path | None = hnsw_db_path
|
|
1328
|
+
if use_persistence and persisted_path is not None:
|
|
1329
|
+
persisted_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1330
|
+
try:
|
|
1331
|
+
con.execute(f"ATTACH '{persisted_path}' AS hnsw_store;")
|
|
1332
|
+
except duckdb.IOException as exc:
|
|
1333
|
+
logger.warning(
|
|
1334
|
+
"ATTACH on {} failed ({}); unlinking and rebuilding the HNSW store.",
|
|
1335
|
+
persisted_path,
|
|
1336
|
+
exc,
|
|
1337
|
+
)
|
|
1338
|
+
with contextlib.suppress(FileNotFoundError):
|
|
1339
|
+
persisted_path.unlink()
|
|
1340
|
+
con.execute(f"ATTACH '{persisted_path}' AS hnsw_store;")
|
|
1341
|
+
# ``message_embeddings`` lives inside the attached store. Macros and
|
|
1342
|
+
# readers reference it via a top-level VIEW so existing call sites
|
|
1343
|
+
# (cli.py, the ``semantic_search`` macro) keep working unchanged.
|
|
1344
|
+
schema_qualifier = "hnsw_store.main."
|
|
1345
|
+
|
|
1346
|
+
parts = iter_part_files(embeddings_parquet)
|
|
1347
|
+
if not parts:
|
|
1348
|
+
logger.warning(
|
|
1349
|
+
"No embeddings parquet at {}; skipping HNSW index build. "
|
|
1350
|
+
"Run `claude-sql embed` to backfill.",
|
|
1351
|
+
embeddings_parquet,
|
|
1352
|
+
)
|
|
1353
|
+
con.execute(
|
|
1354
|
+
f"""
|
|
1355
|
+
CREATE OR REPLACE TABLE {schema_qualifier}message_embeddings (
|
|
1356
|
+
uuid VARCHAR PRIMARY KEY,
|
|
1357
|
+
model VARCHAR,
|
|
1358
|
+
dim USMALLINT,
|
|
1359
|
+
embedding FLOAT[{dim_i}]
|
|
1360
|
+
);
|
|
1361
|
+
"""
|
|
1362
|
+
)
|
|
1363
|
+
if use_persistence:
|
|
1364
|
+
con.execute(
|
|
1365
|
+
"CREATE OR REPLACE VIEW message_embeddings AS "
|
|
1366
|
+
"SELECT * FROM hnsw_store.main.message_embeddings;"
|
|
1367
|
+
)
|
|
1368
|
+
return False
|
|
1369
|
+
|
|
1370
|
+
rebuild = not use_persistence
|
|
1371
|
+
if use_persistence and persisted_path is not None:
|
|
1372
|
+
# Two reasons to rebuild: parquet is newer than the on-disk store,
|
|
1373
|
+
# or the attached store is empty (newly created header-only file
|
|
1374
|
+
# from ``ATTACH`` on a missing path).
|
|
1375
|
+
rebuild = _hnsw_rebuild_needed(
|
|
1376
|
+
embeddings_parquet, persisted_path
|
|
1377
|
+
) or not _attached_embeddings_table_present(con)
|
|
1378
|
+
|
|
1379
|
+
if rebuild:
|
|
1380
|
+
# Drop any stale table+index in the target schema first so
|
|
1381
|
+
# CREATE TABLE doesn't trip on an existing index. DROP TABLE
|
|
1382
|
+
# cascades to dependent indexes.
|
|
1383
|
+
con.execute(f"DROP TABLE IF EXISTS {schema_qualifier}message_embeddings;")
|
|
1384
|
+
# ``parts`` may be a single legacy file or a list of shard files.
|
|
1385
|
+
# Inline-escape each path because DDL doesn't accept prepared params.
|
|
1386
|
+
path_literals = ", ".join(_sql_str(str(p)) for p in parts)
|
|
1387
|
+
con.execute(
|
|
1388
|
+
f"""
|
|
1389
|
+
CREATE TABLE {schema_qualifier}message_embeddings AS
|
|
1390
|
+
SELECT
|
|
1391
|
+
uuid,
|
|
1392
|
+
model,
|
|
1393
|
+
dim,
|
|
1394
|
+
CAST(embedding AS FLOAT[{dim_i}]) AS embedding
|
|
1395
|
+
FROM read_parquet([{path_literals}]);
|
|
1396
|
+
"""
|
|
1397
|
+
)
|
|
1398
|
+
con.execute(
|
|
1399
|
+
f"""
|
|
1400
|
+
CREATE INDEX idx_msg_hnsw
|
|
1401
|
+
ON {schema_qualifier}message_embeddings
|
|
1402
|
+
USING HNSW (embedding)
|
|
1403
|
+
WITH (
|
|
1404
|
+
metric='{metric}',
|
|
1405
|
+
ef_construction={ef_c_i},
|
|
1406
|
+
ef_search={ef_s_i},
|
|
1407
|
+
M={m_i},
|
|
1408
|
+
M0={m0_i}
|
|
1409
|
+
);
|
|
1410
|
+
"""
|
|
1411
|
+
)
|
|
1412
|
+
if use_persistence:
|
|
1413
|
+
con.execute("CHECKPOINT hnsw_store;")
|
|
1414
|
+
|
|
1415
|
+
if use_persistence:
|
|
1416
|
+
con.execute(
|
|
1417
|
+
"CREATE OR REPLACE VIEW message_embeddings AS "
|
|
1418
|
+
"SELECT * FROM hnsw_store.main.message_embeddings;"
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
row = con.execute(f"SELECT count(*) FROM {schema_qualifier}message_embeddings;").fetchone()
|
|
1422
|
+
count = int(row[0]) if row else 0
|
|
1423
|
+
logger.debug(
|
|
1424
|
+
"{} {} embeddings (metric={}, M={}, ef_search={}, persistent={})",
|
|
1425
|
+
"Built" if rebuild else "Reused persisted",
|
|
1426
|
+
count,
|
|
1427
|
+
metric,
|
|
1428
|
+
m_i,
|
|
1429
|
+
ef_s_i,
|
|
1430
|
+
use_persistence,
|
|
1431
|
+
)
|
|
1432
|
+
return True
|
|
1433
|
+
|
|
1434
|
+
|
|
1435
|
+
# ---------------------------------------------------------------------------
|
|
1436
|
+
# v2 analytics views
|
|
1437
|
+
# ---------------------------------------------------------------------------
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def _parquet_is_populated(path: Path | None) -> bool:
|
|
1441
|
+
"""Return True when ``path`` has at least one usable parquet under it.
|
|
1442
|
+
|
|
1443
|
+
Handles both legacy single-file caches (``<name>.parquet``) and the
|
|
1444
|
+
sharded directory layout (``<name>/part-<ts>.parquet``). An empty
|
|
1445
|
+
directory or a single zero-byte file both count as "not populated"
|
|
1446
|
+
so an aborted run can't trick view registration into pointing at
|
|
1447
|
+
rubbish.
|
|
1448
|
+
"""
|
|
1449
|
+
if path is None:
|
|
1450
|
+
return False
|
|
1451
|
+
parts = iter_part_files(path)
|
|
1452
|
+
return any(p.stat().st_size > 16 for p in parts)
|
|
1453
|
+
|
|
1454
|
+
|
|
1455
|
+
def register_analytics(
|
|
1456
|
+
con: duckdb.DuckDBPyConnection,
|
|
1457
|
+
*,
|
|
1458
|
+
settings: Settings | None = None,
|
|
1459
|
+
classifications_parquet: Path | None = None,
|
|
1460
|
+
trajectory_parquet: Path | None = None,
|
|
1461
|
+
conflicts_parquet: Path | None = None,
|
|
1462
|
+
clusters_parquet: Path | None = None,
|
|
1463
|
+
cluster_terms_parquet: Path | None = None,
|
|
1464
|
+
communities_parquet: Path | None = None,
|
|
1465
|
+
user_friction_parquet: Path | None = None,
|
|
1466
|
+
skills_catalog_parquet: Path | None = None,
|
|
1467
|
+
) -> None:
|
|
1468
|
+
"""Register v2 analytics parquets as DuckDB views.
|
|
1469
|
+
|
|
1470
|
+
Creates one ``CREATE OR REPLACE VIEW`` per parquet that exists on disk:
|
|
1471
|
+
``session_classifications``, ``message_trajectory``, ``session_conflicts``,
|
|
1472
|
+
``message_clusters``, ``cluster_terms``, ``session_communities``,
|
|
1473
|
+
``user_friction``, plus the derived ``session_goals`` projection over
|
|
1474
|
+
``session_classifications``.
|
|
1475
|
+
|
|
1476
|
+
Each view is created only when its source parquet exists and is larger
|
|
1477
|
+
than an empty-file sentinel (>16 bytes). Missing parquets are skipped
|
|
1478
|
+
with a ``logger.warning`` so the function is idempotent against a
|
|
1479
|
+
partially-populated system -- you can call it before, during, or after an
|
|
1480
|
+
analytics pipeline run and it will pick up whatever is on disk.
|
|
1481
|
+
|
|
1482
|
+
Analytics macros (``autonomy_trend`` et al.) are **not** registered here
|
|
1483
|
+
-- they belong to :func:`register_macros`, which must be called
|
|
1484
|
+
afterwards so macro bodies bind against the just-created views.
|
|
1485
|
+
|
|
1486
|
+
Parameters
|
|
1487
|
+
----------
|
|
1488
|
+
con
|
|
1489
|
+
Open DuckDB connection.
|
|
1490
|
+
settings
|
|
1491
|
+
Optional :class:`Settings` whose ``*_parquet_path`` fields drive the
|
|
1492
|
+
per-view parquet locations. If ``None``, explicit per-parquet
|
|
1493
|
+
keyword arguments take over (see below); if both are supplied, the
|
|
1494
|
+
explicit path wins.
|
|
1495
|
+
classifications_parquet, trajectory_parquet, conflicts_parquet,
|
|
1496
|
+
clusters_parquet, cluster_terms_parquet, communities_parquet
|
|
1497
|
+
Optional explicit paths, useful for tests and ad-hoc wiring. Each
|
|
1498
|
+
defaults to the matching ``settings.*_parquet_path`` (or the
|
|
1499
|
+
:class:`Settings` defaults) when not provided.
|
|
1500
|
+
"""
|
|
1501
|
+
resolved = settings if settings is not None else Settings()
|
|
1502
|
+
view_to_path: dict[str, Path] = {
|
|
1503
|
+
"session_classifications": classifications_parquet
|
|
1504
|
+
if classifications_parquet is not None
|
|
1505
|
+
else resolved.classifications_parquet_path,
|
|
1506
|
+
"message_trajectory": trajectory_parquet
|
|
1507
|
+
if trajectory_parquet is not None
|
|
1508
|
+
else resolved.trajectory_parquet_path,
|
|
1509
|
+
"session_conflicts": conflicts_parquet
|
|
1510
|
+
if conflicts_parquet is not None
|
|
1511
|
+
else resolved.conflicts_parquet_path,
|
|
1512
|
+
"message_clusters": clusters_parquet
|
|
1513
|
+
if clusters_parquet is not None
|
|
1514
|
+
else resolved.clusters_parquet_path,
|
|
1515
|
+
"cluster_terms": cluster_terms_parquet
|
|
1516
|
+
if cluster_terms_parquet is not None
|
|
1517
|
+
else resolved.cluster_terms_parquet_path,
|
|
1518
|
+
"session_communities": communities_parquet
|
|
1519
|
+
if communities_parquet is not None
|
|
1520
|
+
else resolved.communities_parquet_path,
|
|
1521
|
+
"user_friction": user_friction_parquet
|
|
1522
|
+
if user_friction_parquet is not None
|
|
1523
|
+
else resolved.user_friction_parquet_path,
|
|
1524
|
+
"skills_catalog": skills_catalog_parquet
|
|
1525
|
+
if skills_catalog_parquet is not None
|
|
1526
|
+
else resolved.skills_catalog_parquet_path,
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
# View projections keyed by view name. A ``None`` projection means
|
|
1530
|
+
# ``SELECT *``; a string is spliced in verbatim so the wrapper view can
|
|
1531
|
+
# add convenience alias columns (e.g. ``autonomy`` alongside
|
|
1532
|
+
# ``autonomy_tier``). These aliases are additive: the original column
|
|
1533
|
+
# names continue to work so existing queries never break.
|
|
1534
|
+
view_projections: dict[str, str | None] = {
|
|
1535
|
+
"session_classifications": (
|
|
1536
|
+
"*, autonomy_tier AS autonomy, success AS success_outcome, work_category AS category"
|
|
1537
|
+
),
|
|
1538
|
+
"message_trajectory": ("*, sentiment_delta AS sentiment, is_transition AS transition"),
|
|
1539
|
+
"session_conflicts": ("*, resolution AS conflict_resolution"),
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
registered: set[str] = set()
|
|
1543
|
+
for view_name, path in view_to_path.items():
|
|
1544
|
+
if not _parquet_is_populated(path):
|
|
1545
|
+
# Missing analytics parquets are the default state until the user
|
|
1546
|
+
# runs the corresponding generator (classify / cluster / ...), so
|
|
1547
|
+
# they belong at DEBUG -- otherwise every query command floods the
|
|
1548
|
+
# terminal with warnings about work the user hasn't yet asked for.
|
|
1549
|
+
logger.debug(
|
|
1550
|
+
"register_analytics: skipping {} (parquet missing at {})",
|
|
1551
|
+
view_name,
|
|
1552
|
+
path,
|
|
1553
|
+
)
|
|
1554
|
+
continue
|
|
1555
|
+
projection = view_projections.get(view_name) or "*"
|
|
1556
|
+
# Sharded directories list every part file; legacy single-file paths
|
|
1557
|
+
# become a one-element list. ``read_parquet`` accepts both.
|
|
1558
|
+
parts = [p for p in iter_part_files(path) if p.stat().st_size > 16]
|
|
1559
|
+
path_literals = ", ".join(_sql_str(str(p)) for p in parts)
|
|
1560
|
+
try:
|
|
1561
|
+
con.execute(
|
|
1562
|
+
f"CREATE OR REPLACE VIEW {view_name} AS "
|
|
1563
|
+
f"SELECT {projection} FROM read_parquet([{path_literals}]);"
|
|
1564
|
+
)
|
|
1565
|
+
logger.debug("Registered analytics view: {} (source={})", view_name, path)
|
|
1566
|
+
registered.add(view_name)
|
|
1567
|
+
except duckdb.Error:
|
|
1568
|
+
logger.exception("Failed to register analytics view {} from {}", view_name, path)
|
|
1569
|
+
|
|
1570
|
+
# ``session_goals`` is a thin projection of ``session_classifications``;
|
|
1571
|
+
# only materialize it when the upstream view exists.
|
|
1572
|
+
if "session_classifications" in registered:
|
|
1573
|
+
try:
|
|
1574
|
+
con.execute(
|
|
1575
|
+
"""
|
|
1576
|
+
CREATE OR REPLACE VIEW session_goals AS
|
|
1577
|
+
SELECT session_id, goal, confidence, classified_at
|
|
1578
|
+
FROM session_classifications;
|
|
1579
|
+
"""
|
|
1580
|
+
)
|
|
1581
|
+
logger.debug("Registered analytics view: session_goals")
|
|
1582
|
+
except duckdb.Error:
|
|
1583
|
+
logger.exception("Failed to register session_goals view")
|
|
1584
|
+
|
|
1585
|
+
# ``skill_usage`` joins ``skill_invocations`` (always-on) against the
|
|
1586
|
+
# catalog for human-readable labels + ``is_builtin`` tagging. When the
|
|
1587
|
+
# catalog parquet is absent the view still works, but every row gets a
|
|
1588
|
+
# ``skill_name = skill_id`` pass-through and ``is_builtin = false``.
|
|
1589
|
+
try:
|
|
1590
|
+
if "skills_catalog" in registered:
|
|
1591
|
+
con.execute(
|
|
1592
|
+
"""
|
|
1593
|
+
CREATE OR REPLACE VIEW skill_usage AS
|
|
1594
|
+
SELECT
|
|
1595
|
+
si.session_id,
|
|
1596
|
+
si.ts,
|
|
1597
|
+
si.message_uuid,
|
|
1598
|
+
si.source,
|
|
1599
|
+
si.skill_id,
|
|
1600
|
+
si.args,
|
|
1601
|
+
si.tool_use_id,
|
|
1602
|
+
coalesce(cat.name, si.skill_id) AS skill_name,
|
|
1603
|
+
cat.plugin AS plugin,
|
|
1604
|
+
cat.plugin_version AS plugin_version,
|
|
1605
|
+
cat.description AS description,
|
|
1606
|
+
cat.source_kind AS source_kind,
|
|
1607
|
+
coalesce(cat.source_kind = 'builtin', false) AS is_builtin
|
|
1608
|
+
FROM skill_invocations si
|
|
1609
|
+
LEFT JOIN skills_catalog cat ON cat.skill_id = si.skill_id;
|
|
1610
|
+
"""
|
|
1611
|
+
)
|
|
1612
|
+
else:
|
|
1613
|
+
con.execute(
|
|
1614
|
+
"""
|
|
1615
|
+
CREATE OR REPLACE VIEW skill_usage AS
|
|
1616
|
+
SELECT
|
|
1617
|
+
si.session_id,
|
|
1618
|
+
si.ts,
|
|
1619
|
+
si.message_uuid,
|
|
1620
|
+
si.source,
|
|
1621
|
+
si.skill_id,
|
|
1622
|
+
si.args,
|
|
1623
|
+
si.tool_use_id,
|
|
1624
|
+
si.skill_id AS skill_name,
|
|
1625
|
+
CAST(NULL AS VARCHAR) AS plugin,
|
|
1626
|
+
CAST(NULL AS VARCHAR) AS plugin_version,
|
|
1627
|
+
CAST(NULL AS VARCHAR) AS description,
|
|
1628
|
+
CAST(NULL AS VARCHAR) AS source_kind,
|
|
1629
|
+
false AS is_builtin
|
|
1630
|
+
FROM skill_invocations si;
|
|
1631
|
+
"""
|
|
1632
|
+
)
|
|
1633
|
+
logger.debug("Registered analytics view: skill_usage")
|
|
1634
|
+
except duckdb.Error:
|
|
1635
|
+
logger.exception("Failed to register skill_usage view")
|
|
1636
|
+
|
|
1637
|
+
|
|
1638
|
+
def register_all(
|
|
1639
|
+
con: duckdb.DuckDBPyConnection,
|
|
1640
|
+
*,
|
|
1641
|
+
settings: Settings | None = None,
|
|
1642
|
+
include_analytics: bool = True,
|
|
1643
|
+
) -> None:
|
|
1644
|
+
"""Register raw views, derived views, VSS, analytics, and macros in order.
|
|
1645
|
+
|
|
1646
|
+
Parameters
|
|
1647
|
+
----------
|
|
1648
|
+
con
|
|
1649
|
+
Open DuckDB connection.
|
|
1650
|
+
settings
|
|
1651
|
+
Optional :class:`Settings`; a default instance is created when absent.
|
|
1652
|
+
include_analytics
|
|
1653
|
+
When ``True`` (default), call :func:`register_analytics` before
|
|
1654
|
+
:func:`register_macros` so the v2 analytics macros can bind against
|
|
1655
|
+
the freshly-registered analytics views. Set to ``False`` to skip
|
|
1656
|
+
analytics view registration entirely (useful in tests that only
|
|
1657
|
+
exercise v1 macros or when the caller will register analytics views
|
|
1658
|
+
out-of-band).
|
|
1659
|
+
|
|
1660
|
+
Notes
|
|
1661
|
+
-----
|
|
1662
|
+
Order matters on two axes:
|
|
1663
|
+
|
|
1664
|
+
1. ``register_vss`` must run before ``register_macros`` because the
|
|
1665
|
+
``semantic_search`` macro body references the ``message_embeddings``
|
|
1666
|
+
table and DuckDB resolves macro bodies at creation time.
|
|
1667
|
+
2. ``register_analytics`` must also run before ``register_macros`` so
|
|
1668
|
+
the analytics macros (``autonomy_trend``, ``cluster_top_terms``, ...)
|
|
1669
|
+
bind against the analytics views at macro-creation time. When a
|
|
1670
|
+
parquet is missing the macro is skipped with a warning rather than
|
|
1671
|
+
raising.
|
|
1672
|
+
"""
|
|
1673
|
+
settings = settings or Settings()
|
|
1674
|
+
register_raw(
|
|
1675
|
+
con,
|
|
1676
|
+
glob=settings.default_glob,
|
|
1677
|
+
subagent_glob=settings.subagent_glob,
|
|
1678
|
+
subagent_meta_glob=settings.subagent_meta_glob,
|
|
1679
|
+
)
|
|
1680
|
+
register_views(con)
|
|
1681
|
+
register_vss(
|
|
1682
|
+
con,
|
|
1683
|
+
embeddings_parquet=settings.embeddings_parquet_path,
|
|
1684
|
+
hnsw_db_path=settings.hnsw_db_path,
|
|
1685
|
+
dim=int(settings.output_dimension),
|
|
1686
|
+
metric=settings.hnsw_metric,
|
|
1687
|
+
ef_construction=settings.hnsw_ef_construction,
|
|
1688
|
+
ef_search=settings.hnsw_ef_search,
|
|
1689
|
+
m=settings.hnsw_m,
|
|
1690
|
+
m0=settings.hnsw_m0,
|
|
1691
|
+
)
|
|
1692
|
+
if include_analytics:
|
|
1693
|
+
register_analytics(con, settings=settings)
|
|
1694
|
+
register_macros(con, settings=settings)
|
|
1695
|
+
|
|
1696
|
+
|
|
1697
|
+
# ---------------------------------------------------------------------------
|
|
1698
|
+
# Introspection
|
|
1699
|
+
# ---------------------------------------------------------------------------
|
|
1700
|
+
|
|
1701
|
+
|
|
1702
|
+
def describe_all(con: duckdb.DuckDBPyConnection) -> dict[str, list[tuple[str, str]]]:
|
|
1703
|
+
"""Return the column schema of every business-level view.
|
|
1704
|
+
|
|
1705
|
+
Parameters
|
|
1706
|
+
----------
|
|
1707
|
+
con
|
|
1708
|
+
Open DuckDB connection with views registered.
|
|
1709
|
+
|
|
1710
|
+
Returns
|
|
1711
|
+
-------
|
|
1712
|
+
dict
|
|
1713
|
+
``{view_name: [(column_name, column_type), ...]}``. Views that fail to
|
|
1714
|
+
describe (e.g. missing because ``register_views`` was not called) map
|
|
1715
|
+
to an empty list and emit a warning.
|
|
1716
|
+
"""
|
|
1717
|
+
out: dict[str, list[tuple[str, str]]] = {}
|
|
1718
|
+
for name in VIEW_NAMES:
|
|
1719
|
+
try:
|
|
1720
|
+
rows = con.execute(f"DESCRIBE {name}").fetchall()
|
|
1721
|
+
out[name] = [(str(r[0]), str(r[1])) for r in rows]
|
|
1722
|
+
except duckdb.Error as exc:
|
|
1723
|
+
logger.warning("Could not describe {}: {}", name, exc)
|
|
1724
|
+
out[name] = []
|
|
1725
|
+
return out
|
|
1726
|
+
|
|
1727
|
+
|
|
1728
|
+
def list_macros(con: duckdb.DuckDBPyConnection) -> list[str]:
|
|
1729
|
+
"""Return the macro names defined in this connection's ``main`` schema.
|
|
1730
|
+
|
|
1731
|
+
Parameters
|
|
1732
|
+
----------
|
|
1733
|
+
con
|
|
1734
|
+
Open DuckDB connection.
|
|
1735
|
+
|
|
1736
|
+
Returns
|
|
1737
|
+
-------
|
|
1738
|
+
list[str]
|
|
1739
|
+
Sorted, deduplicated list of macro function names (includes both
|
|
1740
|
+
scalar and table macros).
|
|
1741
|
+
"""
|
|
1742
|
+
rows = con.execute(
|
|
1743
|
+
"""
|
|
1744
|
+
SELECT DISTINCT function_name
|
|
1745
|
+
FROM duckdb_functions()
|
|
1746
|
+
WHERE schema_name = 'main'
|
|
1747
|
+
AND function_type IN ('macro', 'table_macro')
|
|
1748
|
+
ORDER BY function_name
|
|
1749
|
+
"""
|
|
1750
|
+
).fetchall()
|
|
1751
|
+
return [str(r[0]) for r in rows]
|