claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1751 @@
1
+ """DuckDB view, macro, and VSS registry for claude-sql.
2
+
3
+ Wires a DuckDB connection to the on-disk ``~/.claude/`` JSONL transcript corpus
4
+ and exposes it as a stable set of zero-copy SQL views, analytical macros, and
5
+ an HNSW-indexed embeddings table. v2 analytics outputs (classifications,
6
+ trajectory, conflicts, clusters, communities) are surfaced as parquet-backed
7
+ views alongside the transcript-derived views.
8
+
9
+ Design notes
10
+ ------------
11
+ * Reads are zero-copy via ``read_json(..., filename=true)`` -- no intermediate
12
+ parquet ingestion; the corpus is queried in place. ``filename`` unlocks
13
+ file-level predicate pushdown (DuckDB 1.3+).
14
+ * Nested ``message.content`` is left as JSON and flattened at query time via
15
+ ``UNNEST(json_extract(content_json, '$[*]'))``. This keeps views resilient
16
+ to new content block types (``text``, ``tool_use``, ``tool_result``,
17
+ ``thinking``, ...).
18
+ * Subagent transcripts live in sibling ``agent-<hex>.jsonl`` files under
19
+ ``subagents/`` with ``*.meta.json`` partners; they surface via dedicated
20
+ views so primary-session views stay pure.
21
+ * v2 analytics views (``session_classifications``, ``message_trajectory``,
22
+ ``session_conflicts``, ``message_clusters``, ``cluster_terms``,
23
+ ``session_communities``, and the derived ``session_goals``) are created by
24
+ :func:`register_analytics` from the corresponding parquet files. Each is
25
+ skipped with a warning when its parquet is missing, so the function is
26
+ idempotent on partially-populated systems.
27
+ * All views use ``CREATE OR REPLACE`` so callers may safely re-register.
28
+ * Globs are inlined into DDL (DuckDB rejects prepared parameters as
29
+ table-function arguments); ``sample_size`` and ``maximum_object_size`` are
30
+ likewise inlined (guarded by Python ``int`` typing).
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import contextlib
36
+ import os
37
+ from pathlib import Path
38
+
39
+ import duckdb
40
+ from loguru import logger
41
+
42
+ from claude_sql.config import DEFAULT_PRICING, Settings
43
+ from claude_sql.parquet_shards import iter_part_files
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Glob constants
47
+ # ---------------------------------------------------------------------------
48
+
49
+ DEFAULT_GLOB: str = os.path.expanduser("~/.claude/projects/*/*.jsonl")
50
+ SUBAGENT_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.jsonl")
51
+ SUBAGENT_META_GLOB: str = os.path.expanduser("~/.claude/projects/*/*/subagents/agent-*.meta.json")
52
+
53
+ # Business-level views emitted by ``register_views``. Used by the
54
+ # ``claude-sql schema`` subcommand for schema dumps. Includes the v2
55
+ # analytics view names at the tail so ``describe_all`` can enumerate them
56
+ # once :func:`register_analytics` has populated the corresponding parquets.
57
+ VIEW_NAMES: tuple[str, ...] = (
58
+ "sessions",
59
+ "messages",
60
+ "content_blocks",
61
+ "messages_text",
62
+ "tool_calls",
63
+ "tool_results",
64
+ "todo_events",
65
+ "todo_state_current",
66
+ "subagent_spawns",
67
+ "task_creations",
68
+ "task_updates",
69
+ "tasks_state_current",
70
+ "task_spawns",
71
+ "skill_invocations",
72
+ "subagent_sessions",
73
+ "subagent_messages",
74
+ # v2 analytics views (materialize when the matching parquet exists).
75
+ "session_classifications",
76
+ "session_goals",
77
+ "message_trajectory",
78
+ "session_conflicts",
79
+ "message_clusters",
80
+ "cluster_terms",
81
+ "session_communities",
82
+ "user_friction",
83
+ "skills_catalog",
84
+ "skill_usage",
85
+ )
86
+
87
+ # Analytics-only view names -- the subset of :data:`VIEW_NAMES` backed by v2
88
+ # parquet outputs. Exported so callers (``claude-sql`` subcommands, smoke
89
+ # tests) can enumerate analytics views without needing to filter out the
90
+ # transcript-derived views.
91
+ ANALYTICS_VIEW_NAMES: tuple[str, ...] = (
92
+ "session_classifications",
93
+ "session_goals",
94
+ "message_trajectory",
95
+ "session_conflicts",
96
+ "message_clusters",
97
+ "cluster_terms",
98
+ "session_communities",
99
+ "user_friction",
100
+ "skills_catalog",
101
+ )
102
+
103
+ # Macro names registered by :func:`register_macros`. The first six are the
104
+ # v1 macros that ship unconditionally; the remaining six are the v2 analytics
105
+ # macros, each registered via :func:`_safe_macro` so a missing backing view
106
+ # downgrades to a warning instead of an exception.
107
+ MACRO_NAMES: tuple[str, ...] = (
108
+ "model_used",
109
+ "cost_estimate",
110
+ "tool_rank",
111
+ "todo_velocity",
112
+ "subagent_fanout",
113
+ "semantic_search",
114
+ "skill_rank",
115
+ "skill_source_mix",
116
+ # v2 analytics macros
117
+ "autonomy_trend",
118
+ "work_mix",
119
+ "success_rate_by_work",
120
+ "cluster_top_terms",
121
+ "community_top_topics",
122
+ "sentiment_arc",
123
+ "friction_counts",
124
+ "friction_rate",
125
+ "friction_examples",
126
+ "unused_skills",
127
+ )
128
+
129
+
130
+ def _sql_str(value: str) -> str:
131
+ """Escape a Python string as a single-quoted SQL literal.
132
+
133
+ Parameters
134
+ ----------
135
+ value
136
+ Value to embed in a DDL statement.
137
+
138
+ Returns
139
+ -------
140
+ str
141
+ The value wrapped in single quotes with any embedded quotes doubled.
142
+ """
143
+ escaped = value.replace("'", "''")
144
+ return f"'{escaped}'"
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Raw readers
149
+ # ---------------------------------------------------------------------------
150
+
151
+
152
+ def register_raw(
153
+ con: duckdb.DuckDBPyConnection,
154
+ *,
155
+ glob: str | None = None,
156
+ subagent_glob: str | None = None,
157
+ subagent_meta_glob: str | None = None,
158
+ sample_size: int = -1,
159
+ maximum_object_size: int = 67_108_864,
160
+ ) -> None:
161
+ """Create the low-level ``v_raw_events`` and ``v_raw_subagents`` views.
162
+
163
+ Both views are glob-driven zero-copy scans of JSONL via ``read_json`` with
164
+ ``filename=true`` for file-level predicate pushdown. The subagent
165
+ ``meta.json`` files are registered separately as ``v_raw_subagent_meta``
166
+ so ``subagent_sessions`` can join them in.
167
+
168
+ Parameters
169
+ ----------
170
+ con
171
+ Open DuckDB connection.
172
+ glob
173
+ Glob for primary session transcripts. Defaults to :data:`DEFAULT_GLOB`.
174
+ subagent_glob
175
+ Glob for subagent transcripts. Defaults to :data:`SUBAGENT_GLOB`.
176
+ subagent_meta_glob
177
+ Glob for sibling ``*.meta.json`` files. Defaults to
178
+ :data:`SUBAGENT_META_GLOB`.
179
+ sample_size
180
+ ``read_json`` schema-inference sample size. ``-1`` forces a full scan.
181
+ maximum_object_size
182
+ Maximum JSON object size in bytes (``read_json`` option). Must be an
183
+ int so we can inline it safely.
184
+
185
+ Raises
186
+ ------
187
+ duckdb.Error
188
+ If any view DDL fails. Logged via ``logger.exception`` before re-raise.
189
+ """
190
+ glob = glob if glob is not None else DEFAULT_GLOB
191
+ subagent_glob = subagent_glob if subagent_glob is not None else SUBAGENT_GLOB
192
+ subagent_meta_glob = (
193
+ subagent_meta_glob if subagent_meta_glob is not None else SUBAGENT_META_GLOB
194
+ )
195
+
196
+ # Inline numeric literals; type-narrow via int() to neutralize injection.
197
+ sample_size_i = int(sample_size)
198
+ max_obj_i = int(maximum_object_size)
199
+
200
+ try:
201
+ con.execute(
202
+ f"""
203
+ CREATE OR REPLACE VIEW v_raw_events AS
204
+ SELECT *,
205
+ filename AS source_file,
206
+ regexp_extract(filename, '([^/]+)\\.jsonl$', 1) AS session_id_file
207
+ FROM read_json(
208
+ {_sql_str(glob)},
209
+ format='newline_delimited',
210
+ union_by_name=true,
211
+ filename=true,
212
+ ignore_errors=true,
213
+ sample_size={sample_size_i},
214
+ maximum_object_size={max_obj_i}
215
+ );
216
+ """
217
+ )
218
+ logger.debug(
219
+ "Registered v_raw_events from glob {} with sample_size={}",
220
+ glob,
221
+ sample_size_i,
222
+ )
223
+
224
+ con.execute(
225
+ f"""
226
+ CREATE OR REPLACE VIEW v_raw_subagents AS
227
+ SELECT *,
228
+ filename AS source_file,
229
+ regexp_extract(
230
+ filename,
231
+ '/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.jsonl$',
232
+ 1
233
+ ) AS parent_session_id,
234
+ regexp_extract(
235
+ filename,
236
+ '/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.jsonl$',
237
+ 2
238
+ ) AS agent_hex
239
+ FROM read_json(
240
+ {_sql_str(subagent_glob)},
241
+ format='newline_delimited',
242
+ union_by_name=true,
243
+ filename=true,
244
+ ignore_errors=true,
245
+ sample_size={sample_size_i},
246
+ maximum_object_size={max_obj_i}
247
+ );
248
+ """
249
+ )
250
+ logger.debug("Registered v_raw_subagents from glob {}", subagent_glob)
251
+
252
+ # meta.json files are one object per file (not NDJSON) -> format='auto'.
253
+ con.execute(
254
+ f"""
255
+ CREATE OR REPLACE VIEW v_raw_subagent_meta AS
256
+ SELECT *,
257
+ filename AS source_file,
258
+ regexp_extract(
259
+ filename,
260
+ '/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.meta\\.json$',
261
+ 1
262
+ ) AS parent_session_id,
263
+ regexp_extract(
264
+ filename,
265
+ '/([0-9a-f-]{{36}})/subagents/agent-([a-f0-9]+)\\.meta\\.json$',
266
+ 2
267
+ ) AS agent_hex
268
+ FROM read_json(
269
+ {_sql_str(subagent_meta_glob)},
270
+ format='auto',
271
+ union_by_name=true,
272
+ filename=true,
273
+ ignore_errors=true
274
+ );
275
+ """
276
+ )
277
+ logger.debug("Registered v_raw_subagent_meta from glob {}", subagent_meta_glob)
278
+ except Exception:
279
+ logger.exception("Failed to register raw views")
280
+ raise
281
+
282
+
283
+ # ---------------------------------------------------------------------------
284
+ # Derived views
285
+ # ---------------------------------------------------------------------------
286
+
287
+
288
+ def register_views(con: duckdb.DuckDBPyConnection) -> None:
289
+ """Create logical business-level views on top of the raw readers.
290
+
291
+ Must be called after :func:`register_raw`. Creates, in order:
292
+ ``sessions``, ``messages``, ``content_blocks``, ``messages_text``,
293
+ ``tool_calls``, ``tool_results``, ``todo_events``, ``todo_state_current``,
294
+ ``subagent_spawns``, ``task_creations``, ``task_updates``,
295
+ ``tasks_state_current``, ``task_spawns`` (deprecated alias),
296
+ ``subagent_sessions``, ``subagent_messages``.
297
+
298
+ The split between ``subagent_spawns`` and ``task_creations`` reflects
299
+ the Claude Code v2.1.63 ``Task``→``Agent`` rename and the v2.1.16
300
+ (Jan 2026) split of interactive todo tracking from ``TodoWrite`` into
301
+ the ``TaskCreate``/``TaskGet``/``TaskList``/``TaskUpdate`` family.
302
+ Pre-2026 transcripts and Agent-SDK / ``--print`` runs still emit
303
+ ``TodoWrite`` (covered by ``todo_events``).
304
+
305
+ Parameters
306
+ ----------
307
+ con
308
+ Open DuckDB connection with raw views already registered.
309
+
310
+ Raises
311
+ ------
312
+ duckdb.Error
313
+ If any view DDL fails. Logged via ``logger.exception`` before re-raise.
314
+ """
315
+ try:
316
+ con.execute(
317
+ """
318
+ CREATE OR REPLACE VIEW sessions AS
319
+ SELECT
320
+ session_id_file AS session_id,
321
+ any_value(cwd) AS cwd,
322
+ any_value(gitBranch) AS git_branch,
323
+ min(timestamp::TIMESTAMP) AS started_at,
324
+ max(timestamp::TIMESTAMP) AS ended_at,
325
+ count(*) FILTER (WHERE type = 'assistant') AS assistant_messages,
326
+ count(*) AS record_count,
327
+ any_value(source_file) AS transcript_path
328
+ FROM v_raw_events
329
+ WHERE sessionId IS NOT NULL
330
+ GROUP BY session_id_file;
331
+ """
332
+ )
333
+ logger.debug("Registered view: sessions")
334
+
335
+ # ``message.content`` is inferred by ``read_json`` as JSON. ``to_json``
336
+ # is defensive: if a future schema infers LIST, ``to_json`` normalizes
337
+ # it back to a JSON-typed column that ``json_extract`` understands.
338
+ con.execute(
339
+ """
340
+ CREATE OR REPLACE VIEW messages AS
341
+ SELECT
342
+ uuid,
343
+ parentUuid AS parent_uuid,
344
+ sessionId AS session_id,
345
+ timestamp::TIMESTAMP AS ts,
346
+ type,
347
+ isSidechain AS is_sidechain,
348
+ message.role AS role,
349
+ message.model AS model,
350
+ message.stop_reason AS stop_reason,
351
+ message.usage.input_tokens AS input_tokens,
352
+ message.usage.output_tokens AS output_tokens,
353
+ message.usage.cache_read_input_tokens AS cache_read,
354
+ message.usage.cache_creation_input_tokens AS cache_write,
355
+ to_json(message.content) AS content_json,
356
+ source_file
357
+ FROM v_raw_events
358
+ WHERE type IN ('user', 'assistant');
359
+ """
360
+ )
361
+ logger.debug("Registered view: messages")
362
+
363
+ con.execute(
364
+ """
365
+ CREATE OR REPLACE VIEW content_blocks AS
366
+ SELECT
367
+ m.session_id,
368
+ m.uuid AS message_uuid,
369
+ m.ts,
370
+ m.role,
371
+ json_extract_string(block, '$.type') AS block_type,
372
+ json_extract_string(block, '$.text') AS text,
373
+ json_extract_string(block, '$.id') AS tool_use_id_field,
374
+ json_extract_string(block, '$.name') AS tool_name,
375
+ json_extract(block, '$.input') AS tool_input,
376
+ json_extract_string(block, '$.tool_use_id') AS tool_use_id,
377
+ json_extract(block, '$.content') AS tool_result_content,
378
+ json_extract_string(block, '$.thinking') AS thinking
379
+ FROM messages m,
380
+ UNNEST(json_extract(m.content_json, '$[*]')) AS t(block);
381
+ """
382
+ )
383
+ logger.debug("Registered view: content_blocks")
384
+
385
+ # One row per *message*, not per text block. Aggregating the text
386
+ # blocks preserves enough context for useful embeddings; the per-block
387
+ # fan-out made semantic search noisy (tiny fragments like
388
+ # "Now run the tests" dominated results). Messages with no text blocks
389
+ # (tool-use-only, tool-result-only) are omitted.
390
+ con.execute(
391
+ """
392
+ CREATE OR REPLACE VIEW messages_text AS
393
+ SELECT
394
+ cb.message_uuid AS uuid,
395
+ any_value(cb.session_id) AS session_id,
396
+ any_value(cb.ts) AS ts,
397
+ any_value(cb.role) AS role,
398
+ string_agg(cb.text, '\n\n') AS text_content
399
+ FROM content_blocks cb
400
+ WHERE cb.block_type = 'text'
401
+ AND cb.text IS NOT NULL
402
+ AND length(cb.text) > 0
403
+ GROUP BY cb.message_uuid
404
+ HAVING length(string_agg(cb.text, '\n\n')) >= 32;
405
+ """
406
+ )
407
+ logger.debug("Registered view: messages_text")
408
+
409
+ con.execute(
410
+ """
411
+ CREATE OR REPLACE VIEW tool_calls AS
412
+ SELECT
413
+ cb.message_uuid,
414
+ cb.session_id,
415
+ cb.ts,
416
+ cb.tool_name,
417
+ cb.tool_use_id_field AS tool_use_id,
418
+ cb.tool_input
419
+ FROM content_blocks cb
420
+ WHERE cb.block_type = 'tool_use';
421
+ """
422
+ )
423
+ logger.debug("Registered view: tool_calls")
424
+
425
+ con.execute(
426
+ """
427
+ CREATE OR REPLACE VIEW tool_results AS
428
+ SELECT
429
+ cb.message_uuid,
430
+ cb.session_id,
431
+ cb.ts,
432
+ cb.tool_use_id,
433
+ cb.tool_result_content AS content
434
+ FROM content_blocks cb
435
+ WHERE cb.block_type = 'tool_result';
436
+ """
437
+ )
438
+ logger.debug("Registered view: tool_results")
439
+
440
+ # DuckDB's ``UNNEST`` requires a LIST. ``json_extract(x, '$.todos')``
441
+ # returns a JSON scalar (potentially an array) that UNNEST rejects.
442
+ # The ``$.todos[*]`` wildcard path yields a ``JSON[]`` that UNNEST
443
+ # accepts natively.
444
+ con.execute(
445
+ """
446
+ CREATE OR REPLACE VIEW todo_events AS
447
+ SELECT
448
+ tc.session_id,
449
+ tc.ts AS written_at,
450
+ tc.message_uuid,
451
+ json_extract_string(todo, '$.content') AS subject,
452
+ json_extract_string(todo, '$.status') AS status,
453
+ json_extract_string(todo, '$.activeForm') AS active_form,
454
+ row_number() OVER (
455
+ PARTITION BY tc.session_id
456
+ ORDER BY tc.ts, tc.message_uuid
457
+ ) AS snapshot_ix
458
+ FROM tool_calls tc,
459
+ UNNEST(json_extract(tc.tool_input, '$.todos[*]')) AS t(todo)
460
+ WHERE tc.tool_name = 'TodoWrite';
461
+ """
462
+ )
463
+ logger.debug("Registered view: todo_events")
464
+
465
+ con.execute(
466
+ """
467
+ CREATE OR REPLACE VIEW todo_state_current AS
468
+ SELECT session_id, subject, status, active_form, written_at
469
+ FROM (
470
+ SELECT *,
471
+ row_number() OVER (
472
+ PARTITION BY session_id, subject
473
+ ORDER BY snapshot_ix DESC
474
+ ) AS rn
475
+ FROM todo_events
476
+ )
477
+ WHERE rn = 1;
478
+ """
479
+ )
480
+ logger.debug("Registered view: todo_state_current")
481
+
482
+ # Subagent launchers: ``Task`` (pre-v2.1.63) and ``Agent`` (v2.1.63+).
483
+ # Input shape: {subagent_type, description, prompt, run_in_background?}.
484
+ con.execute(
485
+ """
486
+ CREATE OR REPLACE VIEW subagent_spawns AS
487
+ SELECT
488
+ session_id,
489
+ ts AS spawned_at,
490
+ message_uuid,
491
+ tool_use_id,
492
+ tool_name AS spawn_tool,
493
+ json_extract_string(tool_input, '$.subagent_type') AS subagent_type,
494
+ json_extract_string(tool_input, '$.description') AS description,
495
+ json_extract_string(tool_input, '$.prompt') AS prompt,
496
+ json_extract_string(tool_input, '$.run_in_background') AS run_in_background
497
+ FROM tool_calls
498
+ WHERE tool_name IN ('Task', 'Agent');
499
+ """
500
+ )
501
+ logger.debug("Registered view: subagent_spawns")
502
+
503
+ # Persistent task creation: ``TaskCreate`` (Claude Code v2.1.16+
504
+ # interactive sessions) and the SDK-py mirror ``mcp__tasks__task_create``.
505
+ # Input shape: {subject, description, activeForm?, metadata?}. Distinct
506
+ # from subagent_spawns -- no subagent_type / prompt fields.
507
+ con.execute(
508
+ """
509
+ CREATE OR REPLACE VIEW task_creations AS
510
+ SELECT
511
+ session_id,
512
+ ts AS created_at,
513
+ message_uuid,
514
+ tool_use_id,
515
+ tool_name AS create_tool,
516
+ json_extract_string(tool_input, '$.subject') AS subject,
517
+ json_extract_string(tool_input, '$.description') AS description,
518
+ json_extract_string(tool_input, '$.activeForm') AS active_form,
519
+ json_extract(tool_input, '$.metadata') AS metadata
520
+ FROM tool_calls
521
+ WHERE tool_name IN ('TaskCreate', 'mcp__tasks__task_create');
522
+ """
523
+ )
524
+ logger.debug("Registered view: task_creations")
525
+
526
+ # Task lifecycle updates: ``TaskUpdate`` (v2.1.16+) and the SDK-py
527
+ # mirror ``mcp__tasks__task_update``. Native uses ``taskId`` (camel),
528
+ # mcp variant uses ``id`` -- COALESCE to one column.
529
+ con.execute(
530
+ """
531
+ CREATE OR REPLACE VIEW task_updates AS
532
+ SELECT
533
+ session_id,
534
+ ts AS updated_at,
535
+ message_uuid,
536
+ tool_use_id,
537
+ tool_name AS update_tool,
538
+ COALESCE(
539
+ json_extract_string(tool_input, '$.taskId'),
540
+ json_extract_string(tool_input, '$.id')
541
+ ) AS task_id,
542
+ json_extract_string(tool_input, '$.status') AS status,
543
+ json_extract(tool_input, '$.addBlockedBy') AS add_blocked_by,
544
+ json_extract_string(tool_input, '$.owner') AS owner
545
+ FROM tool_calls
546
+ WHERE tool_name IN ('TaskUpdate', 'mcp__tasks__task_update');
547
+ """
548
+ )
549
+ logger.debug("Registered view: task_updates")
550
+
551
+ # Latest status per (session_id, task_id) by joining task_creations
552
+ # to task_updates. The task_id on TaskCreate isn't carried in the
553
+ # tool_input (the runtime assigns it), so we recover it from the
554
+ # tool_result -- and fall back to row-position when the result is
555
+ # missing. Mirrors ``todo_state_current`` for the v2.1.16+ family.
556
+ con.execute(
557
+ """
558
+ CREATE OR REPLACE VIEW tasks_state_current AS
559
+ WITH creates AS (
560
+ SELECT
561
+ tc.session_id,
562
+ tc.created_at,
563
+ tc.subject,
564
+ tc.active_form,
565
+ tc.tool_use_id,
566
+ -- The runtime returns the assigned task id in tool_result.
567
+ -- Common shape: text content like "Task #N created..." or
568
+ -- a JSON {taskId: "N"}. Try both; fall back to per-session
569
+ -- creation order.
570
+ COALESCE(
571
+ regexp_extract(
572
+ CAST(tr.content AS VARCHAR), 'Task #(\\d+)', 1
573
+ ),
574
+ json_extract_string(tr.content, '$.taskId'),
575
+ CAST(row_number() OVER (
576
+ PARTITION BY tc.session_id ORDER BY tc.created_at
577
+ ) AS VARCHAR)
578
+ ) AS task_id
579
+ FROM task_creations tc
580
+ LEFT JOIN tool_results tr USING (tool_use_id)
581
+ ),
582
+ latest_status AS (
583
+ SELECT session_id, task_id, status, updated_at,
584
+ row_number() OVER (
585
+ PARTITION BY session_id, task_id
586
+ ORDER BY updated_at DESC
587
+ ) AS rn
588
+ FROM task_updates
589
+ WHERE task_id IS NOT NULL
590
+ )
591
+ SELECT
592
+ c.session_id,
593
+ c.task_id,
594
+ c.subject,
595
+ c.active_form,
596
+ COALESCE(ls.status, 'pending') AS status,
597
+ c.created_at,
598
+ ls.updated_at AS last_updated_at
599
+ FROM creates c
600
+ LEFT JOIN latest_status ls
601
+ ON ls.session_id = c.session_id
602
+ AND ls.task_id = c.task_id
603
+ AND ls.rn = 1;
604
+ """
605
+ )
606
+ logger.debug("Registered view: tasks_state_current")
607
+
608
+ # DEPRECATED: ``task_spawns`` predates the Task→Agent rename (v2.1.63)
609
+ # and the TodoWrite→TaskCreate split (v2.1.16). It conflated subagent
610
+ # launchers with task-tracker creation. Kept as a UNION ALL alias for
611
+ # one release; new analytics should use ``subagent_spawns`` or
612
+ # ``task_creations`` directly. Removed in the next minor release.
613
+ con.execute(
614
+ """
615
+ CREATE OR REPLACE VIEW task_spawns AS
616
+ SELECT
617
+ session_id, spawned_at, message_uuid, tool_use_id,
618
+ spawn_tool, subagent_type, description, prompt
619
+ FROM subagent_spawns
620
+ UNION ALL
621
+ SELECT
622
+ session_id, created_at AS spawned_at, message_uuid, tool_use_id,
623
+ create_tool AS spawn_tool,
624
+ NULL AS subagent_type,
625
+ description,
626
+ NULL AS prompt
627
+ FROM task_creations;
628
+ """
629
+ )
630
+ logger.debug("Registered view: task_spawns (deprecated)")
631
+
632
+ # Every Skill / slash-command invocation observable in the transcripts,
633
+ # unioned across the two shapes they take:
634
+ #
635
+ # * ``tool`` — the assistant invokes the built-in ``Skill`` tool with
636
+ # ``tool_input.skill = '<name>'``. Lives in ``tool_calls`` already.
637
+ # * ``slash_command`` — the user types ``/<name>`` in chat, which
638
+ # Claude Code serializes into the text block as
639
+ # ``<command-name>/<name></command-name>`` (sometimes paired with
640
+ # ``<command-message>`` and ``<command-args>``).
641
+ #
642
+ # ``skill_id`` stays raw (``erpaval`` and ``personal-plugins:erpaval``
643
+ # are distinct rows) — the ``skills_catalog`` seed emits both shapes
644
+ # so the enriched ``skill_usage`` view joins cleanly either way.
645
+ # ``<command-name>/<name></command-name>`` slash-command text lands in
646
+ # two shapes across the corpus: inside a ``text`` block of a
647
+ # list-typed ``content`` array (newer transcripts), and as a bare
648
+ # VARCHAR ``message.content`` (older user turns). We scan both
649
+ # so the slash-command surface isn't biased toward one era.
650
+ cmd_name_re = "<command-name>/([A-Za-z0-9_:.-]+)</command-name>"
651
+ args_re = "<command-args>([^<]*)</command-args>"
652
+ con.execute(
653
+ f"""
654
+ CREATE OR REPLACE VIEW skill_invocations AS
655
+ SELECT
656
+ tc.session_id,
657
+ tc.ts,
658
+ tc.message_uuid,
659
+ 'tool' AS source,
660
+ json_extract_string(tc.tool_input, '$.skill') AS skill_id,
661
+ json_extract_string(tc.tool_input, '$.args') AS args,
662
+ tc.tool_use_id
663
+ FROM tool_calls tc
664
+ WHERE tc.tool_name = 'Skill'
665
+ AND json_extract_string(tc.tool_input, '$.skill') IS NOT NULL
666
+ UNION ALL
667
+ SELECT
668
+ cb.session_id,
669
+ cb.ts,
670
+ cb.message_uuid,
671
+ 'slash_command' AS source,
672
+ regexp_extract(cb.text, '{cmd_name_re}', 1) AS skill_id,
673
+ NULLIF(regexp_extract(cb.text, '{args_re}', 1), '') AS args,
674
+ NULL AS tool_use_id
675
+ FROM content_blocks cb
676
+ WHERE cb.role = 'user'
677
+ AND cb.block_type = 'text'
678
+ AND cb.text LIKE '%<command-name>/%'
679
+ AND regexp_extract(cb.text, '{cmd_name_re}', 1) != ''
680
+ UNION ALL
681
+ SELECT
682
+ m.session_id,
683
+ m.ts,
684
+ m.uuid AS message_uuid,
685
+ 'slash_command' AS source,
686
+ regexp_extract(raw.txt, '{cmd_name_re}', 1) AS skill_id,
687
+ NULLIF(regexp_extract(raw.txt, '{args_re}', 1), '') AS args,
688
+ NULL AS tool_use_id
689
+ FROM messages m,
690
+ LATERAL (SELECT json_extract_string(m.content_json, '$') AS txt) raw
691
+ WHERE m.role = 'user'
692
+ AND json_type(m.content_json) = 'VARCHAR'
693
+ AND raw.txt LIKE '%<command-name>/%'
694
+ AND regexp_extract(raw.txt, '{cmd_name_re}', 1) != '';
695
+ """
696
+ )
697
+ logger.debug("Registered view: skill_invocations")
698
+
699
+ con.execute(
700
+ """
701
+ CREATE OR REPLACE VIEW subagent_sessions AS
702
+ SELECT
703
+ r.parent_session_id,
704
+ r.agent_hex,
705
+ any_value(m.agentType) AS agent_type,
706
+ any_value(m.description) AS description,
707
+ min(r.timestamp::TIMESTAMP) AS started_at,
708
+ max(r.timestamp::TIMESTAMP) AS ended_at,
709
+ count(*) AS message_count,
710
+ any_value(r.source_file) AS transcript_path
711
+ FROM v_raw_subagents r
712
+ LEFT JOIN v_raw_subagent_meta m
713
+ ON m.parent_session_id = r.parent_session_id
714
+ AND m.agent_hex = r.agent_hex
715
+ GROUP BY r.parent_session_id, r.agent_hex;
716
+ """
717
+ )
718
+ logger.debug("Registered view: subagent_sessions")
719
+
720
+ con.execute(
721
+ """
722
+ CREATE OR REPLACE VIEW subagent_messages AS
723
+ SELECT
724
+ uuid,
725
+ parentUuid AS parent_uuid,
726
+ sessionId AS session_id,
727
+ parent_session_id,
728
+ agent_hex,
729
+ timestamp::TIMESTAMP AS ts,
730
+ type,
731
+ message.role AS role,
732
+ message.model AS model,
733
+ message.usage.input_tokens AS input_tokens,
734
+ message.usage.output_tokens AS output_tokens,
735
+ to_json(message.content) AS content_json,
736
+ source_file
737
+ FROM v_raw_subagents
738
+ WHERE type IN ('user', 'assistant');
739
+ """
740
+ )
741
+ logger.debug("Registered view: subagent_messages")
742
+ except Exception:
743
+ logger.exception("Failed to register derived views")
744
+ raise
745
+
746
+
747
+ # ---------------------------------------------------------------------------
748
+ # Macros
749
+ # ---------------------------------------------------------------------------
750
+
751
+
752
+ def _pricing_values_clause(pricing: dict[str, tuple[float, float]]) -> str:
753
+ """Render a pricing dict as an inline SQL ``VALUES`` row list.
754
+
755
+ Parameters
756
+ ----------
757
+ pricing
758
+ Mapping of ``model_name -> (input_rate, output_rate)`` per 1M tokens.
759
+
760
+ Returns
761
+ -------
762
+ str
763
+ Comma-separated ``('model', in, out)`` rows. Emits a sentinel row that
764
+ matches no real model if ``pricing`` is empty (DuckDB rejects empty
765
+ ``VALUES`` lists).
766
+ """
767
+ if not pricing:
768
+ return "('__no_pricing__', 0.0, 0.0)"
769
+ rows = [
770
+ f"('{model}', {in_rate}, {out_rate})"
771
+ for model, (in_rate, out_rate) in sorted(pricing.items())
772
+ ]
773
+ return ", ".join(rows)
774
+
775
+
776
+ def _safe_macro(con: duckdb.DuckDBPyConnection, name: str, ddl: str) -> None:
777
+ """Execute a ``CREATE OR REPLACE MACRO`` DDL, downgrading failures to warnings.
778
+
779
+ Analytics macros reference views (``session_classifications``,
780
+ ``cluster_terms``, etc.) that only materialize once the corresponding
781
+ parquet has been produced. Wrapping creation in ``try/except
782
+ duckdb.Error`` means a fresh install (pre-``claude-sql classify``) can
783
+ still call :func:`register_macros` without blowing up: the macro simply
784
+ doesn't get created and the caller gets a ``logger.warning`` pointing at
785
+ the missing backing view.
786
+
787
+ Parameters
788
+ ----------
789
+ con
790
+ Open DuckDB connection.
791
+ name
792
+ Macro name, used only for log messages.
793
+ ddl
794
+ Complete ``CREATE OR REPLACE MACRO`` statement.
795
+ """
796
+ try:
797
+ con.execute(ddl)
798
+ logger.debug("Registered analytics macro: {}", name)
799
+ except duckdb.Error as exc:
800
+ logger.warning("Skipped macro {} (backing view missing): {}", name, exc)
801
+
802
+
803
+ def register_macros(
804
+ con: duckdb.DuckDBPyConnection,
805
+ settings: Settings | None = None,
806
+ ) -> None:
807
+ """Create SQL macros used by the CLI and analysts.
808
+
809
+ v1 macros (always created): ``model_used``, ``cost_estimate``,
810
+ ``tool_rank``, ``todo_velocity``, ``subagent_fanout``, ``semantic_search``.
811
+
812
+ v2 analytics macros (created via :func:`_safe_macro`, skipped when their
813
+ backing analytics view is missing): ``autonomy_trend``, ``work_mix``,
814
+ ``success_rate_by_work``, ``cluster_top_terms``, ``community_top_topics``,
815
+ ``sentiment_arc``.
816
+
817
+ ``semantic_search(query_vec, k)`` is a table macro that returns the top-k
818
+ uuids by cosine distance to ``query_vec`` using the HNSW index.
819
+ ``query_vec`` must be ``FLOAT[<dim>]`` matching the ``message_embeddings``
820
+ column type.
821
+
822
+ Parameters
823
+ ----------
824
+ con
825
+ Open DuckDB connection with views (and the ``message_embeddings``
826
+ table from :func:`register_vss`) already registered. Analytics views
827
+ should be registered first (via :func:`register_analytics`) so the
828
+ analytics macros bind successfully; if they're not, those macros are
829
+ skipped with a warning.
830
+ settings
831
+ Optional :class:`Settings` for pricing overrides; falls back to
832
+ :data:`claude_sql.config.DEFAULT_PRICING`.
833
+ """
834
+ pricing = settings.pricing if settings is not None else DEFAULT_PRICING
835
+ pricing_rows = _pricing_values_clause(pricing)
836
+
837
+ con.execute(
838
+ """
839
+ CREATE OR REPLACE MACRO model_used(sid) AS (
840
+ SELECT any_value(model)
841
+ FROM messages
842
+ WHERE session_id = sid AND model IS NOT NULL
843
+ );
844
+ """
845
+ )
846
+
847
+ # Pricing join uses a prefix match so dated model IDs like
848
+ # ``claude-haiku-4-5-20251001`` still resolve to the base entry
849
+ # ``claude-haiku-4-5`` in ``DEFAULT_PRICING``.
850
+ con.execute(
851
+ f"""
852
+ CREATE OR REPLACE MACRO cost_estimate(sid) AS (
853
+ SELECT sum(
854
+ (coalesce(m.input_tokens, 0) + coalesce(m.cache_write, 0)) * p.in_rate
855
+ + coalesce(m.output_tokens, 0) * p.out_rate
856
+ ) / 1e6
857
+ FROM messages m
858
+ JOIN (VALUES {pricing_rows}) p(model, in_rate, out_rate)
859
+ ON regexp_replace(m.model, '-\\d{{8}}$', '') = p.model
860
+ WHERE m.session_id = sid
861
+ );
862
+ """
863
+ )
864
+
865
+ con.execute(
866
+ """
867
+ CREATE OR REPLACE MACRO tool_rank(last_n_days) AS TABLE (
868
+ SELECT tool_name, count(*) AS n
869
+ FROM tool_calls
870
+ WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
871
+ AND tool_name IS NOT NULL
872
+ GROUP BY 1
873
+ ORDER BY n DESC
874
+ );
875
+ """
876
+ )
877
+
878
+ con.execute(
879
+ """
880
+ CREATE OR REPLACE MACRO todo_velocity(sid) AS (
881
+ SELECT count(*) FILTER (WHERE status = 'completed')::DOUBLE
882
+ / NULLIF(count(DISTINCT subject), 0)
883
+ FROM todo_state_current
884
+ WHERE session_id = sid
885
+ );
886
+ """
887
+ )
888
+
889
+ con.execute(
890
+ """
891
+ CREATE OR REPLACE MACRO subagent_fanout(sid) AS (
892
+ SELECT count(*)
893
+ FROM subagent_sessions
894
+ WHERE parent_session_id = sid
895
+ );
896
+ """
897
+ )
898
+
899
+ # ``ORDER BY array_distance`` triggers the HNSW index rewrite; cosine
900
+ # similarity and distance are both surfaced for human-readable ranking.
901
+ con.execute(
902
+ """
903
+ CREATE OR REPLACE MACRO semantic_search(query_vec, k) AS TABLE (
904
+ SELECT me.uuid,
905
+ array_cosine_similarity(me.embedding, query_vec) AS sim,
906
+ array_distance(me.embedding, query_vec) AS distance
907
+ FROM message_embeddings me
908
+ ORDER BY array_distance(me.embedding, query_vec)
909
+ LIMIT k
910
+ );
911
+ """
912
+ )
913
+
914
+ # Skill / slash-command leaderboard over the last N days. Resolves
915
+ # against ``skill_usage``, which always exists (with or without the
916
+ # catalog), so this macro is safe to register unconditionally.
917
+ _safe_macro(
918
+ con,
919
+ "skill_rank",
920
+ """
921
+ CREATE OR REPLACE MACRO skill_rank(last_n_days) AS TABLE (
922
+ SELECT skill_id,
923
+ skill_name,
924
+ plugin,
925
+ is_builtin,
926
+ count(*) AS n,
927
+ count(DISTINCT session_id) AS sessions
928
+ FROM skill_usage
929
+ WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
930
+ GROUP BY 1, 2, 3, 4
931
+ ORDER BY n DESC
932
+ );
933
+ """,
934
+ )
935
+
936
+ # How is each skill invoked? ``n_tool`` comes from the ``Skill`` tool,
937
+ # ``n_slash`` from user-typed ``/<name>`` in chat. Built-ins are
938
+ # excluded because they're almost always slash-only and would drown
939
+ # everything else out.
940
+ _safe_macro(
941
+ con,
942
+ "skill_source_mix",
943
+ """
944
+ CREATE OR REPLACE MACRO skill_source_mix(last_n_days) AS TABLE (
945
+ SELECT skill_id,
946
+ skill_name,
947
+ count(*) FILTER (WHERE source = 'tool') AS n_tool,
948
+ count(*) FILTER (WHERE source = 'slash_command') AS n_slash,
949
+ count(*) AS n_total
950
+ FROM skill_usage
951
+ WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
952
+ AND NOT is_builtin
953
+ GROUP BY 1, 2
954
+ ORDER BY n_total DESC
955
+ );
956
+ """,
957
+ )
958
+
959
+ logger.debug(
960
+ "Registered macros: model_used, cost_estimate, tool_rank, "
961
+ "todo_velocity, subagent_fanout, semantic_search, skill_rank, "
962
+ "skill_source_mix"
963
+ )
964
+
965
+ # ------------------------------------------------------------------
966
+ # v2 analytics macros -- each wrapped in _safe_macro so a missing
967
+ # backing view (pre-``claude-sql classify`` run) is a warning, not an
968
+ # exception.
969
+ # ------------------------------------------------------------------
970
+
971
+ # Time series: autonomy tier mix over rolling windows.
972
+ _safe_macro(
973
+ con,
974
+ "autonomy_trend",
975
+ """
976
+ CREATE OR REPLACE MACRO autonomy_trend(window_days) AS TABLE (
977
+ SELECT
978
+ date_trunc('week', classified_at) AS week,
979
+ autonomy_tier,
980
+ count(*) AS n
981
+ FROM session_classifications
982
+ WHERE classified_at >= current_timestamp - (window_days * INTERVAL 1 DAY)
983
+ GROUP BY 1, 2
984
+ ORDER BY 1, 2
985
+ );
986
+ """,
987
+ )
988
+
989
+ # Work-category mix in the last N days.
990
+ _safe_macro(
991
+ con,
992
+ "work_mix",
993
+ """
994
+ CREATE OR REPLACE MACRO work_mix(since_days) AS TABLE (
995
+ SELECT work_category, count(*) AS n
996
+ FROM session_classifications
997
+ WHERE classified_at >= current_timestamp - (since_days * INTERVAL 1 DAY)
998
+ GROUP BY 1
999
+ ORDER BY n DESC
1000
+ );
1001
+ """,
1002
+ )
1003
+
1004
+ # Success / failure / partial rate broken down by work category.
1005
+ _safe_macro(
1006
+ con,
1007
+ "success_rate_by_work",
1008
+ """
1009
+ CREATE OR REPLACE MACRO success_rate_by_work(since_days) AS TABLE (
1010
+ SELECT
1011
+ work_category,
1012
+ count(*) AS sessions,
1013
+ count(*) FILTER (WHERE success = 'success')::DOUBLE
1014
+ / NULLIF(count(*), 0) AS success_rate,
1015
+ count(*) FILTER (WHERE success = 'failure')::DOUBLE
1016
+ / NULLIF(count(*), 0) AS failure_rate,
1017
+ count(*) FILTER (WHERE success = 'partial')::DOUBLE
1018
+ / NULLIF(count(*), 0) AS partial_rate
1019
+ FROM session_classifications
1020
+ WHERE classified_at >= current_timestamp - (since_days * INTERVAL 1 DAY)
1021
+ GROUP BY 1
1022
+ ORDER BY sessions DESC
1023
+ );
1024
+ """,
1025
+ )
1026
+
1027
+ # Top-N TF-IDF terms for a single cluster.
1028
+ _safe_macro(
1029
+ con,
1030
+ "cluster_top_terms",
1031
+ """
1032
+ CREATE OR REPLACE MACRO cluster_top_terms(cid, n) AS TABLE (
1033
+ SELECT term, weight, rank
1034
+ FROM cluster_terms
1035
+ WHERE cluster_id = cid
1036
+ ORDER BY rank
1037
+ LIMIT n
1038
+ );
1039
+ """,
1040
+ )
1041
+
1042
+ # Top cluster_ids within a given community, ranked by the number of
1043
+ # messages each cluster contributes to the community. Each row carries
1044
+ # its top 5 TF-IDF terms for human-readable context.
1045
+ _safe_macro(
1046
+ con,
1047
+ "community_top_topics",
1048
+ """
1049
+ CREATE OR REPLACE MACRO community_top_topics(cid, n) AS TABLE (
1050
+ WITH community_msgs AS (
1051
+ SELECT CAST(m.uuid AS VARCHAR) AS uuid
1052
+ FROM messages m
1053
+ JOIN session_communities sc
1054
+ ON CAST(m.session_id AS VARCHAR) = sc.session_id
1055
+ WHERE sc.community_id = cid
1056
+ ),
1057
+ cluster_counts AS (
1058
+ SELECT mc.cluster_id, count(*) AS n_msgs
1059
+ FROM message_clusters mc
1060
+ JOIN community_msgs cm USING (uuid)
1061
+ WHERE mc.cluster_id >= 0
1062
+ GROUP BY mc.cluster_id
1063
+ )
1064
+ SELECT cc.cluster_id, cc.n_msgs,
1065
+ (SELECT string_agg(term, ', ' ORDER BY rank)
1066
+ FROM cluster_terms ct
1067
+ WHERE ct.cluster_id = cc.cluster_id
1068
+ AND ct.rank <= 5) AS top_terms
1069
+ FROM cluster_counts cc
1070
+ ORDER BY n_msgs DESC
1071
+ LIMIT n
1072
+ );
1073
+ """,
1074
+ )
1075
+
1076
+ # Sentiment arc for a single session: per-message (ts, role, delta,
1077
+ # transition flag, confidence) in chronological order.
1078
+ _safe_macro(
1079
+ con,
1080
+ "sentiment_arc",
1081
+ """
1082
+ CREATE OR REPLACE MACRO sentiment_arc(sid) AS TABLE (
1083
+ SELECT m.ts, m.role, mt.sentiment_delta, mt.is_transition, mt.confidence
1084
+ FROM messages m
1085
+ JOIN message_trajectory mt
1086
+ ON CAST(m.uuid AS VARCHAR) = mt.uuid
1087
+ WHERE CAST(m.session_id AS VARCHAR) = sid
1088
+ ORDER BY m.ts
1089
+ );
1090
+ """,
1091
+ )
1092
+
1093
+ # Counts per friction label, scoped to the last N days by message ``ts``
1094
+ # (the user's actual utterance time, not detected_at). Pass ``NULL`` to
1095
+ # include the full corpus. Excludes label='none' because that is the
1096
+ # majority sentinel class and would swamp the output.
1097
+ _safe_macro(
1098
+ con,
1099
+ "friction_counts",
1100
+ """
1101
+ CREATE OR REPLACE MACRO friction_counts(since_days) AS TABLE (
1102
+ SELECT label,
1103
+ count(*) AS n,
1104
+ count(DISTINCT session_id) AS sessions,
1105
+ avg(confidence) AS avg_confidence,
1106
+ sum(CASE WHEN source='regex' THEN 1 ELSE 0 END) AS n_regex,
1107
+ sum(CASE WHEN source='llm' THEN 1 ELSE 0 END) AS n_llm
1108
+ FROM user_friction
1109
+ WHERE label != 'none'
1110
+ AND (since_days IS NULL
1111
+ OR ts >= current_timestamp - (since_days * INTERVAL 1 DAY))
1112
+ GROUP BY label
1113
+ ORDER BY n DESC
1114
+ );
1115
+ """,
1116
+ )
1117
+
1118
+ # Per-session friction pressure: how many non-'none' friction messages
1119
+ # fired vs the total user message count. A high rate is a strong proxy
1120
+ # for a session where the agent repeatedly fell short of what the user
1121
+ # expected.
1122
+ _safe_macro(
1123
+ con,
1124
+ "friction_rate",
1125
+ """
1126
+ CREATE OR REPLACE MACRO friction_rate(since_days) AS TABLE (
1127
+ WITH hits AS (
1128
+ SELECT session_id,
1129
+ count(*) FILTER (WHERE label != 'none') AS n_friction,
1130
+ count(*) FILTER (WHERE label = 'status_ping') AS n_status,
1131
+ count(*) FILTER (WHERE label = 'unmet_expectation') AS n_unmet,
1132
+ count(*) FILTER (WHERE label = 'confusion') AS n_confusion,
1133
+ count(*) FILTER (WHERE label = 'interruption') AS n_interruption,
1134
+ count(*) FILTER (WHERE label = 'correction') AS n_correction,
1135
+ count(*) FILTER (WHERE label = 'frustration') AS n_frustration
1136
+ FROM user_friction
1137
+ WHERE since_days IS NULL
1138
+ OR ts >= current_timestamp - (since_days * INTERVAL 1 DAY)
1139
+ GROUP BY session_id
1140
+ ),
1141
+ user_msgs AS (
1142
+ SELECT CAST(mt.session_id AS VARCHAR) AS session_id,
1143
+ count(*) AS n_user_msgs
1144
+ FROM messages_text mt
1145
+ WHERE mt.role = 'user'
1146
+ AND (since_days IS NULL
1147
+ OR mt.ts >= current_timestamp - (since_days * INTERVAL 1 DAY))
1148
+ GROUP BY 1
1149
+ )
1150
+ SELECT h.session_id,
1151
+ h.n_friction,
1152
+ h.n_status, h.n_unmet, h.n_confusion,
1153
+ h.n_interruption, h.n_correction, h.n_frustration,
1154
+ COALESCE(um.n_user_msgs, 0) AS n_user_msgs,
1155
+ h.n_friction::DOUBLE / NULLIF(um.n_user_msgs, 0) AS rate
1156
+ FROM hits h
1157
+ LEFT JOIN user_msgs um USING (session_id)
1158
+ WHERE h.n_friction > 0
1159
+ ORDER BY h.n_friction DESC
1160
+ );
1161
+ """,
1162
+ )
1163
+
1164
+ # Top-N example user messages for a given friction label, highest
1165
+ # confidence first. ``label_name`` is a VARCHAR so DuckDB callers
1166
+ # don't have to quote-escape through the macro boundary.
1167
+ _safe_macro(
1168
+ con,
1169
+ "friction_examples",
1170
+ """
1171
+ CREATE OR REPLACE MACRO friction_examples(label_name, n) AS TABLE (
1172
+ SELECT session_id, ts, text_snippet, rationale, source, confidence
1173
+ FROM user_friction
1174
+ WHERE label = label_name
1175
+ ORDER BY confidence DESC, ts DESC
1176
+ LIMIT n
1177
+ );
1178
+ """,
1179
+ )
1180
+
1181
+ # Catalog entries the user has NOT invoked in the last N days. Pure
1182
+ # catalog lookup; ``skills_catalog`` may be missing pre-sync, so this
1183
+ # is wrapped in ``_safe_macro`` and skipped cleanly in that case.
1184
+ # ``source_kind`` filter keeps out the 'builtin' rows (users don't
1185
+ # install or uninstall ``/clear``).
1186
+ _safe_macro(
1187
+ con,
1188
+ "unused_skills",
1189
+ """
1190
+ CREATE OR REPLACE MACRO unused_skills(last_n_days) AS TABLE (
1191
+ SELECT cat.skill_id,
1192
+ cat.name,
1193
+ cat.plugin,
1194
+ cat.plugin_version,
1195
+ cat.source_kind,
1196
+ cat.description
1197
+ FROM skills_catalog cat
1198
+ LEFT JOIN (
1199
+ SELECT DISTINCT skill_id
1200
+ FROM skill_invocations
1201
+ WHERE ts >= current_timestamp - (last_n_days * INTERVAL 1 DAY)
1202
+ ) used USING (skill_id)
1203
+ WHERE used.skill_id IS NULL
1204
+ AND cat.source_kind IN ('user-skill', 'plugin-skill', 'plugin-command')
1205
+ ORDER BY cat.plugin NULLS FIRST, cat.name
1206
+ );
1207
+ """,
1208
+ )
1209
+
1210
+
1211
+ # ---------------------------------------------------------------------------
1212
+ # VSS
1213
+ # ---------------------------------------------------------------------------
1214
+
1215
+
1216
+ def _hnsw_rebuild_needed(parquet: Path, hnsw_db: Path) -> bool:
1217
+ """Decide from filesystem state alone whether the parquet has shifted.
1218
+
1219
+ Handles both legacy single-file caches and sharded directories: for a
1220
+ sharded directory we compare against the *latest* part file's mtime so
1221
+ a brand-new shard invalidates the persisted HNSW even when the
1222
+ directory's own mtime hasn't moved (some filesystems update dir mtime
1223
+ only on add/remove, not on touch of children).
1224
+
1225
+ This is a *necessary* but not sufficient signal — even when the
1226
+ parquet hasn't moved, the attached store might be empty (for instance,
1227
+ DuckDB's ATTACH on a missing path creates a ~12 KB header-only file
1228
+ before any tables exist). Catalog existence is checked separately
1229
+ inside ``register_vss`` after the ATTACH.
1230
+ """
1231
+ if not hnsw_db.exists():
1232
+ return True
1233
+ parts = iter_part_files(parquet)
1234
+ if not parts:
1235
+ # No source-of-truth on disk yet. The attached store is whatever
1236
+ # was previously persisted; nothing to rebuild from.
1237
+ return False
1238
+ latest_ns = max(p.stat().st_mtime_ns for p in parts)
1239
+ return latest_ns > hnsw_db.stat().st_mtime_ns
1240
+
1241
+
1242
+ def _attached_embeddings_table_present(con: duckdb.DuckDBPyConnection) -> bool:
1243
+ """Return True when ``hnsw_store.main.message_embeddings`` exists in the catalog."""
1244
+ row = con.execute(
1245
+ """
1246
+ SELECT count(*)
1247
+ FROM duckdb_tables()
1248
+ WHERE database_name = 'hnsw_store'
1249
+ AND schema_name = 'main'
1250
+ AND table_name = 'message_embeddings';
1251
+ """
1252
+ ).fetchone()
1253
+ return bool(row and row[0])
1254
+
1255
+
1256
+ def register_vss(
1257
+ con: duckdb.DuckDBPyConnection,
1258
+ *,
1259
+ embeddings_parquet: Path,
1260
+ hnsw_db_path: Path | None = None,
1261
+ dim: int = 1024,
1262
+ metric: str = "cosine",
1263
+ ef_construction: int = 128,
1264
+ ef_search: int = 64,
1265
+ m: int = 16,
1266
+ m0: int = 32,
1267
+ ) -> bool:
1268
+ """Install + load VSS and bind ``message_embeddings`` over a persisted HNSW store.
1269
+
1270
+ When ``hnsw_db_path`` is provided the embeddings table and its HNSW
1271
+ index live inside that DuckDB file (ATTACHed under the alias
1272
+ ``hnsw_store``) so reopening a CLI command reuses the index instead of
1273
+ rebuilding it from parquet. The store is rebuilt only when missing,
1274
+ suspiciously small, or older than the embeddings parquet on disk; an
1275
+ ``IOException`` during attach unlinks the store and rebuilds.
1276
+
1277
+ When ``hnsw_db_path`` is ``None`` (legacy / tests) the table and index
1278
+ stay in the connection's main database, matching the original
1279
+ in-memory behavior.
1280
+
1281
+ Parameters
1282
+ ----------
1283
+ con
1284
+ Open DuckDB connection.
1285
+ embeddings_parquet
1286
+ Path to the embeddings parquet produced by ``claude-sql embed``.
1287
+ hnsw_db_path
1288
+ Persistent DuckDB file that backs the HNSW index, or ``None`` to
1289
+ keep everything in the connection's main database.
1290
+ dim
1291
+ Fixed-length embedding dimension. Must match the parquet's
1292
+ ``embedding`` column. Defaults to 1024 (Cohere Embed v4 mid-tier).
1293
+ metric
1294
+ HNSW distance metric. One of ``cosine``, ``l2sq``, ``ip``.
1295
+ ef_construction, ef_search, m, m0
1296
+ Standard HNSW tuning knobs. ``m`` and ``m0`` map to DuckDB's ``M``
1297
+ and ``M0`` parameters.
1298
+
1299
+ Returns
1300
+ -------
1301
+ bool
1302
+ ``True`` if the table was populated and the HNSW index is usable;
1303
+ ``False`` if the parquet file does not exist yet.
1304
+
1305
+ Notes
1306
+ -----
1307
+ VSS only supports ``FLOAT`` element type. Embeddings persisted as
1308
+ ``DOUBLE[]`` are cast via ``CAST(embedding AS FLOAT[<dim>])``.
1309
+ Persistence rides on the experimental
1310
+ ``hnsw_enable_experimental_persistence`` flag — when corruption
1311
+ surfaces, ``rm`` the file and the next call rebuilds from parquet.
1312
+ """
1313
+ dim_i = int(dim)
1314
+ ef_c_i = int(ef_construction)
1315
+ ef_s_i = int(ef_search)
1316
+ m_i = int(m)
1317
+ m0_i = int(m0)
1318
+ if metric not in {"cosine", "l2sq", "ip"}:
1319
+ raise ValueError(f"Unsupported HNSW metric: {metric!r}")
1320
+
1321
+ con.execute("INSTALL vss;")
1322
+ con.execute("LOAD vss;")
1323
+ con.execute("SET hnsw_enable_experimental_persistence = true;")
1324
+
1325
+ use_persistence = hnsw_db_path is not None
1326
+ schema_qualifier = ""
1327
+ persisted_path: Path | None = hnsw_db_path
1328
+ if use_persistence and persisted_path is not None:
1329
+ persisted_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ try:
1331
+ con.execute(f"ATTACH '{persisted_path}' AS hnsw_store;")
1332
+ except duckdb.IOException as exc:
1333
+ logger.warning(
1334
+ "ATTACH on {} failed ({}); unlinking and rebuilding the HNSW store.",
1335
+ persisted_path,
1336
+ exc,
1337
+ )
1338
+ with contextlib.suppress(FileNotFoundError):
1339
+ persisted_path.unlink()
1340
+ con.execute(f"ATTACH '{persisted_path}' AS hnsw_store;")
1341
+ # ``message_embeddings`` lives inside the attached store. Macros and
1342
+ # readers reference it via a top-level VIEW so existing call sites
1343
+ # (cli.py, the ``semantic_search`` macro) keep working unchanged.
1344
+ schema_qualifier = "hnsw_store.main."
1345
+
1346
+ parts = iter_part_files(embeddings_parquet)
1347
+ if not parts:
1348
+ logger.warning(
1349
+ "No embeddings parquet at {}; skipping HNSW index build. "
1350
+ "Run `claude-sql embed` to backfill.",
1351
+ embeddings_parquet,
1352
+ )
1353
+ con.execute(
1354
+ f"""
1355
+ CREATE OR REPLACE TABLE {schema_qualifier}message_embeddings (
1356
+ uuid VARCHAR PRIMARY KEY,
1357
+ model VARCHAR,
1358
+ dim USMALLINT,
1359
+ embedding FLOAT[{dim_i}]
1360
+ );
1361
+ """
1362
+ )
1363
+ if use_persistence:
1364
+ con.execute(
1365
+ "CREATE OR REPLACE VIEW message_embeddings AS "
1366
+ "SELECT * FROM hnsw_store.main.message_embeddings;"
1367
+ )
1368
+ return False
1369
+
1370
+ rebuild = not use_persistence
1371
+ if use_persistence and persisted_path is not None:
1372
+ # Two reasons to rebuild: parquet is newer than the on-disk store,
1373
+ # or the attached store is empty (newly created header-only file
1374
+ # from ``ATTACH`` on a missing path).
1375
+ rebuild = _hnsw_rebuild_needed(
1376
+ embeddings_parquet, persisted_path
1377
+ ) or not _attached_embeddings_table_present(con)
1378
+
1379
+ if rebuild:
1380
+ # Drop any stale table+index in the target schema first so
1381
+ # CREATE TABLE doesn't trip on an existing index. DROP TABLE
1382
+ # cascades to dependent indexes.
1383
+ con.execute(f"DROP TABLE IF EXISTS {schema_qualifier}message_embeddings;")
1384
+ # ``parts`` may be a single legacy file or a list of shard files.
1385
+ # Inline-escape each path because DDL doesn't accept prepared params.
1386
+ path_literals = ", ".join(_sql_str(str(p)) for p in parts)
1387
+ con.execute(
1388
+ f"""
1389
+ CREATE TABLE {schema_qualifier}message_embeddings AS
1390
+ SELECT
1391
+ uuid,
1392
+ model,
1393
+ dim,
1394
+ CAST(embedding AS FLOAT[{dim_i}]) AS embedding
1395
+ FROM read_parquet([{path_literals}]);
1396
+ """
1397
+ )
1398
+ con.execute(
1399
+ f"""
1400
+ CREATE INDEX idx_msg_hnsw
1401
+ ON {schema_qualifier}message_embeddings
1402
+ USING HNSW (embedding)
1403
+ WITH (
1404
+ metric='{metric}',
1405
+ ef_construction={ef_c_i},
1406
+ ef_search={ef_s_i},
1407
+ M={m_i},
1408
+ M0={m0_i}
1409
+ );
1410
+ """
1411
+ )
1412
+ if use_persistence:
1413
+ con.execute("CHECKPOINT hnsw_store;")
1414
+
1415
+ if use_persistence:
1416
+ con.execute(
1417
+ "CREATE OR REPLACE VIEW message_embeddings AS "
1418
+ "SELECT * FROM hnsw_store.main.message_embeddings;"
1419
+ )
1420
+
1421
+ row = con.execute(f"SELECT count(*) FROM {schema_qualifier}message_embeddings;").fetchone()
1422
+ count = int(row[0]) if row else 0
1423
+ logger.debug(
1424
+ "{} {} embeddings (metric={}, M={}, ef_search={}, persistent={})",
1425
+ "Built" if rebuild else "Reused persisted",
1426
+ count,
1427
+ metric,
1428
+ m_i,
1429
+ ef_s_i,
1430
+ use_persistence,
1431
+ )
1432
+ return True
1433
+
1434
+
1435
+ # ---------------------------------------------------------------------------
1436
+ # v2 analytics views
1437
+ # ---------------------------------------------------------------------------
1438
+
1439
+
1440
+ def _parquet_is_populated(path: Path | None) -> bool:
1441
+ """Return True when ``path`` has at least one usable parquet under it.
1442
+
1443
+ Handles both legacy single-file caches (``<name>.parquet``) and the
1444
+ sharded directory layout (``<name>/part-<ts>.parquet``). An empty
1445
+ directory or a single zero-byte file both count as "not populated"
1446
+ so an aborted run can't trick view registration into pointing at
1447
+ rubbish.
1448
+ """
1449
+ if path is None:
1450
+ return False
1451
+ parts = iter_part_files(path)
1452
+ return any(p.stat().st_size > 16 for p in parts)
1453
+
1454
+
1455
+ def register_analytics(
1456
+ con: duckdb.DuckDBPyConnection,
1457
+ *,
1458
+ settings: Settings | None = None,
1459
+ classifications_parquet: Path | None = None,
1460
+ trajectory_parquet: Path | None = None,
1461
+ conflicts_parquet: Path | None = None,
1462
+ clusters_parquet: Path | None = None,
1463
+ cluster_terms_parquet: Path | None = None,
1464
+ communities_parquet: Path | None = None,
1465
+ user_friction_parquet: Path | None = None,
1466
+ skills_catalog_parquet: Path | None = None,
1467
+ ) -> None:
1468
+ """Register v2 analytics parquets as DuckDB views.
1469
+
1470
+ Creates one ``CREATE OR REPLACE VIEW`` per parquet that exists on disk:
1471
+ ``session_classifications``, ``message_trajectory``, ``session_conflicts``,
1472
+ ``message_clusters``, ``cluster_terms``, ``session_communities``,
1473
+ ``user_friction``, plus the derived ``session_goals`` projection over
1474
+ ``session_classifications``.
1475
+
1476
+ Each view is created only when its source parquet exists and is larger
1477
+ than an empty-file sentinel (>16 bytes). Missing parquets are skipped
1478
+ with a ``logger.warning`` so the function is idempotent against a
1479
+ partially-populated system -- you can call it before, during, or after an
1480
+ analytics pipeline run and it will pick up whatever is on disk.
1481
+
1482
+ Analytics macros (``autonomy_trend`` et al.) are **not** registered here
1483
+ -- they belong to :func:`register_macros`, which must be called
1484
+ afterwards so macro bodies bind against the just-created views.
1485
+
1486
+ Parameters
1487
+ ----------
1488
+ con
1489
+ Open DuckDB connection.
1490
+ settings
1491
+ Optional :class:`Settings` whose ``*_parquet_path`` fields drive the
1492
+ per-view parquet locations. If ``None``, explicit per-parquet
1493
+ keyword arguments take over (see below); if both are supplied, the
1494
+ explicit path wins.
1495
+ classifications_parquet, trajectory_parquet, conflicts_parquet,
1496
+ clusters_parquet, cluster_terms_parquet, communities_parquet
1497
+ Optional explicit paths, useful for tests and ad-hoc wiring. Each
1498
+ defaults to the matching ``settings.*_parquet_path`` (or the
1499
+ :class:`Settings` defaults) when not provided.
1500
+ """
1501
+ resolved = settings if settings is not None else Settings()
1502
+ view_to_path: dict[str, Path] = {
1503
+ "session_classifications": classifications_parquet
1504
+ if classifications_parquet is not None
1505
+ else resolved.classifications_parquet_path,
1506
+ "message_trajectory": trajectory_parquet
1507
+ if trajectory_parquet is not None
1508
+ else resolved.trajectory_parquet_path,
1509
+ "session_conflicts": conflicts_parquet
1510
+ if conflicts_parquet is not None
1511
+ else resolved.conflicts_parquet_path,
1512
+ "message_clusters": clusters_parquet
1513
+ if clusters_parquet is not None
1514
+ else resolved.clusters_parquet_path,
1515
+ "cluster_terms": cluster_terms_parquet
1516
+ if cluster_terms_parquet is not None
1517
+ else resolved.cluster_terms_parquet_path,
1518
+ "session_communities": communities_parquet
1519
+ if communities_parquet is not None
1520
+ else resolved.communities_parquet_path,
1521
+ "user_friction": user_friction_parquet
1522
+ if user_friction_parquet is not None
1523
+ else resolved.user_friction_parquet_path,
1524
+ "skills_catalog": skills_catalog_parquet
1525
+ if skills_catalog_parquet is not None
1526
+ else resolved.skills_catalog_parquet_path,
1527
+ }
1528
+
1529
+ # View projections keyed by view name. A ``None`` projection means
1530
+ # ``SELECT *``; a string is spliced in verbatim so the wrapper view can
1531
+ # add convenience alias columns (e.g. ``autonomy`` alongside
1532
+ # ``autonomy_tier``). These aliases are additive: the original column
1533
+ # names continue to work so existing queries never break.
1534
+ view_projections: dict[str, str | None] = {
1535
+ "session_classifications": (
1536
+ "*, autonomy_tier AS autonomy, success AS success_outcome, work_category AS category"
1537
+ ),
1538
+ "message_trajectory": ("*, sentiment_delta AS sentiment, is_transition AS transition"),
1539
+ "session_conflicts": ("*, resolution AS conflict_resolution"),
1540
+ }
1541
+
1542
+ registered: set[str] = set()
1543
+ for view_name, path in view_to_path.items():
1544
+ if not _parquet_is_populated(path):
1545
+ # Missing analytics parquets are the default state until the user
1546
+ # runs the corresponding generator (classify / cluster / ...), so
1547
+ # they belong at DEBUG -- otherwise every query command floods the
1548
+ # terminal with warnings about work the user hasn't yet asked for.
1549
+ logger.debug(
1550
+ "register_analytics: skipping {} (parquet missing at {})",
1551
+ view_name,
1552
+ path,
1553
+ )
1554
+ continue
1555
+ projection = view_projections.get(view_name) or "*"
1556
+ # Sharded directories list every part file; legacy single-file paths
1557
+ # become a one-element list. ``read_parquet`` accepts both.
1558
+ parts = [p for p in iter_part_files(path) if p.stat().st_size > 16]
1559
+ path_literals = ", ".join(_sql_str(str(p)) for p in parts)
1560
+ try:
1561
+ con.execute(
1562
+ f"CREATE OR REPLACE VIEW {view_name} AS "
1563
+ f"SELECT {projection} FROM read_parquet([{path_literals}]);"
1564
+ )
1565
+ logger.debug("Registered analytics view: {} (source={})", view_name, path)
1566
+ registered.add(view_name)
1567
+ except duckdb.Error:
1568
+ logger.exception("Failed to register analytics view {} from {}", view_name, path)
1569
+
1570
+ # ``session_goals`` is a thin projection of ``session_classifications``;
1571
+ # only materialize it when the upstream view exists.
1572
+ if "session_classifications" in registered:
1573
+ try:
1574
+ con.execute(
1575
+ """
1576
+ CREATE OR REPLACE VIEW session_goals AS
1577
+ SELECT session_id, goal, confidence, classified_at
1578
+ FROM session_classifications;
1579
+ """
1580
+ )
1581
+ logger.debug("Registered analytics view: session_goals")
1582
+ except duckdb.Error:
1583
+ logger.exception("Failed to register session_goals view")
1584
+
1585
+ # ``skill_usage`` joins ``skill_invocations`` (always-on) against the
1586
+ # catalog for human-readable labels + ``is_builtin`` tagging. When the
1587
+ # catalog parquet is absent the view still works, but every row gets a
1588
+ # ``skill_name = skill_id`` pass-through and ``is_builtin = false``.
1589
+ try:
1590
+ if "skills_catalog" in registered:
1591
+ con.execute(
1592
+ """
1593
+ CREATE OR REPLACE VIEW skill_usage AS
1594
+ SELECT
1595
+ si.session_id,
1596
+ si.ts,
1597
+ si.message_uuid,
1598
+ si.source,
1599
+ si.skill_id,
1600
+ si.args,
1601
+ si.tool_use_id,
1602
+ coalesce(cat.name, si.skill_id) AS skill_name,
1603
+ cat.plugin AS plugin,
1604
+ cat.plugin_version AS plugin_version,
1605
+ cat.description AS description,
1606
+ cat.source_kind AS source_kind,
1607
+ coalesce(cat.source_kind = 'builtin', false) AS is_builtin
1608
+ FROM skill_invocations si
1609
+ LEFT JOIN skills_catalog cat ON cat.skill_id = si.skill_id;
1610
+ """
1611
+ )
1612
+ else:
1613
+ con.execute(
1614
+ """
1615
+ CREATE OR REPLACE VIEW skill_usage AS
1616
+ SELECT
1617
+ si.session_id,
1618
+ si.ts,
1619
+ si.message_uuid,
1620
+ si.source,
1621
+ si.skill_id,
1622
+ si.args,
1623
+ si.tool_use_id,
1624
+ si.skill_id AS skill_name,
1625
+ CAST(NULL AS VARCHAR) AS plugin,
1626
+ CAST(NULL AS VARCHAR) AS plugin_version,
1627
+ CAST(NULL AS VARCHAR) AS description,
1628
+ CAST(NULL AS VARCHAR) AS source_kind,
1629
+ false AS is_builtin
1630
+ FROM skill_invocations si;
1631
+ """
1632
+ )
1633
+ logger.debug("Registered analytics view: skill_usage")
1634
+ except duckdb.Error:
1635
+ logger.exception("Failed to register skill_usage view")
1636
+
1637
+
1638
+ def register_all(
1639
+ con: duckdb.DuckDBPyConnection,
1640
+ *,
1641
+ settings: Settings | None = None,
1642
+ include_analytics: bool = True,
1643
+ ) -> None:
1644
+ """Register raw views, derived views, VSS, analytics, and macros in order.
1645
+
1646
+ Parameters
1647
+ ----------
1648
+ con
1649
+ Open DuckDB connection.
1650
+ settings
1651
+ Optional :class:`Settings`; a default instance is created when absent.
1652
+ include_analytics
1653
+ When ``True`` (default), call :func:`register_analytics` before
1654
+ :func:`register_macros` so the v2 analytics macros can bind against
1655
+ the freshly-registered analytics views. Set to ``False`` to skip
1656
+ analytics view registration entirely (useful in tests that only
1657
+ exercise v1 macros or when the caller will register analytics views
1658
+ out-of-band).
1659
+
1660
+ Notes
1661
+ -----
1662
+ Order matters on two axes:
1663
+
1664
+ 1. ``register_vss`` must run before ``register_macros`` because the
1665
+ ``semantic_search`` macro body references the ``message_embeddings``
1666
+ table and DuckDB resolves macro bodies at creation time.
1667
+ 2. ``register_analytics`` must also run before ``register_macros`` so
1668
+ the analytics macros (``autonomy_trend``, ``cluster_top_terms``, ...)
1669
+ bind against the analytics views at macro-creation time. When a
1670
+ parquet is missing the macro is skipped with a warning rather than
1671
+ raising.
1672
+ """
1673
+ settings = settings or Settings()
1674
+ register_raw(
1675
+ con,
1676
+ glob=settings.default_glob,
1677
+ subagent_glob=settings.subagent_glob,
1678
+ subagent_meta_glob=settings.subagent_meta_glob,
1679
+ )
1680
+ register_views(con)
1681
+ register_vss(
1682
+ con,
1683
+ embeddings_parquet=settings.embeddings_parquet_path,
1684
+ hnsw_db_path=settings.hnsw_db_path,
1685
+ dim=int(settings.output_dimension),
1686
+ metric=settings.hnsw_metric,
1687
+ ef_construction=settings.hnsw_ef_construction,
1688
+ ef_search=settings.hnsw_ef_search,
1689
+ m=settings.hnsw_m,
1690
+ m0=settings.hnsw_m0,
1691
+ )
1692
+ if include_analytics:
1693
+ register_analytics(con, settings=settings)
1694
+ register_macros(con, settings=settings)
1695
+
1696
+
1697
+ # ---------------------------------------------------------------------------
1698
+ # Introspection
1699
+ # ---------------------------------------------------------------------------
1700
+
1701
+
1702
+ def describe_all(con: duckdb.DuckDBPyConnection) -> dict[str, list[tuple[str, str]]]:
1703
+ """Return the column schema of every business-level view.
1704
+
1705
+ Parameters
1706
+ ----------
1707
+ con
1708
+ Open DuckDB connection with views registered.
1709
+
1710
+ Returns
1711
+ -------
1712
+ dict
1713
+ ``{view_name: [(column_name, column_type), ...]}``. Views that fail to
1714
+ describe (e.g. missing because ``register_views`` was not called) map
1715
+ to an empty list and emit a warning.
1716
+ """
1717
+ out: dict[str, list[tuple[str, str]]] = {}
1718
+ for name in VIEW_NAMES:
1719
+ try:
1720
+ rows = con.execute(f"DESCRIBE {name}").fetchall()
1721
+ out[name] = [(str(r[0]), str(r[1])) for r in rows]
1722
+ except duckdb.Error as exc:
1723
+ logger.warning("Could not describe {}: {}", name, exc)
1724
+ out[name] = []
1725
+ return out
1726
+
1727
+
1728
+ def list_macros(con: duckdb.DuckDBPyConnection) -> list[str]:
1729
+ """Return the macro names defined in this connection's ``main`` schema.
1730
+
1731
+ Parameters
1732
+ ----------
1733
+ con
1734
+ Open DuckDB connection.
1735
+
1736
+ Returns
1737
+ -------
1738
+ list[str]
1739
+ Sorted, deduplicated list of macro function names (includes both
1740
+ scalar and table macros).
1741
+ """
1742
+ rows = con.execute(
1743
+ """
1744
+ SELECT DISTINCT function_name
1745
+ FROM duckdb_functions()
1746
+ WHERE schema_name = 'main'
1747
+ AND function_type IN ('macro', 'table_macro')
1748
+ ORDER BY function_name
1749
+ """
1750
+ ).fetchall()
1751
+ return [str(r[0]) for r in rows]