chatstrata 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatstrata/__init__.py +7 -0
- chatstrata/analysis/__init__.py +24 -0
- chatstrata/analysis/cli.py +145 -0
- chatstrata/analysis/queries/activity.sql +12 -0
- chatstrata/analysis/queries/conversations.sql +11 -0
- chatstrata/analysis/queries/example_queries.sql +129 -0
- chatstrata/analysis/queries/models.sql +8 -0
- chatstrata/analysis/queries/projects.sql +11 -0
- chatstrata/analysis/queries/tools.sql +9 -0
- chatstrata/cli.py +584 -0
- chatstrata/core/__init__.py +24 -0
- chatstrata/core/db.py +115 -0
- chatstrata/core/ingest.py +271 -0
- chatstrata/core/migrations/0001_initial.sql +130 -0
- chatstrata/core/migrations/0002_fts_index.sql +11 -0
- chatstrata/core/migrations/0003_conversation_mtime.sql +2 -0
- chatstrata/core/migrations/__init__.py +28 -0
- chatstrata/core/models.py +87 -0
- chatstrata/core/search.py +151 -0
- chatstrata/embed/__init__.py +20 -0
- chatstrata/embed/base.py +26 -0
- chatstrata/embed/cli.py +134 -0
- chatstrata/embed/local_provider.py +30 -0
- chatstrata/embed/search.py +141 -0
- chatstrata/mcp/README.md +155 -0
- chatstrata/mcp/__init__.py +1 -0
- chatstrata/mcp/safety.py +92 -0
- chatstrata/mcp/server.py +247 -0
- chatstrata/redact/__init__.py +20 -0
- chatstrata/redact/base.py +58 -0
- chatstrata/redact/cli.py +310 -0
- chatstrata/redact/presidio_engine.py +183 -0
- chatstrata/redact/recognizers/__init__.py +20 -0
- chatstrata/redact/recognizers/api_keys.py +62 -0
- chatstrata/redact/recognizers/connection_strings.py +20 -0
- chatstrata/redact/recognizers/paths.py +33 -0
- chatstrata/redact/recognizers/tokens.py +29 -0
- chatstrata/sources/__init__.py +5 -0
- chatstrata/sources/base.py +67 -0
- chatstrata/sources/claude_code/__init__.py +3 -0
- chatstrata/sources/claude_code/adapter.py +253 -0
- chatstrata/sources/claude_code/manifest.yaml +10 -0
- chatstrata/sources/claude_export/__init__.py +3 -0
- chatstrata/sources/claude_export/adapter.py +239 -0
- chatstrata/sources/claude_export/manifest.yaml +10 -0
- chatstrata/sources/codex_cli/__init__.py +3 -0
- chatstrata/sources/codex_cli/adapter.py +315 -0
- chatstrata/sources/codex_cli/manifest.yaml +10 -0
- chatstrata/sources/opencode/__init__.py +3 -0
- chatstrata/sources/opencode/adapter.py +350 -0
- chatstrata/sources/opencode/manifest.yaml +10 -0
- chatstrata-0.1.0.dist-info/METADATA +227 -0
- chatstrata-0.1.0.dist-info/RECORD +56 -0
- chatstrata-0.1.0.dist-info/WHEEL +4 -0
- chatstrata-0.1.0.dist-info/entry_points.txt +9 -0
- chatstrata-0.1.0.dist-info/licenses/LICENSE +201 -0
chatstrata/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Analysis query helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
_QUERIES_DIR = Path(__file__).parent / "queries"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_query(name: str) -> str:
|
|
11
|
+
"""Load a .sql file from the queries directory by name (without extension)."""
|
|
12
|
+
path = _QUERIES_DIR / f"{name}.sql"
|
|
13
|
+
if not path.exists():
|
|
14
|
+
raise FileNotFoundError(f"Query file not found: {path}")
|
|
15
|
+
return path.read_text()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def build_source_filter(
|
|
19
|
+
source: str | None, *, column: str = "c.source_id"
|
|
20
|
+
) -> tuple[str, list]:
|
|
21
|
+
"""Return a (sql_fragment, params) tuple for optional source filtering."""
|
|
22
|
+
if source is None:
|
|
23
|
+
return "", []
|
|
24
|
+
return f"AND {column} = ?", [source]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Analyze subcommands for chatstrata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
|
|
9
|
+
from chatstrata.analysis import build_source_filter, load_query
|
|
10
|
+
from chatstrata.core.db import connect, resolve_db_path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _output(cols: list[str], rows: list[tuple], as_json: bool) -> None:
|
|
14
|
+
"""Output results as either a formatted table or JSON."""
|
|
15
|
+
if as_json:
|
|
16
|
+
out = [dict(zip(cols, row, strict=False)) for row in rows]
|
|
17
|
+
click.echo(json.dumps(out, default=str, indent=2))
|
|
18
|
+
return
|
|
19
|
+
if not rows:
|
|
20
|
+
click.echo("No data.")
|
|
21
|
+
return
|
|
22
|
+
widths = [
|
|
23
|
+
max(len(c), *(len(str(v) if v is not None else "") for v in (r[i] for r in rows)))
|
|
24
|
+
for i, c in enumerate(cols)
|
|
25
|
+
]
|
|
26
|
+
click.echo(" ".join(c.ljust(w) for c, w in zip(cols, widths)))
|
|
27
|
+
click.echo(" ".join("-" * w for w in widths))
|
|
28
|
+
for row in rows:
|
|
29
|
+
click.echo(" ".join(str(v if v is not None else "").ljust(w) for v, w in zip(row, widths)))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@click.group()
|
|
33
|
+
def analyze() -> None:
|
|
34
|
+
"""Analyze your conversation archive."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@analyze.command()
|
|
38
|
+
@click.option(
|
|
39
|
+
"--by", "granularity", type=click.Choice(["day", "week", "month"]),
|
|
40
|
+
default="month", help="Time granularity (default: month).",
|
|
41
|
+
)
|
|
42
|
+
@click.option("--source", default=None, help="Filter to a specific source.")
|
|
43
|
+
@click.option("--db", default=None, help="Override the database path.")
|
|
44
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
|
|
45
|
+
def activity(granularity: str, source: str | None, db: str | None, as_json: bool) -> None:
|
|
46
|
+
"""Messages over time, grouped by period."""
|
|
47
|
+
sql_template = load_query("activity")
|
|
48
|
+
source_filter, params = build_source_filter(source)
|
|
49
|
+
sql = sql_template.format(granularity=granularity, source_filter=source_filter)
|
|
50
|
+
|
|
51
|
+
conn = connect(resolve_db_path(db))
|
|
52
|
+
try:
|
|
53
|
+
result = conn.execute(sql, params)
|
|
54
|
+
cols = [d[0] for d in result.description]
|
|
55
|
+
rows = result.fetchall()
|
|
56
|
+
_output(cols, rows, as_json)
|
|
57
|
+
finally:
|
|
58
|
+
conn.close()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@analyze.command()
|
|
62
|
+
@click.option("--source", default=None, help="Filter to a specific source (e.g. claude_code).")
|
|
63
|
+
@click.option("--db", default=None, help="Override the database path.")
|
|
64
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
|
|
65
|
+
def tools(source: str | None, db: str | None, as_json: bool) -> None:
|
|
66
|
+
"""Tool usage frequency."""
|
|
67
|
+
sql_template = load_query("tools")
|
|
68
|
+
source_filter, params = build_source_filter(source, column="source_id")
|
|
69
|
+
sql = sql_template.format(source_filter=source_filter)
|
|
70
|
+
|
|
71
|
+
conn = connect(resolve_db_path(db))
|
|
72
|
+
try:
|
|
73
|
+
result = conn.execute(sql, params)
|
|
74
|
+
cols = [d[0] for d in result.description]
|
|
75
|
+
rows = result.fetchall()
|
|
76
|
+
_output(cols, rows, as_json)
|
|
77
|
+
finally:
|
|
78
|
+
conn.close()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@analyze.command()
|
|
82
|
+
@click.option("--longest", "longest_n", type=int, default=None, help="Show N longest conversations.")
|
|
83
|
+
@click.option("--shortest", "shortest_n", type=int, default=None, help="Show N shortest conversations.")
|
|
84
|
+
@click.option("--db", default=None, help="Override the database path.")
|
|
85
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
|
|
86
|
+
def conversations(
|
|
87
|
+
longest_n: int | None, shortest_n: int | None, db: str | None, as_json: bool,
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Conversation length statistics."""
|
|
90
|
+
if longest_n and shortest_n:
|
|
91
|
+
raise click.UsageError("Specify --longest or --shortest, not both.")
|
|
92
|
+
|
|
93
|
+
order = "DESC"
|
|
94
|
+
limit = 20
|
|
95
|
+
if shortest_n:
|
|
96
|
+
order = "ASC"
|
|
97
|
+
limit = shortest_n
|
|
98
|
+
elif longest_n:
|
|
99
|
+
limit = longest_n
|
|
100
|
+
|
|
101
|
+
sql_template = load_query("conversations")
|
|
102
|
+
sql = sql_template.format(order=order, limit=limit)
|
|
103
|
+
|
|
104
|
+
conn = connect(resolve_db_path(db))
|
|
105
|
+
try:
|
|
106
|
+
result = conn.execute(sql)
|
|
107
|
+
cols = [d[0] for d in result.description]
|
|
108
|
+
rows = result.fetchall()
|
|
109
|
+
_output(cols, rows, as_json)
|
|
110
|
+
finally:
|
|
111
|
+
conn.close()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@analyze.command()
|
|
115
|
+
@click.option("--db", default=None, help="Override the database path.")
|
|
116
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
|
|
117
|
+
def models(db: str | None, as_json: bool) -> None:
|
|
118
|
+
"""Model usage breakdown."""
|
|
119
|
+
sql = load_query("models")
|
|
120
|
+
|
|
121
|
+
conn = connect(resolve_db_path(db))
|
|
122
|
+
try:
|
|
123
|
+
result = conn.execute(sql)
|
|
124
|
+
cols = [d[0] for d in result.description]
|
|
125
|
+
rows = result.fetchall()
|
|
126
|
+
_output(cols, rows, as_json)
|
|
127
|
+
finally:
|
|
128
|
+
conn.close()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@analyze.command()
|
|
132
|
+
@click.option("--db", default=None, help="Override the database path.")
|
|
133
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
|
|
134
|
+
def projects(db: str | None, as_json: bool) -> None:
|
|
135
|
+
"""Per-project conversation counts (Claude Code)."""
|
|
136
|
+
sql = load_query("projects")
|
|
137
|
+
|
|
138
|
+
conn = connect(resolve_db_path(db))
|
|
139
|
+
try:
|
|
140
|
+
result = conn.execute(sql)
|
|
141
|
+
cols = [d[0] for d in result.description]
|
|
142
|
+
rows = result.fetchall()
|
|
143
|
+
_output(cols, rows, as_json)
|
|
144
|
+
finally:
|
|
145
|
+
conn.close()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
s.id AS source,
|
|
3
|
+
date_trunc('{granularity}', m.created_at) AS period,
|
|
4
|
+
COUNT(*) AS messages,
|
|
5
|
+
COUNT(DISTINCT m.conversation_id) AS conversations
|
|
6
|
+
FROM messages m
|
|
7
|
+
JOIN conversations c ON c.id = m.conversation_id
|
|
8
|
+
JOIN sources s ON s.id = c.source_id
|
|
9
|
+
WHERE m.created_at IS NOT NULL
|
|
10
|
+
{source_filter}
|
|
11
|
+
GROUP BY source, period
|
|
12
|
+
ORDER BY period DESC, source
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
-- Example chatstrata queries. Use as: chatstrata query "$(cat path/to/query.sql)"
|
|
2
|
+
-- or paste into a DuckDB shell connected to your chatstrata.duckdb file.
|
|
3
|
+
|
|
4
|
+
-- ============================================================
|
|
5
|
+
-- 1. Activity over time
|
|
6
|
+
-- ============================================================
|
|
7
|
+
|
|
8
|
+
-- Messages per month, by source
|
|
9
|
+
SELECT
|
|
10
|
+
s.id AS source,
|
|
11
|
+
date_trunc('month', m.created_at) AS month,
|
|
12
|
+
COUNT(*) AS messages
|
|
13
|
+
FROM messages m
|
|
14
|
+
JOIN conversations c ON c.id = m.conversation_id
|
|
15
|
+
JOIN sources s ON s.id = c.source_id
|
|
16
|
+
WHERE m.created_at IS NOT NULL
|
|
17
|
+
GROUP BY source, month
|
|
18
|
+
ORDER BY month DESC, source;
|
|
19
|
+
|
|
20
|
+
-- ============================================================
|
|
21
|
+
-- 2. Conversation length distribution
|
|
22
|
+
-- ============================================================
|
|
23
|
+
|
|
24
|
+
SELECT
|
|
25
|
+
CASE
|
|
26
|
+
WHEN message_count < 5 THEN 'very short (< 5)'
|
|
27
|
+
WHEN message_count < 20 THEN 'short (5-19)'
|
|
28
|
+
WHEN message_count < 50 THEN 'medium (20-49)'
|
|
29
|
+
ELSE 'long (50+)'
|
|
30
|
+
END AS length_bucket,
|
|
31
|
+
COUNT(*) AS conversations
|
|
32
|
+
FROM conversations
|
|
33
|
+
GROUP BY length_bucket
|
|
34
|
+
ORDER BY conversations DESC;
|
|
35
|
+
|
|
36
|
+
-- ============================================================
|
|
37
|
+
-- 3. Every bash command via Claude Code, grouped by project
|
|
38
|
+
-- ============================================================
|
|
39
|
+
|
|
40
|
+
SELECT
|
|
41
|
+
c.project,
|
|
42
|
+
cb.payload->'input'->>'command' AS command,
|
|
43
|
+
COUNT(*) AS times_run
|
|
44
|
+
FROM content_blocks cb
|
|
45
|
+
JOIN messages m ON m.id = cb.message_id
|
|
46
|
+
JOIN conversations c ON c.id = m.conversation_id
|
|
47
|
+
WHERE cb.type = 'tool_use'
|
|
48
|
+
AND cb.tool_name = 'Bash'
|
|
49
|
+
AND c.source_id = 'claude_code'
|
|
50
|
+
GROUP BY c.project, command
|
|
51
|
+
ORDER BY times_run DESC
|
|
52
|
+
LIMIT 50;
|
|
53
|
+
|
|
54
|
+
-- ============================================================
|
|
55
|
+
-- 4. Tool usage frequency
|
|
56
|
+
-- ============================================================
|
|
57
|
+
|
|
58
|
+
SELECT
|
|
59
|
+
tool_name,
|
|
60
|
+
COUNT(*) AS calls,
|
|
61
|
+
COUNT(DISTINCT conversation_id) AS conversations
|
|
62
|
+
FROM tool_calls
|
|
63
|
+
GROUP BY tool_name
|
|
64
|
+
ORDER BY calls DESC;
|
|
65
|
+
|
|
66
|
+
-- ============================================================
|
|
67
|
+
-- 5. Word count of user prompts over time
|
|
68
|
+
-- ============================================================
|
|
69
|
+
|
|
70
|
+
-- Rough proxy for "how have I been prompting"
|
|
71
|
+
SELECT
|
|
72
|
+
date_trunc('week', m.created_at) AS week,
|
|
73
|
+
AVG(length(cb.text) - length(replace(cb.text, ' ', ''))) AS avg_word_count
|
|
74
|
+
FROM content_blocks cb
|
|
75
|
+
JOIN messages m ON m.id = cb.message_id
|
|
76
|
+
WHERE m.role = 'user'
|
|
77
|
+
AND cb.type = 'text'
|
|
78
|
+
AND cb.text IS NOT NULL
|
|
79
|
+
GROUP BY week
|
|
80
|
+
ORDER BY week;
|
|
81
|
+
|
|
82
|
+
-- ============================================================
|
|
83
|
+
-- 6. Recent conversations matching a keyword
|
|
84
|
+
-- ============================================================
|
|
85
|
+
|
|
86
|
+
SELECT
|
|
87
|
+
c.started_at,
|
|
88
|
+
s.id AS source,
|
|
89
|
+
c.project,
|
|
90
|
+
c.title
|
|
91
|
+
FROM conversations c
|
|
92
|
+
JOIN sources s ON s.id = c.source_id
|
|
93
|
+
WHERE EXISTS (
|
|
94
|
+
SELECT 1 FROM messages m
|
|
95
|
+
JOIN content_blocks cb ON cb.message_id = m.id
|
|
96
|
+
WHERE m.conversation_id = c.id
|
|
97
|
+
AND cb.text ILIKE '%grandma%' -- replace with your keyword
|
|
98
|
+
)
|
|
99
|
+
ORDER BY c.started_at DESC
|
|
100
|
+
LIMIT 20;
|
|
101
|
+
|
|
102
|
+
-- ============================================================
|
|
103
|
+
-- 7. Most-used models
|
|
104
|
+
-- ============================================================
|
|
105
|
+
|
|
106
|
+
SELECT model, COUNT(*) AS messages
|
|
107
|
+
FROM messages
|
|
108
|
+
WHERE model IS NOT NULL
|
|
109
|
+
GROUP BY model
|
|
110
|
+
ORDER BY messages DESC;
|
|
111
|
+
|
|
112
|
+
-- ============================================================
|
|
113
|
+
-- 8. Thinking blocks: how often does the model reason explicitly?
|
|
114
|
+
-- ============================================================
|
|
115
|
+
|
|
116
|
+
SELECT
|
|
117
|
+
date_trunc('month', m.created_at) AS month,
|
|
118
|
+
SUM(CASE WHEN cb.type = 'thinking' THEN 1 ELSE 0 END) AS thinking_blocks,
|
|
119
|
+
COUNT(DISTINCT m.id) AS assistant_messages,
|
|
120
|
+
ROUND(
|
|
121
|
+
100.0 * SUM(CASE WHEN cb.type = 'thinking' THEN 1 ELSE 0 END)
|
|
122
|
+
/ NULLIF(COUNT(DISTINCT m.id), 0),
|
|
123
|
+
1
|
|
124
|
+
) AS pct
|
|
125
|
+
FROM messages m
|
|
126
|
+
LEFT JOIN content_blocks cb ON cb.message_id = m.id
|
|
127
|
+
WHERE m.role = 'assistant'
|
|
128
|
+
GROUP BY month
|
|
129
|
+
ORDER BY month DESC;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
SELECT
|
|
2
|
+
c.project,
|
|
3
|
+
COUNT(*) AS conversations,
|
|
4
|
+
SUM(c.message_count) AS total_messages,
|
|
5
|
+
MIN(c.started_at) AS earliest,
|
|
6
|
+
MAX(c.ended_at) AS latest
|
|
7
|
+
FROM conversations c
|
|
8
|
+
WHERE c.source_id = 'claude_code'
|
|
9
|
+
AND c.project IS NOT NULL
|
|
10
|
+
GROUP BY c.project
|
|
11
|
+
ORDER BY conversations DESC
|