chatstrata 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. chatstrata/__init__.py +7 -0
  2. chatstrata/analysis/__init__.py +24 -0
  3. chatstrata/analysis/cli.py +145 -0
  4. chatstrata/analysis/queries/activity.sql +12 -0
  5. chatstrata/analysis/queries/conversations.sql +11 -0
  6. chatstrata/analysis/queries/example_queries.sql +129 -0
  7. chatstrata/analysis/queries/models.sql +8 -0
  8. chatstrata/analysis/queries/projects.sql +11 -0
  9. chatstrata/analysis/queries/tools.sql +9 -0
  10. chatstrata/cli.py +584 -0
  11. chatstrata/core/__init__.py +24 -0
  12. chatstrata/core/db.py +115 -0
  13. chatstrata/core/ingest.py +271 -0
  14. chatstrata/core/migrations/0001_initial.sql +130 -0
  15. chatstrata/core/migrations/0002_fts_index.sql +11 -0
  16. chatstrata/core/migrations/0003_conversation_mtime.sql +2 -0
  17. chatstrata/core/migrations/__init__.py +28 -0
  18. chatstrata/core/models.py +87 -0
  19. chatstrata/core/search.py +151 -0
  20. chatstrata/embed/__init__.py +20 -0
  21. chatstrata/embed/base.py +26 -0
  22. chatstrata/embed/cli.py +134 -0
  23. chatstrata/embed/local_provider.py +30 -0
  24. chatstrata/embed/search.py +141 -0
  25. chatstrata/mcp/README.md +155 -0
  26. chatstrata/mcp/__init__.py +1 -0
  27. chatstrata/mcp/safety.py +92 -0
  28. chatstrata/mcp/server.py +247 -0
  29. chatstrata/redact/__init__.py +20 -0
  30. chatstrata/redact/base.py +58 -0
  31. chatstrata/redact/cli.py +310 -0
  32. chatstrata/redact/presidio_engine.py +183 -0
  33. chatstrata/redact/recognizers/__init__.py +20 -0
  34. chatstrata/redact/recognizers/api_keys.py +62 -0
  35. chatstrata/redact/recognizers/connection_strings.py +20 -0
  36. chatstrata/redact/recognizers/paths.py +33 -0
  37. chatstrata/redact/recognizers/tokens.py +29 -0
  38. chatstrata/sources/__init__.py +5 -0
  39. chatstrata/sources/base.py +67 -0
  40. chatstrata/sources/claude_code/__init__.py +3 -0
  41. chatstrata/sources/claude_code/adapter.py +253 -0
  42. chatstrata/sources/claude_code/manifest.yaml +10 -0
  43. chatstrata/sources/claude_export/__init__.py +3 -0
  44. chatstrata/sources/claude_export/adapter.py +239 -0
  45. chatstrata/sources/claude_export/manifest.yaml +10 -0
  46. chatstrata/sources/codex_cli/__init__.py +3 -0
  47. chatstrata/sources/codex_cli/adapter.py +315 -0
  48. chatstrata/sources/codex_cli/manifest.yaml +10 -0
  49. chatstrata/sources/opencode/__init__.py +3 -0
  50. chatstrata/sources/opencode/adapter.py +350 -0
  51. chatstrata/sources/opencode/manifest.yaml +10 -0
  52. chatstrata-0.1.0.dist-info/METADATA +227 -0
  53. chatstrata-0.1.0.dist-info/RECORD +56 -0
  54. chatstrata-0.1.0.dist-info/WHEEL +4 -0
  55. chatstrata-0.1.0.dist-info/entry_points.txt +9 -0
  56. chatstrata-0.1.0.dist-info/licenses/LICENSE +201 -0
chatstrata/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """chatstrata - a personal, queryable archive of your AI conversations."""
2
+
3
+ from chatstrata.core.migrations import LATEST_VERSION as SCHEMA_VERSION
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = ["SCHEMA_VERSION", "__version__"]
@@ -0,0 +1,24 @@
1
+ """Analysis query helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ _QUERIES_DIR = Path(__file__).parent / "queries"
8
+
9
+
10
+ def load_query(name: str) -> str:
11
+ """Load a .sql file from the queries directory by name (without extension)."""
12
+ path = _QUERIES_DIR / f"{name}.sql"
13
+ if not path.exists():
14
+ raise FileNotFoundError(f"Query file not found: {path}")
15
+ return path.read_text()
16
+
17
+
18
+ def build_source_filter(
19
+ source: str | None, *, column: str = "c.source_id"
20
+ ) -> tuple[str, list]:
21
+ """Return a (sql_fragment, params) tuple for optional source filtering."""
22
+ if source is None:
23
+ return "", []
24
+ return f"AND {column} = ?", [source]
@@ -0,0 +1,145 @@
1
+ """Analyze subcommands for chatstrata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import click
8
+
9
+ from chatstrata.analysis import build_source_filter, load_query
10
+ from chatstrata.core.db import connect, resolve_db_path
11
+
12
+
13
+ def _output(cols: list[str], rows: list[tuple], as_json: bool) -> None:
14
+ """Output results as either a formatted table or JSON."""
15
+ if as_json:
16
+ out = [dict(zip(cols, row, strict=False)) for row in rows]
17
+ click.echo(json.dumps(out, default=str, indent=2))
18
+ return
19
+ if not rows:
20
+ click.echo("No data.")
21
+ return
22
+ widths = [
23
+ max(len(c), *(len(str(v) if v is not None else "") for v in (r[i] for r in rows)))
24
+ for i, c in enumerate(cols)
25
+ ]
26
+ click.echo(" ".join(c.ljust(w) for c, w in zip(cols, widths)))
27
+ click.echo(" ".join("-" * w for w in widths))
28
+ for row in rows:
29
+ click.echo(" ".join(str(v if v is not None else "").ljust(w) for v, w in zip(row, widths)))
30
+
31
+
32
+ @click.group()
33
+ def analyze() -> None:
34
+ """Analyze your conversation archive."""
35
+
36
+
37
+ @analyze.command()
38
+ @click.option(
39
+ "--by", "granularity", type=click.Choice(["day", "week", "month"]),
40
+ default="month", help="Time granularity (default: month).",
41
+ )
42
+ @click.option("--source", default=None, help="Filter to a specific source.")
43
+ @click.option("--db", default=None, help="Override the database path.")
44
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
45
+ def activity(granularity: str, source: str | None, db: str | None, as_json: bool) -> None:
46
+ """Messages over time, grouped by period."""
47
+ sql_template = load_query("activity")
48
+ source_filter, params = build_source_filter(source)
49
+ sql = sql_template.format(granularity=granularity, source_filter=source_filter)
50
+
51
+ conn = connect(resolve_db_path(db))
52
+ try:
53
+ result = conn.execute(sql, params)
54
+ cols = [d[0] for d in result.description]
55
+ rows = result.fetchall()
56
+ _output(cols, rows, as_json)
57
+ finally:
58
+ conn.close()
59
+
60
+
61
+ @analyze.command()
62
+ @click.option("--source", default=None, help="Filter to a specific source (e.g. claude_code).")
63
+ @click.option("--db", default=None, help="Override the database path.")
64
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
65
+ def tools(source: str | None, db: str | None, as_json: bool) -> None:
66
+ """Tool usage frequency."""
67
+ sql_template = load_query("tools")
68
+ source_filter, params = build_source_filter(source, column="source_id")
69
+ sql = sql_template.format(source_filter=source_filter)
70
+
71
+ conn = connect(resolve_db_path(db))
72
+ try:
73
+ result = conn.execute(sql, params)
74
+ cols = [d[0] for d in result.description]
75
+ rows = result.fetchall()
76
+ _output(cols, rows, as_json)
77
+ finally:
78
+ conn.close()
79
+
80
+
81
+ @analyze.command()
82
+ @click.option("--longest", "longest_n", type=int, default=None, help="Show N longest conversations.")
83
+ @click.option("--shortest", "shortest_n", type=int, default=None, help="Show N shortest conversations.")
84
+ @click.option("--db", default=None, help="Override the database path.")
85
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
86
+ def conversations(
87
+ longest_n: int | None, shortest_n: int | None, db: str | None, as_json: bool,
88
+ ) -> None:
89
+ """Conversation length statistics."""
90
+ if longest_n and shortest_n:
91
+ raise click.UsageError("Specify --longest or --shortest, not both.")
92
+
93
+ order = "DESC"
94
+ limit = 20
95
+ if shortest_n:
96
+ order = "ASC"
97
+ limit = shortest_n
98
+ elif longest_n:
99
+ limit = longest_n
100
+
101
+ sql_template = load_query("conversations")
102
+ sql = sql_template.format(order=order, limit=limit)
103
+
104
+ conn = connect(resolve_db_path(db))
105
+ try:
106
+ result = conn.execute(sql)
107
+ cols = [d[0] for d in result.description]
108
+ rows = result.fetchall()
109
+ _output(cols, rows, as_json)
110
+ finally:
111
+ conn.close()
112
+
113
+
114
+ @analyze.command()
115
+ @click.option("--db", default=None, help="Override the database path.")
116
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
117
+ def models(db: str | None, as_json: bool) -> None:
118
+ """Model usage breakdown."""
119
+ sql = load_query("models")
120
+
121
+ conn = connect(resolve_db_path(db))
122
+ try:
123
+ result = conn.execute(sql)
124
+ cols = [d[0] for d in result.description]
125
+ rows = result.fetchall()
126
+ _output(cols, rows, as_json)
127
+ finally:
128
+ conn.close()
129
+
130
+
131
+ @analyze.command()
132
+ @click.option("--db", default=None, help="Override the database path.")
133
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON.")
134
+ def projects(db: str | None, as_json: bool) -> None:
135
+ """Per-project conversation counts (Claude Code)."""
136
+ sql = load_query("projects")
137
+
138
+ conn = connect(resolve_db_path(db))
139
+ try:
140
+ result = conn.execute(sql)
141
+ cols = [d[0] for d in result.description]
142
+ rows = result.fetchall()
143
+ _output(cols, rows, as_json)
144
+ finally:
145
+ conn.close()
@@ -0,0 +1,12 @@
1
+ SELECT
2
+ s.id AS source,
3
+ date_trunc('{granularity}', m.created_at) AS period,
4
+ COUNT(*) AS messages,
5
+ COUNT(DISTINCT m.conversation_id) AS conversations
6
+ FROM messages m
7
+ JOIN conversations c ON c.id = m.conversation_id
8
+ JOIN sources s ON s.id = c.source_id
9
+ WHERE m.created_at IS NOT NULL
10
+ {source_filter}
11
+ GROUP BY source, period
12
+ ORDER BY period DESC, source
@@ -0,0 +1,11 @@
1
+ SELECT
2
+ c.title,
3
+ s.id AS source,
4
+ c.project,
5
+ c.message_count,
6
+ c.started_at,
7
+ c.ended_at
8
+ FROM conversations c
9
+ JOIN sources s ON s.id = c.source_id
10
+ ORDER BY c.message_count {order}
11
+ LIMIT {limit}
@@ -0,0 +1,129 @@
1
+ -- Example chatstrata queries. Use as: chatstrata query "$(cat path/to/query.sql)"
2
+ -- or paste into a DuckDB shell connected to your chatstrata.duckdb file.
3
+
4
+ -- ============================================================
5
+ -- 1. Activity over time
6
+ -- ============================================================
7
+
8
+ -- Messages per month, by source
9
+ SELECT
10
+ s.id AS source,
11
+ date_trunc('month', m.created_at) AS month,
12
+ COUNT(*) AS messages
13
+ FROM messages m
14
+ JOIN conversations c ON c.id = m.conversation_id
15
+ JOIN sources s ON s.id = c.source_id
16
+ WHERE m.created_at IS NOT NULL
17
+ GROUP BY source, month
18
+ ORDER BY month DESC, source;
19
+
20
+ -- ============================================================
21
+ -- 2. Conversation length distribution
22
+ -- ============================================================
23
+
24
+ SELECT
25
+ CASE
26
+ WHEN message_count < 5 THEN 'very short (< 5)'
27
+ WHEN message_count < 20 THEN 'short (5-19)'
28
+ WHEN message_count < 50 THEN 'medium (20-49)'
29
+ ELSE 'long (50+)'
30
+ END AS length_bucket,
31
+ COUNT(*) AS conversations
32
+ FROM conversations
33
+ GROUP BY length_bucket
34
+ ORDER BY conversations DESC;
35
+
36
+ -- ============================================================
37
+ -- 3. Every bash command via Claude Code, grouped by project
38
+ -- ============================================================
39
+
40
+ SELECT
41
+ c.project,
42
+ cb.payload->'input'->>'command' AS command,
43
+ COUNT(*) AS times_run
44
+ FROM content_blocks cb
45
+ JOIN messages m ON m.id = cb.message_id
46
+ JOIN conversations c ON c.id = m.conversation_id
47
+ WHERE cb.type = 'tool_use'
48
+ AND cb.tool_name = 'Bash'
49
+ AND c.source_id = 'claude_code'
50
+ GROUP BY c.project, command
51
+ ORDER BY times_run DESC
52
+ LIMIT 50;
53
+
54
+ -- ============================================================
55
+ -- 4. Tool usage frequency
56
+ -- ============================================================
57
+
58
+ SELECT
59
+ tool_name,
60
+ COUNT(*) AS calls,
61
+ COUNT(DISTINCT conversation_id) AS conversations
62
+ FROM tool_calls
63
+ GROUP BY tool_name
64
+ ORDER BY calls DESC;
65
+
66
+ -- ============================================================
67
+ -- 5. Word count of user prompts over time
68
+ -- ============================================================
69
+
70
+ -- Rough proxy for "how have I been prompting"
71
+ SELECT
72
+ date_trunc('week', m.created_at) AS week,
73
+ AVG(length(cb.text) - length(replace(cb.text, ' ', ''))) AS avg_word_count
74
+ FROM content_blocks cb
75
+ JOIN messages m ON m.id = cb.message_id
76
+ WHERE m.role = 'user'
77
+ AND cb.type = 'text'
78
+ AND cb.text IS NOT NULL
79
+ GROUP BY week
80
+ ORDER BY week;
81
+
82
+ -- ============================================================
83
+ -- 6. Recent conversations matching a keyword
84
+ -- ============================================================
85
+
86
+ SELECT
87
+ c.started_at,
88
+ s.id AS source,
89
+ c.project,
90
+ c.title
91
+ FROM conversations c
92
+ JOIN sources s ON s.id = c.source_id
93
+ WHERE EXISTS (
94
+ SELECT 1 FROM messages m
95
+ JOIN content_blocks cb ON cb.message_id = m.id
96
+ WHERE m.conversation_id = c.id
97
+ AND cb.text ILIKE '%grandma%' -- replace with your keyword
98
+ )
99
+ ORDER BY c.started_at DESC
100
+ LIMIT 20;
101
+
102
+ -- ============================================================
103
+ -- 7. Most-used models
104
+ -- ============================================================
105
+
106
+ SELECT model, COUNT(*) AS messages
107
+ FROM messages
108
+ WHERE model IS NOT NULL
109
+ GROUP BY model
110
+ ORDER BY messages DESC;
111
+
112
+ -- ============================================================
113
+ -- 8. Thinking blocks: how often does the model reason explicitly?
114
+ -- ============================================================
115
+
116
+ SELECT
117
+ date_trunc('month', m.created_at) AS month,
118
+ SUM(CASE WHEN cb.type = 'thinking' THEN 1 ELSE 0 END) AS thinking_blocks,
119
+ COUNT(DISTINCT m.id) AS assistant_messages,
120
+ ROUND(
121
+ 100.0 * SUM(CASE WHEN cb.type = 'thinking' THEN 1 ELSE 0 END)
122
+ / NULLIF(COUNT(DISTINCT m.id), 0),
123
+ 1
124
+ ) AS pct
125
+ FROM messages m
126
+ LEFT JOIN content_blocks cb ON cb.message_id = m.id
127
+ WHERE m.role = 'assistant'
128
+ GROUP BY month
129
+ ORDER BY month DESC;
@@ -0,0 +1,8 @@
1
+ SELECT
2
+ model,
3
+ COUNT(*) AS messages,
4
+ COUNT(DISTINCT m.conversation_id) AS conversations
5
+ FROM messages m
6
+ WHERE model IS NOT NULL
7
+ GROUP BY model
8
+ ORDER BY messages DESC
@@ -0,0 +1,11 @@
1
+ SELECT
2
+ c.project,
3
+ COUNT(*) AS conversations,
4
+ SUM(c.message_count) AS total_messages,
5
+ MIN(c.started_at) AS earliest,
6
+ MAX(c.ended_at) AS latest
7
+ FROM conversations c
8
+ WHERE c.source_id = 'claude_code'
9
+ AND c.project IS NOT NULL
10
+ GROUP BY c.project
11
+ ORDER BY conversations DESC
@@ -0,0 +1,9 @@
1
+ SELECT
2
+ tool_name,
3
+ COUNT(*) AS calls,
4
+ COUNT(DISTINCT conversation_id) AS conversations
5
+ FROM tool_calls
6
+ WHERE tool_name IS NOT NULL
7
+ {source_filter}
8
+ GROUP BY tool_name
9
+ ORDER BY calls DESC