mem-context 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """mem-context — Temporal Memory MCP Server."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,196 @@
1
+ """Capture importer — converts conversation transcripts into memories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from ..retrieval.embedder import Embedder
12
+ from ..scope import detect_scope
13
+ from ..storage.lance import LanceMemoryStore
14
+ from .formats import parse_transcript
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class CaptureImporter:
20
+ """Import conversations from various sources into the memory store."""
21
+
22
+ def __init__(self, store: LanceMemoryStore, embedder: Embedder):
23
+ self._store = store
24
+ self._embedder = embedder
25
+
26
+ async def from_json(self, data: dict[str, Any], scope: str | None = None) -> list[str]:
27
+ """Import from structured JSON (universal format).
28
+
29
+ Expected format:
30
+ {
31
+ "messages": [
32
+ {"role": "user", "content": "..."},
33
+ {"role": "assistant", "content": "..."}
34
+ ],
35
+ "metadata": {"client": "claude-code", "session_id": "..."}
36
+ }
37
+
38
+ Returns list of created memory IDs.
39
+ """
40
+ if scope is None:
41
+ scope = detect_scope().scope
42
+
43
+ messages = data.get("messages", [])
44
+ metadata = data.get("metadata", {})
45
+ session_id = metadata.get("session_id", "")
46
+
47
+ if not messages:
48
+ logger.warning("Capture: no messages to import")
49
+ return []
50
+
51
+ created = []
52
+
53
+ # 1. Store full conversation in archive
54
+ full_text = _messages_to_text(messages)
55
+ conv_id = await self._store.put_conversation({
56
+ "scope": scope,
57
+ "date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
58
+ "full_text": full_text,
59
+ "summary": _auto_summary(messages),
60
+ "messages": messages,
61
+ })
62
+ logger.debug("Conversation archived: %s", conv_id)
63
+
64
+ # 2. Semantic chunking: split into logical blocks
65
+ chunks = _chunk_by_message_group(messages)
66
+ for chunk in chunks:
67
+ chunk_text = _messages_to_text(chunk)
68
+ chunk_summary = _auto_summary(chunk)
69
+
70
+ # Embed the chunk
71
+ embedding = await self._embedder.embed_query(chunk_summary + " " + chunk_text[:1000])
72
+
73
+ mid = await self._store.put({
74
+ "type": "episodic",
75
+ "scope": scope,
76
+ "scope_name": detect_scope().scope_name,
77
+ "summary": chunk_summary,
78
+ "content": chunk_text,
79
+ "weight": 0.5,
80
+ "tags": _detect_tags(chunk_text),
81
+ "embedding": embedding,
82
+ "source_session": session_id,
83
+ })
84
+ created.append(mid)
85
+
86
+ logger.info("Capture: %d messages → %d chunks stored (conv=%s)", len(messages), len(created), conv_id)
87
+ return created
88
+
89
+ async def from_transcript(self, path: str, client: str = "generic", scope: str | None = None) -> list[str]:
90
+ """Import from a transcript file according to client format.
91
+
92
+ Args:
93
+ path: Path to transcript file.
94
+ client: Client identifier (claude-code, opencode, generic, json).
95
+ scope: Scope override. Auto-detected if None.
96
+
97
+ Returns:
98
+ List of created memory IDs.
99
+ """
100
+ text = Path(path).read_text(encoding="utf-8")
101
+ data = parse_transcript(text, client)
102
+
103
+ # Enrich metadata
104
+ data.setdefault("metadata", {})
105
+ data["metadata"]["client"] = client
106
+ data["metadata"]["source_file"] = str(path)
107
+
108
+ return await self.from_json(data, scope=scope)
109
+
110
+ async def from_pipe(self, scope: str | None = None, *, client: str = "generic") -> list[str]:
111
+ """Import from stdin — universal fallback."""
112
+ text = sys.stdin.read()
113
+ data = parse_transcript(text, client)
114
+ data.setdefault("metadata", {})
115
+ data["metadata"]["client"] = client
116
+ data["metadata"]["source"] = "pipe"
117
+ return await self.from_json(data, scope=scope)
118
+
119
+
120
+ # ── Helpers ──
121
+
122
+
123
+ def _messages_to_text(messages: list[dict[str, str]]) -> str:
124
+ """Convert messages to a single text block."""
125
+ parts = []
126
+ for msg in messages:
127
+ role = msg.get("role", "unknown").capitalize()
128
+ content = msg.get("content", "")
129
+ parts.append(f"{role}: {content}")
130
+ return "\n\n".join(parts)
131
+
132
+
133
+ def _auto_summary(messages: list[dict[str, str]], max_len: int = 160) -> str:
134
+ """Generate auto-summary from first user message or combined messages."""
135
+ for msg in messages:
136
+ if msg.get("role") == "user":
137
+ text = msg.get("content", "").replace("\n", " ")
138
+ if len(text) > max_len:
139
+ return text[:max_len - 3] + "..."
140
+ return text
141
+
142
+ # Fallback: combined first exchange
143
+ parts = []
144
+ for msg in messages[:2]:
145
+ text = msg.get("content", "").replace("\n", " ")
146
+ parts.append(text[:80])
147
+
148
+ combined = " | ".join(parts)
149
+ if len(combined) > max_len:
150
+ combined = combined[:max_len - 3] + "..."
151
+ return combined or "Untitled session"
152
+
153
+
154
+ def _chunk_by_message_group(messages: list[dict[str, str]], max_per_chunk: int = 15) -> list[list[dict[str, str]]]:
155
+ """Split messages into chunks by conversation turns.
156
+
157
+ Simple heuristic: group consecutive user+assistant pairs.
158
+ For better chunking, use host model or Ollama via ConsolidationPipeline.
159
+ """
160
+ if len(messages) <= max_per_chunk:
161
+ return [messages]
162
+
163
+ chunks = []
164
+ for i in range(0, len(messages), max_per_chunk):
165
+ chunk = messages[i:i + max_per_chunk]
166
+ chunks.append(chunk)
167
+
168
+ return chunks
169
+
170
+
171
+ def _detect_tags(text: str) -> list[str]:
172
+ """Auto-detect tags from conversation content."""
173
+ tags = []
174
+ text_lower = text.lower()
175
+
176
+ # Simple keyword-based tag detection
177
+ tag_keywords = {
178
+ "architecture": ["architecture", "design pattern", "microservice", "monolith"],
179
+ "debugging": ["debug", "memory leak", "segfault", "stack trace", "error"],
180
+ "testing": ["test", "pytest", "unit test", "integration test", "mock"],
181
+ "git": ["git", "commit", "branch", "merge", "rebase"],
182
+ "database": ["database", "sql", "sqlite", "postgres", "mysql"],
183
+ "api": ["api", "rest", "graphql", "endpoint", "http"],
184
+ "python": ["python", "pip", "venv", "asyncio", "pydantic"],
185
+ "rust": ["rust", "cargo", "borrow", "lifetime", "trait"],
186
+ "c++": ["c++", "cpp", "cmake", "template", "pointer"],
187
+ "refactoring": ["refactor", "clean up", "simplify", "extract method"],
188
+ }
189
+
190
+ for tag, keywords in tag_keywords.items():
191
+ for kw in keywords:
192
+ if kw in text_lower:
193
+ tags.append(tag)
194
+ break
195
+
196
+ return tags[:5] # Max 5 auto-tags
@@ -0,0 +1,319 @@
1
+ """Per-client transcript format parsers.
2
+
3
+ Each parser takes raw transcript text and returns a standardized dict:
4
+ {
5
+ "messages": [{"role": "user"|"assistant", "content": "..."}, ...],
6
+ "metadata": {"client": "claude-code"|"opencode"|"generic", "session_id": "..."},
7
+ }
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import re
14
+ from typing import Any
15
+
16
+
17
+ def parse_transcript(text: str, client: str) -> dict[str, Any]:
18
+ """Parse a transcript from a known client format.
19
+
20
+ Args:
21
+ text: Raw transcript file content.
22
+ client: Client identifier (claude-code, opencode, generic, json).
23
+
24
+ Returns:
25
+ Standardized dict with messages and metadata.
26
+ """
27
+ parser = PARSERS.get(client, parse_generic)
28
+ return parser(text)
29
+
30
+
31
+ def parse_json(text: str) -> dict[str, Any]:
32
+ """Parse JSON transcript (already standardized)."""
33
+ data = json.loads(text)
34
+ if "messages" not in data:
35
+ raise ValueError("JSON transcript must have 'messages' key")
36
+ data.setdefault("metadata", {})
37
+ data["metadata"].setdefault("client", "json")
38
+ return data
39
+
40
+
41
+ def parse_generic(text: str) -> dict[str, Any]:
42
+ """Generic parser — tries to detect format from content.
43
+
44
+ Heuristics:
45
+ 1. Try JSON first
46
+ 2. Look for "User:" and "Assistant:" prefixes
47
+ 3. Fallback: single assistant message with full text
48
+ """
49
+ # Try JSON
50
+ try:
51
+ return parse_json(text)
52
+ except (json.JSONDecodeError, ValueError):
53
+ pass
54
+
55
+ # Try "Role: content" pattern
56
+ messages = _parse_role_prefix(text, r"(?m)^(User|Assistant|Human|AI|System):\s*(.*)")
57
+ if len(messages) >= 2:
58
+ return {
59
+ "messages": messages,
60
+ "metadata": {"client": "generic", "pattern": "role_prefix"},
61
+ }
62
+
63
+ # Fallback: entire text as one assistant message
64
+ return {
65
+ "messages": [{"role": "assistant", "content": text.strip()}],
66
+ "metadata": {"client": "generic", "pattern": "fallback"},
67
+ }
68
+
69
+
70
+ def parse_claude_code(text: str) -> dict[str, Any]:
71
+ """Parse Claude Code transcript format.
72
+
73
+ Claude Code transcripts are JSONL (one JSON object per line) with
74
+ ``type`` discriminator fields. We extract ``user`` and ``assistant``
75
+ entries and convert them to the standardized message format.
76
+ Falls back to plain JSON, role-prefix, and generic parsing.
77
+ """
78
+ # 1 ── JSONL (native Claude Code transcript) ──
79
+ messages = _try_jsonl(text)
80
+ if messages:
81
+ return {
82
+ "messages": messages,
83
+ "metadata": {"client": "claude-code", "pattern": "jsonl"},
84
+ }
85
+
86
+ # 2 ── Plain JSON ──
87
+ try:
88
+ data = json.loads(text)
89
+ if "messages" in data:
90
+ data.setdefault("metadata", {})
91
+ data["metadata"]["client"] = "claude-code"
92
+ return data
93
+ # Might be a different JSON structure — try extracting
94
+ if isinstance(data, list):
95
+ messages = []
96
+ for entry in data:
97
+ role = entry.get("role", entry.get("type", "assistant"))
98
+ content = entry.get("content", entry.get("text", str(entry)))
99
+ messages.append({"role": _normalize_role(role), "content": str(content)})
100
+ if messages:
101
+ return {"messages": messages, "metadata": {"client": "claude-code"}}
102
+ except (json.JSONDecodeError, ValueError):
103
+ pass
104
+
105
+ # 3 ── Role-prefix text ──
106
+ messages = _parse_role_prefix(text, r"(?m)^(?:User|Assistant|Human|Claude):\s*(.*)")
107
+ if len(messages) >= 2:
108
+ return {
109
+ "messages": messages,
110
+ "metadata": {"client": "claude-code", "pattern": "role_prefix"},
111
+ }
112
+
113
+ # 4 ── Fallback to generic ──
114
+ return parse_generic(text)
115
+
116
+
117
+ def parse_opencode(text: str) -> dict[str, Any]:
118
+ """Parse OpenCode transcript format.
119
+
120
+ OpenCode uses a YAML-like or JSON transcript format.
121
+ """
122
+ # Try JSON first
123
+ try:
124
+ data = json.loads(text)
125
+ if "messages" in data:
126
+ data.setdefault("metadata", {})
127
+ data["metadata"]["client"] = "opencode"
128
+ return data
129
+ except (json.JSONDecodeError, ValueError):
130
+ pass
131
+
132
+ # OpenCode text format
133
+ messages = _parse_role_prefix(text, r"(?m)^(?:User|Assistant|System):\s*(.*)")
134
+ if len(messages) >= 2:
135
+ return {
136
+ "messages": messages,
137
+ "metadata": {"client": "opencode", "pattern": "role_prefix"},
138
+ }
139
+
140
+ return parse_generic(text)
141
+
142
+
143
+ # ── Parser registry ──
144
+
145
+ PARSERS: dict[str, Any] = {
146
+ "claude-code": parse_claude_code,
147
+ "opencode": parse_opencode,
148
+ "generic": parse_generic,
149
+ "json": parse_json,
150
+ }
151
+
152
+
153
+ # ── Helpers ──
154
+
155
+
156
+ def _try_jsonl(text: str) -> list[dict[str, str]]:
157
+ """Try to parse text as JSONL (one JSON object per line).
158
+
159
+ Detects Claude Code's native transcript format where each line is a
160
+ JSON object with a ``type`` discriminator (``user``, ``assistant``,
161
+ ``system``). Returns standardized message list or empty list.
162
+ """
163
+ lines = [l for l in text.split("\n") if l.strip()]
164
+ if not lines:
165
+ return []
166
+
167
+ # Heuristic: at least 2 lines must parse as JSON for JSONL detection
168
+ parsed = []
169
+ for line in lines:
170
+ try:
171
+ obj = json.loads(line)
172
+ parsed.append(obj)
173
+ except (json.JSONDecodeError, ValueError):
174
+ # Each line must be valid JSON for JSONL format
175
+ return []
176
+
177
+ if len(parsed) < 2:
178
+ return []
179
+
180
+ # Must have at least one recognized message type
181
+ msg_types = {"user", "assistant", "system"}
182
+ if not any(obj.get("type") in msg_types for obj in parsed):
183
+ return []
184
+
185
+ messages: list[dict[str, str]] = []
186
+ for obj in parsed:
187
+ typ = obj.get("type", "")
188
+ if typ not in msg_types:
189
+ continue
190
+
191
+ inner = obj.get("message", {})
192
+ role = _normalize_role(inner.get("role", typ))
193
+ content = _extract_jsonl_content(inner.get("content", ""))
194
+ if not content:
195
+ continue
196
+ messages.append({"role": role, "content": content})
197
+
198
+ return messages
199
+
200
+
201
+ def _extract_jsonl_content(content: Any) -> str:
202
+ """Extract text from Claude Code JSONL content (string or content-block array).
203
+
204
+ Claude Code assistant messages use a content-block array:
205
+ [{"type": "text", "text": "..."}, {"type": "tool_use", ...}, ...]
206
+ User messages use a plain string.
207
+ """
208
+ if isinstance(content, str):
209
+ return content.strip()
210
+
211
+ if isinstance(content, list):
212
+ parts: list[str] = []
213
+ for block in content:
214
+ if not isinstance(block, dict):
215
+ continue
216
+ btype = block.get("type", "")
217
+ if btype == "text":
218
+ t = block.get("text", "")
219
+ if t:
220
+ parts.append(t.strip())
221
+ elif btype == "thinking":
222
+ # Thinking blocks are internal reasoning — include but de-emphasize
223
+ t = block.get("thinking", "")
224
+ if t:
225
+ parts.append(f"[thinking] {t.strip()}")
226
+ elif btype == "tool_use":
227
+ # Summarize tool calls: name + brief input
228
+ name = block.get("name", "tool")
229
+ inp = block.get("input", {})
230
+ if isinstance(inp, dict):
231
+ brief = _summarize_tool_input(name, inp)
232
+ parts.append(f"[tool_call: {brief}]")
233
+ else:
234
+ parts.append(f"[tool_call: {name}]")
235
+ elif btype == "tool_result":
236
+ # Skip large tool results, just note them
237
+ parts.append("[tool_result]")
238
+ # Skip other block types (tool_use, tool_result, etc.)
239
+ return "\n".join(parts) if parts else ""
240
+
241
+ return str(content).strip()
242
+
243
+
244
+ def _summarize_tool_input(name: str, inp: dict) -> str:
245
+ """Create a brief summary of a tool call for embedding."""
246
+ # Keep first key-value pair for context
247
+ brief_parts = [name]
248
+ for k, v in inp.items():
249
+ if isinstance(v, str):
250
+ brief_parts.append(f"{k}={v[:80]}")
251
+ else:
252
+ brief_parts.append(f"{k}=...")
253
+ if len(brief_parts) >= 3:
254
+ break
255
+ return " ".join(brief_parts)
256
+
257
+
258
+ def _parse_role_prefix(text: str, pattern: str) -> list[dict[str, str]]:
259
+ """Parse text where each turn starts with a role prefix.
260
+
261
+ Args:
262
+ text: Raw transcript.
263
+ pattern: Regex with named groups or capturing role+content pairs.
264
+
265
+ Returns:
266
+ List of message dicts.
267
+ """
268
+ messages = []
269
+ current_role = None
270
+ current_content: list[str] = []
271
+
272
+ for line in text.split("\n"):
273
+ match = re.match(pattern, line)
274
+ if match:
275
+ # Save previous message
276
+ if current_role and current_content:
277
+ messages.append({
278
+ "role": _normalize_role(current_role),
279
+ "content": "\n".join(current_content).strip(),
280
+ })
281
+ # Start new message
282
+ if len(match.groups()) >= 2:
283
+ current_role = match.group(1)
284
+ current_content = [match.group(2)]
285
+ elif len(match.groups()) == 1:
286
+ # Pattern matched role only, content in next lines?
287
+ current_role = match.group(1)
288
+ current_content = []
289
+ else:
290
+ current_role = "assistant"
291
+ current_content = [line]
292
+ else:
293
+ if current_role:
294
+ current_content.append(line)
295
+ elif line.strip():
296
+ # No role detected yet — treat as assistant
297
+ current_role = "assistant"
298
+ current_content.append(line)
299
+
300
+ # Save last message
301
+ if current_role and current_content:
302
+ messages.append({
303
+ "role": _normalize_role(current_role),
304
+ "content": "\n".join(current_content).strip(),
305
+ })
306
+
307
+ return messages
308
+
309
+
310
+ def _normalize_role(role: str) -> str:
311
+ """Normalize role to user/assistant/system."""
312
+ role = role.strip().lower()
313
+ if role in ("user", "human", "u"):
314
+ return "user"
315
+ if role in ("assistant", "ai", "claude", "bot", "a"):
316
+ return "assistant"
317
+ if role in ("system", "sys", "s"):
318
+ return "system"
319
+ return "assistant"
@@ -0,0 +1,111 @@
1
+ """mem-context-capture-cc — capture Claude Code transcript into mem-context.
2
+
3
+ Entry point called by Claude Code hooks (Stop, SessionEnd). Locates the
4
+ current session's JSONL transcript and feeds it to ``mem-context capture``.
5
+
6
+ Tries ``$CLAUDE_TRANSCRIPT_FILE`` first. If unset or missing, looks under
7
+ ``~/.claude/projects/<project-slug>/`` for the most recent JSONL transcript.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ from pathlib import Path
17
+
18
+
19
+ def _find_mem_context_bin() -> str | None:
20
+ """Return the absolute path to the ``mem-context`` CLI binary.
21
+
22
+ Resolution order:
23
+ 1. ``~/.mem-context/.venv/bin/mem-context`` (standalone install)
24
+ 2. Same directory as this script (pip-installed sibling)
25
+ 3. ``mem-context`` in PATH
26
+ """
27
+ # 1) Standalone install (primary location)
28
+ standalone = Path.home() / ".mem-context" / ".venv" / "bin" / "mem-context"
29
+ if standalone.is_file():
30
+ return str(standalone)
31
+
32
+ # 2) Sibling — same directory as this script.
33
+ # Prefer __file__: when invoked by bare name from PATH, sys.argv[0]
34
+ # is just "mem-context-capture-cc" and Path resolves relative to CWD.
35
+ # When the hook uses an absolute path (set by the installer),
36
+ # sys.argv[0] is absolute — but __file__ is always correct.
37
+ script_dir = Path(__file__).resolve().parent
38
+ sibling = script_dir / "mem-context"
39
+ if sibling.is_file():
40
+ return str(sibling)
41
+
42
+ # 3) PATH
43
+ in_path = shutil.which("mem-context")
44
+ if in_path:
45
+ return in_path
46
+
47
+ return None
48
+
49
+
50
+ def _find_transcript() -> str | None:
51
+ """Locate the Claude Code transcript JSONL file.
52
+
53
+ Returns the path to the transcript, or None if not found.
54
+ """
55
+ # 1) Env-var provided by Claude Code hooks (Stop / SessionEnd)
56
+ transcript = os.environ.get("CLAUDE_TRANSCRIPT_FILE", "")
57
+ if transcript and os.path.isfile(transcript):
58
+ return transcript
59
+
60
+ # 2) Fallback — find latest transcript for current project
61
+ cwd = os.environ.get("CLAUDE_PROJECT_DIR", os.getcwd())
62
+ # Project slug = absolute path with '/' → '-' (bash: ${cwd//\//-})
63
+ slug = cwd.replace("/", "-")
64
+ proj_dir = Path.home() / ".claude" / "projects" / slug
65
+ if proj_dir.is_dir():
66
+ # SessionEnd: use $CLAUDE_SESSION_ID if available
67
+ session_id = os.environ.get("CLAUDE_SESSION_ID", "")
68
+ if session_id:
69
+ session_file = proj_dir / f"{session_id}.jsonl"
70
+ if session_file.is_file():
71
+ return str(session_file)
72
+
73
+ # Last resort: most recent .jsonl (exclude directories)
74
+ jsonl_files = sorted(
75
+ (p for p in proj_dir.glob("*.jsonl") if p.is_file()),
76
+ key=lambda p: p.stat().st_mtime,
77
+ reverse=True,
78
+ )
79
+ if jsonl_files:
80
+ return str(jsonl_files[0])
81
+
82
+ return None
83
+
84
+
85
+ def main() -> None:
86
+ """Run capture — locate transcript and pipe to mem-context."""
87
+ mem_bin = _find_mem_context_bin()
88
+ if not mem_bin:
89
+ print("mem-context-capture-cc: mem-context binary not found", file=sys.stderr)
90
+ print(" Install with: pip install --user mem-context", file=sys.stderr)
91
+ print(" Or ensure ~/.mem-context/.venv/bin/ is in PATH", file=sys.stderr)
92
+ sys.exit(0) # Not a hard error — don't break the hook chain
93
+
94
+ transcript = _find_transcript()
95
+ if not transcript:
96
+ print("mem-context-capture-cc: no transcript found", file=sys.stderr)
97
+ sys.exit(0) # Not a hard error
98
+
99
+ result = subprocess.run(
100
+ [mem_bin, "capture", "transcript", transcript, "--client", "claude-code"],
101
+ capture_output=False,
102
+ text=True,
103
+ )
104
+ if result.returncode != 0:
105
+ print(f"mem-context-capture-cc: capture exited with code {result.returncode}", file=sys.stderr)
106
+
107
+ sys.exit(0) # Always exit 0 — don't break the hook chain
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()