oghma 0.0.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oghma/daemon.py ADDED
@@ -0,0 +1,198 @@
1
+ import logging
2
+ import os
3
+ import signal
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+
8
+ from oghma.config import Config
9
+ from oghma.extractor import Extractor
10
+ from oghma.parsers import get_parser_for_file
11
+ from oghma.storage import Storage
12
+ from oghma.watcher import Watcher
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Daemon:
18
+ """Main daemon for Oghma memory extraction."""
19
+
20
+ def __init__(self, config: Config):
21
+ self.config = config
22
+ self.storage = Storage(config=config)
23
+ self.watcher = Watcher(config, self.storage)
24
+ self.extractor = Extractor(config)
25
+ self._setup_logging()
26
+ self._running = False
27
+
28
+ def _setup_logging(self) -> None:
29
+ """Setup logging configuration."""
30
+ log_file = self.config["daemon"]["log_file"]
31
+ log_level = self.config["daemon"]["log_level"]
32
+ Path(log_file).parent.mkdir(parents=True, exist_ok=True)
33
+
34
+ logging.basicConfig(
35
+ level=getattr(logging, log_level),
36
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
37
+ handlers=[
38
+ logging.FileHandler(log_file),
39
+ logging.StreamHandler(sys.stdout),
40
+ ],
41
+ )
42
+
43
+ def _signal_handler(self, signum: int, frame) -> None:
44
+ """Handle shutdown signals gracefully."""
45
+ logger.info(f"Received signal {signum}, shutting down...")
46
+ self._running = False
47
+
48
+ def start(self) -> None:
49
+ """Start the daemon main loop."""
50
+ signal.signal(signal.SIGINT, self._signal_handler)
51
+ signal.signal(signal.SIGTERM, self._signal_handler)
52
+
53
+ pid_file = self.config["daemon"]["pid_file"]
54
+ self._write_pid_file(pid_file)
55
+
56
+ try:
57
+ self._running = True
58
+ poll_interval = self.config["daemon"]["poll_interval"]
59
+
60
+ logger.info("Oghma daemon started")
61
+ logger.info(f"Poll interval: {poll_interval} seconds")
62
+
63
+ while self._running:
64
+ try:
65
+ self._run_cycle()
66
+ except Exception as e:
67
+ logger.error(f"Error in extraction cycle: {e}", exc_info=True)
68
+
69
+ for _ in range(poll_interval):
70
+ if not self._running:
71
+ break
72
+ time.sleep(1)
73
+
74
+ finally:
75
+ self._cleanup(pid_file)
76
+
77
+ def _run_cycle(self) -> None:
78
+ """Run one extraction cycle."""
79
+ logger.debug("Starting extraction cycle")
80
+ changed_files = self.watcher.get_changed_files()
81
+
82
+ if not changed_files:
83
+ logger.debug("No changed files found")
84
+ return
85
+
86
+ logger.info(f"Processing {len(changed_files)} changed files")
87
+
88
+ for file_path in changed_files:
89
+ self._process_file(file_path)
90
+
91
+ def _process_file(self, file_path: Path) -> None:
92
+ """Process a single file: parse, extract, and save memories."""
93
+ logger.info(f"Processing file: {file_path}")
94
+
95
+ try:
96
+ parser = get_parser_for_file(file_path)
97
+ if not parser:
98
+ logger.warning(f"No parser found for {file_path}")
99
+ return
100
+
101
+ messages = parser.parse(file_path)
102
+
103
+ if not self.watcher.should_process(messages):
104
+ logger.debug(f"Skipping {file_path}: not enough messages")
105
+ return
106
+
107
+ source_tool = self._get_tool_name(file_path)
108
+ memories = self.extractor.extract(messages, source_tool)
109
+
110
+ mtime = file_path.stat().st_mtime
111
+ size = file_path.stat().st_size
112
+ source_session = self._get_session_id(file_path)
113
+
114
+ for memory in memories:
115
+ self.storage.add_memory(
116
+ content=memory.content,
117
+ category=memory.category,
118
+ source_tool=source_tool,
119
+ source_file=str(file_path),
120
+ source_session=source_session,
121
+ confidence=memory.confidence,
122
+ )
123
+
124
+ self.storage.update_extraction_state(str(file_path), mtime, size, len(messages))
125
+
126
+ self.storage.log_extraction(
127
+ source_path=str(file_path),
128
+ memories_extracted=len(memories),
129
+ )
130
+
131
+ logger.info(f"Extracted {len(memories)} memories from {file_path}")
132
+
133
+ except Exception as e:
134
+ logger.error(f"Failed to process {file_path}: {e}", exc_info=True)
135
+ self.storage.log_extraction(source_path=str(file_path), error=str(e))
136
+
137
+ def _get_tool_name(self, file_path: Path) -> str:
138
+ """Extract tool name from file path."""
139
+ path_str = str(file_path)
140
+
141
+ if ".claude" in path_str:
142
+ return "claude_code"
143
+ elif ".codex" in path_str:
144
+ return "codex"
145
+ elif ".openclaw" in path_str:
146
+ return "openclaw"
147
+ elif ".local/share/opencode" in path_str or "opencode" in path_str:
148
+ return "opencode"
149
+ else:
150
+ return "unknown"
151
+
152
+ def _get_session_id(self, file_path: Path) -> str | None:
153
+ """Extract session ID from file path if possible."""
154
+ parts = file_path.parts
155
+ for part in reversed(parts):
156
+ if part.startswith("ses_") or part.startswith("rollout-"):
157
+ return part
158
+ return None
159
+
160
+ def _write_pid_file(self, pid_file: str) -> None:
161
+ """Write current PID to lock file."""
162
+ Path(pid_file).parent.mkdir(parents=True, exist_ok=True)
163
+ with open(pid_file, "w") as f:
164
+ f.write(str(os.getpid()))
165
+
166
+ def _cleanup(self, pid_file: str) -> None:
167
+ """Cleanup on shutdown."""
168
+ logger.info("Oghma daemon stopped")
169
+ pid_path = Path(pid_file)
170
+ if pid_path.exists():
171
+ pid_path.unlink()
172
+
173
+
174
+ def get_daemon_pid(pid_file: str) -> int | None:
175
+ """Read PID from lock file, check if process is running."""
176
+ pid_path = Path(pid_file)
177
+
178
+ if not pid_path.exists():
179
+ return None
180
+
181
+ try:
182
+ with open(pid_path) as f:
183
+ pid_str = f.read().strip()
184
+ pid = int(pid_str)
185
+
186
+ if pid > 0:
187
+ try:
188
+ os.kill(pid, 0)
189
+ return pid
190
+ except OSError:
191
+ pid_path.unlink()
192
+ return None
193
+
194
+ except (OSError, ValueError):
195
+ if pid_path.exists():
196
+ pid_path.unlink()
197
+
198
+ return None
oghma/embedder.py ADDED
@@ -0,0 +1,107 @@
1
+ import os
2
+ import time
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Mapping
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from openai import APIError, OpenAI
9
+
10
+
11
+ @dataclass
12
+ class EmbedConfig:
13
+ provider: str = "openai"
14
+ model: str = "text-embedding-3-small"
15
+ dimensions: int = 1536
16
+ batch_size: int = 100
17
+ rate_limit_delay: float = 0.1
18
+ max_retries: int = 3
19
+
20
+ @classmethod
21
+ def from_dict(
22
+ cls,
23
+ data: Mapping[str, Any] | None = None,
24
+ **overrides: Any,
25
+ ) -> "EmbedConfig":
26
+ values = {
27
+ "provider": cls.provider,
28
+ "model": cls.model,
29
+ "dimensions": cls.dimensions,
30
+ "batch_size": cls.batch_size,
31
+ "rate_limit_delay": cls.rate_limit_delay,
32
+ "max_retries": cls.max_retries,
33
+ }
34
+ if data:
35
+ for key in values:
36
+ if key in data:
37
+ values[key] = data[key]
38
+ for key, value in overrides.items():
39
+ if key in values and value is not None:
40
+ values[key] = value
41
+ return cls(**values)
42
+
43
+
44
+ class Embedder(ABC):
45
+ def __init__(self, config: EmbedConfig):
46
+ self.config = config
47
+
48
+ @abstractmethod
49
+ def embed(self, text: str) -> list[float]:
50
+ """Embed a single text string."""
51
+
52
+ @abstractmethod
53
+ def embed_batch(self, texts: list[str]) -> list[list[float]]:
54
+ """Embed multiple strings."""
55
+
56
+
57
+ class OpenAIEmbedder(Embedder):
58
+ BASE_RETRY_DELAY = 1.0
59
+
60
+ def __init__(self, config: EmbedConfig):
61
+ super().__init__(config)
62
+ api_key = os.environ.get("OPENAI_API_KEY")
63
+ if not api_key:
64
+ raise ValueError("OPENAI_API_KEY environment variable not set")
65
+ self.client = OpenAI(api_key=api_key)
66
+
67
+ def embed(self, text: str) -> list[float]:
68
+ return self.embed_batch([text])[0]
69
+
70
+ def embed_batch(self, texts: list[str]) -> list[list[float]]:
71
+ if not texts:
72
+ return []
73
+
74
+ vectors: list[list[float]] = []
75
+ for start in range(0, len(texts), self.config.batch_size):
76
+ chunk = texts[start : start + self.config.batch_size]
77
+ vectors.extend(self._embed_chunk(chunk))
78
+ if self.config.rate_limit_delay > 0:
79
+ time.sleep(self.config.rate_limit_delay)
80
+ return vectors
81
+
82
+ def _embed_chunk(self, texts: list[str]) -> list[list[float]]:
83
+ last_error: Exception | None = None
84
+
85
+ for attempt in range(self.config.max_retries):
86
+ try:
87
+ response = self.client.embeddings.create(
88
+ model=self.config.model,
89
+ input=texts,
90
+ dimensions=self.config.dimensions,
91
+ )
92
+ return [list(item.embedding) for item in response.data]
93
+ except APIError as exc: # pragma: no cover - covered by retry test
94
+ last_error = exc
95
+ if attempt < self.config.max_retries - 1:
96
+ delay = self.BASE_RETRY_DELAY * (2**attempt)
97
+ time.sleep(delay)
98
+
99
+ if last_error:
100
+ raise last_error
101
+ raise RuntimeError("Embedding request failed")
102
+
103
+
104
+ def create_embedder(config: EmbedConfig) -> Embedder:
105
+ if config.provider == "openai":
106
+ return OpenAIEmbedder(config)
107
+ raise ValueError(f"Unsupported embedding provider: {config.provider}")
oghma/exporter.py ADDED
@@ -0,0 +1,177 @@
1
+ import json
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ from oghma.storage import MemoryRecord, Storage
7
+
8
+
9
+ @dataclass
10
+ class ExportOptions:
11
+ output_dir: Path
12
+ format: str = "markdown"
13
+ group_by: str = "category"
14
+ include_metadata: bool = True
15
+
16
+
17
+ class Exporter:
18
+ def __init__(self, storage: Storage, options: ExportOptions):
19
+ self.storage = storage
20
+ self.options = options
21
+
22
+ def export(self) -> list[Path]:
23
+ """Export memories to files, returning list of created file paths."""
24
+ memories = self.storage.get_all_memories(status="active")
25
+
26
+ if not memories:
27
+ return []
28
+
29
+ self.options.output_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ if self.options.group_by == "category":
32
+ return self._export_by_category(memories)
33
+ elif self.options.group_by == "date":
34
+ return self._export_by_date(memories)
35
+ elif self.options.group_by == "source":
36
+ return self._export_by_source(memories)
37
+ else:
38
+ raise ValueError(f"Unsupported group_by: {self.options.group_by}")
39
+
40
+ def export_category(self, category: str) -> Path:
41
+ """Export memories for a single category."""
42
+ memories = self.storage.get_all_memories(status="active", category=category)
43
+
44
+ if not memories:
45
+ raise ValueError(f"No memories found for category: {category}")
46
+
47
+ self.options.output_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ if self.options.format == "markdown":
50
+ content = self._format_markdown(memories, category)
51
+ ext = ".md"
52
+ elif self.options.format == "json":
53
+ content = self._format_json(memories)
54
+ ext = ".json"
55
+ else:
56
+ raise ValueError(f"Unsupported format: {self.options.format}")
57
+
58
+ safe_category = category.replace("/", "_").replace("\\", "_")
59
+ filename = f"{safe_category}{ext}"
60
+ file_path = self.options.output_dir / filename
61
+
62
+ file_path.write_text(content, encoding="utf-8")
63
+ return file_path
64
+
65
+ def _export_by_category(self, memories: list[MemoryRecord]) -> list[Path]:
66
+ categories = {m["category"] for m in memories}
67
+ files = []
68
+
69
+ for category in sorted(categories):
70
+ category_memories = [m for m in memories if m["category"] == category]
71
+
72
+ if self.options.format == "markdown":
73
+ content = self._format_markdown(category_memories, category)
74
+ ext = ".md"
75
+ elif self.options.format == "json":
76
+ content = self._format_json(category_memories)
77
+ ext = ".json"
78
+ else:
79
+ raise ValueError(f"Unsupported format: {self.options.format}")
80
+
81
+ safe_category = category.replace("/", "_").replace("\\", "_")
82
+ filename = f"{safe_category}{ext}"
83
+ file_path = self.options.output_dir / filename
84
+
85
+ file_path.write_text(content, encoding="utf-8")
86
+ files.append(file_path)
87
+
88
+ return files
89
+
90
+ def _export_by_date(self, memories: list[MemoryRecord]) -> list[Path]:
91
+ dates = {m["created_at"][:10] for m in memories}
92
+ files = []
93
+
94
+ for date_str in sorted(dates):
95
+ date_memories = [m for m in memories if m["created_at"].startswith(date_str)]
96
+
97
+ if self.options.format == "markdown":
98
+ content = self._format_markdown(date_memories, date_str)
99
+ ext = ".md"
100
+ elif self.options.format == "json":
101
+ content = self._format_json(date_memories)
102
+ ext = ".json"
103
+ else:
104
+ raise ValueError(f"Unsupported format: {self.options.format}")
105
+
106
+ filename = f"{date_str}{ext}"
107
+ file_path = self.options.output_dir / filename
108
+
109
+ file_path.write_text(content, encoding="utf-8")
110
+ files.append(file_path)
111
+
112
+ return files
113
+
114
+ def _export_by_source(self, memories: list[MemoryRecord]) -> list[Path]:
115
+ sources = {m["source_tool"] for m in memories}
116
+ files = []
117
+
118
+ for source in sorted(sources):
119
+ source_memories = [m for m in memories if m["source_tool"] == source]
120
+
121
+ if self.options.format == "markdown":
122
+ content = self._format_markdown(source_memories, source)
123
+ ext = ".md"
124
+ elif self.options.format == "json":
125
+ content = self._format_json(source_memories)
126
+ ext = ".json"
127
+ else:
128
+ raise ValueError(f"Unsupported format: {self.options.format}")
129
+
130
+ safe_source = source.replace("/", "_").replace("\\", "_")
131
+ filename = f"{safe_source}{ext}"
132
+ file_path = self.options.output_dir / filename
133
+
134
+ file_path.write_text(content, encoding="utf-8")
135
+ files.append(file_path)
136
+
137
+ return files
138
+
139
+ def _format_markdown(self, memories: list[MemoryRecord], title: str) -> str:
140
+ """Format memories as markdown with YAML frontmatter."""
141
+ lines = [
142
+ "---",
143
+ f"category: {title}",
144
+ f"exported_at: {datetime.now().isoformat()}",
145
+ f"count: {len(memories)}",
146
+ "---",
147
+ "",
148
+ f"# {title.title()}",
149
+ "",
150
+ ]
151
+
152
+ for memory in memories:
153
+ content_preview = (
154
+ memory["content"][:80] + "..." if len(memory["content"]) > 80 else memory["content"]
155
+ )
156
+ lines.append(f"## {content_preview}")
157
+ source_info = (
158
+ f"*Source: {memory['source_tool']} | {memory['created_at'][:10]} | "
159
+ f"Confidence: {memory['confidence']:.0%}*"
160
+ )
161
+ lines.append(source_info)
162
+ lines.append("")
163
+ lines.append(memory["content"])
164
+ lines.append("")
165
+ lines.append("---")
166
+ lines.append("")
167
+
168
+ return "\n".join(lines)
169
+
170
+ def _format_json(self, memories: list[MemoryRecord]) -> str:
171
+ """Format memories as JSON."""
172
+ export_data = {
173
+ "exported_at": datetime.now().isoformat(),
174
+ "count": len(memories),
175
+ "memories": memories,
176
+ }
177
+ return json.dumps(export_data, indent=2, ensure_ascii=False, default=str)
oghma/extractor.py ADDED
@@ -0,0 +1,180 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import time
5
+ from dataclasses import dataclass
6
+
7
+ from openai import OpenAI
8
+
9
+ from oghma.config import Config
10
+ from oghma.parsers import Message
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class Memory:
17
+ content: str
18
+ category: str
19
+ confidence: float = 1.0
20
+
21
+
22
+ class Extractor:
23
+ """Extracts memories from conversations using LLM."""
24
+
25
+ MAX_RETRIES = 3
26
+ BASE_RETRY_DELAY = 1.0
27
+
28
+ CATEGORIES = ["learning", "preference", "project_context", "gotcha", "workflow"]
29
+
30
+ # Models that require OpenRouter
31
+ OPENROUTER_PREFIXES = ("google/", "anthropic/", "meta-llama/", "deepseek/", "moonshotai/")
32
+
33
+ def __init__(self, config: Config):
34
+ self.config = config
35
+ self.model = config.get("extraction", {}).get("model", "gpt-4o-mini")
36
+ self.max_chars = config.get("extraction", {}).get("max_content_chars", 4000)
37
+
38
+ # Determine which API to use based on model name
39
+ if self.model.startswith(self.OPENROUTER_PREFIXES):
40
+ api_key = os.environ.get("OPENROUTER_API_KEY")
41
+ if not api_key:
42
+ raise ValueError("OPENROUTER_API_KEY environment variable not set")
43
+ self.client = OpenAI(
44
+ api_key=api_key,
45
+ base_url="https://openrouter.ai/api/v1"
46
+ )
47
+ self.use_openrouter = True
48
+ else:
49
+ api_key = os.environ.get("OPENAI_API_KEY")
50
+ if not api_key:
51
+ raise ValueError("OPENAI_API_KEY environment variable not set")
52
+ self.client = OpenAI(api_key=api_key)
53
+ self.use_openrouter = False
54
+
55
+ def extract(self, messages: list[Message], source_tool: str) -> list[Memory]:
56
+ """Extract memories from a list of messages."""
57
+ if not messages:
58
+ return []
59
+
60
+ prompt = self._build_prompt(messages)
61
+
62
+ for attempt in range(self.MAX_RETRIES):
63
+ try:
64
+ response = self._call_openai(prompt)
65
+ memories = self._parse_response(response)
66
+
67
+ valid_memories = [
68
+ m for m in memories if m.category in self.CATEGORIES and m.confidence >= 0.5
69
+ ]
70
+
71
+ logger.info(
72
+ f"Extracted {len(valid_memories)} memories from {source_tool} "
73
+ f"(attempt {attempt + 1})"
74
+ )
75
+ return valid_memories
76
+
77
+ except Exception as e:
78
+ if attempt < self.MAX_RETRIES - 1:
79
+ delay = self.BASE_RETRY_DELAY * (2**attempt)
80
+ logger.warning(
81
+ f"Extraction attempt {attempt + 1} failed: {e}. Retrying in {delay}s..."
82
+ )
83
+ time.sleep(delay)
84
+ else:
85
+ logger.error(f"Extraction failed after {self.MAX_RETRIES} attempts: {e}")
86
+
87
+ return []
88
+
89
+ def _call_openai(self, prompt: str) -> str:
90
+ """Call LLM API and return the response text."""
91
+ kwargs = {
92
+ "model": self.model,
93
+ "messages": [
94
+ {
95
+ "role": "system",
96
+ "content": "You are a memory extraction system. "
97
+ "Always respond with valid JSON only, no markdown.",
98
+ },
99
+ {"role": "user", "content": prompt},
100
+ ],
101
+ "temperature": 0.3,
102
+ "max_tokens": 1500,
103
+ }
104
+
105
+ # OpenAI models support structured output, OpenRouter models don't
106
+ if not self.use_openrouter:
107
+ kwargs["response_format"] = {"type": "json_object"}
108
+
109
+ response = self.client.chat.completions.create(**kwargs)
110
+
111
+ content = response.choices[0].message.content
112
+ if not content:
113
+ raise ValueError("Empty response from LLM API")
114
+
115
+ return content
116
+
117
+ def _build_prompt(self, messages: list[Message]) -> str:
118
+ """Build the extraction prompt."""
119
+ messages_text = ""
120
+ for msg in messages[:100]:
121
+ role_label = "User" if msg.role == "user" else "Assistant"
122
+ messages_text += f"{role_label}: {msg.content}\n\n"
123
+
124
+ messages_text = messages_text[: self.max_chars]
125
+
126
+ categories_desc = "\n".join(f"- {cat}" for cat in self.CATEGORIES)
127
+
128
+ prompt = (
129
+ "You are a memory extraction system. "
130
+ "Analyze this conversation and extract key memories.\n\n"
131
+ f"Categories:\n{categories_desc}\n\n"
132
+ f"Conversation:\n{messages_text}\n\n"
133
+ "Extract memories in this JSON format:\n"
134
+ '[ {"content": "...", "category": "...", "confidence": 0.0-1.0},\n'
135
+ " ...\n"
136
+ "]\n\n"
137
+ "Only extract clear, specific memories. Skip vague or trivial content.\n"
138
+ "Return empty array [] if no significant memories found.\n"
139
+ "Remember: respond with valid JSON only, no markdown formatting."
140
+ )
141
+
142
+ return prompt
143
+
144
+ def _parse_response(self, response_text: str) -> list[Memory]:
145
+ """Parse LLM response into Memory objects."""
146
+ response_text = response_text.strip()
147
+
148
+ if response_text.startswith("```"):
149
+ lines = response_text.split("\n")
150
+ response_text = "\n".join(lines[1:-1])
151
+
152
+ try:
153
+ data = json.loads(response_text)
154
+ except json.JSONDecodeError:
155
+ logger.warning(f"Failed to parse JSON response: {response_text[:200]}...")
156
+ return []
157
+
158
+ if not isinstance(data, list):
159
+ return []
160
+
161
+ memories: list[Memory] = []
162
+ for item in data:
163
+ if not isinstance(item, dict):
164
+ continue
165
+
166
+ content = item.get("content")
167
+ category = item.get("category")
168
+ confidence = item.get("confidence", 1.0)
169
+
170
+ if not content or not category:
171
+ continue
172
+
173
+ if not isinstance(confidence, (int, float)):
174
+ confidence = 1.0
175
+
176
+ confidence = max(0.0, min(1.0, float(confidence)))
177
+
178
+ memories.append(Memory(content=content, category=category, confidence=confidence))
179
+
180
+ return memories