oghma 0.0.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oghma/__init__.py +1 -3
- oghma/cli.py +342 -0
- oghma/config.py +262 -0
- oghma/daemon.py +198 -0
- oghma/embedder.py +107 -0
- oghma/exporter.py +177 -0
- oghma/extractor.py +180 -0
- oghma/mcp_server.py +112 -0
- oghma/migration.py +63 -0
- oghma/parsers/__init__.py +26 -0
- oghma/parsers/base.py +24 -0
- oghma/parsers/claude_code.py +62 -0
- oghma/parsers/codex.py +84 -0
- oghma/parsers/openclaw.py +64 -0
- oghma/parsers/opencode.py +90 -0
- oghma/storage.py +753 -0
- oghma/watcher.py +97 -0
- oghma-0.3.0.dist-info/METADATA +26 -0
- oghma-0.3.0.dist-info/RECORD +22 -0
- {oghma-0.0.1.dist-info → oghma-0.3.0.dist-info}/WHEEL +2 -1
- oghma-0.3.0.dist-info/entry_points.txt +3 -0
- oghma-0.3.0.dist-info/top_level.txt +1 -0
- oghma-0.0.1.dist-info/METADATA +0 -33
- oghma-0.0.1.dist-info/RECORD +0 -4
oghma/daemon.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import signal
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from oghma.config import Config
|
|
9
|
+
from oghma.extractor import Extractor
|
|
10
|
+
from oghma.parsers import get_parser_for_file
|
|
11
|
+
from oghma.storage import Storage
|
|
12
|
+
from oghma.watcher import Watcher
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Daemon:
|
|
18
|
+
"""Main daemon for Oghma memory extraction."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: Config):
|
|
21
|
+
self.config = config
|
|
22
|
+
self.storage = Storage(config=config)
|
|
23
|
+
self.watcher = Watcher(config, self.storage)
|
|
24
|
+
self.extractor = Extractor(config)
|
|
25
|
+
self._setup_logging()
|
|
26
|
+
self._running = False
|
|
27
|
+
|
|
28
|
+
def _setup_logging(self) -> None:
|
|
29
|
+
"""Setup logging configuration."""
|
|
30
|
+
log_file = self.config["daemon"]["log_file"]
|
|
31
|
+
log_level = self.config["daemon"]["log_level"]
|
|
32
|
+
Path(log_file).parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
logging.basicConfig(
|
|
35
|
+
level=getattr(logging, log_level),
|
|
36
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
37
|
+
handlers=[
|
|
38
|
+
logging.FileHandler(log_file),
|
|
39
|
+
logging.StreamHandler(sys.stdout),
|
|
40
|
+
],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _signal_handler(self, signum: int, frame) -> None:
|
|
44
|
+
"""Handle shutdown signals gracefully."""
|
|
45
|
+
logger.info(f"Received signal {signum}, shutting down...")
|
|
46
|
+
self._running = False
|
|
47
|
+
|
|
48
|
+
def start(self) -> None:
|
|
49
|
+
"""Start the daemon main loop."""
|
|
50
|
+
signal.signal(signal.SIGINT, self._signal_handler)
|
|
51
|
+
signal.signal(signal.SIGTERM, self._signal_handler)
|
|
52
|
+
|
|
53
|
+
pid_file = self.config["daemon"]["pid_file"]
|
|
54
|
+
self._write_pid_file(pid_file)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
self._running = True
|
|
58
|
+
poll_interval = self.config["daemon"]["poll_interval"]
|
|
59
|
+
|
|
60
|
+
logger.info("Oghma daemon started")
|
|
61
|
+
logger.info(f"Poll interval: {poll_interval} seconds")
|
|
62
|
+
|
|
63
|
+
while self._running:
|
|
64
|
+
try:
|
|
65
|
+
self._run_cycle()
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"Error in extraction cycle: {e}", exc_info=True)
|
|
68
|
+
|
|
69
|
+
for _ in range(poll_interval):
|
|
70
|
+
if not self._running:
|
|
71
|
+
break
|
|
72
|
+
time.sleep(1)
|
|
73
|
+
|
|
74
|
+
finally:
|
|
75
|
+
self._cleanup(pid_file)
|
|
76
|
+
|
|
77
|
+
def _run_cycle(self) -> None:
|
|
78
|
+
"""Run one extraction cycle."""
|
|
79
|
+
logger.debug("Starting extraction cycle")
|
|
80
|
+
changed_files = self.watcher.get_changed_files()
|
|
81
|
+
|
|
82
|
+
if not changed_files:
|
|
83
|
+
logger.debug("No changed files found")
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
logger.info(f"Processing {len(changed_files)} changed files")
|
|
87
|
+
|
|
88
|
+
for file_path in changed_files:
|
|
89
|
+
self._process_file(file_path)
|
|
90
|
+
|
|
91
|
+
def _process_file(self, file_path: Path) -> None:
|
|
92
|
+
"""Process a single file: parse, extract, and save memories."""
|
|
93
|
+
logger.info(f"Processing file: {file_path}")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
parser = get_parser_for_file(file_path)
|
|
97
|
+
if not parser:
|
|
98
|
+
logger.warning(f"No parser found for {file_path}")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
messages = parser.parse(file_path)
|
|
102
|
+
|
|
103
|
+
if not self.watcher.should_process(messages):
|
|
104
|
+
logger.debug(f"Skipping {file_path}: not enough messages")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
source_tool = self._get_tool_name(file_path)
|
|
108
|
+
memories = self.extractor.extract(messages, source_tool)
|
|
109
|
+
|
|
110
|
+
mtime = file_path.stat().st_mtime
|
|
111
|
+
size = file_path.stat().st_size
|
|
112
|
+
source_session = self._get_session_id(file_path)
|
|
113
|
+
|
|
114
|
+
for memory in memories:
|
|
115
|
+
self.storage.add_memory(
|
|
116
|
+
content=memory.content,
|
|
117
|
+
category=memory.category,
|
|
118
|
+
source_tool=source_tool,
|
|
119
|
+
source_file=str(file_path),
|
|
120
|
+
source_session=source_session,
|
|
121
|
+
confidence=memory.confidence,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.storage.update_extraction_state(str(file_path), mtime, size, len(messages))
|
|
125
|
+
|
|
126
|
+
self.storage.log_extraction(
|
|
127
|
+
source_path=str(file_path),
|
|
128
|
+
memories_extracted=len(memories),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
logger.info(f"Extracted {len(memories)} memories from {file_path}")
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.error(f"Failed to process {file_path}: {e}", exc_info=True)
|
|
135
|
+
self.storage.log_extraction(source_path=str(file_path), error=str(e))
|
|
136
|
+
|
|
137
|
+
def _get_tool_name(self, file_path: Path) -> str:
|
|
138
|
+
"""Extract tool name from file path."""
|
|
139
|
+
path_str = str(file_path)
|
|
140
|
+
|
|
141
|
+
if ".claude" in path_str:
|
|
142
|
+
return "claude_code"
|
|
143
|
+
elif ".codex" in path_str:
|
|
144
|
+
return "codex"
|
|
145
|
+
elif ".openclaw" in path_str:
|
|
146
|
+
return "openclaw"
|
|
147
|
+
elif ".local/share/opencode" in path_str or "opencode" in path_str:
|
|
148
|
+
return "opencode"
|
|
149
|
+
else:
|
|
150
|
+
return "unknown"
|
|
151
|
+
|
|
152
|
+
def _get_session_id(self, file_path: Path) -> str | None:
|
|
153
|
+
"""Extract session ID from file path if possible."""
|
|
154
|
+
parts = file_path.parts
|
|
155
|
+
for part in reversed(parts):
|
|
156
|
+
if part.startswith("ses_") or part.startswith("rollout-"):
|
|
157
|
+
return part
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
def _write_pid_file(self, pid_file: str) -> None:
|
|
161
|
+
"""Write current PID to lock file."""
|
|
162
|
+
Path(pid_file).parent.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
with open(pid_file, "w") as f:
|
|
164
|
+
f.write(str(os.getpid()))
|
|
165
|
+
|
|
166
|
+
def _cleanup(self, pid_file: str) -> None:
|
|
167
|
+
"""Cleanup on shutdown."""
|
|
168
|
+
logger.info("Oghma daemon stopped")
|
|
169
|
+
pid_path = Path(pid_file)
|
|
170
|
+
if pid_path.exists():
|
|
171
|
+
pid_path.unlink()
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_daemon_pid(pid_file: str) -> int | None:
|
|
175
|
+
"""Read PID from lock file, check if process is running."""
|
|
176
|
+
pid_path = Path(pid_file)
|
|
177
|
+
|
|
178
|
+
if not pid_path.exists():
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
with open(pid_path) as f:
|
|
183
|
+
pid_str = f.read().strip()
|
|
184
|
+
pid = int(pid_str)
|
|
185
|
+
|
|
186
|
+
if pid > 0:
|
|
187
|
+
try:
|
|
188
|
+
os.kill(pid, 0)
|
|
189
|
+
return pid
|
|
190
|
+
except OSError:
|
|
191
|
+
pid_path.unlink()
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
except (OSError, ValueError):
|
|
195
|
+
if pid_path.exists():
|
|
196
|
+
pid_path.unlink()
|
|
197
|
+
|
|
198
|
+
return None
|
oghma/embedder.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from openai import APIError, OpenAI
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class EmbedConfig:
|
|
13
|
+
provider: str = "openai"
|
|
14
|
+
model: str = "text-embedding-3-small"
|
|
15
|
+
dimensions: int = 1536
|
|
16
|
+
batch_size: int = 100
|
|
17
|
+
rate_limit_delay: float = 0.1
|
|
18
|
+
max_retries: int = 3
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def from_dict(
|
|
22
|
+
cls,
|
|
23
|
+
data: Mapping[str, Any] | None = None,
|
|
24
|
+
**overrides: Any,
|
|
25
|
+
) -> "EmbedConfig":
|
|
26
|
+
values = {
|
|
27
|
+
"provider": cls.provider,
|
|
28
|
+
"model": cls.model,
|
|
29
|
+
"dimensions": cls.dimensions,
|
|
30
|
+
"batch_size": cls.batch_size,
|
|
31
|
+
"rate_limit_delay": cls.rate_limit_delay,
|
|
32
|
+
"max_retries": cls.max_retries,
|
|
33
|
+
}
|
|
34
|
+
if data:
|
|
35
|
+
for key in values:
|
|
36
|
+
if key in data:
|
|
37
|
+
values[key] = data[key]
|
|
38
|
+
for key, value in overrides.items():
|
|
39
|
+
if key in values and value is not None:
|
|
40
|
+
values[key] = value
|
|
41
|
+
return cls(**values)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Embedder(ABC):
|
|
45
|
+
def __init__(self, config: EmbedConfig):
|
|
46
|
+
self.config = config
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def embed(self, text: str) -> list[float]:
|
|
50
|
+
"""Embed a single text string."""
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
54
|
+
"""Embed multiple strings."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class OpenAIEmbedder(Embedder):
|
|
58
|
+
BASE_RETRY_DELAY = 1.0
|
|
59
|
+
|
|
60
|
+
def __init__(self, config: EmbedConfig):
|
|
61
|
+
super().__init__(config)
|
|
62
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
63
|
+
if not api_key:
|
|
64
|
+
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
65
|
+
self.client = OpenAI(api_key=api_key)
|
|
66
|
+
|
|
67
|
+
def embed(self, text: str) -> list[float]:
|
|
68
|
+
return self.embed_batch([text])[0]
|
|
69
|
+
|
|
70
|
+
def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
71
|
+
if not texts:
|
|
72
|
+
return []
|
|
73
|
+
|
|
74
|
+
vectors: list[list[float]] = []
|
|
75
|
+
for start in range(0, len(texts), self.config.batch_size):
|
|
76
|
+
chunk = texts[start : start + self.config.batch_size]
|
|
77
|
+
vectors.extend(self._embed_chunk(chunk))
|
|
78
|
+
if self.config.rate_limit_delay > 0:
|
|
79
|
+
time.sleep(self.config.rate_limit_delay)
|
|
80
|
+
return vectors
|
|
81
|
+
|
|
82
|
+
def _embed_chunk(self, texts: list[str]) -> list[list[float]]:
|
|
83
|
+
last_error: Exception | None = None
|
|
84
|
+
|
|
85
|
+
for attempt in range(self.config.max_retries):
|
|
86
|
+
try:
|
|
87
|
+
response = self.client.embeddings.create(
|
|
88
|
+
model=self.config.model,
|
|
89
|
+
input=texts,
|
|
90
|
+
dimensions=self.config.dimensions,
|
|
91
|
+
)
|
|
92
|
+
return [list(item.embedding) for item in response.data]
|
|
93
|
+
except APIError as exc: # pragma: no cover - covered by retry test
|
|
94
|
+
last_error = exc
|
|
95
|
+
if attempt < self.config.max_retries - 1:
|
|
96
|
+
delay = self.BASE_RETRY_DELAY * (2**attempt)
|
|
97
|
+
time.sleep(delay)
|
|
98
|
+
|
|
99
|
+
if last_error:
|
|
100
|
+
raise last_error
|
|
101
|
+
raise RuntimeError("Embedding request failed")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def create_embedder(config: EmbedConfig) -> Embedder:
|
|
105
|
+
if config.provider == "openai":
|
|
106
|
+
return OpenAIEmbedder(config)
|
|
107
|
+
raise ValueError(f"Unsupported embedding provider: {config.provider}")
|
oghma/exporter.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from oghma.storage import MemoryRecord, Storage
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class ExportOptions:
|
|
11
|
+
output_dir: Path
|
|
12
|
+
format: str = "markdown"
|
|
13
|
+
group_by: str = "category"
|
|
14
|
+
include_metadata: bool = True
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Exporter:
|
|
18
|
+
def __init__(self, storage: Storage, options: ExportOptions):
|
|
19
|
+
self.storage = storage
|
|
20
|
+
self.options = options
|
|
21
|
+
|
|
22
|
+
def export(self) -> list[Path]:
|
|
23
|
+
"""Export memories to files, returning list of created file paths."""
|
|
24
|
+
memories = self.storage.get_all_memories(status="active")
|
|
25
|
+
|
|
26
|
+
if not memories:
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
self.options.output_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
if self.options.group_by == "category":
|
|
32
|
+
return self._export_by_category(memories)
|
|
33
|
+
elif self.options.group_by == "date":
|
|
34
|
+
return self._export_by_date(memories)
|
|
35
|
+
elif self.options.group_by == "source":
|
|
36
|
+
return self._export_by_source(memories)
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unsupported group_by: {self.options.group_by}")
|
|
39
|
+
|
|
40
|
+
def export_category(self, category: str) -> Path:
|
|
41
|
+
"""Export memories for a single category."""
|
|
42
|
+
memories = self.storage.get_all_memories(status="active", category=category)
|
|
43
|
+
|
|
44
|
+
if not memories:
|
|
45
|
+
raise ValueError(f"No memories found for category: {category}")
|
|
46
|
+
|
|
47
|
+
self.options.output_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
if self.options.format == "markdown":
|
|
50
|
+
content = self._format_markdown(memories, category)
|
|
51
|
+
ext = ".md"
|
|
52
|
+
elif self.options.format == "json":
|
|
53
|
+
content = self._format_json(memories)
|
|
54
|
+
ext = ".json"
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError(f"Unsupported format: {self.options.format}")
|
|
57
|
+
|
|
58
|
+
safe_category = category.replace("/", "_").replace("\\", "_")
|
|
59
|
+
filename = f"{safe_category}{ext}"
|
|
60
|
+
file_path = self.options.output_dir / filename
|
|
61
|
+
|
|
62
|
+
file_path.write_text(content, encoding="utf-8")
|
|
63
|
+
return file_path
|
|
64
|
+
|
|
65
|
+
def _export_by_category(self, memories: list[MemoryRecord]) -> list[Path]:
|
|
66
|
+
categories = {m["category"] for m in memories}
|
|
67
|
+
files = []
|
|
68
|
+
|
|
69
|
+
for category in sorted(categories):
|
|
70
|
+
category_memories = [m for m in memories if m["category"] == category]
|
|
71
|
+
|
|
72
|
+
if self.options.format == "markdown":
|
|
73
|
+
content = self._format_markdown(category_memories, category)
|
|
74
|
+
ext = ".md"
|
|
75
|
+
elif self.options.format == "json":
|
|
76
|
+
content = self._format_json(category_memories)
|
|
77
|
+
ext = ".json"
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Unsupported format: {self.options.format}")
|
|
80
|
+
|
|
81
|
+
safe_category = category.replace("/", "_").replace("\\", "_")
|
|
82
|
+
filename = f"{safe_category}{ext}"
|
|
83
|
+
file_path = self.options.output_dir / filename
|
|
84
|
+
|
|
85
|
+
file_path.write_text(content, encoding="utf-8")
|
|
86
|
+
files.append(file_path)
|
|
87
|
+
|
|
88
|
+
return files
|
|
89
|
+
|
|
90
|
+
def _export_by_date(self, memories: list[MemoryRecord]) -> list[Path]:
|
|
91
|
+
dates = {m["created_at"][:10] for m in memories}
|
|
92
|
+
files = []
|
|
93
|
+
|
|
94
|
+
for date_str in sorted(dates):
|
|
95
|
+
date_memories = [m for m in memories if m["created_at"].startswith(date_str)]
|
|
96
|
+
|
|
97
|
+
if self.options.format == "markdown":
|
|
98
|
+
content = self._format_markdown(date_memories, date_str)
|
|
99
|
+
ext = ".md"
|
|
100
|
+
elif self.options.format == "json":
|
|
101
|
+
content = self._format_json(date_memories)
|
|
102
|
+
ext = ".json"
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError(f"Unsupported format: {self.options.format}")
|
|
105
|
+
|
|
106
|
+
filename = f"{date_str}{ext}"
|
|
107
|
+
file_path = self.options.output_dir / filename
|
|
108
|
+
|
|
109
|
+
file_path.write_text(content, encoding="utf-8")
|
|
110
|
+
files.append(file_path)
|
|
111
|
+
|
|
112
|
+
return files
|
|
113
|
+
|
|
114
|
+
def _export_by_source(self, memories: list[MemoryRecord]) -> list[Path]:
|
|
115
|
+
sources = {m["source_tool"] for m in memories}
|
|
116
|
+
files = []
|
|
117
|
+
|
|
118
|
+
for source in sorted(sources):
|
|
119
|
+
source_memories = [m for m in memories if m["source_tool"] == source]
|
|
120
|
+
|
|
121
|
+
if self.options.format == "markdown":
|
|
122
|
+
content = self._format_markdown(source_memories, source)
|
|
123
|
+
ext = ".md"
|
|
124
|
+
elif self.options.format == "json":
|
|
125
|
+
content = self._format_json(source_memories)
|
|
126
|
+
ext = ".json"
|
|
127
|
+
else:
|
|
128
|
+
raise ValueError(f"Unsupported format: {self.options.format}")
|
|
129
|
+
|
|
130
|
+
safe_source = source.replace("/", "_").replace("\\", "_")
|
|
131
|
+
filename = f"{safe_source}{ext}"
|
|
132
|
+
file_path = self.options.output_dir / filename
|
|
133
|
+
|
|
134
|
+
file_path.write_text(content, encoding="utf-8")
|
|
135
|
+
files.append(file_path)
|
|
136
|
+
|
|
137
|
+
return files
|
|
138
|
+
|
|
139
|
+
def _format_markdown(self, memories: list[MemoryRecord], title: str) -> str:
|
|
140
|
+
"""Format memories as markdown with YAML frontmatter."""
|
|
141
|
+
lines = [
|
|
142
|
+
"---",
|
|
143
|
+
f"category: {title}",
|
|
144
|
+
f"exported_at: {datetime.now().isoformat()}",
|
|
145
|
+
f"count: {len(memories)}",
|
|
146
|
+
"---",
|
|
147
|
+
"",
|
|
148
|
+
f"# {title.title()}",
|
|
149
|
+
"",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
for memory in memories:
|
|
153
|
+
content_preview = (
|
|
154
|
+
memory["content"][:80] + "..." if len(memory["content"]) > 80 else memory["content"]
|
|
155
|
+
)
|
|
156
|
+
lines.append(f"## {content_preview}")
|
|
157
|
+
source_info = (
|
|
158
|
+
f"*Source: {memory['source_tool']} | {memory['created_at'][:10]} | "
|
|
159
|
+
f"Confidence: {memory['confidence']:.0%}*"
|
|
160
|
+
)
|
|
161
|
+
lines.append(source_info)
|
|
162
|
+
lines.append("")
|
|
163
|
+
lines.append(memory["content"])
|
|
164
|
+
lines.append("")
|
|
165
|
+
lines.append("---")
|
|
166
|
+
lines.append("")
|
|
167
|
+
|
|
168
|
+
return "\n".join(lines)
|
|
169
|
+
|
|
170
|
+
def _format_json(self, memories: list[MemoryRecord]) -> str:
|
|
171
|
+
"""Format memories as JSON."""
|
|
172
|
+
export_data = {
|
|
173
|
+
"exported_at": datetime.now().isoformat(),
|
|
174
|
+
"count": len(memories),
|
|
175
|
+
"memories": memories,
|
|
176
|
+
}
|
|
177
|
+
return json.dumps(export_data, indent=2, ensure_ascii=False, default=str)
|
oghma/extractor.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from openai import OpenAI
|
|
8
|
+
|
|
9
|
+
from oghma.config import Config
|
|
10
|
+
from oghma.parsers import Message
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Memory:
|
|
17
|
+
content: str
|
|
18
|
+
category: str
|
|
19
|
+
confidence: float = 1.0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Extractor:
|
|
23
|
+
"""Extracts memories from conversations using LLM."""
|
|
24
|
+
|
|
25
|
+
MAX_RETRIES = 3
|
|
26
|
+
BASE_RETRY_DELAY = 1.0
|
|
27
|
+
|
|
28
|
+
CATEGORIES = ["learning", "preference", "project_context", "gotcha", "workflow"]
|
|
29
|
+
|
|
30
|
+
# Models that require OpenRouter
|
|
31
|
+
OPENROUTER_PREFIXES = ("google/", "anthropic/", "meta-llama/", "deepseek/", "moonshotai/")
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: Config):
|
|
34
|
+
self.config = config
|
|
35
|
+
self.model = config.get("extraction", {}).get("model", "gpt-4o-mini")
|
|
36
|
+
self.max_chars = config.get("extraction", {}).get("max_content_chars", 4000)
|
|
37
|
+
|
|
38
|
+
# Determine which API to use based on model name
|
|
39
|
+
if self.model.startswith(self.OPENROUTER_PREFIXES):
|
|
40
|
+
api_key = os.environ.get("OPENROUTER_API_KEY")
|
|
41
|
+
if not api_key:
|
|
42
|
+
raise ValueError("OPENROUTER_API_KEY environment variable not set")
|
|
43
|
+
self.client = OpenAI(
|
|
44
|
+
api_key=api_key,
|
|
45
|
+
base_url="https://openrouter.ai/api/v1"
|
|
46
|
+
)
|
|
47
|
+
self.use_openrouter = True
|
|
48
|
+
else:
|
|
49
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
50
|
+
if not api_key:
|
|
51
|
+
raise ValueError("OPENAI_API_KEY environment variable not set")
|
|
52
|
+
self.client = OpenAI(api_key=api_key)
|
|
53
|
+
self.use_openrouter = False
|
|
54
|
+
|
|
55
|
+
def extract(self, messages: list[Message], source_tool: str) -> list[Memory]:
|
|
56
|
+
"""Extract memories from a list of messages."""
|
|
57
|
+
if not messages:
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
prompt = self._build_prompt(messages)
|
|
61
|
+
|
|
62
|
+
for attempt in range(self.MAX_RETRIES):
|
|
63
|
+
try:
|
|
64
|
+
response = self._call_openai(prompt)
|
|
65
|
+
memories = self._parse_response(response)
|
|
66
|
+
|
|
67
|
+
valid_memories = [
|
|
68
|
+
m for m in memories if m.category in self.CATEGORIES and m.confidence >= 0.5
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
logger.info(
|
|
72
|
+
f"Extracted {len(valid_memories)} memories from {source_tool} "
|
|
73
|
+
f"(attempt {attempt + 1})"
|
|
74
|
+
)
|
|
75
|
+
return valid_memories
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
if attempt < self.MAX_RETRIES - 1:
|
|
79
|
+
delay = self.BASE_RETRY_DELAY * (2**attempt)
|
|
80
|
+
logger.warning(
|
|
81
|
+
f"Extraction attempt {attempt + 1} failed: {e}. Retrying in {delay}s..."
|
|
82
|
+
)
|
|
83
|
+
time.sleep(delay)
|
|
84
|
+
else:
|
|
85
|
+
logger.error(f"Extraction failed after {self.MAX_RETRIES} attempts: {e}")
|
|
86
|
+
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
def _call_openai(self, prompt: str) -> str:
|
|
90
|
+
"""Call LLM API and return the response text."""
|
|
91
|
+
kwargs = {
|
|
92
|
+
"model": self.model,
|
|
93
|
+
"messages": [
|
|
94
|
+
{
|
|
95
|
+
"role": "system",
|
|
96
|
+
"content": "You are a memory extraction system. "
|
|
97
|
+
"Always respond with valid JSON only, no markdown.",
|
|
98
|
+
},
|
|
99
|
+
{"role": "user", "content": prompt},
|
|
100
|
+
],
|
|
101
|
+
"temperature": 0.3,
|
|
102
|
+
"max_tokens": 1500,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# OpenAI models support structured output, OpenRouter models don't
|
|
106
|
+
if not self.use_openrouter:
|
|
107
|
+
kwargs["response_format"] = {"type": "json_object"}
|
|
108
|
+
|
|
109
|
+
response = self.client.chat.completions.create(**kwargs)
|
|
110
|
+
|
|
111
|
+
content = response.choices[0].message.content
|
|
112
|
+
if not content:
|
|
113
|
+
raise ValueError("Empty response from LLM API")
|
|
114
|
+
|
|
115
|
+
return content
|
|
116
|
+
|
|
117
|
+
def _build_prompt(self, messages: list[Message]) -> str:
|
|
118
|
+
"""Build the extraction prompt."""
|
|
119
|
+
messages_text = ""
|
|
120
|
+
for msg in messages[:100]:
|
|
121
|
+
role_label = "User" if msg.role == "user" else "Assistant"
|
|
122
|
+
messages_text += f"{role_label}: {msg.content}\n\n"
|
|
123
|
+
|
|
124
|
+
messages_text = messages_text[: self.max_chars]
|
|
125
|
+
|
|
126
|
+
categories_desc = "\n".join(f"- {cat}" for cat in self.CATEGORIES)
|
|
127
|
+
|
|
128
|
+
prompt = (
|
|
129
|
+
"You are a memory extraction system. "
|
|
130
|
+
"Analyze this conversation and extract key memories.\n\n"
|
|
131
|
+
f"Categories:\n{categories_desc}\n\n"
|
|
132
|
+
f"Conversation:\n{messages_text}\n\n"
|
|
133
|
+
"Extract memories in this JSON format:\n"
|
|
134
|
+
'[ {"content": "...", "category": "...", "confidence": 0.0-1.0},\n'
|
|
135
|
+
" ...\n"
|
|
136
|
+
"]\n\n"
|
|
137
|
+
"Only extract clear, specific memories. Skip vague or trivial content.\n"
|
|
138
|
+
"Return empty array [] if no significant memories found.\n"
|
|
139
|
+
"Remember: respond with valid JSON only, no markdown formatting."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return prompt
|
|
143
|
+
|
|
144
|
+
def _parse_response(self, response_text: str) -> list[Memory]:
|
|
145
|
+
"""Parse LLM response into Memory objects."""
|
|
146
|
+
response_text = response_text.strip()
|
|
147
|
+
|
|
148
|
+
if response_text.startswith("```"):
|
|
149
|
+
lines = response_text.split("\n")
|
|
150
|
+
response_text = "\n".join(lines[1:-1])
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
data = json.loads(response_text)
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
logger.warning(f"Failed to parse JSON response: {response_text[:200]}...")
|
|
156
|
+
return []
|
|
157
|
+
|
|
158
|
+
if not isinstance(data, list):
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
memories: list[Memory] = []
|
|
162
|
+
for item in data:
|
|
163
|
+
if not isinstance(item, dict):
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
content = item.get("content")
|
|
167
|
+
category = item.get("category")
|
|
168
|
+
confidence = item.get("confidence", 1.0)
|
|
169
|
+
|
|
170
|
+
if not content or not category:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if not isinstance(confidence, (int, float)):
|
|
174
|
+
confidence = 1.0
|
|
175
|
+
|
|
176
|
+
confidence = max(0.0, min(1.0, float(confidence)))
|
|
177
|
+
|
|
178
|
+
memories.append(Memory(content=content, category=category, confidence=confidence))
|
|
179
|
+
|
|
180
|
+
return memories
|