headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Fast Memory Wrapper - Zero-latency inline extraction + semantic retrieval.
|
|
2
|
+
|
|
3
|
+
This is the ultimate memory solution:
|
|
4
|
+
1. ZERO extra latency - memories extracted as part of LLM response (Letta-style)
|
|
5
|
+
2. Semantic retrieval - vector similarity for intelligent memory lookup
|
|
6
|
+
3. Local embeddings - sub-50ms retrieval, no API calls needed
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from headroom.memory import with_fast_memory
|
|
10
|
+
|
|
11
|
+
client = with_fast_memory(OpenAI(), user_id="alice")
|
|
12
|
+
response = client.chat.completions.create(
|
|
13
|
+
model="gpt-4o",
|
|
14
|
+
messages=[{"role": "user", "content": "I prefer Python"}]
|
|
15
|
+
)
|
|
16
|
+
# Memory extracted INLINE - zero extra latency!
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import copy
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
from headroom.memory.fast_store import (
|
|
26
|
+
FastMemoryStore,
|
|
27
|
+
MemoryChunk,
|
|
28
|
+
create_local_embed_fn,
|
|
29
|
+
create_openai_embed_fn,
|
|
30
|
+
)
|
|
31
|
+
from headroom.memory.inline_extractor import (
|
|
32
|
+
inject_memory_instruction,
|
|
33
|
+
parse_response_with_memory,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class FastMemoryWrapper:
|
|
38
|
+
"""Wraps an LLM client with zero-latency inline memory extraction.
|
|
39
|
+
|
|
40
|
+
Architecture:
|
|
41
|
+
1. BEFORE: Inject relevant memories into user message (semantic search)
|
|
42
|
+
2. DURING: Memory instruction is in system prompt
|
|
43
|
+
3. AFTER: Parse memory block from response, store extracted memories
|
|
44
|
+
|
|
45
|
+
All memory operations happen as part of the normal LLM flow - no extra calls!
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
client: Any,
|
|
51
|
+
user_id: str,
|
|
52
|
+
db_path: str | Path = "headroom_fast_memory.db",
|
|
53
|
+
top_k: int = 5,
|
|
54
|
+
use_local_embeddings: bool = True,
|
|
55
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
56
|
+
_store: FastMemoryStore | None = None,
|
|
57
|
+
):
|
|
58
|
+
"""Initialize the fast memory wrapper.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
client: OpenAI-compatible LLM client
|
|
62
|
+
user_id: User identifier for memory isolation
|
|
63
|
+
db_path: Path to SQLite database
|
|
64
|
+
top_k: Number of memories to inject
|
|
65
|
+
use_local_embeddings: Use local model (fast) or OpenAI API
|
|
66
|
+
embedding_model: Model name for local embeddings
|
|
67
|
+
_store: Override store (for testing)
|
|
68
|
+
"""
|
|
69
|
+
self._client = client
|
|
70
|
+
self._user_id = user_id
|
|
71
|
+
self._top_k = top_k
|
|
72
|
+
|
|
73
|
+
# Initialize store with appropriate embedding function
|
|
74
|
+
if _store:
|
|
75
|
+
self._store = _store
|
|
76
|
+
elif use_local_embeddings:
|
|
77
|
+
embed_fn = create_local_embed_fn(embedding_model)
|
|
78
|
+
# MiniLM-L6-v2 produces 384-dim embeddings
|
|
79
|
+
self._store = FastMemoryStore(db_path, embed_fn=embed_fn, embedding_dim=384)
|
|
80
|
+
else:
|
|
81
|
+
embed_fn = create_openai_embed_fn(client)
|
|
82
|
+
self._store = FastMemoryStore(db_path, embed_fn=embed_fn)
|
|
83
|
+
|
|
84
|
+
# Create wrapped chat interface
|
|
85
|
+
self.chat = _FastWrappedChat(self)
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def memory(self) -> _FastMemoryAPI:
|
|
89
|
+
"""Direct access to memory operations."""
|
|
90
|
+
return _FastMemoryAPI(self._store, self._user_id)
|
|
91
|
+
|
|
92
|
+
def _inject_memories(self, messages: list[dict]) -> list[dict]:
|
|
93
|
+
"""Inject relevant memories into user message.
|
|
94
|
+
|
|
95
|
+
Uses semantic search (vector similarity) to find relevant memories.
|
|
96
|
+
Injects into FIRST user message to preserve system prompt caching.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
messages: Original messages list
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
New messages with memories injected
|
|
103
|
+
"""
|
|
104
|
+
# Find the last user message for search context
|
|
105
|
+
user_content = None
|
|
106
|
+
for msg in reversed(messages):
|
|
107
|
+
if msg.get("role") == "user":
|
|
108
|
+
user_content = msg.get("content", "")
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
if not user_content:
|
|
112
|
+
return messages
|
|
113
|
+
|
|
114
|
+
# Semantic search for relevant memories
|
|
115
|
+
results = self._store.search(self._user_id, str(user_content), top_k=self._top_k)
|
|
116
|
+
|
|
117
|
+
if not results:
|
|
118
|
+
return messages
|
|
119
|
+
|
|
120
|
+
# Build context block
|
|
121
|
+
context_lines = ["<context>"]
|
|
122
|
+
for chunk, _score in results:
|
|
123
|
+
context_lines.append(f"- {chunk.text}")
|
|
124
|
+
context_lines.append("</context>")
|
|
125
|
+
context_block = "\n".join(context_lines)
|
|
126
|
+
|
|
127
|
+
# Inject into first user message
|
|
128
|
+
new_messages = copy.deepcopy(messages)
|
|
129
|
+
for msg in new_messages:
|
|
130
|
+
if msg.get("role") == "user":
|
|
131
|
+
original = msg.get("content", "")
|
|
132
|
+
msg["content"] = f"{context_block}\n\n{original}"
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
return new_messages
|
|
136
|
+
|
|
137
|
+
def _store_memories(self, memories: list[dict[str, Any]]) -> None:
|
|
138
|
+
"""Store extracted memories.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
memories: List of memory dicts from inline extraction
|
|
142
|
+
"""
|
|
143
|
+
for mem in memories:
|
|
144
|
+
content = mem.get("content", "")
|
|
145
|
+
category = mem.get("category", "fact")
|
|
146
|
+
if content:
|
|
147
|
+
self._store.add(
|
|
148
|
+
self._user_id,
|
|
149
|
+
content,
|
|
150
|
+
role="memory",
|
|
151
|
+
metadata={"category": category, "source": "inline_extraction"},
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class _FastWrappedChat:
|
|
156
|
+
"""Wrapped chat interface."""
|
|
157
|
+
|
|
158
|
+
def __init__(self, wrapper: FastMemoryWrapper):
|
|
159
|
+
self._wrapper = wrapper
|
|
160
|
+
self.completions = _FastWrappedCompletions(wrapper)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class _FastWrappedCompletions:
|
|
164
|
+
"""Wrapped completions with inline memory extraction."""
|
|
165
|
+
|
|
166
|
+
def __init__(self, wrapper: FastMemoryWrapper):
|
|
167
|
+
self._wrapper = wrapper
|
|
168
|
+
|
|
169
|
+
def create(self, **kwargs: Any) -> Any:
|
|
170
|
+
"""Create chat completion with inline memory extraction.
|
|
171
|
+
|
|
172
|
+
Flow:
|
|
173
|
+
1. Search for relevant memories (semantic)
|
|
174
|
+
2. Inject memories into user message
|
|
175
|
+
3. Add memory instruction to system prompt
|
|
176
|
+
4. Forward to LLM
|
|
177
|
+
5. Parse response to extract memories
|
|
178
|
+
6. Store extracted memories
|
|
179
|
+
7. Return clean response (without memory block)
|
|
180
|
+
"""
|
|
181
|
+
messages = kwargs.get("messages", [])
|
|
182
|
+
|
|
183
|
+
# 1. Inject relevant memories into user message
|
|
184
|
+
enhanced_messages = self._wrapper._inject_memories(messages)
|
|
185
|
+
|
|
186
|
+
# 2. Add memory extraction instruction to system prompt
|
|
187
|
+
enhanced_messages = inject_memory_instruction(enhanced_messages, short=True)
|
|
188
|
+
kwargs["messages"] = enhanced_messages
|
|
189
|
+
|
|
190
|
+
# 3. Forward to LLM
|
|
191
|
+
response = self._wrapper._client.chat.completions.create(**kwargs)
|
|
192
|
+
|
|
193
|
+
# 4. Parse response and extract memories
|
|
194
|
+
raw_content = response.choices[0].message.content
|
|
195
|
+
parsed = parse_response_with_memory(raw_content)
|
|
196
|
+
|
|
197
|
+
# 5. Store extracted memories
|
|
198
|
+
if parsed.memories:
|
|
199
|
+
self._wrapper._store_memories(parsed.memories)
|
|
200
|
+
|
|
201
|
+
# 6. Return clean response (modify in place)
|
|
202
|
+
response.choices[0].message.content = parsed.content
|
|
203
|
+
|
|
204
|
+
return response
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class _FastMemoryAPI:
|
|
208
|
+
"""Direct API for memory operations."""
|
|
209
|
+
|
|
210
|
+
def __init__(self, store: FastMemoryStore, user_id: str):
|
|
211
|
+
self._store = store
|
|
212
|
+
self._user_id = user_id
|
|
213
|
+
|
|
214
|
+
def search(self, query: str, top_k: int = 5) -> list[tuple[MemoryChunk, float]]:
|
|
215
|
+
"""Semantic search for memories.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
query: Search query
|
|
219
|
+
top_k: Max results
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
List of (memory, similarity_score) tuples
|
|
223
|
+
"""
|
|
224
|
+
return self._store.search(self._user_id, query, top_k)
|
|
225
|
+
|
|
226
|
+
def add(self, content: str, category: str = "fact") -> MemoryChunk:
|
|
227
|
+
"""Manually add a memory.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
content: Memory content
|
|
231
|
+
category: preference, fact, or context
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
The created memory chunk
|
|
235
|
+
"""
|
|
236
|
+
return self._store.add(
|
|
237
|
+
self._user_id,
|
|
238
|
+
content,
|
|
239
|
+
role="memory",
|
|
240
|
+
metadata={"category": category, "source": "manual"},
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def get_all(self) -> list[MemoryChunk]:
|
|
244
|
+
"""Get all memories for this user."""
|
|
245
|
+
return self._store.get_all(self._user_id)
|
|
246
|
+
|
|
247
|
+
def clear(self) -> int:
|
|
248
|
+
"""Clear all memories for this user."""
|
|
249
|
+
return self._store.clear(self._user_id)
|
|
250
|
+
|
|
251
|
+
def stats(self) -> dict:
|
|
252
|
+
"""Get memory statistics."""
|
|
253
|
+
return self._store.stats(self._user_id)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def with_fast_memory(
|
|
257
|
+
client: Any,
|
|
258
|
+
user_id: str,
|
|
259
|
+
db_path: str | Path = "headroom_fast_memory.db",
|
|
260
|
+
top_k: int = 5,
|
|
261
|
+
use_local_embeddings: bool = True,
|
|
262
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
263
|
+
**kwargs: Any,
|
|
264
|
+
) -> FastMemoryWrapper:
|
|
265
|
+
"""Wrap an LLM client with zero-latency inline memory extraction.
|
|
266
|
+
|
|
267
|
+
This is the fastest memory solution:
|
|
268
|
+
1. ZERO extra LLM calls - memories extracted inline as part of response
|
|
269
|
+
2. Sub-50ms retrieval - local embeddings, no API calls
|
|
270
|
+
3. Semantic search - finds conceptually related memories
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
client: OpenAI-compatible LLM client
|
|
274
|
+
user_id: User identifier for memory isolation
|
|
275
|
+
db_path: Path to SQLite database
|
|
276
|
+
top_k: Number of memories to inject per request
|
|
277
|
+
use_local_embeddings: Use local model (True) or OpenAI API (False)
|
|
278
|
+
embedding_model: Model name for local embeddings
|
|
279
|
+
**kwargs: Additional arguments
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Wrapped client with automatic memory
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
from openai import OpenAI
|
|
286
|
+
from headroom.memory import with_fast_memory
|
|
287
|
+
|
|
288
|
+
client = with_fast_memory(OpenAI(), user_id="alice")
|
|
289
|
+
|
|
290
|
+
# First conversation - memory extracted INLINE
|
|
291
|
+
response = client.chat.completions.create(
|
|
292
|
+
model="gpt-4o",
|
|
293
|
+
messages=[{"role": "user", "content": "I prefer Python for backend work"}]
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Later - memories automatically retrieved
|
|
297
|
+
response = client.chat.completions.create(
|
|
298
|
+
model="gpt-4o",
|
|
299
|
+
messages=[{"role": "user", "content": "What language should I use?"}]
|
|
300
|
+
)
|
|
301
|
+
# User sees: "Based on your preference for Python..."
|
|
302
|
+
"""
|
|
303
|
+
return FastMemoryWrapper(
|
|
304
|
+
client=client,
|
|
305
|
+
user_id=user_id,
|
|
306
|
+
db_path=db_path,
|
|
307
|
+
top_k=top_k,
|
|
308
|
+
use_local_embeddings=use_local_embeddings,
|
|
309
|
+
embedding_model=embedding_model,
|
|
310
|
+
**kwargs,
|
|
311
|
+
)
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Inline memory extraction - zero extra latency.
|
|
2
|
+
|
|
3
|
+
Instead of making a separate LLM call to extract memories,
|
|
4
|
+
we modify the system prompt so the LLM outputs memories
|
|
5
|
+
as part of its response. This is the Letta/MemGPT approach.
|
|
6
|
+
|
|
7
|
+
Benefits:
|
|
8
|
+
- Zero extra latency (memory is part of response)
|
|
9
|
+
- Zero extra API cost (already paying for response tokens)
|
|
10
|
+
- Higher quality (LLM has full context)
|
|
11
|
+
- Intelligent filtering (LLM decides what's relevant)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Memory extraction instruction to append to system prompt
|
|
26
|
+
MEMORY_INSTRUCTION = """
|
|
27
|
+
|
|
28
|
+
## Memory Instructions
|
|
29
|
+
After your response, if there are facts worth remembering about the user/entity for future conversations, output them in a <memory> block:
|
|
30
|
+
|
|
31
|
+
<memory>
|
|
32
|
+
{"memories": [{"content": "fact to remember", "category": "preference|fact|context"}]}
|
|
33
|
+
</memory>
|
|
34
|
+
|
|
35
|
+
Categories:
|
|
36
|
+
- preference: likes, dislikes, preferred tools/languages/styles
|
|
37
|
+
- fact: identity, role, job, location, constraints
|
|
38
|
+
- context: current goals, ongoing tasks, recent events
|
|
39
|
+
|
|
40
|
+
Only output memories for significant, reusable information. Skip for:
|
|
41
|
+
- Greetings, thanks, small talk
|
|
42
|
+
- One-time questions
|
|
43
|
+
- Information already known
|
|
44
|
+
|
|
45
|
+
If nothing worth remembering: <memory>{"memories": []}</memory>
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Shorter version for token efficiency
|
|
49
|
+
MEMORY_INSTRUCTION_SHORT = """
|
|
50
|
+
|
|
51
|
+
After responding, output facts to remember: <memory>{"memories": [{"content": "...", "category": "preference|fact|context"}]}</memory>
|
|
52
|
+
Skip for greetings/small talk. If nothing: <memory>{"memories": []}</memory>"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class ParsedResponse:
|
|
57
|
+
"""Response with extracted memories."""
|
|
58
|
+
|
|
59
|
+
content: str # The actual response (without memory block)
|
|
60
|
+
memories: list[dict[str, Any]] # Extracted memories
|
|
61
|
+
raw: str # Original full response
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def inject_memory_instruction(
|
|
65
|
+
messages: list[dict[str, Any]],
|
|
66
|
+
short: bool = True,
|
|
67
|
+
) -> list[dict[str, Any]]:
|
|
68
|
+
"""Inject memory extraction instruction into system prompt.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
messages: Original messages list
|
|
72
|
+
short: Use short instruction (fewer tokens)
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Modified messages with memory instruction
|
|
76
|
+
"""
|
|
77
|
+
instruction = MEMORY_INSTRUCTION_SHORT if short else MEMORY_INSTRUCTION
|
|
78
|
+
messages = [m.copy() for m in messages] # Don't modify original
|
|
79
|
+
|
|
80
|
+
# Find or create system message
|
|
81
|
+
has_system = False
|
|
82
|
+
for i, msg in enumerate(messages):
|
|
83
|
+
if msg.get("role") == "system":
|
|
84
|
+
messages[i] = {
|
|
85
|
+
**msg,
|
|
86
|
+
"content": msg.get("content", "") + instruction,
|
|
87
|
+
}
|
|
88
|
+
has_system = True
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
if not has_system:
|
|
92
|
+
# Prepend system message
|
|
93
|
+
messages.insert(
|
|
94
|
+
0,
|
|
95
|
+
{
|
|
96
|
+
"role": "system",
|
|
97
|
+
"content": "You are a helpful assistant." + instruction,
|
|
98
|
+
},
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return messages
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_response_with_memory(response_text: str) -> ParsedResponse:
|
|
105
|
+
"""Parse LLM response to extract memories.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
response_text: Raw LLM response
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
ParsedResponse with content and memories separated
|
|
112
|
+
"""
|
|
113
|
+
memories: list[dict[str, Any]] = []
|
|
114
|
+
content = response_text
|
|
115
|
+
|
|
116
|
+
# Extract <memory> block
|
|
117
|
+
memory_pattern = r"<memory>\s*(.*?)\s*</memory>"
|
|
118
|
+
match = re.search(memory_pattern, response_text, re.DOTALL | re.IGNORECASE)
|
|
119
|
+
|
|
120
|
+
if match:
|
|
121
|
+
memory_json = match.group(1).strip()
|
|
122
|
+
|
|
123
|
+
# Remove the memory block from content
|
|
124
|
+
content = re.sub(memory_pattern, "", response_text, flags=re.DOTALL | re.IGNORECASE).strip()
|
|
125
|
+
|
|
126
|
+
# Parse the JSON
|
|
127
|
+
try:
|
|
128
|
+
data = json.loads(memory_json)
|
|
129
|
+
memories = data.get("memories", [])
|
|
130
|
+
except json.JSONDecodeError as e:
|
|
131
|
+
logger.warning(f"Failed to parse memory JSON: {e}")
|
|
132
|
+
|
|
133
|
+
return ParsedResponse(
|
|
134
|
+
content=content,
|
|
135
|
+
memories=memories,
|
|
136
|
+
raw=response_text,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class InlineMemoryWrapper:
|
|
141
|
+
"""Wrapper that extracts memories from LLM responses inline.
|
|
142
|
+
|
|
143
|
+
This is the zero-latency approach - memories are extracted
|
|
144
|
+
as part of the response, not in a separate call.
|
|
145
|
+
|
|
146
|
+
Usage:
|
|
147
|
+
wrapper = InlineMemoryWrapper(openai_client)
|
|
148
|
+
response, memories = wrapper.chat(
|
|
149
|
+
messages=[{"role": "user", "content": "I prefer Python"}],
|
|
150
|
+
model="gpt-4o-mini"
|
|
151
|
+
)
|
|
152
|
+
# response = "Great choice! Python is excellent..."
|
|
153
|
+
# memories = [{"content": "User prefers Python", "category": "preference"}]
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
def __init__(self, client: Any):
|
|
157
|
+
"""Initialize wrapper.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
client: OpenAI-compatible client
|
|
161
|
+
"""
|
|
162
|
+
self.client = client
|
|
163
|
+
|
|
164
|
+
def chat(
|
|
165
|
+
self,
|
|
166
|
+
messages: list[dict[str, Any]],
|
|
167
|
+
model: str = "gpt-4o-mini",
|
|
168
|
+
short_instruction: bool = True,
|
|
169
|
+
**kwargs: Any,
|
|
170
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
171
|
+
"""Send chat request and extract memories inline.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
messages: Chat messages
|
|
175
|
+
model: Model to use
|
|
176
|
+
short_instruction: Use shorter memory instruction
|
|
177
|
+
**kwargs: Additional args for chat completion
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tuple of (response_content, extracted_memories)
|
|
181
|
+
"""
|
|
182
|
+
# Inject memory instruction
|
|
183
|
+
modified_messages = inject_memory_instruction(messages, short=short_instruction)
|
|
184
|
+
|
|
185
|
+
# Call LLM
|
|
186
|
+
response = self.client.chat.completions.create(
|
|
187
|
+
model=model,
|
|
188
|
+
messages=modified_messages,
|
|
189
|
+
**kwargs,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
raw_content = response.choices[0].message.content
|
|
193
|
+
|
|
194
|
+
# Parse response and extract memories
|
|
195
|
+
parsed = parse_response_with_memory(raw_content)
|
|
196
|
+
|
|
197
|
+
return parsed.content, parsed.memories
|
|
198
|
+
|
|
199
|
+
def chat_with_response(
|
|
200
|
+
self,
|
|
201
|
+
messages: list[dict[str, Any]],
|
|
202
|
+
model: str = "gpt-4o-mini",
|
|
203
|
+
**kwargs: Any,
|
|
204
|
+
) -> tuple[Any, str, list[dict[str, Any]]]:
|
|
205
|
+
"""Send chat request and return full response object.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
messages: Chat messages
|
|
209
|
+
model: Model to use
|
|
210
|
+
**kwargs: Additional args for chat completion
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Tuple of (response_object, content, memories)
|
|
214
|
+
"""
|
|
215
|
+
modified_messages = inject_memory_instruction(messages)
|
|
216
|
+
|
|
217
|
+
response = self.client.chat.completions.create(
|
|
218
|
+
model=model,
|
|
219
|
+
messages=modified_messages,
|
|
220
|
+
**kwargs,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
raw_content = response.choices[0].message.content
|
|
224
|
+
parsed = parse_response_with_memory(raw_content)
|
|
225
|
+
|
|
226
|
+
# Modify response to have clean content
|
|
227
|
+
response.choices[0].message.content = parsed.content
|
|
228
|
+
|
|
229
|
+
return response, parsed.content, parsed.memories
|