headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,311 @@
1
+ """Fast Memory Wrapper - Zero-latency inline extraction + semantic retrieval.
2
+
3
+ This is the ultimate memory solution:
4
+ 1. ZERO extra latency - memories extracted as part of LLM response (Letta-style)
5
+ 2. Semantic retrieval - vector similarity for intelligent memory lookup
6
+ 3. Local embeddings - sub-50ms retrieval, no API calls needed
7
+
8
+ Usage:
9
+ from headroom.memory import with_fast_memory
10
+
11
+ client = with_fast_memory(OpenAI(), user_id="alice")
12
+ response = client.chat.completions.create(
13
+ model="gpt-4o",
14
+ messages=[{"role": "user", "content": "I prefer Python"}]
15
+ )
16
+ # Memory extracted INLINE - zero extra latency!
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import copy
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ from headroom.memory.fast_store import (
26
+ FastMemoryStore,
27
+ MemoryChunk,
28
+ create_local_embed_fn,
29
+ create_openai_embed_fn,
30
+ )
31
+ from headroom.memory.inline_extractor import (
32
+ inject_memory_instruction,
33
+ parse_response_with_memory,
34
+ )
35
+
36
+
37
+ class FastMemoryWrapper:
38
+ """Wraps an LLM client with zero-latency inline memory extraction.
39
+
40
+ Architecture:
41
+ 1. BEFORE: Inject relevant memories into user message (semantic search)
42
+ 2. DURING: Memory instruction is in system prompt
43
+ 3. AFTER: Parse memory block from response, store extracted memories
44
+
45
+ All memory operations happen as part of the normal LLM flow - no extra calls!
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ client: Any,
51
+ user_id: str,
52
+ db_path: str | Path = "headroom_fast_memory.db",
53
+ top_k: int = 5,
54
+ use_local_embeddings: bool = True,
55
+ embedding_model: str = "all-MiniLM-L6-v2",
56
+ _store: FastMemoryStore | None = None,
57
+ ):
58
+ """Initialize the fast memory wrapper.
59
+
60
+ Args:
61
+ client: OpenAI-compatible LLM client
62
+ user_id: User identifier for memory isolation
63
+ db_path: Path to SQLite database
64
+ top_k: Number of memories to inject
65
+ use_local_embeddings: Use local model (fast) or OpenAI API
66
+ embedding_model: Model name for local embeddings
67
+ _store: Override store (for testing)
68
+ """
69
+ self._client = client
70
+ self._user_id = user_id
71
+ self._top_k = top_k
72
+
73
+ # Initialize store with appropriate embedding function
74
+ if _store:
75
+ self._store = _store
76
+ elif use_local_embeddings:
77
+ embed_fn = create_local_embed_fn(embedding_model)
78
+ # MiniLM-L6-v2 produces 384-dim embeddings
79
+ self._store = FastMemoryStore(db_path, embed_fn=embed_fn, embedding_dim=384)
80
+ else:
81
+ embed_fn = create_openai_embed_fn(client)
82
+ self._store = FastMemoryStore(db_path, embed_fn=embed_fn)
83
+
84
+ # Create wrapped chat interface
85
+ self.chat = _FastWrappedChat(self)
86
+
87
+ @property
88
+ def memory(self) -> _FastMemoryAPI:
89
+ """Direct access to memory operations."""
90
+ return _FastMemoryAPI(self._store, self._user_id)
91
+
92
+ def _inject_memories(self, messages: list[dict]) -> list[dict]:
93
+ """Inject relevant memories into user message.
94
+
95
+ Uses semantic search (vector similarity) to find relevant memories.
96
+ Injects into FIRST user message to preserve system prompt caching.
97
+
98
+ Args:
99
+ messages: Original messages list
100
+
101
+ Returns:
102
+ New messages with memories injected
103
+ """
104
+ # Find the last user message for search context
105
+ user_content = None
106
+ for msg in reversed(messages):
107
+ if msg.get("role") == "user":
108
+ user_content = msg.get("content", "")
109
+ break
110
+
111
+ if not user_content:
112
+ return messages
113
+
114
+ # Semantic search for relevant memories
115
+ results = self._store.search(self._user_id, str(user_content), top_k=self._top_k)
116
+
117
+ if not results:
118
+ return messages
119
+
120
+ # Build context block
121
+ context_lines = ["<context>"]
122
+ for chunk, _score in results:
123
+ context_lines.append(f"- {chunk.text}")
124
+ context_lines.append("</context>")
125
+ context_block = "\n".join(context_lines)
126
+
127
+ # Inject into first user message
128
+ new_messages = copy.deepcopy(messages)
129
+ for msg in new_messages:
130
+ if msg.get("role") == "user":
131
+ original = msg.get("content", "")
132
+ msg["content"] = f"{context_block}\n\n{original}"
133
+ break
134
+
135
+ return new_messages
136
+
137
+ def _store_memories(self, memories: list[dict[str, Any]]) -> None:
138
+ """Store extracted memories.
139
+
140
+ Args:
141
+ memories: List of memory dicts from inline extraction
142
+ """
143
+ for mem in memories:
144
+ content = mem.get("content", "")
145
+ category = mem.get("category", "fact")
146
+ if content:
147
+ self._store.add(
148
+ self._user_id,
149
+ content,
150
+ role="memory",
151
+ metadata={"category": category, "source": "inline_extraction"},
152
+ )
153
+
154
+
155
+ class _FastWrappedChat:
156
+ """Wrapped chat interface."""
157
+
158
+ def __init__(self, wrapper: FastMemoryWrapper):
159
+ self._wrapper = wrapper
160
+ self.completions = _FastWrappedCompletions(wrapper)
161
+
162
+
163
+ class _FastWrappedCompletions:
164
+ """Wrapped completions with inline memory extraction."""
165
+
166
+ def __init__(self, wrapper: FastMemoryWrapper):
167
+ self._wrapper = wrapper
168
+
169
+ def create(self, **kwargs: Any) -> Any:
170
+ """Create chat completion with inline memory extraction.
171
+
172
+ Flow:
173
+ 1. Search for relevant memories (semantic)
174
+ 2. Inject memories into user message
175
+ 3. Add memory instruction to system prompt
176
+ 4. Forward to LLM
177
+ 5. Parse response to extract memories
178
+ 6. Store extracted memories
179
+ 7. Return clean response (without memory block)
180
+ """
181
+ messages = kwargs.get("messages", [])
182
+
183
+ # 1. Inject relevant memories into user message
184
+ enhanced_messages = self._wrapper._inject_memories(messages)
185
+
186
+ # 2. Add memory extraction instruction to system prompt
187
+ enhanced_messages = inject_memory_instruction(enhanced_messages, short=True)
188
+ kwargs["messages"] = enhanced_messages
189
+
190
+ # 3. Forward to LLM
191
+ response = self._wrapper._client.chat.completions.create(**kwargs)
192
+
193
+ # 4. Parse response and extract memories
194
+ raw_content = response.choices[0].message.content
195
+ parsed = parse_response_with_memory(raw_content)
196
+
197
+ # 5. Store extracted memories
198
+ if parsed.memories:
199
+ self._wrapper._store_memories(parsed.memories)
200
+
201
+ # 6. Return clean response (modify in place)
202
+ response.choices[0].message.content = parsed.content
203
+
204
+ return response
205
+
206
+
207
+ class _FastMemoryAPI:
208
+ """Direct API for memory operations."""
209
+
210
+ def __init__(self, store: FastMemoryStore, user_id: str):
211
+ self._store = store
212
+ self._user_id = user_id
213
+
214
+ def search(self, query: str, top_k: int = 5) -> list[tuple[MemoryChunk, float]]:
215
+ """Semantic search for memories.
216
+
217
+ Args:
218
+ query: Search query
219
+ top_k: Max results
220
+
221
+ Returns:
222
+ List of (memory, similarity_score) tuples
223
+ """
224
+ return self._store.search(self._user_id, query, top_k)
225
+
226
+ def add(self, content: str, category: str = "fact") -> MemoryChunk:
227
+ """Manually add a memory.
228
+
229
+ Args:
230
+ content: Memory content
231
+ category: preference, fact, or context
232
+
233
+ Returns:
234
+ The created memory chunk
235
+ """
236
+ return self._store.add(
237
+ self._user_id,
238
+ content,
239
+ role="memory",
240
+ metadata={"category": category, "source": "manual"},
241
+ )
242
+
243
+ def get_all(self) -> list[MemoryChunk]:
244
+ """Get all memories for this user."""
245
+ return self._store.get_all(self._user_id)
246
+
247
+ def clear(self) -> int:
248
+ """Clear all memories for this user."""
249
+ return self._store.clear(self._user_id)
250
+
251
+ def stats(self) -> dict:
252
+ """Get memory statistics."""
253
+ return self._store.stats(self._user_id)
254
+
255
+
256
+ def with_fast_memory(
257
+ client: Any,
258
+ user_id: str,
259
+ db_path: str | Path = "headroom_fast_memory.db",
260
+ top_k: int = 5,
261
+ use_local_embeddings: bool = True,
262
+ embedding_model: str = "all-MiniLM-L6-v2",
263
+ **kwargs: Any,
264
+ ) -> FastMemoryWrapper:
265
+ """Wrap an LLM client with zero-latency inline memory extraction.
266
+
267
+ This is the fastest memory solution:
268
+ 1. ZERO extra LLM calls - memories extracted inline as part of response
269
+ 2. Sub-50ms retrieval - local embeddings, no API calls
270
+ 3. Semantic search - finds conceptually related memories
271
+
272
+ Args:
273
+ client: OpenAI-compatible LLM client
274
+ user_id: User identifier for memory isolation
275
+ db_path: Path to SQLite database
276
+ top_k: Number of memories to inject per request
277
+ use_local_embeddings: Use local model (True) or OpenAI API (False)
278
+ embedding_model: Model name for local embeddings
279
+ **kwargs: Additional arguments
280
+
281
+ Returns:
282
+ Wrapped client with automatic memory
283
+
284
+ Example:
285
+ from openai import OpenAI
286
+ from headroom.memory import with_fast_memory
287
+
288
+ client = with_fast_memory(OpenAI(), user_id="alice")
289
+
290
+ # First conversation - memory extracted INLINE
291
+ response = client.chat.completions.create(
292
+ model="gpt-4o",
293
+ messages=[{"role": "user", "content": "I prefer Python for backend work"}]
294
+ )
295
+
296
+ # Later - memories automatically retrieved
297
+ response = client.chat.completions.create(
298
+ model="gpt-4o",
299
+ messages=[{"role": "user", "content": "What language should I use?"}]
300
+ )
301
+ # User sees: "Based on your preference for Python..."
302
+ """
303
+ return FastMemoryWrapper(
304
+ client=client,
305
+ user_id=user_id,
306
+ db_path=db_path,
307
+ top_k=top_k,
308
+ use_local_embeddings=use_local_embeddings,
309
+ embedding_model=embedding_model,
310
+ **kwargs,
311
+ )
@@ -0,0 +1,229 @@
1
+ """Inline memory extraction - zero extra latency.
2
+
3
+ Instead of making a separate LLM call to extract memories,
4
+ we modify the system prompt so the LLM outputs memories
5
+ as part of its response. This is the Letta/MemGPT approach.
6
+
7
+ Benefits:
8
+ - Zero extra latency (memory is part of response)
9
+ - Zero extra API cost (already paying for response tokens)
10
+ - Higher quality (LLM has full context)
11
+ - Intelligent filtering (LLM decides what's relevant)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import logging
18
+ import re
19
+ from dataclasses import dataclass
20
+ from typing import Any
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # Memory extraction instruction to append to system prompt
26
+ MEMORY_INSTRUCTION = """
27
+
28
+ ## Memory Instructions
29
+ After your response, if there are facts worth remembering about the user/entity for future conversations, output them in a <memory> block:
30
+
31
+ <memory>
32
+ {"memories": [{"content": "fact to remember", "category": "preference|fact|context"}]}
33
+ </memory>
34
+
35
+ Categories:
36
+ - preference: likes, dislikes, preferred tools/languages/styles
37
+ - fact: identity, role, job, location, constraints
38
+ - context: current goals, ongoing tasks, recent events
39
+
40
+ Only output memories for significant, reusable information. Skip for:
41
+ - Greetings, thanks, small talk
42
+ - One-time questions
43
+ - Information already known
44
+
45
+ If nothing worth remembering: <memory>{"memories": []}</memory>
46
+ """
47
+
48
+ # Shorter version for token efficiency
49
+ MEMORY_INSTRUCTION_SHORT = """
50
+
51
+ After responding, output facts to remember: <memory>{"memories": [{"content": "...", "category": "preference|fact|context"}]}</memory>
52
+ Skip for greetings/small talk. If nothing: <memory>{"memories": []}</memory>"""
53
+
54
+
55
+ @dataclass
56
+ class ParsedResponse:
57
+ """Response with extracted memories."""
58
+
59
+ content: str # The actual response (without memory block)
60
+ memories: list[dict[str, Any]] # Extracted memories
61
+ raw: str # Original full response
62
+
63
+
64
+ def inject_memory_instruction(
65
+ messages: list[dict[str, Any]],
66
+ short: bool = True,
67
+ ) -> list[dict[str, Any]]:
68
+ """Inject memory extraction instruction into system prompt.
69
+
70
+ Args:
71
+ messages: Original messages list
72
+ short: Use short instruction (fewer tokens)
73
+
74
+ Returns:
75
+ Modified messages with memory instruction
76
+ """
77
+ instruction = MEMORY_INSTRUCTION_SHORT if short else MEMORY_INSTRUCTION
78
+ messages = [m.copy() for m in messages] # Don't modify original
79
+
80
+ # Find or create system message
81
+ has_system = False
82
+ for i, msg in enumerate(messages):
83
+ if msg.get("role") == "system":
84
+ messages[i] = {
85
+ **msg,
86
+ "content": msg.get("content", "") + instruction,
87
+ }
88
+ has_system = True
89
+ break
90
+
91
+ if not has_system:
92
+ # Prepend system message
93
+ messages.insert(
94
+ 0,
95
+ {
96
+ "role": "system",
97
+ "content": "You are a helpful assistant." + instruction,
98
+ },
99
+ )
100
+
101
+ return messages
102
+
103
+
104
+ def parse_response_with_memory(response_text: str) -> ParsedResponse:
105
+ """Parse LLM response to extract memories.
106
+
107
+ Args:
108
+ response_text: Raw LLM response
109
+
110
+ Returns:
111
+ ParsedResponse with content and memories separated
112
+ """
113
+ memories: list[dict[str, Any]] = []
114
+ content = response_text
115
+
116
+ # Extract <memory> block
117
+ memory_pattern = r"<memory>\s*(.*?)\s*</memory>"
118
+ match = re.search(memory_pattern, response_text, re.DOTALL | re.IGNORECASE)
119
+
120
+ if match:
121
+ memory_json = match.group(1).strip()
122
+
123
+ # Remove the memory block from content
124
+ content = re.sub(memory_pattern, "", response_text, flags=re.DOTALL | re.IGNORECASE).strip()
125
+
126
+ # Parse the JSON
127
+ try:
128
+ data = json.loads(memory_json)
129
+ memories = data.get("memories", [])
130
+ except json.JSONDecodeError as e:
131
+ logger.warning(f"Failed to parse memory JSON: {e}")
132
+
133
+ return ParsedResponse(
134
+ content=content,
135
+ memories=memories,
136
+ raw=response_text,
137
+ )
138
+
139
+
140
+ class InlineMemoryWrapper:
141
+ """Wrapper that extracts memories from LLM responses inline.
142
+
143
+ This is the zero-latency approach - memories are extracted
144
+ as part of the response, not in a separate call.
145
+
146
+ Usage:
147
+ wrapper = InlineMemoryWrapper(openai_client)
148
+ response, memories = wrapper.chat(
149
+ messages=[{"role": "user", "content": "I prefer Python"}],
150
+ model="gpt-4o-mini"
151
+ )
152
+ # response = "Great choice! Python is excellent..."
153
+ # memories = [{"content": "User prefers Python", "category": "preference"}]
154
+ """
155
+
156
+ def __init__(self, client: Any):
157
+ """Initialize wrapper.
158
+
159
+ Args:
160
+ client: OpenAI-compatible client
161
+ """
162
+ self.client = client
163
+
164
+ def chat(
165
+ self,
166
+ messages: list[dict[str, Any]],
167
+ model: str = "gpt-4o-mini",
168
+ short_instruction: bool = True,
169
+ **kwargs: Any,
170
+ ) -> tuple[str, list[dict[str, Any]]]:
171
+ """Send chat request and extract memories inline.
172
+
173
+ Args:
174
+ messages: Chat messages
175
+ model: Model to use
176
+ short_instruction: Use shorter memory instruction
177
+ **kwargs: Additional args for chat completion
178
+
179
+ Returns:
180
+ Tuple of (response_content, extracted_memories)
181
+ """
182
+ # Inject memory instruction
183
+ modified_messages = inject_memory_instruction(messages, short=short_instruction)
184
+
185
+ # Call LLM
186
+ response = self.client.chat.completions.create(
187
+ model=model,
188
+ messages=modified_messages,
189
+ **kwargs,
190
+ )
191
+
192
+ raw_content = response.choices[0].message.content
193
+
194
+ # Parse response and extract memories
195
+ parsed = parse_response_with_memory(raw_content)
196
+
197
+ return parsed.content, parsed.memories
198
+
199
+ def chat_with_response(
200
+ self,
201
+ messages: list[dict[str, Any]],
202
+ model: str = "gpt-4o-mini",
203
+ **kwargs: Any,
204
+ ) -> tuple[Any, str, list[dict[str, Any]]]:
205
+ """Send chat request and return full response object.
206
+
207
+ Args:
208
+ messages: Chat messages
209
+ model: Model to use
210
+ **kwargs: Additional args for chat completion
211
+
212
+ Returns:
213
+ Tuple of (response_object, content, memories)
214
+ """
215
+ modified_messages = inject_memory_instruction(messages)
216
+
217
+ response = self.client.chat.completions.create(
218
+ model=model,
219
+ messages=modified_messages,
220
+ **kwargs,
221
+ )
222
+
223
+ raw_content = response.choices[0].message.content
224
+ parsed = parse_response_with_memory(raw_content)
225
+
226
+ # Modify response to have clean content
227
+ response.choices[0].message.content = parsed.content
228
+
229
+ return response, parsed.content, parsed.memories