claude-mpm 3.3.0__py3-none-any.whl → 3.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/agents/templates/data_engineer.json +1 -1
- claude_mpm/agents/templates/documentation.json +1 -1
- claude_mpm/agents/templates/engineer.json +1 -1
- claude_mpm/agents/templates/ops.json +1 -1
- claude_mpm/agents/templates/pm.json +1 -1
- claude_mpm/agents/templates/qa.json +1 -1
- claude_mpm/agents/templates/research.json +1 -1
- claude_mpm/agents/templates/security.json +1 -1
- claude_mpm/agents/templates/test_integration.json +112 -0
- claude_mpm/agents/templates/version_control.json +1 -1
- claude_mpm/cli/commands/memory.py +575 -25
- claude_mpm/cli/commands/run.py +115 -14
- claude_mpm/cli/parser.py +76 -0
- claude_mpm/constants.py +5 -0
- claude_mpm/core/claude_runner.py +13 -11
- claude_mpm/core/session_manager.py +46 -0
- claude_mpm/core/simple_runner.py +13 -11
- claude_mpm/hooks/claude_hooks/hook_handler.py +2 -26
- claude_mpm/services/agent_memory_manager.py +264 -23
- claude_mpm/services/memory_builder.py +491 -0
- claude_mpm/services/memory_optimizer.py +619 -0
- claude_mpm/services/memory_router.py +445 -0
- claude_mpm/services/socketio_server.py +184 -20
- claude_mpm-3.3.2.dist-info/METADATA +159 -0
- {claude_mpm-3.3.0.dist-info → claude_mpm-3.3.2.dist-info}/RECORD +29 -28
- claude_mpm/agents/templates/test-integration-agent.md +0 -34
- claude_mpm/core/websocket_handler.py +0 -233
- claude_mpm/services/websocket_server.py +0 -376
- claude_mpm-3.3.0.dist-info/METADATA +0 -432
- {claude_mpm-3.3.0.dist-info → claude_mpm-3.3.2.dist-info}/WHEEL +0 -0
- {claude_mpm-3.3.0.dist-info → claude_mpm-3.3.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-3.3.0.dist-info → claude_mpm-3.3.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-3.3.0.dist-info → claude_mpm-3.3.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Memory Builder Service
|
|
4
|
+
=====================
|
|
5
|
+
|
|
6
|
+
Builds agent memories from project documentation by parsing and extracting
|
|
7
|
+
memory-worthy content for appropriate agents.
|
|
8
|
+
|
|
9
|
+
This service provides:
|
|
10
|
+
- Documentation parsing (CLAUDE.md, QA.md, STRUCTURE.md, etc.)
|
|
11
|
+
- Content extraction and categorization
|
|
12
|
+
- Agent assignment based on content type
|
|
13
|
+
- Concise memory entry creation (< 100 chars)
|
|
14
|
+
- Batch building from multiple docs
|
|
15
|
+
|
|
16
|
+
WHY: Project documentation contains valuable patterns, guidelines, and knowledge
|
|
17
|
+
that agents should be aware of. This service automatically extracts and assigns
|
|
18
|
+
relevant information to appropriate agents.
|
|
19
|
+
|
|
20
|
+
DESIGN DECISION: Focuses on extracting actionable insights rather than copying
|
|
21
|
+
documentation verbatim. Creates concise learnings that fit memory constraints
|
|
22
|
+
while preserving essential information.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
28
|
+
from datetime import datetime
|
|
29
|
+
|
|
30
|
+
from claude_mpm.core import LoggerMixin
|
|
31
|
+
from claude_mpm.core.config import Config
|
|
32
|
+
from claude_mpm.utils.paths import PathResolver
|
|
33
|
+
from claude_mpm.services.memory_router import MemoryRouter
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MemoryBuilder(LoggerMixin):
|
|
37
|
+
"""Builds agent memories from project documentation.
|
|
38
|
+
|
|
39
|
+
WHY: Documentation contains patterns and guidelines that agents should know
|
|
40
|
+
about. Manual memory creation is time-consuming and prone to inconsistency.
|
|
41
|
+
This service automates the extraction and assignment process.
|
|
42
|
+
|
|
43
|
+
DESIGN DECISION: Uses pattern matching and content analysis to extract
|
|
44
|
+
actionable insights rather than copying raw documentation. Focuses on
|
|
45
|
+
creating learnings that will actually be useful to agents.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Documentation files to process
|
|
49
|
+
DOC_FILES = {
|
|
50
|
+
'CLAUDE.md': {
|
|
51
|
+
'priority': 'high',
|
|
52
|
+
'sections': ['development guidelines', 'key components', 'common issues'],
|
|
53
|
+
'agents': ['pm', 'engineer']
|
|
54
|
+
},
|
|
55
|
+
'docs/STRUCTURE.md': {
|
|
56
|
+
'priority': 'high',
|
|
57
|
+
'sections': ['file placement', 'design patterns', 'directory structure'],
|
|
58
|
+
'agents': ['engineer', 'documentation']
|
|
59
|
+
},
|
|
60
|
+
'docs/QA.md': {
|
|
61
|
+
'priority': 'high',
|
|
62
|
+
'sections': ['testing', 'quality assurance', 'validation'],
|
|
63
|
+
'agents': ['qa', 'engineer']
|
|
64
|
+
},
|
|
65
|
+
'docs/DEPLOY.md': {
|
|
66
|
+
'priority': 'medium',
|
|
67
|
+
'sections': ['deployment', 'versioning', 'release'],
|
|
68
|
+
'agents': ['engineer', 'pm']
|
|
69
|
+
},
|
|
70
|
+
'docs/VERSIONING.md': {
|
|
71
|
+
'priority': 'medium',
|
|
72
|
+
'sections': ['version management', 'semantic versioning'],
|
|
73
|
+
'agents': ['engineer', 'pm']
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Patterns for extracting actionable content
|
|
78
|
+
EXTRACTION_PATTERNS = {
|
|
79
|
+
'guidelines': [
|
|
80
|
+
r'(?:must|should|always|never|avoid|ensure|remember to)\s+(.+?)(?:\.|$)',
|
|
81
|
+
r'(?:important|note|warning|tip):\s*(.+?)(?:\.|$)',
|
|
82
|
+
r'(?:do not|don\'t)\s+(.+?)(?:\.|$)'
|
|
83
|
+
],
|
|
84
|
+
'patterns': [
|
|
85
|
+
r'(?:pattern|approach|strategy|method):\s*(.+?)(?:\.|$)',
|
|
86
|
+
r'(?:use|implement|follow)\s+(.+?)\s+(?:pattern|approach|for)',
|
|
87
|
+
r'(?:follows|uses|implements)\s+(.+?)\s+(?:pattern|architecture)'
|
|
88
|
+
],
|
|
89
|
+
'mistakes': [
|
|
90
|
+
r'(?:common\s+)?(?:mistake|error|issue|problem):\s*(.+?)(?:\.|$)',
|
|
91
|
+
r'(?:avoid|never|don\'t)\s+(.+?)(?:\.|$)',
|
|
92
|
+
r'(?:pitfall|gotcha|warning):\s*(.+?)(?:\.|$)'
|
|
93
|
+
],
|
|
94
|
+
'architecture': [
|
|
95
|
+
r'(?:architecture|structure|design):\s*(.+?)(?:\.|$)',
|
|
96
|
+
r'(?:component|service|module)\s+(.+?)\s+(?:provides|handles|manages)',
|
|
97
|
+
r'(?:uses|implements|follows)\s+(.+?)\s+(?:architecture|pattern)'
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def __init__(self, config: Optional[Config] = None):
|
|
102
|
+
"""Initialize the memory builder.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
config: Optional Config object
|
|
106
|
+
"""
|
|
107
|
+
super().__init__()
|
|
108
|
+
self.config = config or Config()
|
|
109
|
+
self.project_root = PathResolver.get_project_root()
|
|
110
|
+
self.memories_dir = self.project_root / ".claude-mpm" / "memories"
|
|
111
|
+
self.router = MemoryRouter(config)
|
|
112
|
+
|
|
113
|
+
def build_from_documentation(self, force_rebuild: bool = False) -> Dict[str, Any]:
|
|
114
|
+
"""Build agent memories from project documentation.
|
|
115
|
+
|
|
116
|
+
WHY: Documentation contains project-specific knowledge that agents need.
|
|
117
|
+
This method extracts and assigns relevant information to appropriate agents.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
force_rebuild: If True, rebuilds even if docs haven't changed
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dict containing build results and statistics
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
results = {
|
|
127
|
+
"success": True,
|
|
128
|
+
"timestamp": datetime.now().isoformat(),
|
|
129
|
+
"files_processed": 0,
|
|
130
|
+
"memories_created": 0,
|
|
131
|
+
"memories_updated": 0,
|
|
132
|
+
"agents_affected": set(),
|
|
133
|
+
"files": {},
|
|
134
|
+
"errors": []
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Process each documentation file
|
|
138
|
+
for doc_path, doc_config in self.DOC_FILES.items():
|
|
139
|
+
file_path = self.project_root / doc_path
|
|
140
|
+
|
|
141
|
+
if not file_path.exists():
|
|
142
|
+
self.logger.debug(f"Documentation file not found: {doc_path}")
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
# Check if rebuild is needed
|
|
146
|
+
if not force_rebuild and not self._needs_rebuild(file_path):
|
|
147
|
+
self.logger.debug(f"Skipping {doc_path} - no changes detected")
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
file_result = self._process_documentation_file(file_path, doc_config)
|
|
151
|
+
results["files"][doc_path] = file_result
|
|
152
|
+
|
|
153
|
+
# Aggregate results
|
|
154
|
+
if file_result.get("success"):
|
|
155
|
+
results["files_processed"] += 1
|
|
156
|
+
results["memories_created"] += file_result.get("memories_created", 0)
|
|
157
|
+
results["memories_updated"] += file_result.get("memories_updated", 0)
|
|
158
|
+
results["agents_affected"].update(file_result.get("agents_affected", []))
|
|
159
|
+
else:
|
|
160
|
+
results["errors"].append(f"{doc_path}: {file_result.get('error', 'Unknown error')}")
|
|
161
|
+
|
|
162
|
+
# Convert set to list for JSON serialization
|
|
163
|
+
results["agents_affected"] = list(results["agents_affected"])
|
|
164
|
+
results["total_agents_affected"] = len(results["agents_affected"])
|
|
165
|
+
|
|
166
|
+
self.logger.info(f"Built memories from documentation: {results['files_processed']} files, {results['memories_created']} memories created")
|
|
167
|
+
return results
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
self.logger.error(f"Error building memories from documentation: {e}")
|
|
171
|
+
return {
|
|
172
|
+
"success": False,
|
|
173
|
+
"error": str(e),
|
|
174
|
+
"timestamp": datetime.now().isoformat()
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def extract_from_text(self, text: str, source: str) -> List[Dict[str, Any]]:
|
|
178
|
+
"""Extract memory-worthy content from text.
|
|
179
|
+
|
|
180
|
+
WHY: Provides reusable text extraction logic that can be used for
|
|
181
|
+
custom documentation or other text sources beyond standard files.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
text: Text content to analyze
|
|
185
|
+
source: Source identifier for context
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of extracted memory items with metadata
|
|
189
|
+
"""
|
|
190
|
+
try:
|
|
191
|
+
extracted_items = []
|
|
192
|
+
|
|
193
|
+
# Process each extraction pattern type
|
|
194
|
+
for pattern_type, patterns in self.EXTRACTION_PATTERNS.items():
|
|
195
|
+
for pattern in patterns:
|
|
196
|
+
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
197
|
+
|
|
198
|
+
for match in matches:
|
|
199
|
+
content = match.group(1).strip()
|
|
200
|
+
|
|
201
|
+
# Clean and validate content
|
|
202
|
+
content = self._clean_extracted_content(content)
|
|
203
|
+
if not self._is_valid_memory_content(content):
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
# Route to appropriate agent
|
|
207
|
+
routing_result = self.router.analyze_and_route(content)
|
|
208
|
+
|
|
209
|
+
extracted_item = {
|
|
210
|
+
"content": content,
|
|
211
|
+
"type": pattern_type,
|
|
212
|
+
"source": source,
|
|
213
|
+
"target_agent": routing_result.get("target_agent", "pm"),
|
|
214
|
+
"section": routing_result.get("section", "Recent Learnings"),
|
|
215
|
+
"confidence": routing_result.get("confidence", 0.5),
|
|
216
|
+
"pattern_matched": pattern
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
extracted_items.append(extracted_item)
|
|
220
|
+
|
|
221
|
+
# Remove near-duplicates
|
|
222
|
+
unique_items = self._deduplicate_extracted_items(extracted_items)
|
|
223
|
+
|
|
224
|
+
self.logger.debug(f"Extracted {len(unique_items)} unique items from {source}")
|
|
225
|
+
return unique_items
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
self.logger.error(f"Error extracting content from text: {e}")
|
|
229
|
+
return []
|
|
230
|
+
|
|
231
|
+
def build_agent_memory_from_items(self, agent_id: str, items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
232
|
+
"""Build or update agent memory from extracted items.
|
|
233
|
+
|
|
234
|
+
WHY: Extracted items need to be properly integrated into agent memory
|
|
235
|
+
files while respecting existing content and size limits.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
agent_id: Target agent identifier
|
|
239
|
+
items: List of extracted memory items
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Dict containing update results
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
from claude_mpm.services.agent_memory_manager import get_memory_manager
|
|
246
|
+
memory_manager = get_memory_manager(self.config)
|
|
247
|
+
|
|
248
|
+
result = {
|
|
249
|
+
"success": True,
|
|
250
|
+
"agent_id": agent_id,
|
|
251
|
+
"items_processed": 0,
|
|
252
|
+
"items_added": 0,
|
|
253
|
+
"items_skipped": 0,
|
|
254
|
+
"sections_updated": set(),
|
|
255
|
+
"errors": []
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# Filter items for this agent
|
|
259
|
+
agent_items = [item for item in items if item.get("target_agent") == agent_id]
|
|
260
|
+
|
|
261
|
+
for item in agent_items:
|
|
262
|
+
result["items_processed"] += 1
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
# Add to memory
|
|
266
|
+
section = item.get("section", "Recent Learnings")
|
|
267
|
+
content = item.get("content", "")
|
|
268
|
+
|
|
269
|
+
success = memory_manager.update_agent_memory(agent_id, section, content)
|
|
270
|
+
|
|
271
|
+
if success:
|
|
272
|
+
result["items_added"] += 1
|
|
273
|
+
result["sections_updated"].add(section)
|
|
274
|
+
else:
|
|
275
|
+
result["items_skipped"] += 1
|
|
276
|
+
result["errors"].append(f"Failed to add: {content[:50]}...")
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
result["items_skipped"] += 1
|
|
280
|
+
result["errors"].append(f"Error processing item: {str(e)}")
|
|
281
|
+
|
|
282
|
+
# Convert set to list
|
|
283
|
+
result["sections_updated"] = list(result["sections_updated"])
|
|
284
|
+
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
except Exception as e:
|
|
288
|
+
self.logger.error(f"Error building memory for {agent_id}: {e}")
|
|
289
|
+
return {
|
|
290
|
+
"success": False,
|
|
291
|
+
"agent_id": agent_id,
|
|
292
|
+
"error": str(e)
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
def _process_documentation_file(self, file_path: Path, doc_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
296
|
+
"""Process a single documentation file.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
file_path: Path to documentation file
|
|
300
|
+
doc_config: Configuration for this file type
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Processing results
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
# Read file content
|
|
307
|
+
content = file_path.read_text(encoding='utf-8')
|
|
308
|
+
|
|
309
|
+
# Extract memory items
|
|
310
|
+
extracted_items = self.extract_from_text(content, str(file_path.relative_to(self.project_root)))
|
|
311
|
+
|
|
312
|
+
result = {
|
|
313
|
+
"success": True,
|
|
314
|
+
"file_path": str(file_path),
|
|
315
|
+
"content_length": len(content),
|
|
316
|
+
"items_extracted": len(extracted_items),
|
|
317
|
+
"memories_created": 0,
|
|
318
|
+
"memories_updated": 0,
|
|
319
|
+
"agents_affected": [],
|
|
320
|
+
"agent_results": {}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# Group items by target agent
|
|
324
|
+
agent_items = {}
|
|
325
|
+
for item in extracted_items:
|
|
326
|
+
agent = item.get("target_agent", "pm")
|
|
327
|
+
if agent not in agent_items:
|
|
328
|
+
agent_items[agent] = []
|
|
329
|
+
agent_items[agent].append(item)
|
|
330
|
+
|
|
331
|
+
# Update each agent's memory
|
|
332
|
+
for agent_id, items in agent_items.items():
|
|
333
|
+
agent_result = self.build_agent_memory_from_items(agent_id, items)
|
|
334
|
+
result["agent_results"][agent_id] = agent_result
|
|
335
|
+
|
|
336
|
+
if agent_result.get("success"):
|
|
337
|
+
result["agents_affected"].append(agent_id)
|
|
338
|
+
result["memories_created"] += agent_result.get("items_added", 0)
|
|
339
|
+
|
|
340
|
+
# Update last processed timestamp
|
|
341
|
+
self._update_last_processed(file_path)
|
|
342
|
+
|
|
343
|
+
return result
|
|
344
|
+
|
|
345
|
+
except Exception as e:
|
|
346
|
+
self.logger.error(f"Error processing documentation file {file_path}: {e}")
|
|
347
|
+
return {
|
|
348
|
+
"success": False,
|
|
349
|
+
"file_path": str(file_path),
|
|
350
|
+
"error": str(e)
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
def _needs_rebuild(self, file_path: Path) -> bool:
|
|
354
|
+
"""Check if documentation file needs to be processed.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
file_path: Path to documentation file
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
True if file needs processing
|
|
361
|
+
"""
|
|
362
|
+
# Check if file was modified since last processing
|
|
363
|
+
try:
|
|
364
|
+
last_processed_file = self.memories_dir / ".last_processed.json"
|
|
365
|
+
|
|
366
|
+
if not last_processed_file.exists():
|
|
367
|
+
return True
|
|
368
|
+
|
|
369
|
+
import json
|
|
370
|
+
last_processed = json.loads(last_processed_file.read_text())
|
|
371
|
+
|
|
372
|
+
file_key = str(file_path.relative_to(self.project_root))
|
|
373
|
+
if file_key not in last_processed:
|
|
374
|
+
return True
|
|
375
|
+
|
|
376
|
+
last_processed_time = datetime.fromisoformat(last_processed[file_key])
|
|
377
|
+
file_modified_time = datetime.fromtimestamp(file_path.stat().st_mtime)
|
|
378
|
+
|
|
379
|
+
return file_modified_time > last_processed_time
|
|
380
|
+
|
|
381
|
+
except Exception as e:
|
|
382
|
+
self.logger.debug(f"Error checking rebuild status for {file_path}: {e}")
|
|
383
|
+
return True # Default to rebuilding if we can't determine
|
|
384
|
+
|
|
385
|
+
def _update_last_processed(self, file_path: Path):
|
|
386
|
+
"""Update last processed timestamp for file.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
file_path: Path to documentation file
|
|
390
|
+
"""
|
|
391
|
+
try:
|
|
392
|
+
self.memories_dir.mkdir(parents=True, exist_ok=True)
|
|
393
|
+
last_processed_file = self.memories_dir / ".last_processed.json"
|
|
394
|
+
|
|
395
|
+
# Load existing data
|
|
396
|
+
if last_processed_file.exists():
|
|
397
|
+
import json
|
|
398
|
+
last_processed = json.loads(last_processed_file.read_text())
|
|
399
|
+
else:
|
|
400
|
+
last_processed = {}
|
|
401
|
+
|
|
402
|
+
# Update timestamp
|
|
403
|
+
file_key = str(file_path.relative_to(self.project_root))
|
|
404
|
+
last_processed[file_key] = datetime.now().isoformat()
|
|
405
|
+
|
|
406
|
+
# Save back
|
|
407
|
+
import json
|
|
408
|
+
last_processed_file.write_text(json.dumps(last_processed, indent=2))
|
|
409
|
+
|
|
410
|
+
except Exception as e:
|
|
411
|
+
self.logger.warning(f"Error updating last processed timestamp: {e}")
|
|
412
|
+
|
|
413
|
+
def _clean_extracted_content(self, content: str) -> str:
|
|
414
|
+
"""Clean and normalize extracted content.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
content: Raw extracted content
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Cleaned content string
|
|
421
|
+
"""
|
|
422
|
+
# Remove markdown formatting
|
|
423
|
+
content = re.sub(r'[*_`#]+', '', content)
|
|
424
|
+
|
|
425
|
+
# Remove extra whitespace
|
|
426
|
+
content = re.sub(r'\s+', ' ', content).strip()
|
|
427
|
+
|
|
428
|
+
# Remove common prefixes that don't add value
|
|
429
|
+
content = re.sub(r'^(?:note:|tip:|important:|warning:)\s*', '', content, flags=re.IGNORECASE)
|
|
430
|
+
|
|
431
|
+
# Truncate to memory limit (with ellipsis if needed)
|
|
432
|
+
if len(content) > 95: # Leave room for ellipsis
|
|
433
|
+
content = content[:95] + "..."
|
|
434
|
+
|
|
435
|
+
return content
|
|
436
|
+
|
|
437
|
+
def _is_valid_memory_content(self, content: str) -> bool:
|
|
438
|
+
"""Validate if content is suitable for memory storage.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
content: Content to validate
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
True if content is valid for memory
|
|
445
|
+
"""
|
|
446
|
+
# Must have minimum length
|
|
447
|
+
if len(content) < 10:
|
|
448
|
+
return False
|
|
449
|
+
|
|
450
|
+
# Must contain actionable information
|
|
451
|
+
actionable_words = ['use', 'avoid', 'ensure', 'follow', 'implement', 'check', 'must', 'should', 'never', 'always']
|
|
452
|
+
if not any(word in content.lower() for word in actionable_words):
|
|
453
|
+
return False
|
|
454
|
+
|
|
455
|
+
# Avoid overly generic content
|
|
456
|
+
generic_phrases = ['this is', 'this document', 'see above', 'as mentioned', 'for more info']
|
|
457
|
+
if any(phrase in content.lower() for phrase in generic_phrases):
|
|
458
|
+
return False
|
|
459
|
+
|
|
460
|
+
return True
|
|
461
|
+
|
|
462
|
+
def _deduplicate_extracted_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
463
|
+
"""Remove near-duplicate extracted items.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
items: List of extracted items
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Deduplicated list
|
|
470
|
+
"""
|
|
471
|
+
from difflib import SequenceMatcher
|
|
472
|
+
|
|
473
|
+
unique_items = []
|
|
474
|
+
|
|
475
|
+
for item in items:
|
|
476
|
+
content = item.get("content", "")
|
|
477
|
+
is_duplicate = False
|
|
478
|
+
|
|
479
|
+
# Check against existing unique items
|
|
480
|
+
for unique_item in unique_items:
|
|
481
|
+
unique_content = unique_item.get("content", "")
|
|
482
|
+
similarity = SequenceMatcher(None, content.lower(), unique_content.lower()).ratio()
|
|
483
|
+
|
|
484
|
+
if similarity > 0.8: # 80% similarity threshold
|
|
485
|
+
is_duplicate = True
|
|
486
|
+
break
|
|
487
|
+
|
|
488
|
+
if not is_duplicate:
|
|
489
|
+
unique_items.append(item)
|
|
490
|
+
|
|
491
|
+
return unique_items
|