alma-memory 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. alma/__init__.py +121 -45
  2. alma/confidence/__init__.py +1 -1
  3. alma/confidence/engine.py +92 -58
  4. alma/confidence/types.py +34 -14
  5. alma/config/loader.py +3 -2
  6. alma/consolidation/__init__.py +23 -0
  7. alma/consolidation/engine.py +678 -0
  8. alma/consolidation/prompts.py +84 -0
  9. alma/core.py +136 -28
  10. alma/domains/__init__.py +6 -6
  11. alma/domains/factory.py +12 -9
  12. alma/domains/schemas.py +17 -3
  13. alma/domains/types.py +8 -4
  14. alma/events/__init__.py +75 -0
  15. alma/events/emitter.py +284 -0
  16. alma/events/storage_mixin.py +246 -0
  17. alma/events/types.py +126 -0
  18. alma/events/webhook.py +425 -0
  19. alma/exceptions.py +49 -0
  20. alma/extraction/__init__.py +31 -0
  21. alma/extraction/auto_learner.py +265 -0
  22. alma/extraction/extractor.py +420 -0
  23. alma/graph/__init__.py +106 -0
  24. alma/graph/backends/__init__.py +32 -0
  25. alma/graph/backends/kuzu.py +624 -0
  26. alma/graph/backends/memgraph.py +432 -0
  27. alma/graph/backends/memory.py +236 -0
  28. alma/graph/backends/neo4j.py +417 -0
  29. alma/graph/base.py +159 -0
  30. alma/graph/extraction.py +198 -0
  31. alma/graph/store.py +860 -0
  32. alma/harness/__init__.py +4 -4
  33. alma/harness/base.py +18 -9
  34. alma/harness/domains.py +27 -11
  35. alma/initializer/__init__.py +1 -1
  36. alma/initializer/initializer.py +51 -43
  37. alma/initializer/types.py +25 -17
  38. alma/integration/__init__.py +9 -9
  39. alma/integration/claude_agents.py +32 -20
  40. alma/integration/helena.py +32 -22
  41. alma/integration/victor.py +57 -33
  42. alma/learning/__init__.py +27 -27
  43. alma/learning/forgetting.py +198 -148
  44. alma/learning/heuristic_extractor.py +40 -24
  45. alma/learning/protocols.py +65 -17
  46. alma/learning/validation.py +7 -2
  47. alma/mcp/__init__.py +4 -4
  48. alma/mcp/__main__.py +2 -1
  49. alma/mcp/resources.py +17 -16
  50. alma/mcp/server.py +102 -44
  51. alma/mcp/tools.py +180 -45
  52. alma/observability/__init__.py +84 -0
  53. alma/observability/config.py +302 -0
  54. alma/observability/logging.py +424 -0
  55. alma/observability/metrics.py +583 -0
  56. alma/observability/tracing.py +440 -0
  57. alma/progress/__init__.py +3 -3
  58. alma/progress/tracker.py +26 -20
  59. alma/progress/types.py +8 -12
  60. alma/py.typed +0 -0
  61. alma/retrieval/__init__.py +11 -11
  62. alma/retrieval/cache.py +20 -21
  63. alma/retrieval/embeddings.py +4 -4
  64. alma/retrieval/engine.py +179 -39
  65. alma/retrieval/scoring.py +73 -63
  66. alma/session/__init__.py +2 -2
  67. alma/session/manager.py +5 -5
  68. alma/session/types.py +5 -4
  69. alma/storage/__init__.py +70 -0
  70. alma/storage/azure_cosmos.py +414 -133
  71. alma/storage/base.py +215 -4
  72. alma/storage/chroma.py +1443 -0
  73. alma/storage/constants.py +103 -0
  74. alma/storage/file_based.py +59 -28
  75. alma/storage/migrations/__init__.py +21 -0
  76. alma/storage/migrations/base.py +321 -0
  77. alma/storage/migrations/runner.py +323 -0
  78. alma/storage/migrations/version_stores.py +337 -0
  79. alma/storage/migrations/versions/__init__.py +11 -0
  80. alma/storage/migrations/versions/v1_0_0.py +373 -0
  81. alma/storage/pinecone.py +1080 -0
  82. alma/storage/postgresql.py +1559 -0
  83. alma/storage/qdrant.py +1306 -0
  84. alma/storage/sqlite_local.py +504 -60
  85. alma/testing/__init__.py +46 -0
  86. alma/testing/factories.py +301 -0
  87. alma/testing/mocks.py +389 -0
  88. alma/types.py +62 -14
  89. alma_memory-0.5.1.dist-info/METADATA +939 -0
  90. alma_memory-0.5.1.dist-info/RECORD +93 -0
  91. {alma_memory-0.4.0.dist-info → alma_memory-0.5.1.dist-info}/WHEEL +1 -1
  92. alma_memory-0.4.0.dist-info/METADATA +0 -488
  93. alma_memory-0.4.0.dist-info/RECORD +0 -52
  94. {alma_memory-0.4.0.dist-info → alma_memory-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,265 @@
1
+ """
2
+ ALMA Auto-Learning Module.
3
+
4
+ Bridges LLM-powered fact extraction with ALMA's learning protocols.
5
+ Enables Mem0-style automatic learning from conversations.
6
+ """
7
+
8
+ import logging
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from alma.extraction import (
12
+ ExtractedFact,
13
+ FactExtractor,
14
+ FactType,
15
+ create_extractor,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class AutoLearner:
22
+ """
23
+ Automatic learning from conversations.
24
+
25
+ This class bridges the gap between Mem0's automatic extraction
26
+ and ALMA's explicit learning protocols. It:
27
+
28
+ 1. Extracts facts from conversations using LLM or rules
29
+ 2. Validates facts against agent scopes
30
+ 3. Deduplicates against existing memories
31
+ 4. Commits valid facts to ALMA storage
32
+
33
+ Usage:
34
+ alma = ALMA.from_config(".alma/config.yaml")
35
+ auto_learner = AutoLearner(alma)
36
+
37
+ # After a conversation
38
+ results = auto_learner.learn_from_conversation(
39
+ messages=[
40
+ {"role": "user", "content": "Test the login form"},
41
+ {"role": "assistant", "content": "I tested using incremental validation..."},
42
+ ],
43
+ agent="helena",
44
+ )
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ alma, # ALMA instance - avoid circular import
50
+ extractor: Optional[FactExtractor] = None,
51
+ auto_commit: bool = True,
52
+ min_confidence: float = 0.5,
53
+ ):
54
+ """
55
+ Initialize AutoLearner.
56
+
57
+ Args:
58
+ alma: ALMA instance for storage and retrieval
59
+ extractor: Custom extractor, or None for auto-detection
60
+ auto_commit: Whether to automatically commit extracted facts
61
+ min_confidence: Minimum confidence threshold for facts
62
+ """
63
+ self.alma = alma
64
+ self.extractor = extractor or create_extractor()
65
+ self.auto_commit = auto_commit
66
+ self.min_confidence = min_confidence
67
+
68
+ def learn_from_conversation(
69
+ self,
70
+ messages: List[Dict[str, str]],
71
+ agent: str,
72
+ user_id: Optional[str] = None,
73
+ commit: Optional[bool] = None,
74
+ ) -> Dict[str, Any]:
75
+ """
76
+ Extract and learn from a conversation.
77
+
78
+ Args:
79
+ messages: Conversation messages
80
+ agent: Agent that had the conversation
81
+ user_id: Optional user ID for preferences
82
+ commit: Override auto_commit setting
83
+
84
+ Returns:
85
+ Dict with extraction results and commit status
86
+ """
87
+ should_commit = commit if commit is not None else self.auto_commit
88
+
89
+ # Get agent scope for context
90
+ scope = self.alma.scopes.get(agent)
91
+ agent_context = None
92
+ if scope:
93
+ agent_context = f"Agent '{agent}' can learn: {scope.can_learn}. Cannot learn: {scope.cannot_learn}"
94
+
95
+ # Get existing facts to avoid duplicates
96
+ existing_memories = self.alma.retrieve(
97
+ task=" ".join(m["content"] for m in messages[-3:]), # Recent context
98
+ agent=agent,
99
+ top_k=20,
100
+ )
101
+ existing_facts = []
102
+ for h in existing_memories.heuristics:
103
+ existing_facts.append(f"{h.condition}: {h.strategy}")
104
+ for ap in existing_memories.anti_patterns:
105
+ existing_facts.append(f"AVOID: {ap.pattern}")
106
+ for dk in existing_memories.domain_knowledge:
107
+ existing_facts.append(dk.fact)
108
+
109
+ # Extract facts
110
+ extraction_result = self.extractor.extract(
111
+ messages=messages,
112
+ agent_context=agent_context,
113
+ existing_facts=existing_facts if existing_facts else None,
114
+ )
115
+
116
+ # Filter by confidence and scope
117
+ valid_facts = []
118
+ rejected_facts = []
119
+
120
+ for fact in extraction_result.facts:
121
+ # Check confidence
122
+ if fact.confidence < self.min_confidence:
123
+ rejected_facts.append(
124
+ {
125
+ "fact": fact,
126
+ "reason": f"Low confidence: {fact.confidence} < {self.min_confidence}",
127
+ }
128
+ )
129
+ continue
130
+
131
+ # Check scope for heuristics and anti-patterns
132
+ if scope and fact.fact_type in (FactType.HEURISTIC, FactType.ANTI_PATTERN):
133
+ # Infer domain from content
134
+ inferred_domain = self._infer_domain(fact.content)
135
+ if inferred_domain and not scope.is_allowed(inferred_domain):
136
+ rejected_facts.append(
137
+ {
138
+ "fact": fact,
139
+ "reason": f"Outside agent scope: {inferred_domain}",
140
+ }
141
+ )
142
+ continue
143
+
144
+ valid_facts.append(fact)
145
+
146
+ # Commit if enabled
147
+ committed = []
148
+ if should_commit:
149
+ for fact in valid_facts:
150
+ try:
151
+ result = self._commit_fact(fact, agent, user_id)
152
+ if result:
153
+ committed.append({"fact": fact, "id": result})
154
+ except Exception as e:
155
+ logger.error(f"Failed to commit fact: {e}")
156
+ rejected_facts.append(
157
+ {
158
+ "fact": fact,
159
+ "reason": f"Commit failed: {str(e)}",
160
+ }
161
+ )
162
+
163
+ return {
164
+ "extracted_count": len(extraction_result.facts),
165
+ "valid_count": len(valid_facts),
166
+ "committed_count": len(committed),
167
+ "rejected_count": len(rejected_facts),
168
+ "extraction_time_ms": extraction_result.extraction_time_ms,
169
+ "tokens_used": extraction_result.tokens_used,
170
+ "committed": committed,
171
+ "rejected": rejected_facts,
172
+ "valid_facts": valid_facts,
173
+ }
174
+
175
+ def _commit_fact(
176
+ self,
177
+ fact: ExtractedFact,
178
+ agent: str,
179
+ user_id: Optional[str],
180
+ ) -> Optional[str]:
181
+ """Commit a single fact to ALMA storage."""
182
+
183
+ if fact.fact_type == FactType.HEURISTIC:
184
+ # Use learning protocol for heuristics
185
+ return self.alma.learning.add_heuristic_direct(
186
+ agent=agent,
187
+ project_id=self.alma.project_id,
188
+ condition=fact.condition or fact.content,
189
+ strategy=fact.strategy or fact.content,
190
+ confidence=fact.confidence,
191
+ metadata={"source": "auto_extraction"},
192
+ )
193
+
194
+ elif fact.fact_type == FactType.ANTI_PATTERN:
195
+ return self.alma.learning.add_anti_pattern(
196
+ agent=agent,
197
+ project_id=self.alma.project_id,
198
+ pattern=fact.content,
199
+ why_bad=fact.condition,
200
+ better_alternative=fact.strategy,
201
+ )
202
+
203
+ elif fact.fact_type == FactType.PREFERENCE:
204
+ if user_id:
205
+ pref = self.alma.add_user_preference(
206
+ user_id=user_id,
207
+ category=fact.category or "general",
208
+ preference=fact.content,
209
+ source="auto_extraction",
210
+ )
211
+ return pref.id
212
+
213
+ elif fact.fact_type == FactType.DOMAIN_KNOWLEDGE:
214
+ # add_domain_knowledge now raises ScopeViolationError instead of returning None
215
+ knowledge = self.alma.add_domain_knowledge(
216
+ agent=agent,
217
+ domain=fact.domain or "general",
218
+ fact=fact.content,
219
+ source="auto_extraction",
220
+ )
221
+ return knowledge.id
222
+
223
+ elif fact.fact_type == FactType.OUTCOME:
224
+ # Outcomes need success/failure info we don't have
225
+ # Store as domain knowledge instead
226
+ knowledge = self.alma.add_domain_knowledge(
227
+ agent=agent,
228
+ domain="outcomes",
229
+ fact=fact.content,
230
+ source="auto_extraction",
231
+ )
232
+ return knowledge.id
233
+
234
+ return None
235
+
236
+ def _infer_domain(self, content: str) -> Optional[str]:
237
+ """Infer domain from fact content using keywords."""
238
+ content_lower = content.lower()
239
+
240
+ domain_keywords = {
241
+ "testing": ["test", "assert", "selenium", "playwright", "cypress"],
242
+ "frontend": ["css", "html", "react", "vue", "ui", "button", "form"],
243
+ "backend": ["api", "database", "sql", "server", "endpoint"],
244
+ "security": ["auth", "token", "password", "encrypt", "csrf"],
245
+ "performance": ["latency", "cache", "optimize", "slow", "fast"],
246
+ }
247
+
248
+ for domain, keywords in domain_keywords.items():
249
+ if any(kw in content_lower for kw in keywords):
250
+ return domain
251
+
252
+ return None
253
+
254
+
255
+ def add_auto_learning_to_alma(alma) -> AutoLearner:
256
+ """
257
+ Convenience function to add auto-learning to an ALMA instance.
258
+
259
+ Usage:
260
+ alma = ALMA.from_config(".alma/config.yaml")
261
+ auto_learner = add_auto_learning_to_alma(alma)
262
+
263
+ # Now use auto_learner.learn_from_conversation()
264
+ """
265
+ return AutoLearner(alma)
@@ -0,0 +1,420 @@
1
+ """
2
+ ALMA Fact Extraction Module.
3
+
4
+ LLM-powered extraction of facts, preferences, and learnings from conversations.
5
+ This bridges the gap between Mem0's automatic extraction and ALMA's explicit learning.
6
+ """
7
+
8
+ import logging
9
+ from abc import ABC, abstractmethod
10
+ from dataclasses import dataclass
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class FactType(Enum):
18
+ """Types of facts that can be extracted from conversations."""
19
+
20
+ HEURISTIC = "heuristic" # Strategy that worked
21
+ ANTI_PATTERN = "anti_pattern" # What NOT to do
22
+ PREFERENCE = "preference" # User preference
23
+ DOMAIN_KNOWLEDGE = "domain_knowledge" # Factual information
24
+ OUTCOME = "outcome" # Task result
25
+
26
+
27
+ @dataclass
28
+ class ExtractedFact:
29
+ """A fact extracted from conversation."""
30
+
31
+ fact_type: FactType
32
+ content: str
33
+ confidence: float # 0.0 to 1.0
34
+ source_text: str # Original text this was extracted from
35
+ metadata: Dict[str, Any] = None
36
+
37
+ # For heuristics/anti-patterns
38
+ condition: Optional[str] = None # When does this apply?
39
+ strategy: Optional[str] = None # What to do?
40
+
41
+ # For preferences
42
+ category: Optional[str] = None
43
+
44
+ # For domain knowledge
45
+ domain: Optional[str] = None
46
+
47
+
48
+ @dataclass
49
+ class ExtractionResult:
50
+ """Result of fact extraction from a conversation."""
51
+
52
+ facts: List[ExtractedFact]
53
+ raw_response: str # LLM's raw response for debugging
54
+ tokens_used: int
55
+ extraction_time_ms: int
56
+
57
+
58
+ class FactExtractor(ABC):
59
+ """Abstract base class for fact extraction."""
60
+
61
+ @abstractmethod
62
+ def extract(
63
+ self,
64
+ messages: List[Dict[str, str]],
65
+ agent_context: Optional[str] = None,
66
+ existing_facts: Optional[List[str]] = None,
67
+ ) -> ExtractionResult:
68
+ """
69
+ Extract facts from a conversation.
70
+
71
+ Args:
72
+ messages: List of {"role": "user"|"assistant", "content": "..."}
73
+ agent_context: Optional context about the agent's domain
74
+ existing_facts: Optional list of already-known facts to avoid duplicates
75
+
76
+ Returns:
77
+ ExtractionResult with extracted facts
78
+ """
79
+ pass
80
+
81
+
82
+ class LLMFactExtractor(FactExtractor):
83
+ """
84
+ LLM-powered fact extraction.
85
+
86
+ Uses structured prompting to extract facts, preferences, and learnings
87
+ from conversations. Supports OpenAI, Anthropic, and local models.
88
+ """
89
+
90
+ EXTRACTION_PROMPT = """You are a fact extraction system for an AI agent memory architecture.
91
+
92
+ Analyze the following conversation and extract facts worth remembering.
93
+
94
+ IMPORTANT: Only extract facts that are:
95
+ 1. Specific and actionable (not vague observations)
96
+ 2. Likely to be useful in future similar situations
97
+ 3. Not already in the existing facts list
98
+
99
+ Categorize each fact as one of:
100
+ - HEURISTIC: A strategy or approach that worked well
101
+ - ANTI_PATTERN: Something that failed or should be avoided
102
+ - PREFERENCE: A user preference or constraint
103
+ - DOMAIN_KNOWLEDGE: A factual piece of information about the domain
104
+ - OUTCOME: The result of a specific task
105
+
106
+ For HEURISTIC and ANTI_PATTERN, also extract:
107
+ - condition: When does this apply?
108
+ - strategy: What to do (or not do)?
109
+
110
+ For PREFERENCE, extract:
111
+ - category: What type of preference (communication, code_style, workflow, etc.)
112
+
113
+ For DOMAIN_KNOWLEDGE, extract:
114
+ - domain: What knowledge domain this belongs to
115
+
116
+ {agent_context}
117
+
118
+ {existing_facts_section}
119
+
120
+ CONVERSATION:
121
+ {conversation}
122
+
123
+ Respond in JSON format:
124
+ ```json
125
+ {{
126
+ "facts": [
127
+ {{
128
+ "fact_type": "HEURISTIC|ANTI_PATTERN|PREFERENCE|DOMAIN_KNOWLEDGE|OUTCOME",
129
+ "content": "The main fact statement",
130
+ "confidence": 0.0-1.0,
131
+ "condition": "optional - when this applies",
132
+ "strategy": "optional - what to do",
133
+ "category": "optional - preference category",
134
+ "domain": "optional - knowledge domain"
135
+ }}
136
+ ]
137
+ }}
138
+ ```
139
+
140
+ If no facts worth extracting, return: {{"facts": []}}
141
+ """
142
+
143
+ def __init__(
144
+ self,
145
+ provider: str = "openai",
146
+ model: str = "gpt-4o-mini",
147
+ api_key: Optional[str] = None,
148
+ temperature: float = 0.1,
149
+ ):
150
+ """
151
+ Initialize LLM fact extractor.
152
+
153
+ Args:
154
+ provider: "openai", "anthropic", or "local"
155
+ model: Model name/identifier
156
+ api_key: API key (or use environment variable)
157
+ temperature: LLM temperature for extraction
158
+ """
159
+ self.provider = provider
160
+ self.model = model
161
+ self.api_key = api_key
162
+ self.temperature = temperature
163
+ self._client = None
164
+
165
+ def _get_client(self):
166
+ """Lazy initialization of LLM client."""
167
+ if self._client is None:
168
+ if self.provider == "openai":
169
+ from openai import OpenAI
170
+
171
+ self._client = OpenAI(api_key=self.api_key)
172
+ elif self.provider == "anthropic":
173
+ from anthropic import Anthropic
174
+
175
+ self._client = Anthropic(api_key=self.api_key)
176
+ else:
177
+ raise ValueError(f"Unsupported provider: {self.provider}")
178
+ return self._client
179
+
180
+ def extract(
181
+ self,
182
+ messages: List[Dict[str, str]],
183
+ agent_context: Optional[str] = None,
184
+ existing_facts: Optional[List[str]] = None,
185
+ ) -> ExtractionResult:
186
+ """Extract facts from conversation using LLM."""
187
+ import time
188
+
189
+ start_time = time.time()
190
+
191
+ # Format conversation
192
+ conversation = "\n".join(
193
+ f"{msg['role'].upper()}: {msg['content']}" for msg in messages
194
+ )
195
+
196
+ # Build prompt
197
+ agent_context_section = ""
198
+ if agent_context:
199
+ agent_context_section = f"\nAGENT CONTEXT:\n{agent_context}\n"
200
+
201
+ existing_facts_section = ""
202
+ if existing_facts:
203
+ facts_list = "\n".join(f"- {f}" for f in existing_facts)
204
+ existing_facts_section = (
205
+ f"\nEXISTING FACTS (do not duplicate):\n{facts_list}\n"
206
+ )
207
+
208
+ prompt = self.EXTRACTION_PROMPT.format(
209
+ agent_context=agent_context_section,
210
+ existing_facts_section=existing_facts_section,
211
+ conversation=conversation,
212
+ )
213
+
214
+ # Call LLM
215
+ client = self._get_client()
216
+ tokens_used = 0
217
+
218
+ if self.provider == "openai":
219
+ response = client.chat.completions.create(
220
+ model=self.model,
221
+ messages=[{"role": "user", "content": prompt}],
222
+ temperature=self.temperature,
223
+ )
224
+ raw_response = response.choices[0].message.content
225
+ tokens_used = response.usage.total_tokens if response.usage else 0
226
+
227
+ elif self.provider == "anthropic":
228
+ response = client.messages.create(
229
+ model=self.model,
230
+ max_tokens=2000,
231
+ messages=[{"role": "user", "content": prompt}],
232
+ )
233
+ raw_response = response.content[0].text
234
+ tokens_used = response.usage.input_tokens + response.usage.output_tokens
235
+
236
+ # Parse response
237
+ facts = self._parse_response(raw_response, conversation)
238
+
239
+ extraction_time_ms = int((time.time() - start_time) * 1000)
240
+
241
+ return ExtractionResult(
242
+ facts=facts,
243
+ raw_response=raw_response,
244
+ tokens_used=tokens_used,
245
+ extraction_time_ms=extraction_time_ms,
246
+ )
247
+
248
+ def _parse_response(
249
+ self,
250
+ raw_response: str,
251
+ source_text: str,
252
+ ) -> List[ExtractedFact]:
253
+ """Parse LLM response into ExtractedFact objects."""
254
+ import json
255
+ import re
256
+
257
+ # Extract JSON from response (handle markdown code blocks)
258
+ json_match = re.search(r"```json\s*(.*?)\s*```", raw_response, re.DOTALL)
259
+ if json_match:
260
+ json_str = json_match.group(1)
261
+ else:
262
+ # Try to find raw JSON
263
+ json_match = re.search(r"\{.*\}", raw_response, re.DOTALL)
264
+ if json_match:
265
+ json_str = json_match.group(0)
266
+ else:
267
+ logger.warning(
268
+ f"Could not parse JSON from response: {raw_response[:200]}"
269
+ )
270
+ return []
271
+
272
+ try:
273
+ data = json.loads(json_str)
274
+ except json.JSONDecodeError as e:
275
+ logger.warning(f"JSON parse error: {e}")
276
+ return []
277
+
278
+ facts = []
279
+ for item in data.get("facts", []):
280
+ try:
281
+ fact_type = FactType[item["fact_type"].upper()]
282
+ facts.append(
283
+ ExtractedFact(
284
+ fact_type=fact_type,
285
+ content=item["content"],
286
+ confidence=float(item.get("confidence", 0.7)),
287
+ source_text=source_text[:500], # Truncate for storage
288
+ condition=item.get("condition"),
289
+ strategy=item.get("strategy"),
290
+ category=item.get("category"),
291
+ domain=item.get("domain"),
292
+ )
293
+ )
294
+ except (KeyError, ValueError) as e:
295
+ logger.warning(f"Could not parse fact: {item}, error: {e}")
296
+ continue
297
+
298
+ return facts
299
+
300
+
301
+ class RuleBasedExtractor(FactExtractor):
302
+ """
303
+ Rule-based fact extraction for offline/free usage.
304
+
305
+ Uses pattern matching and heuristics instead of LLM calls.
306
+ Less accurate but free and fast.
307
+ """
308
+
309
+ # Patterns that indicate different fact types
310
+ HEURISTIC_PATTERNS = [
311
+ r"(?:worked|succeeded|fixed|solved|helped).*(?:by|using|with)",
312
+ r"(?:better|best|good)\s+(?:to|approach|way|strategy)",
313
+ r"(?:should|always|recommend).*(?:use|try|do)",
314
+ ]
315
+
316
+ ANTI_PATTERN_PATTERNS = [
317
+ r"(?:don't|do not|never|avoid).*(?:use|do|try)",
318
+ r"(?:failed|broke|caused|error).*(?:because|when|due)",
319
+ r"(?:bad|wrong|incorrect)\s+(?:to|approach|way)",
320
+ ]
321
+
322
+ PREFERENCE_PATTERNS = [
323
+ r"(?:i|user)\s+(?:prefer|like|want|need)",
324
+ r"(?:always|never).*(?:for me|i want)",
325
+ ]
326
+
327
+ def extract(
328
+ self,
329
+ messages: List[Dict[str, str]],
330
+ agent_context: Optional[str] = None,
331
+ existing_facts: Optional[List[str]] = None,
332
+ ) -> ExtractionResult:
333
+ """Extract facts using pattern matching."""
334
+ import re
335
+ import time
336
+
337
+ start_time = time.time()
338
+ facts = []
339
+
340
+ for msg in messages:
341
+ content = msg["content"].lower()
342
+
343
+ # Check for heuristics
344
+ for pattern in self.HEURISTIC_PATTERNS:
345
+ if re.search(pattern, content, re.IGNORECASE):
346
+ facts.append(
347
+ ExtractedFact(
348
+ fact_type=FactType.HEURISTIC,
349
+ content=msg["content"][:200],
350
+ confidence=0.5, # Lower confidence for rule-based
351
+ source_text=msg["content"],
352
+ )
353
+ )
354
+ break
355
+
356
+ # Check for anti-patterns
357
+ for pattern in self.ANTI_PATTERN_PATTERNS:
358
+ if re.search(pattern, content, re.IGNORECASE):
359
+ facts.append(
360
+ ExtractedFact(
361
+ fact_type=FactType.ANTI_PATTERN,
362
+ content=msg["content"][:200],
363
+ confidence=0.5,
364
+ source_text=msg["content"],
365
+ )
366
+ )
367
+ break
368
+
369
+ # Check for preferences
370
+ for pattern in self.PREFERENCE_PATTERNS:
371
+ if re.search(pattern, content, re.IGNORECASE):
372
+ facts.append(
373
+ ExtractedFact(
374
+ fact_type=FactType.PREFERENCE,
375
+ content=msg["content"][:200],
376
+ confidence=0.5,
377
+ source_text=msg["content"],
378
+ )
379
+ )
380
+ break
381
+
382
+ extraction_time_ms = int((time.time() - start_time) * 1000)
383
+
384
+ return ExtractionResult(
385
+ facts=facts,
386
+ raw_response="rule-based extraction",
387
+ tokens_used=0,
388
+ extraction_time_ms=extraction_time_ms,
389
+ )
390
+
391
+
392
+ def create_extractor(
393
+ provider: str = "auto",
394
+ **kwargs,
395
+ ) -> FactExtractor:
396
+ """
397
+ Factory function to create appropriate extractor.
398
+
399
+ Args:
400
+ provider: "openai", "anthropic", "local", "rule-based", or "auto"
401
+ **kwargs: Additional arguments for the extractor
402
+
403
+ Returns:
404
+ Configured FactExtractor instance
405
+ """
406
+ if provider == "auto":
407
+ # Try to use LLM if API key is available
408
+ import os
409
+
410
+ if os.environ.get("OPENAI_API_KEY"):
411
+ provider = "openai"
412
+ elif os.environ.get("ANTHROPIC_API_KEY"):
413
+ provider = "anthropic"
414
+ else:
415
+ provider = "rule-based"
416
+
417
+ if provider == "rule-based":
418
+ return RuleBasedExtractor()
419
+ else:
420
+ return LLMFactExtractor(provider=provider, **kwargs)