bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bot_knows/__init__.py +70 -0
  2. bot_knows/config.py +115 -0
  3. bot_knows/domain/__init__.py +5 -0
  4. bot_knows/domain/chat.py +62 -0
  5. bot_knows/domain/message.py +64 -0
  6. bot_knows/domain/relation.py +56 -0
  7. bot_knows/domain/topic.py +132 -0
  8. bot_knows/domain/topic_evidence.py +55 -0
  9. bot_knows/importers/__init__.py +12 -0
  10. bot_knows/importers/base.py +116 -0
  11. bot_knows/importers/chatgpt.py +154 -0
  12. bot_knows/importers/claude.py +172 -0
  13. bot_knows/importers/generic_json.py +272 -0
  14. bot_knows/importers/registry.py +125 -0
  15. bot_knows/infra/__init__.py +5 -0
  16. bot_knows/infra/llm/__init__.py +6 -0
  17. bot_knows/infra/llm/anthropic_provider.py +172 -0
  18. bot_knows/infra/llm/openai_provider.py +195 -0
  19. bot_knows/infra/mongo/__init__.py +5 -0
  20. bot_knows/infra/mongo/client.py +145 -0
  21. bot_knows/infra/mongo/repositories.py +348 -0
  22. bot_knows/infra/neo4j/__init__.py +5 -0
  23. bot_knows/infra/neo4j/client.py +152 -0
  24. bot_knows/infra/neo4j/graph_repository.py +329 -0
  25. bot_knows/infra/redis/__init__.py +6 -0
  26. bot_knows/infra/redis/cache.py +198 -0
  27. bot_knows/infra/redis/client.py +193 -0
  28. bot_knows/interfaces/__init__.py +18 -0
  29. bot_knows/interfaces/embedding.py +55 -0
  30. bot_knows/interfaces/graph.py +194 -0
  31. bot_knows/interfaces/llm.py +70 -0
  32. bot_knows/interfaces/recall.py +92 -0
  33. bot_knows/interfaces/storage.py +225 -0
  34. bot_knows/logging.py +101 -0
  35. bot_knows/models/__init__.py +22 -0
  36. bot_knows/models/chat.py +55 -0
  37. bot_knows/models/ingest.py +70 -0
  38. bot_knows/models/message.py +49 -0
  39. bot_knows/models/recall.py +58 -0
  40. bot_knows/models/topic.py +100 -0
  41. bot_knows/orchestrator.py +398 -0
  42. bot_knows/py.typed +0 -0
  43. bot_knows/services/__init__.py +24 -0
  44. bot_knows/services/chat_processing.py +182 -0
  45. bot_knows/services/dedup_service.py +161 -0
  46. bot_knows/services/graph_service.py +217 -0
  47. bot_knows/services/message_builder.py +135 -0
  48. bot_knows/services/recall_service.py +296 -0
  49. bot_knows/services/tasks.py +128 -0
  50. bot_knows/services/topic_extraction.py +199 -0
  51. bot_knows/utils/__init__.py +22 -0
  52. bot_knows/utils/hashing.py +126 -0
  53. bot_knows-0.1.0.dist-info/METADATA +294 -0
  54. bot_knows-0.1.0.dist-info/RECORD +56 -0
  55. bot_knows-0.1.0.dist-info/WHEEL +4 -0
  56. bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,296 @@
1
+ """Recall service for bot_knows.
2
+
3
+ This module provides the evidence-weighted recall service with
4
+ spaced repetition-style decay and reinforcement.
5
+ """
6
+
7
+ import math
8
+ import time
9
+ from typing import Literal
10
+
11
+ from bot_knows.interfaces.graph import GraphServiceInterface
12
+ from bot_knows.interfaces.storage import StorageInterface
13
+ from bot_knows.logging import get_logger
14
+ from bot_knows.models.recall import RecallItemDTO, TopicRecallStateDTO
15
+
16
+ __all__ = [
17
+ "CONTEXT_WEIGHTS",
18
+ "RecallService",
19
+ ]
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ # Context weights for reinforcement
24
+ CONTEXT_WEIGHTS: dict[str, float] = {
25
+ "passive": 0.2, # Passive read (background access)
26
+ "active": 0.6, # Active query (user explicitly asked)
27
+ "recall": 1.0, # Recall prompt (spaced repetition review)
28
+ }
29
+
30
+
31
+ class RecallService:
32
+ """Evidence-weighted recall service.
33
+
34
+ Implements a spaced repetition-inspired recall system with:
35
+ - Time-based decay: strength *= exp(-Δt / (stability * 86400))
36
+ - Reinforcement: strength += confidence * novelty * context_weight
37
+ - Stability growth: stability += k * confidence
38
+ - Semantic reinforcement: boost related topics
39
+
40
+ Example:
41
+ service = RecallService(storage, graph)
42
+
43
+ # Reinforce when topic is accessed
44
+ state = await service.reinforce(topic_id, confidence=0.9, context="active")
45
+
46
+ # Get topics due for review
47
+ due_topics = await service.get_due_topics(threshold=0.3)
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ storage: StorageInterface,
53
+ graph: GraphServiceInterface,
54
+ stability_k: float = 0.1,
55
+ semantic_boost: float = 0.1,
56
+ ) -> None:
57
+ """Initialize service with dependencies.
58
+
59
+ Args:
60
+ storage: Storage interface for recall states
61
+ graph: Graph interface for related topics
62
+ stability_k: Factor for stability growth on reinforcement
63
+ semantic_boost: Factor for boosting related topics
64
+ """
65
+ self._storage = storage
66
+ self._graph = graph
67
+ self._stability_k = stability_k
68
+ self._semantic_boost = semantic_boost
69
+
70
+ async def reinforce(
71
+ self,
72
+ topic_id: str,
73
+ confidence: float,
74
+ novelty_factor: float = 1.0,
75
+ context: Literal["passive", "active", "recall"] = "passive",
76
+ ) -> TopicRecallStateDTO:
77
+ """Reinforce a topic's recall strength.
78
+
79
+ Formulas:
80
+ delta = confidence * novelty_factor * context_weight
81
+ strength = min(1.0, strength + delta)
82
+ stability += k * confidence
83
+
84
+ Also boosts semantically related topics.
85
+
86
+ Args:
87
+ topic_id: Topic to reinforce
88
+ confidence: Evidence confidence (0.0-1.0)
89
+ novelty_factor: How novel this reinforcement is
90
+ context: Interaction context (passive/active/recall)
91
+
92
+ Returns:
93
+ Updated TopicRecallStateDTO
94
+ """
95
+ now = int(time.time())
96
+
97
+ # Get or create state
98
+ state = await self._storage.get_recall_state(topic_id)
99
+ if not state:
100
+ state = TopicRecallStateDTO(
101
+ topic_id=topic_id,
102
+ strength=0.0,
103
+ last_seen=now,
104
+ last_updated=now,
105
+ stability=1.0,
106
+ )
107
+
108
+ # Apply decay first
109
+ state = self._apply_decay(state, now)
110
+
111
+ # Calculate reinforcement
112
+ context_weight = CONTEXT_WEIGHTS.get(context, 0.2)
113
+ delta = confidence * novelty_factor * context_weight
114
+ new_strength = min(1.0, state.strength + delta)
115
+ new_stability = state.stability + self._stability_k * confidence
116
+
117
+ # Create updated state
118
+ new_state = TopicRecallStateDTO(
119
+ topic_id=topic_id,
120
+ strength=new_strength,
121
+ last_seen=now,
122
+ last_updated=now,
123
+ stability=new_stability,
124
+ )
125
+
126
+ # Save state
127
+ await self._storage.save_recall_state(new_state)
128
+
129
+ # Boost related topics
130
+ await self._boost_related_topics(topic_id, state.strength)
131
+
132
+ logger.debug(
133
+ "topic_reinforced",
134
+ topic_id=topic_id,
135
+ old_strength=state.strength,
136
+ new_strength=new_strength,
137
+ context=context,
138
+ )
139
+
140
+ return new_state
141
+
142
+ async def apply_decay(
143
+ self,
144
+ topic_id: str,
145
+ current_time: int | None = None,
146
+ ) -> TopicRecallStateDTO | None:
147
+ """Apply time-based decay to a topic.
148
+
149
+ Formula: strength *= exp(-Δt / (stability * 86400))
150
+
151
+ Args:
152
+ topic_id: Topic to decay
153
+ current_time: Current time (epoch seconds), default: now
154
+
155
+ Returns:
156
+ Updated state or None if topic has no state
157
+ """
158
+ state = await self._storage.get_recall_state(topic_id)
159
+ if not state:
160
+ return None
161
+
162
+ now = current_time or int(time.time())
163
+ new_state = self._apply_decay(state, now)
164
+
165
+ await self._storage.save_recall_state(new_state)
166
+ return new_state
167
+
168
+ async def batch_decay_update(self) -> int:
169
+ """Apply decay to all topics (scheduled task).
170
+
171
+ Returns:
172
+ Number of topics updated
173
+ """
174
+ now = int(time.time())
175
+ states = await self._storage.get_all_recall_states()
176
+
177
+ updated = 0
178
+ for state in states:
179
+ new_state = self._apply_decay(state, now)
180
+ if new_state.strength != state.strength:
181
+ await self._storage.save_recall_state(new_state)
182
+ updated += 1
183
+
184
+ logger.info("batch_decay_completed", updated_count=updated)
185
+ return updated
186
+
187
+ async def get_due_topics(
188
+ self,
189
+ threshold: float = 0.3,
190
+ limit: int = 10,
191
+ ) -> list[RecallItemDTO]:
192
+ """Get topics due for recall review.
193
+
194
+ Topics with strength below threshold are considered due.
195
+ Results are sorted by due_score (higher = more urgent).
196
+
197
+ Args:
198
+ threshold: Strength threshold for being "due"
199
+ limit: Maximum number of topics
200
+
201
+ Returns:
202
+ List of RecallItemDTO sorted by priority
203
+ """
204
+ states = await self._storage.get_due_topics(threshold)
205
+
206
+ items: list[RecallItemDTO] = []
207
+ for state in states[:limit]:
208
+ topic = await self._storage.get_topic(state.topic_id)
209
+ if not topic:
210
+ continue
211
+
212
+ # Get related topics
213
+ related = await self._graph.get_related_topics(state.topic_id, limit=5)
214
+
215
+ # Calculate due score
216
+ due_score = self._calculate_due_score(state)
217
+
218
+ items.append(
219
+ RecallItemDTO(
220
+ topic=topic,
221
+ recall_state=state,
222
+ due_score=due_score,
223
+ related_topics=[r[0] for r in related],
224
+ )
225
+ )
226
+
227
+ # Sort by due_score descending
228
+ items.sort(key=lambda x: x.due_score, reverse=True)
229
+ return items
230
+
231
+ def _apply_decay(
232
+ self,
233
+ state: TopicRecallStateDTO,
234
+ current_time: int,
235
+ ) -> TopicRecallStateDTO:
236
+ """Apply time-based decay to state.
237
+
238
+ Formula: strength *= exp(-Δt / (stability * 86400))
239
+ """
240
+ delta_t = current_time - state.last_updated
241
+ if delta_t <= 0:
242
+ return state
243
+
244
+ # stability is in days, so multiply by seconds per day
245
+ decay_factor = math.exp(-delta_t / (state.stability * 86400))
246
+ new_strength = state.strength * decay_factor
247
+
248
+ return TopicRecallStateDTO(
249
+ topic_id=state.topic_id,
250
+ strength=new_strength,
251
+ last_seen=state.last_seen,
252
+ last_updated=current_time,
253
+ stability=state.stability,
254
+ )
255
+
256
+ async def _boost_related_topics(
257
+ self,
258
+ topic_id: str,
259
+ source_strength: float,
260
+ ) -> None:
261
+ """Boost strength of related topics.
262
+
263
+ Formula: strength += source_strength * edge_weight * semantic_boost
264
+ """
265
+ related = await self._graph.get_related_topics(topic_id)
266
+
267
+ for related_id, edge_weight in related:
268
+ state = await self._storage.get_recall_state(related_id)
269
+ if not state:
270
+ continue
271
+
272
+ boost = source_strength * edge_weight * self._semantic_boost
273
+ new_strength = min(1.0, state.strength + boost)
274
+
275
+ if new_strength > state.strength:
276
+ new_state = TopicRecallStateDTO(
277
+ topic_id=related_id,
278
+ strength=new_strength,
279
+ last_seen=state.last_seen,
280
+ last_updated=state.last_updated,
281
+ stability=state.stability,
282
+ )
283
+ await self._storage.save_recall_state(new_state)
284
+
285
+ def _calculate_due_score(self, state: TopicRecallStateDTO) -> float:
286
+ """Calculate priority score for recall review.
287
+
288
+ Higher score = more urgent for review.
289
+ Factors: lower strength, older last_seen
290
+ """
291
+ now = time.time()
292
+ age_days = (now - state.last_seen) / 86400
293
+
294
+ # Lower strength = higher priority
295
+ # Older = higher priority
296
+ return (1.0 - state.strength) * (1.0 + age_days * 0.1)
@@ -0,0 +1,128 @@
1
+ """Task orchestration for bot_knows.
2
+
3
+ This module provides Taskiq-based background task definitions
4
+ for async processing of imports, extractions, and scheduled jobs.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from bot_knows.logging import get_logger
10
+
11
+ __all__ = [
12
+ "create_broker",
13
+ "create_scheduler",
14
+ ]
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ def create_broker(redis_url: str) -> Any:
20
+ """Create Taskiq broker for background tasks.
21
+
22
+ Args:
23
+ redis_url: Redis URL for task queue
24
+
25
+ Returns:
26
+ Configured TaskiqRedisStreamBroker
27
+
28
+ Example:
29
+ broker = create_broker("redis://localhost:6379")
30
+
31
+ @broker.task
32
+ async def my_task():
33
+ ...
34
+ """
35
+ try:
36
+ from taskiq_redis import RedisStreamBroker
37
+
38
+ broker = RedisStreamBroker(url=redis_url)
39
+ logger.info("taskiq_broker_created", url=redis_url)
40
+ return broker
41
+ except ImportError:
42
+ logger.warning("taskiq_redis_not_installed")
43
+ return None
44
+
45
+
46
+ def create_scheduler(broker: Any) -> Any:
47
+ """Create Taskiq scheduler for periodic tasks.
48
+
49
+ Args:
50
+ broker: Taskiq broker instance
51
+
52
+ Returns:
53
+ Configured TaskiqScheduler
54
+ """
55
+ if broker is None:
56
+ return None
57
+
58
+ try:
59
+ from taskiq import TaskiqScheduler
60
+
61
+ scheduler = TaskiqScheduler(broker)
62
+ logger.info("taskiq_scheduler_created")
63
+ return scheduler
64
+ except ImportError:
65
+ logger.warning("taskiq_not_installed")
66
+ return None
67
+
68
+
69
+ # Task definitions (to be registered with broker)
70
+ # These are placeholder implementations - actual implementation
71
+ # requires a running broker instance.
72
+
73
+
74
+ async def process_chat_import_task(
75
+ source: str,
76
+ raw_export: dict[str, Any],
77
+ ) -> dict[str, Any]:
78
+ """Background task to process chat import.
79
+
80
+ Args:
81
+ source: Import source identifier
82
+ raw_export: Raw export data
83
+
84
+ Returns:
85
+ Import result summary
86
+ """
87
+ # Import orchestration would go here
88
+ # This requires dependency injection of services
89
+ logger.info("process_chat_import_task", source=source)
90
+ return {"status": "completed", "source": source}
91
+
92
+
93
+ async def extract_topics_task(
94
+ message_id: str,
95
+ ) -> dict[str, Any]:
96
+ """Background task to extract topics from message.
97
+
98
+ Args:
99
+ message_id: Message ID to process
100
+
101
+ Returns:
102
+ Extraction result summary
103
+ """
104
+ logger.info("extract_topics_task", message_id=message_id)
105
+ return {"status": "completed", "message_id": message_id}
106
+
107
+
108
+ async def batch_decay_task() -> dict[str, Any]:
109
+ """Scheduled task to update decay for all topics.
110
+
111
+ This should be scheduled to run periodically (e.g., daily).
112
+
113
+ Returns:
114
+ Decay update summary
115
+ """
116
+ logger.info("batch_decay_task_started")
117
+ # Would call RecallService.batch_decay_update()
118
+ return {"status": "completed"}
119
+
120
+
121
+ # Example of how to set up scheduled tasks with a broker:
122
+ #
123
+ # broker = create_broker("redis://localhost:6379")
124
+ # scheduler = create_scheduler(broker)
125
+ #
126
+ # if scheduler:
127
+ # # Run decay update every 24 hours at midnight
128
+ # scheduler.schedule(batch_decay_task, cron="0 0 * * *")
@@ -0,0 +1,199 @@
1
+ """Topic extraction service for bot_knows.
2
+
3
+ This module provides the service for extracting topics from messages.
4
+ """
5
+
6
+ import time
7
+
8
+ from bot_knows.interfaces.embedding import EmbeddingServiceInterface
9
+ from bot_knows.interfaces.llm import LLMInterface
10
+ from bot_knows.logging import get_logger
11
+ from bot_knows.models.message import MessageDTO
12
+ from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
13
+ from bot_knows.utils.hashing import generate_evidence_id, generate_topic_id
14
+
15
+ __all__ = [
16
+ "TopicCandidate",
17
+ "TopicExtractionService",
18
+ ]
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class TopicCandidate:
24
+ """Represents a candidate topic extracted from a message.
25
+
26
+ This is an intermediate representation before deduplication.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ extracted_name: str,
32
+ confidence: float,
33
+ embedding: list[float],
34
+ source_message_id: str,
35
+ ) -> None:
36
+ self.extracted_name = extracted_name
37
+ self.confidence = confidence
38
+ self.embedding = embedding
39
+ self.source_message_id = source_message_id
40
+
41
+
42
+ class TopicExtractionService:
43
+ """Service for extracting topics from messages.
44
+
45
+ Uses LLM for topic extraction and embedding service for
46
+ generating embeddings for semantic matching.
47
+
48
+ Example:
49
+ service = TopicExtractionService(llm, embedding_service)
50
+ candidates = await service.extract(message)
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ llm: LLMInterface,
56
+ embedding_service: EmbeddingServiceInterface,
57
+ ) -> None:
58
+ """Initialize service with dependencies.
59
+
60
+ Args:
61
+ llm: LLM interface for topic extraction
62
+ embedding_service: Embedding service for vector generation
63
+ """
64
+ self._llm = llm
65
+ self._embedding = embedding_service
66
+
67
+ async def extract(self, message: MessageDTO) -> list[TopicCandidate]:
68
+ """Extract topic candidates from a message.
69
+
70
+ Args:
71
+ message: Message to extract topics from
72
+
73
+ Returns:
74
+ List of TopicCandidate objects
75
+ """
76
+ if message.is_empty:
77
+ return []
78
+
79
+ # Extract raw topics using LLM
80
+ try:
81
+ raw_topics = await self._llm.extract_topics(
82
+ message.user_content,
83
+ message.assistant_content,
84
+ )
85
+ except Exception as e:
86
+ logger.warning("topic_extraction_failed", error=str(e))
87
+ return []
88
+
89
+ if not raw_topics:
90
+ return []
91
+
92
+ # Generate embeddings for all topics
93
+ topic_names = [name for name, _ in raw_topics]
94
+ try:
95
+ embeddings = await self._embedding.embed_batch(topic_names)
96
+ except Exception as e:
97
+ logger.warning("embedding_generation_failed", error=str(e))
98
+ return []
99
+
100
+ # Create candidates
101
+ candidates = []
102
+ for (name, confidence), embedding in zip(raw_topics, embeddings, strict=False):
103
+ candidate = TopicCandidate(
104
+ extracted_name=name,
105
+ confidence=confidence,
106
+ embedding=embedding,
107
+ source_message_id=message.message_id,
108
+ )
109
+ candidates.append(candidate)
110
+
111
+ logger.debug(
112
+ "topics_extracted",
113
+ message_id=message.message_id,
114
+ count=len(candidates),
115
+ )
116
+
117
+ return candidates
118
+
119
+ async def create_topic_and_evidence(
120
+ self,
121
+ candidate: TopicCandidate,
122
+ canonical_name: str | None = None,
123
+ ) -> tuple[TopicDTO, TopicEvidenceDTO]:
124
+ """Create new TopicDTO and TopicEvidenceDTO from candidate.
125
+
126
+ Args:
127
+ candidate: Topic candidate
128
+ canonical_name: Normalized name (if None, uses extracted_name)
129
+
130
+ Returns:
131
+ Tuple of (TopicDTO, TopicEvidenceDTO)
132
+ """
133
+ name = canonical_name or await self._llm.normalize_topic_name(candidate.extracted_name)
134
+ now = int(time.time())
135
+
136
+ topic_id = generate_topic_id(name, candidate.source_message_id)
137
+ evidence_id = generate_evidence_id(
138
+ topic_id=topic_id,
139
+ extracted_name=candidate.extracted_name,
140
+ source_message_id=candidate.source_message_id,
141
+ timestamp=now,
142
+ )
143
+
144
+ topic = TopicDTO(
145
+ topic_id=topic_id,
146
+ canonical_name=name,
147
+ centroid_embedding=candidate.embedding,
148
+ evidence_count=1,
149
+ importance=candidate.confidence * 0.1, # Initial importance
150
+ recall_strength=0.0,
151
+ )
152
+
153
+ evidence = TopicEvidenceDTO(
154
+ evidence_id=evidence_id,
155
+ topic_id=topic_id,
156
+ extracted_name=candidate.extracted_name,
157
+ source_message_id=candidate.source_message_id,
158
+ confidence=candidate.confidence,
159
+ timestamp=now,
160
+ )
161
+
162
+ return topic, evidence
163
+
164
+ async def create_evidence_for_existing(
165
+ self,
166
+ candidate: TopicCandidate,
167
+ existing_topic: TopicDTO,
168
+ ) -> tuple[TopicDTO, TopicEvidenceDTO]:
169
+ """Create evidence for existing topic and update centroid.
170
+
171
+ Args:
172
+ candidate: Topic candidate
173
+ existing_topic: Existing topic to link to
174
+
175
+ Returns:
176
+ Tuple of (updated TopicDTO, new TopicEvidenceDTO)
177
+ """
178
+ now = int(time.time())
179
+
180
+ evidence_id = generate_evidence_id(
181
+ topic_id=existing_topic.topic_id,
182
+ extracted_name=candidate.extracted_name,
183
+ source_message_id=candidate.source_message_id,
184
+ timestamp=now,
185
+ )
186
+
187
+ evidence = TopicEvidenceDTO(
188
+ evidence_id=evidence_id,
189
+ topic_id=existing_topic.topic_id,
190
+ extracted_name=candidate.extracted_name,
191
+ source_message_id=candidate.source_message_id,
192
+ confidence=candidate.confidence,
193
+ timestamp=now,
194
+ )
195
+
196
+ # Update topic centroid
197
+ updated_topic = existing_topic.with_updated_centroid(candidate.embedding)
198
+
199
+ return updated_topic, evidence
@@ -0,0 +1,22 @@
1
+ """Utility functions for bot_knows.
2
+
3
+ This module contains internal utility functions.
4
+ """
5
+
6
+ from bot_knows.utils.hashing import (
7
+ generate_chat_id,
8
+ generate_evidence_id,
9
+ generate_message_id,
10
+ generate_topic_id,
11
+ hash_text,
12
+ stable_hash,
13
+ )
14
+
15
+ __all__ = [
16
+ "generate_chat_id",
17
+ "generate_evidence_id",
18
+ "generate_message_id",
19
+ "generate_topic_id",
20
+ "hash_text",
21
+ "stable_hash",
22
+ ]