bot-knows 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bot_knows/__init__.py +70 -0
- bot_knows/config.py +115 -0
- bot_knows/domain/__init__.py +5 -0
- bot_knows/domain/chat.py +62 -0
- bot_knows/domain/message.py +64 -0
- bot_knows/domain/relation.py +56 -0
- bot_knows/domain/topic.py +132 -0
- bot_knows/domain/topic_evidence.py +55 -0
- bot_knows/importers/__init__.py +12 -0
- bot_knows/importers/base.py +116 -0
- bot_knows/importers/chatgpt.py +154 -0
- bot_knows/importers/claude.py +172 -0
- bot_knows/importers/generic_json.py +272 -0
- bot_knows/importers/registry.py +125 -0
- bot_knows/infra/__init__.py +5 -0
- bot_knows/infra/llm/__init__.py +6 -0
- bot_knows/infra/llm/anthropic_provider.py +172 -0
- bot_knows/infra/llm/openai_provider.py +195 -0
- bot_knows/infra/mongo/__init__.py +5 -0
- bot_knows/infra/mongo/client.py +145 -0
- bot_knows/infra/mongo/repositories.py +348 -0
- bot_knows/infra/neo4j/__init__.py +5 -0
- bot_knows/infra/neo4j/client.py +152 -0
- bot_knows/infra/neo4j/graph_repository.py +329 -0
- bot_knows/infra/redis/__init__.py +6 -0
- bot_knows/infra/redis/cache.py +198 -0
- bot_knows/infra/redis/client.py +193 -0
- bot_knows/interfaces/__init__.py +18 -0
- bot_knows/interfaces/embedding.py +55 -0
- bot_knows/interfaces/graph.py +194 -0
- bot_knows/interfaces/llm.py +70 -0
- bot_knows/interfaces/recall.py +92 -0
- bot_knows/interfaces/storage.py +225 -0
- bot_knows/logging.py +101 -0
- bot_knows/models/__init__.py +22 -0
- bot_knows/models/chat.py +55 -0
- bot_knows/models/ingest.py +70 -0
- bot_knows/models/message.py +49 -0
- bot_knows/models/recall.py +58 -0
- bot_knows/models/topic.py +100 -0
- bot_knows/orchestrator.py +398 -0
- bot_knows/py.typed +0 -0
- bot_knows/services/__init__.py +24 -0
- bot_knows/services/chat_processing.py +182 -0
- bot_knows/services/dedup_service.py +161 -0
- bot_knows/services/graph_service.py +217 -0
- bot_knows/services/message_builder.py +135 -0
- bot_knows/services/recall_service.py +296 -0
- bot_knows/services/tasks.py +128 -0
- bot_knows/services/topic_extraction.py +199 -0
- bot_knows/utils/__init__.py +22 -0
- bot_knows/utils/hashing.py +126 -0
- bot_knows-0.1.0.dist-info/METADATA +294 -0
- bot_knows-0.1.0.dist-info/RECORD +56 -0
- bot_knows-0.1.0.dist-info/WHEEL +4 -0
- bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Recall service for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides the evidence-weighted recall service with
|
|
4
|
+
spaced repetition-style decay and reinforcement.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import time
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
from bot_knows.interfaces.graph import GraphServiceInterface
|
|
12
|
+
from bot_knows.interfaces.storage import StorageInterface
|
|
13
|
+
from bot_knows.logging import get_logger
|
|
14
|
+
from bot_knows.models.recall import RecallItemDTO, TopicRecallStateDTO
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"CONTEXT_WEIGHTS",
|
|
18
|
+
"RecallService",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
# Context weights for reinforcement
|
|
24
|
+
CONTEXT_WEIGHTS: dict[str, float] = {
|
|
25
|
+
"passive": 0.2, # Passive read (background access)
|
|
26
|
+
"active": 0.6, # Active query (user explicitly asked)
|
|
27
|
+
"recall": 1.0, # Recall prompt (spaced repetition review)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RecallService:
|
|
32
|
+
"""Evidence-weighted recall service.
|
|
33
|
+
|
|
34
|
+
Implements a spaced repetition-inspired recall system with:
|
|
35
|
+
- Time-based decay: strength *= exp(-Δt / (stability * 86400))
|
|
36
|
+
- Reinforcement: strength += confidence * novelty * context_weight
|
|
37
|
+
- Stability growth: stability += k * confidence
|
|
38
|
+
- Semantic reinforcement: boost related topics
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
service = RecallService(storage, graph)
|
|
42
|
+
|
|
43
|
+
# Reinforce when topic is accessed
|
|
44
|
+
state = await service.reinforce(topic_id, confidence=0.9, context="active")
|
|
45
|
+
|
|
46
|
+
# Get topics due for review
|
|
47
|
+
due_topics = await service.get_due_topics(threshold=0.3)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
storage: StorageInterface,
|
|
53
|
+
graph: GraphServiceInterface,
|
|
54
|
+
stability_k: float = 0.1,
|
|
55
|
+
semantic_boost: float = 0.1,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Initialize service with dependencies.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
storage: Storage interface for recall states
|
|
61
|
+
graph: Graph interface for related topics
|
|
62
|
+
stability_k: Factor for stability growth on reinforcement
|
|
63
|
+
semantic_boost: Factor for boosting related topics
|
|
64
|
+
"""
|
|
65
|
+
self._storage = storage
|
|
66
|
+
self._graph = graph
|
|
67
|
+
self._stability_k = stability_k
|
|
68
|
+
self._semantic_boost = semantic_boost
|
|
69
|
+
|
|
70
|
+
async def reinforce(
|
|
71
|
+
self,
|
|
72
|
+
topic_id: str,
|
|
73
|
+
confidence: float,
|
|
74
|
+
novelty_factor: float = 1.0,
|
|
75
|
+
context: Literal["passive", "active", "recall"] = "passive",
|
|
76
|
+
) -> TopicRecallStateDTO:
|
|
77
|
+
"""Reinforce a topic's recall strength.
|
|
78
|
+
|
|
79
|
+
Formulas:
|
|
80
|
+
delta = confidence * novelty_factor * context_weight
|
|
81
|
+
strength = min(1.0, strength + delta)
|
|
82
|
+
stability += k * confidence
|
|
83
|
+
|
|
84
|
+
Also boosts semantically related topics.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
topic_id: Topic to reinforce
|
|
88
|
+
confidence: Evidence confidence (0.0-1.0)
|
|
89
|
+
novelty_factor: How novel this reinforcement is
|
|
90
|
+
context: Interaction context (passive/active/recall)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Updated TopicRecallStateDTO
|
|
94
|
+
"""
|
|
95
|
+
now = int(time.time())
|
|
96
|
+
|
|
97
|
+
# Get or create state
|
|
98
|
+
state = await self._storage.get_recall_state(topic_id)
|
|
99
|
+
if not state:
|
|
100
|
+
state = TopicRecallStateDTO(
|
|
101
|
+
topic_id=topic_id,
|
|
102
|
+
strength=0.0,
|
|
103
|
+
last_seen=now,
|
|
104
|
+
last_updated=now,
|
|
105
|
+
stability=1.0,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Apply decay first
|
|
109
|
+
state = self._apply_decay(state, now)
|
|
110
|
+
|
|
111
|
+
# Calculate reinforcement
|
|
112
|
+
context_weight = CONTEXT_WEIGHTS.get(context, 0.2)
|
|
113
|
+
delta = confidence * novelty_factor * context_weight
|
|
114
|
+
new_strength = min(1.0, state.strength + delta)
|
|
115
|
+
new_stability = state.stability + self._stability_k * confidence
|
|
116
|
+
|
|
117
|
+
# Create updated state
|
|
118
|
+
new_state = TopicRecallStateDTO(
|
|
119
|
+
topic_id=topic_id,
|
|
120
|
+
strength=new_strength,
|
|
121
|
+
last_seen=now,
|
|
122
|
+
last_updated=now,
|
|
123
|
+
stability=new_stability,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Save state
|
|
127
|
+
await self._storage.save_recall_state(new_state)
|
|
128
|
+
|
|
129
|
+
# Boost related topics
|
|
130
|
+
await self._boost_related_topics(topic_id, state.strength)
|
|
131
|
+
|
|
132
|
+
logger.debug(
|
|
133
|
+
"topic_reinforced",
|
|
134
|
+
topic_id=topic_id,
|
|
135
|
+
old_strength=state.strength,
|
|
136
|
+
new_strength=new_strength,
|
|
137
|
+
context=context,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return new_state
|
|
141
|
+
|
|
142
|
+
async def apply_decay(
|
|
143
|
+
self,
|
|
144
|
+
topic_id: str,
|
|
145
|
+
current_time: int | None = None,
|
|
146
|
+
) -> TopicRecallStateDTO | None:
|
|
147
|
+
"""Apply time-based decay to a topic.
|
|
148
|
+
|
|
149
|
+
Formula: strength *= exp(-Δt / (stability * 86400))
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
topic_id: Topic to decay
|
|
153
|
+
current_time: Current time (epoch seconds), default: now
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Updated state or None if topic has no state
|
|
157
|
+
"""
|
|
158
|
+
state = await self._storage.get_recall_state(topic_id)
|
|
159
|
+
if not state:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
now = current_time or int(time.time())
|
|
163
|
+
new_state = self._apply_decay(state, now)
|
|
164
|
+
|
|
165
|
+
await self._storage.save_recall_state(new_state)
|
|
166
|
+
return new_state
|
|
167
|
+
|
|
168
|
+
async def batch_decay_update(self) -> int:
|
|
169
|
+
"""Apply decay to all topics (scheduled task).
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Number of topics updated
|
|
173
|
+
"""
|
|
174
|
+
now = int(time.time())
|
|
175
|
+
states = await self._storage.get_all_recall_states()
|
|
176
|
+
|
|
177
|
+
updated = 0
|
|
178
|
+
for state in states:
|
|
179
|
+
new_state = self._apply_decay(state, now)
|
|
180
|
+
if new_state.strength != state.strength:
|
|
181
|
+
await self._storage.save_recall_state(new_state)
|
|
182
|
+
updated += 1
|
|
183
|
+
|
|
184
|
+
logger.info("batch_decay_completed", updated_count=updated)
|
|
185
|
+
return updated
|
|
186
|
+
|
|
187
|
+
async def get_due_topics(
|
|
188
|
+
self,
|
|
189
|
+
threshold: float = 0.3,
|
|
190
|
+
limit: int = 10,
|
|
191
|
+
) -> list[RecallItemDTO]:
|
|
192
|
+
"""Get topics due for recall review.
|
|
193
|
+
|
|
194
|
+
Topics with strength below threshold are considered due.
|
|
195
|
+
Results are sorted by due_score (higher = more urgent).
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
threshold: Strength threshold for being "due"
|
|
199
|
+
limit: Maximum number of topics
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List of RecallItemDTO sorted by priority
|
|
203
|
+
"""
|
|
204
|
+
states = await self._storage.get_due_topics(threshold)
|
|
205
|
+
|
|
206
|
+
items: list[RecallItemDTO] = []
|
|
207
|
+
for state in states[:limit]:
|
|
208
|
+
topic = await self._storage.get_topic(state.topic_id)
|
|
209
|
+
if not topic:
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
# Get related topics
|
|
213
|
+
related = await self._graph.get_related_topics(state.topic_id, limit=5)
|
|
214
|
+
|
|
215
|
+
# Calculate due score
|
|
216
|
+
due_score = self._calculate_due_score(state)
|
|
217
|
+
|
|
218
|
+
items.append(
|
|
219
|
+
RecallItemDTO(
|
|
220
|
+
topic=topic,
|
|
221
|
+
recall_state=state,
|
|
222
|
+
due_score=due_score,
|
|
223
|
+
related_topics=[r[0] for r in related],
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Sort by due_score descending
|
|
228
|
+
items.sort(key=lambda x: x.due_score, reverse=True)
|
|
229
|
+
return items
|
|
230
|
+
|
|
231
|
+
def _apply_decay(
|
|
232
|
+
self,
|
|
233
|
+
state: TopicRecallStateDTO,
|
|
234
|
+
current_time: int,
|
|
235
|
+
) -> TopicRecallStateDTO:
|
|
236
|
+
"""Apply time-based decay to state.
|
|
237
|
+
|
|
238
|
+
Formula: strength *= exp(-Δt / (stability * 86400))
|
|
239
|
+
"""
|
|
240
|
+
delta_t = current_time - state.last_updated
|
|
241
|
+
if delta_t <= 0:
|
|
242
|
+
return state
|
|
243
|
+
|
|
244
|
+
# stability is in days, so multiply by seconds per day
|
|
245
|
+
decay_factor = math.exp(-delta_t / (state.stability * 86400))
|
|
246
|
+
new_strength = state.strength * decay_factor
|
|
247
|
+
|
|
248
|
+
return TopicRecallStateDTO(
|
|
249
|
+
topic_id=state.topic_id,
|
|
250
|
+
strength=new_strength,
|
|
251
|
+
last_seen=state.last_seen,
|
|
252
|
+
last_updated=current_time,
|
|
253
|
+
stability=state.stability,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
async def _boost_related_topics(
|
|
257
|
+
self,
|
|
258
|
+
topic_id: str,
|
|
259
|
+
source_strength: float,
|
|
260
|
+
) -> None:
|
|
261
|
+
"""Boost strength of related topics.
|
|
262
|
+
|
|
263
|
+
Formula: strength += source_strength * edge_weight * semantic_boost
|
|
264
|
+
"""
|
|
265
|
+
related = await self._graph.get_related_topics(topic_id)
|
|
266
|
+
|
|
267
|
+
for related_id, edge_weight in related:
|
|
268
|
+
state = await self._storage.get_recall_state(related_id)
|
|
269
|
+
if not state:
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
boost = source_strength * edge_weight * self._semantic_boost
|
|
273
|
+
new_strength = min(1.0, state.strength + boost)
|
|
274
|
+
|
|
275
|
+
if new_strength > state.strength:
|
|
276
|
+
new_state = TopicRecallStateDTO(
|
|
277
|
+
topic_id=related_id,
|
|
278
|
+
strength=new_strength,
|
|
279
|
+
last_seen=state.last_seen,
|
|
280
|
+
last_updated=state.last_updated,
|
|
281
|
+
stability=state.stability,
|
|
282
|
+
)
|
|
283
|
+
await self._storage.save_recall_state(new_state)
|
|
284
|
+
|
|
285
|
+
def _calculate_due_score(self, state: TopicRecallStateDTO) -> float:
|
|
286
|
+
"""Calculate priority score for recall review.
|
|
287
|
+
|
|
288
|
+
Higher score = more urgent for review.
|
|
289
|
+
Factors: lower strength, older last_seen
|
|
290
|
+
"""
|
|
291
|
+
now = time.time()
|
|
292
|
+
age_days = (now - state.last_seen) / 86400
|
|
293
|
+
|
|
294
|
+
# Lower strength = higher priority
|
|
295
|
+
# Older = higher priority
|
|
296
|
+
return (1.0 - state.strength) * (1.0 + age_days * 0.1)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Task orchestration for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides Taskiq-based background task definitions
|
|
4
|
+
for async processing of imports, extractions, and scheduled jobs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from bot_knows.logging import get_logger
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"create_broker",
|
|
13
|
+
"create_scheduler",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_broker(redis_url: str) -> Any:
|
|
20
|
+
"""Create Taskiq broker for background tasks.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
redis_url: Redis URL for task queue
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Configured TaskiqRedisStreamBroker
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
broker = create_broker("redis://localhost:6379")
|
|
30
|
+
|
|
31
|
+
@broker.task
|
|
32
|
+
async def my_task():
|
|
33
|
+
...
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
from taskiq_redis import RedisStreamBroker
|
|
37
|
+
|
|
38
|
+
broker = RedisStreamBroker(url=redis_url)
|
|
39
|
+
logger.info("taskiq_broker_created", url=redis_url)
|
|
40
|
+
return broker
|
|
41
|
+
except ImportError:
|
|
42
|
+
logger.warning("taskiq_redis_not_installed")
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def create_scheduler(broker: Any) -> Any:
|
|
47
|
+
"""Create Taskiq scheduler for periodic tasks.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
broker: Taskiq broker instance
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Configured TaskiqScheduler
|
|
54
|
+
"""
|
|
55
|
+
if broker is None:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
from taskiq import TaskiqScheduler
|
|
60
|
+
|
|
61
|
+
scheduler = TaskiqScheduler(broker)
|
|
62
|
+
logger.info("taskiq_scheduler_created")
|
|
63
|
+
return scheduler
|
|
64
|
+
except ImportError:
|
|
65
|
+
logger.warning("taskiq_not_installed")
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Task definitions (to be registered with broker)
|
|
70
|
+
# These are placeholder implementations - actual implementation
|
|
71
|
+
# requires a running broker instance.
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def process_chat_import_task(
|
|
75
|
+
source: str,
|
|
76
|
+
raw_export: dict[str, Any],
|
|
77
|
+
) -> dict[str, Any]:
|
|
78
|
+
"""Background task to process chat import.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
source: Import source identifier
|
|
82
|
+
raw_export: Raw export data
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Import result summary
|
|
86
|
+
"""
|
|
87
|
+
# Import orchestration would go here
|
|
88
|
+
# This requires dependency injection of services
|
|
89
|
+
logger.info("process_chat_import_task", source=source)
|
|
90
|
+
return {"status": "completed", "source": source}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
async def extract_topics_task(
|
|
94
|
+
message_id: str,
|
|
95
|
+
) -> dict[str, Any]:
|
|
96
|
+
"""Background task to extract topics from message.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
message_id: Message ID to process
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Extraction result summary
|
|
103
|
+
"""
|
|
104
|
+
logger.info("extract_topics_task", message_id=message_id)
|
|
105
|
+
return {"status": "completed", "message_id": message_id}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def batch_decay_task() -> dict[str, Any]:
|
|
109
|
+
"""Scheduled task to update decay for all topics.
|
|
110
|
+
|
|
111
|
+
This should be scheduled to run periodically (e.g., daily).
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Decay update summary
|
|
115
|
+
"""
|
|
116
|
+
logger.info("batch_decay_task_started")
|
|
117
|
+
# Would call RecallService.batch_decay_update()
|
|
118
|
+
return {"status": "completed"}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Example of how to set up scheduled tasks with a broker:
|
|
122
|
+
#
|
|
123
|
+
# broker = create_broker("redis://localhost:6379")
|
|
124
|
+
# scheduler = create_scheduler(broker)
|
|
125
|
+
#
|
|
126
|
+
# if scheduler:
|
|
127
|
+
# # Run decay update every 24 hours at midnight
|
|
128
|
+
# scheduler.schedule(batch_decay_task, cron="0 0 * * *")
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Topic extraction service for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides the service for extracting topics from messages.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from bot_knows.interfaces.embedding import EmbeddingServiceInterface
|
|
9
|
+
from bot_knows.interfaces.llm import LLMInterface
|
|
10
|
+
from bot_knows.logging import get_logger
|
|
11
|
+
from bot_knows.models.message import MessageDTO
|
|
12
|
+
from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
|
|
13
|
+
from bot_knows.utils.hashing import generate_evidence_id, generate_topic_id
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"TopicCandidate",
|
|
17
|
+
"TopicExtractionService",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TopicCandidate:
|
|
24
|
+
"""Represents a candidate topic extracted from a message.
|
|
25
|
+
|
|
26
|
+
This is an intermediate representation before deduplication.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
extracted_name: str,
|
|
32
|
+
confidence: float,
|
|
33
|
+
embedding: list[float],
|
|
34
|
+
source_message_id: str,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.extracted_name = extracted_name
|
|
37
|
+
self.confidence = confidence
|
|
38
|
+
self.embedding = embedding
|
|
39
|
+
self.source_message_id = source_message_id
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TopicExtractionService:
|
|
43
|
+
"""Service for extracting topics from messages.
|
|
44
|
+
|
|
45
|
+
Uses LLM for topic extraction and embedding service for
|
|
46
|
+
generating embeddings for semantic matching.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
service = TopicExtractionService(llm, embedding_service)
|
|
50
|
+
candidates = await service.extract(message)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
llm: LLMInterface,
|
|
56
|
+
embedding_service: EmbeddingServiceInterface,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Initialize service with dependencies.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
llm: LLM interface for topic extraction
|
|
62
|
+
embedding_service: Embedding service for vector generation
|
|
63
|
+
"""
|
|
64
|
+
self._llm = llm
|
|
65
|
+
self._embedding = embedding_service
|
|
66
|
+
|
|
67
|
+
async def extract(self, message: MessageDTO) -> list[TopicCandidate]:
|
|
68
|
+
"""Extract topic candidates from a message.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
message: Message to extract topics from
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of TopicCandidate objects
|
|
75
|
+
"""
|
|
76
|
+
if message.is_empty:
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
# Extract raw topics using LLM
|
|
80
|
+
try:
|
|
81
|
+
raw_topics = await self._llm.extract_topics(
|
|
82
|
+
message.user_content,
|
|
83
|
+
message.assistant_content,
|
|
84
|
+
)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warning("topic_extraction_failed", error=str(e))
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
if not raw_topics:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
# Generate embeddings for all topics
|
|
93
|
+
topic_names = [name for name, _ in raw_topics]
|
|
94
|
+
try:
|
|
95
|
+
embeddings = await self._embedding.embed_batch(topic_names)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.warning("embedding_generation_failed", error=str(e))
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
# Create candidates
|
|
101
|
+
candidates = []
|
|
102
|
+
for (name, confidence), embedding in zip(raw_topics, embeddings, strict=False):
|
|
103
|
+
candidate = TopicCandidate(
|
|
104
|
+
extracted_name=name,
|
|
105
|
+
confidence=confidence,
|
|
106
|
+
embedding=embedding,
|
|
107
|
+
source_message_id=message.message_id,
|
|
108
|
+
)
|
|
109
|
+
candidates.append(candidate)
|
|
110
|
+
|
|
111
|
+
logger.debug(
|
|
112
|
+
"topics_extracted",
|
|
113
|
+
message_id=message.message_id,
|
|
114
|
+
count=len(candidates),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return candidates
|
|
118
|
+
|
|
119
|
+
async def create_topic_and_evidence(
|
|
120
|
+
self,
|
|
121
|
+
candidate: TopicCandidate,
|
|
122
|
+
canonical_name: str | None = None,
|
|
123
|
+
) -> tuple[TopicDTO, TopicEvidenceDTO]:
|
|
124
|
+
"""Create new TopicDTO and TopicEvidenceDTO from candidate.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
candidate: Topic candidate
|
|
128
|
+
canonical_name: Normalized name (if None, uses extracted_name)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Tuple of (TopicDTO, TopicEvidenceDTO)
|
|
132
|
+
"""
|
|
133
|
+
name = canonical_name or await self._llm.normalize_topic_name(candidate.extracted_name)
|
|
134
|
+
now = int(time.time())
|
|
135
|
+
|
|
136
|
+
topic_id = generate_topic_id(name, candidate.source_message_id)
|
|
137
|
+
evidence_id = generate_evidence_id(
|
|
138
|
+
topic_id=topic_id,
|
|
139
|
+
extracted_name=candidate.extracted_name,
|
|
140
|
+
source_message_id=candidate.source_message_id,
|
|
141
|
+
timestamp=now,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
topic = TopicDTO(
|
|
145
|
+
topic_id=topic_id,
|
|
146
|
+
canonical_name=name,
|
|
147
|
+
centroid_embedding=candidate.embedding,
|
|
148
|
+
evidence_count=1,
|
|
149
|
+
importance=candidate.confidence * 0.1, # Initial importance
|
|
150
|
+
recall_strength=0.0,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
evidence = TopicEvidenceDTO(
|
|
154
|
+
evidence_id=evidence_id,
|
|
155
|
+
topic_id=topic_id,
|
|
156
|
+
extracted_name=candidate.extracted_name,
|
|
157
|
+
source_message_id=candidate.source_message_id,
|
|
158
|
+
confidence=candidate.confidence,
|
|
159
|
+
timestamp=now,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return topic, evidence
|
|
163
|
+
|
|
164
|
+
async def create_evidence_for_existing(
|
|
165
|
+
self,
|
|
166
|
+
candidate: TopicCandidate,
|
|
167
|
+
existing_topic: TopicDTO,
|
|
168
|
+
) -> tuple[TopicDTO, TopicEvidenceDTO]:
|
|
169
|
+
"""Create evidence for existing topic and update centroid.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
candidate: Topic candidate
|
|
173
|
+
existing_topic: Existing topic to link to
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Tuple of (updated TopicDTO, new TopicEvidenceDTO)
|
|
177
|
+
"""
|
|
178
|
+
now = int(time.time())
|
|
179
|
+
|
|
180
|
+
evidence_id = generate_evidence_id(
|
|
181
|
+
topic_id=existing_topic.topic_id,
|
|
182
|
+
extracted_name=candidate.extracted_name,
|
|
183
|
+
source_message_id=candidate.source_message_id,
|
|
184
|
+
timestamp=now,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
evidence = TopicEvidenceDTO(
|
|
188
|
+
evidence_id=evidence_id,
|
|
189
|
+
topic_id=existing_topic.topic_id,
|
|
190
|
+
extracted_name=candidate.extracted_name,
|
|
191
|
+
source_message_id=candidate.source_message_id,
|
|
192
|
+
confidence=candidate.confidence,
|
|
193
|
+
timestamp=now,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Update topic centroid
|
|
197
|
+
updated_topic = existing_topic.with_updated_centroid(candidate.embedding)
|
|
198
|
+
|
|
199
|
+
return updated_topic, evidence
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Utility functions for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module contains internal utility functions.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from bot_knows.utils.hashing import (
|
|
7
|
+
generate_chat_id,
|
|
8
|
+
generate_evidence_id,
|
|
9
|
+
generate_message_id,
|
|
10
|
+
generate_topic_id,
|
|
11
|
+
hash_text,
|
|
12
|
+
stable_hash,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"generate_chat_id",
|
|
17
|
+
"generate_evidence_id",
|
|
18
|
+
"generate_message_id",
|
|
19
|
+
"generate_topic_id",
|
|
20
|
+
"hash_text",
|
|
21
|
+
"stable_hash",
|
|
22
|
+
]
|