bot-knows 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bot_knows/__init__.py +70 -0
- bot_knows/config.py +115 -0
- bot_knows/domain/__init__.py +5 -0
- bot_knows/domain/chat.py +62 -0
- bot_knows/domain/message.py +64 -0
- bot_knows/domain/relation.py +56 -0
- bot_knows/domain/topic.py +132 -0
- bot_knows/domain/topic_evidence.py +55 -0
- bot_knows/importers/__init__.py +12 -0
- bot_knows/importers/base.py +116 -0
- bot_knows/importers/chatgpt.py +154 -0
- bot_knows/importers/claude.py +172 -0
- bot_knows/importers/generic_json.py +272 -0
- bot_knows/importers/registry.py +125 -0
- bot_knows/infra/__init__.py +5 -0
- bot_knows/infra/llm/__init__.py +6 -0
- bot_knows/infra/llm/anthropic_provider.py +172 -0
- bot_knows/infra/llm/openai_provider.py +195 -0
- bot_knows/infra/mongo/__init__.py +5 -0
- bot_knows/infra/mongo/client.py +145 -0
- bot_knows/infra/mongo/repositories.py +348 -0
- bot_knows/infra/neo4j/__init__.py +5 -0
- bot_knows/infra/neo4j/client.py +152 -0
- bot_knows/infra/neo4j/graph_repository.py +329 -0
- bot_knows/infra/redis/__init__.py +6 -0
- bot_knows/infra/redis/cache.py +198 -0
- bot_knows/infra/redis/client.py +193 -0
- bot_knows/interfaces/__init__.py +18 -0
- bot_knows/interfaces/embedding.py +55 -0
- bot_knows/interfaces/graph.py +194 -0
- bot_knows/interfaces/llm.py +70 -0
- bot_knows/interfaces/recall.py +92 -0
- bot_knows/interfaces/storage.py +225 -0
- bot_knows/logging.py +101 -0
- bot_knows/models/__init__.py +22 -0
- bot_knows/models/chat.py +55 -0
- bot_knows/models/ingest.py +70 -0
- bot_knows/models/message.py +49 -0
- bot_knows/models/recall.py +58 -0
- bot_knows/models/topic.py +100 -0
- bot_knows/orchestrator.py +398 -0
- bot_knows/py.typed +0 -0
- bot_knows/services/__init__.py +24 -0
- bot_knows/services/chat_processing.py +182 -0
- bot_knows/services/dedup_service.py +161 -0
- bot_knows/services/graph_service.py +217 -0
- bot_knows/services/message_builder.py +135 -0
- bot_knows/services/recall_service.py +296 -0
- bot_knows/services/tasks.py +128 -0
- bot_knows/services/topic_extraction.py +199 -0
- bot_knows/utils/__init__.py +22 -0
- bot_knows/utils/hashing.py +126 -0
- bot_knows-0.1.0.dist-info/METADATA +294 -0
- bot_knows-0.1.0.dist-info/RECORD +56 -0
- bot_knows-0.1.0.dist-info/WHEEL +4 -0
- bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""BotKnows orchestrator for high-level knowledge base operations.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for the bot_knows package,
|
|
4
|
+
orchestrating all services for chat ingestion and knowledge retrieval.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from bot_knows.config import BotKnowsConfig
|
|
12
|
+
from bot_knows.importers.base import ChatImportAdapter
|
|
13
|
+
from bot_knows.interfaces.embedding import EmbeddingServiceInterface
|
|
14
|
+
from bot_knows.interfaces.graph import GraphServiceInterface
|
|
15
|
+
from bot_knows.interfaces.llm import LLMInterface
|
|
16
|
+
from bot_knows.interfaces.storage import StorageInterface
|
|
17
|
+
from bot_knows.logging import get_logger
|
|
18
|
+
from bot_knows.models.ingest import ChatIngest
|
|
19
|
+
from bot_knows.models.message import MessageDTO
|
|
20
|
+
from bot_knows.models.recall import TopicRecallStateDTO
|
|
21
|
+
from bot_knows.models.topic import TopicEvidenceDTO
|
|
22
|
+
from bot_knows.services.chat_processing import ChatProcessingService
|
|
23
|
+
from bot_knows.services.dedup_service import DedupAction, DedupService
|
|
24
|
+
from bot_knows.services.graph_service import GraphService
|
|
25
|
+
from bot_knows.services.message_builder import MessageBuilder
|
|
26
|
+
from bot_knows.services.recall_service import RecallService
|
|
27
|
+
from bot_knows.services.topic_extraction import TopicExtractionService
|
|
28
|
+
|
|
29
|
+
__all__ = ["BotKnows", "InsertResult"]
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class InsertResult:
|
|
36
|
+
"""Statistics from chat insertion."""
|
|
37
|
+
|
|
38
|
+
chats_processed: int = 0
|
|
39
|
+
chats_new: int = 0
|
|
40
|
+
chats_skipped: int = 0
|
|
41
|
+
messages_created: int = 0
|
|
42
|
+
topics_created: int = 0
|
|
43
|
+
topics_merged: int = 0
|
|
44
|
+
topics_soft_matched: int = 0
|
|
45
|
+
errors: list[str] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BotKnows:
|
|
49
|
+
"""Main orchestrator and retriver for bot_knows knowledge base.
|
|
50
|
+
|
|
51
|
+
Accepts implementation classes. Config is loaded from .env automatically.
|
|
52
|
+
For custom implementations, set config_class = None and pass custom_config dict.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
async with BotKnows(
|
|
56
|
+
storage_class=MongoStorageRepository,
|
|
57
|
+
graphdb_class=Neo4jGraphRepository,
|
|
58
|
+
llm_class=OpenAIProvider,
|
|
59
|
+
) as bk:
|
|
60
|
+
result = await bk.insert_chats("export.json", ChatGPTAdapter)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
storage_class: type[StorageInterface],
|
|
66
|
+
graphdb_class: type[GraphServiceInterface],
|
|
67
|
+
llm_class: type[LLMInterface],
|
|
68
|
+
embedding_class: type[EmbeddingServiceInterface] | None = None,
|
|
69
|
+
*,
|
|
70
|
+
storage_custom_config: dict[str, Any] | None = None,
|
|
71
|
+
graphdb_custom_config: dict[str, Any] | None = None,
|
|
72
|
+
llm_custom_config: dict[str, Any] | None = None,
|
|
73
|
+
embedding_custom_config: dict[str, Any] | None = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""Initialize BotKnows with implementation classes.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
storage_class: Storage implementation class
|
|
79
|
+
graphdb_class: Graph DB implementation class
|
|
80
|
+
llm_class: LLM implementation class
|
|
81
|
+
embedding_class: Embedding implementation class (defaults to llm_class)
|
|
82
|
+
storage_custom_config: Custom config dict if storage_class.config_class is None
|
|
83
|
+
graphdb_custom_config: Custom config dict if graphdb_class.config_class is None
|
|
84
|
+
llm_custom_config: Custom config dict if llm_class.config_class is None
|
|
85
|
+
embedding_custom_config: Custom config dict if embedding_class.config_class is None
|
|
86
|
+
"""
|
|
87
|
+
self._config = BotKnowsConfig() # Loads from .env
|
|
88
|
+
|
|
89
|
+
self._storage_class = storage_class
|
|
90
|
+
self._graphdb_class = graphdb_class
|
|
91
|
+
self._llm_class = llm_class
|
|
92
|
+
self._embedding_class = embedding_class or llm_class
|
|
93
|
+
|
|
94
|
+
self._storage_custom_config = storage_custom_config
|
|
95
|
+
self._graphdb_custom_config = graphdb_custom_config
|
|
96
|
+
self._llm_custom_config = llm_custom_config
|
|
97
|
+
self._embedding_custom_config = embedding_custom_config
|
|
98
|
+
|
|
99
|
+
# Instances (created on connect)
|
|
100
|
+
self._storage: StorageInterface | None = None
|
|
101
|
+
self._graph: GraphServiceInterface | None = None
|
|
102
|
+
self._llm: LLMInterface | None = None
|
|
103
|
+
self._embedding: EmbeddingServiceInterface | None = None
|
|
104
|
+
|
|
105
|
+
# Services (wired on connect)
|
|
106
|
+
self._chat_processor: ChatProcessingService | None = None
|
|
107
|
+
self._message_builder = MessageBuilder()
|
|
108
|
+
self._topic_extractor: TopicExtractionService | None = None
|
|
109
|
+
self._dedup_service: DedupService | None = None
|
|
110
|
+
self._graph_service: GraphService | None = None
|
|
111
|
+
self._recall_service: RecallService | None = None
|
|
112
|
+
|
|
113
|
+
self._connected = False
|
|
114
|
+
|
|
115
|
+
async def _instantiate_class(
|
|
116
|
+
self,
|
|
117
|
+
cls: type,
|
|
118
|
+
custom_config: dict[str, Any] | None,
|
|
119
|
+
) -> Any:
|
|
120
|
+
"""Instantiate an implementation class.
|
|
121
|
+
|
|
122
|
+
If cls.config_class is set, instantiate config (loads from .env).
|
|
123
|
+
If cls.config_class is None, use custom_config dict.
|
|
124
|
+
"""
|
|
125
|
+
config_class = getattr(cls, "config_class", None)
|
|
126
|
+
|
|
127
|
+
if config_class is None:
|
|
128
|
+
# Custom implementation - use dict
|
|
129
|
+
if custom_config is None:
|
|
130
|
+
raise ValueError(
|
|
131
|
+
f"{cls.__name__} has config_class=None but no custom_config provided"
|
|
132
|
+
)
|
|
133
|
+
return await cls.from_dict(custom_config)
|
|
134
|
+
else:
|
|
135
|
+
# Standard implementation - instantiate settings (loads from .env)
|
|
136
|
+
config = config_class()
|
|
137
|
+
return await cls.from_config(config)
|
|
138
|
+
|
|
139
|
+
async def _connect(self) -> None:
|
|
140
|
+
"""Initialize connections and services."""
|
|
141
|
+
if self._connected:
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
# Instantiate implementations
|
|
145
|
+
self._storage = await self._instantiate_class(
|
|
146
|
+
self._storage_class, self._storage_custom_config
|
|
147
|
+
)
|
|
148
|
+
self._graph = await self._instantiate_class(
|
|
149
|
+
self._graphdb_class, self._graphdb_custom_config
|
|
150
|
+
)
|
|
151
|
+
self._llm = await self._instantiate_class(self._llm_class, self._llm_custom_config)
|
|
152
|
+
|
|
153
|
+
if self._embedding_class is self._llm_class:
|
|
154
|
+
self._embedding = self._llm # type: ignore[assignment]
|
|
155
|
+
else:
|
|
156
|
+
self._embedding = await self._instantiate_class(
|
|
157
|
+
self._embedding_class, self._embedding_custom_config
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Wire services
|
|
161
|
+
self._chat_processor = ChatProcessingService(self._storage, self._llm)
|
|
162
|
+
self._topic_extractor = TopicExtractionService(self._llm, self._embedding)
|
|
163
|
+
self._dedup_service = DedupService(
|
|
164
|
+
self._embedding,
|
|
165
|
+
self._storage,
|
|
166
|
+
high_threshold=self._config.dedup_high_threshold,
|
|
167
|
+
low_threshold=self._config.dedup_low_threshold,
|
|
168
|
+
)
|
|
169
|
+
self._graph_service = GraphService(self._graph)
|
|
170
|
+
self._recall_service = RecallService(
|
|
171
|
+
self._storage,
|
|
172
|
+
self._graph,
|
|
173
|
+
stability_k=self._config.recall_stability_k,
|
|
174
|
+
semantic_boost=self._config.recall_semantic_boost,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
self._connected = True
|
|
178
|
+
logger.info("bot_knows_connected")
|
|
179
|
+
|
|
180
|
+
async def _disconnect(self) -> None:
|
|
181
|
+
"""Close all connections."""
|
|
182
|
+
if self._storage and hasattr(self._storage, "close"):
|
|
183
|
+
await self._storage.close()
|
|
184
|
+
if self._graph and hasattr(self._graph, "close"):
|
|
185
|
+
await self._graph.close()
|
|
186
|
+
if self._llm and hasattr(self._llm, "close"):
|
|
187
|
+
await self._llm.close()
|
|
188
|
+
if (
|
|
189
|
+
self._embedding
|
|
190
|
+
and self._embedding is not self._llm
|
|
191
|
+
and hasattr(self._embedding, "close")
|
|
192
|
+
):
|
|
193
|
+
await self._embedding.close()
|
|
194
|
+
|
|
195
|
+
self._connected = False
|
|
196
|
+
logger.info("bot_knows_disconnected")
|
|
197
|
+
|
|
198
|
+
async def __aenter__(self) -> "BotKnows":
|
|
199
|
+
"""Async context manager entry - connects automatically."""
|
|
200
|
+
await self._connect()
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
async def __aexit__(
|
|
204
|
+
self,
|
|
205
|
+
exc_type: type[BaseException] | None,
|
|
206
|
+
exc_val: BaseException | None,
|
|
207
|
+
exc_tb: Any,
|
|
208
|
+
) -> None:
|
|
209
|
+
"""Async context manager exit - disconnects automatically."""
|
|
210
|
+
await self._disconnect()
|
|
211
|
+
|
|
212
|
+
def _ensure_connected(self) -> None:
|
|
213
|
+
if not self._connected:
|
|
214
|
+
raise RuntimeError("BotKnows not connected. Use 'async with BotKnows(...) as bk:'")
|
|
215
|
+
|
|
216
|
+
# === MAIN WORKFLOW ===
|
|
217
|
+
|
|
218
|
+
async def insert_chats(
|
|
219
|
+
self,
|
|
220
|
+
chats: dict[str, Any] | str | Path,
|
|
221
|
+
adapter_class: type[ChatImportAdapter],
|
|
222
|
+
) -> InsertResult:
|
|
223
|
+
"""Ingest chats through the complete processing pipeline.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
chats: Raw export data (dict), JSON string, or path to JSON file
|
|
227
|
+
adapter_class: Import adapter class to use for parsing
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
InsertResult with statistics
|
|
231
|
+
"""
|
|
232
|
+
self._ensure_connected()
|
|
233
|
+
|
|
234
|
+
# Parse input
|
|
235
|
+
adapter = adapter_class()
|
|
236
|
+
chat_ingests: list[ChatIngest]
|
|
237
|
+
|
|
238
|
+
if isinstance(chats, Path):
|
|
239
|
+
chat_ingests = adapter.parse_file(chats)
|
|
240
|
+
elif isinstance(chats, str):
|
|
241
|
+
path = Path(chats)
|
|
242
|
+
if path.exists():
|
|
243
|
+
chat_ingests = adapter.parse_file(path)
|
|
244
|
+
else:
|
|
245
|
+
chat_ingests = adapter.parse_string(chats)
|
|
246
|
+
else:
|
|
247
|
+
chat_ingests = adapter.parse(chats)
|
|
248
|
+
|
|
249
|
+
result = InsertResult()
|
|
250
|
+
|
|
251
|
+
for chat_ingest in chat_ingests:
|
|
252
|
+
try:
|
|
253
|
+
await self._process_single_chat(chat_ingest, result)
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.error("chat_processing_failed", error=str(e))
|
|
256
|
+
result.errors.append(f"Chat '{chat_ingest.title}': {e}")
|
|
257
|
+
|
|
258
|
+
logger.info(
|
|
259
|
+
"insert_chats_completed",
|
|
260
|
+
chats_processed=result.chats_processed,
|
|
261
|
+
chats_new=result.chats_new,
|
|
262
|
+
topics_created=result.topics_created,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return result
|
|
266
|
+
|
|
267
|
+
async def _process_single_chat(
|
|
268
|
+
self,
|
|
269
|
+
chat_ingest: ChatIngest,
|
|
270
|
+
result: InsertResult,
|
|
271
|
+
) -> None:
|
|
272
|
+
"""Process a single chat through the pipeline."""
|
|
273
|
+
assert self._chat_processor is not None
|
|
274
|
+
assert self._storage is not None
|
|
275
|
+
assert self._graph_service is not None
|
|
276
|
+
|
|
277
|
+
result.chats_processed += 1
|
|
278
|
+
|
|
279
|
+
# Step 1: Process chat (identity, classification, persistence)
|
|
280
|
+
chat, is_new = await self._chat_processor.process(chat_ingest)
|
|
281
|
+
|
|
282
|
+
if not is_new:
|
|
283
|
+
result.chats_skipped += 1
|
|
284
|
+
return
|
|
285
|
+
|
|
286
|
+
result.chats_new += 1
|
|
287
|
+
|
|
288
|
+
# Step 2: Build and save messages
|
|
289
|
+
messages = self._message_builder.build(chat_ingest.messages, chat.id)
|
|
290
|
+
for message in messages:
|
|
291
|
+
await self._storage.save_message(message)
|
|
292
|
+
result.messages_created += 1
|
|
293
|
+
|
|
294
|
+
# Step 3: Create graph nodes for chat and messages
|
|
295
|
+
await self._graph_service.add_chat_with_messages(chat, messages)
|
|
296
|
+
|
|
297
|
+
# Step 4: Process each message for topics
|
|
298
|
+
for message in messages:
|
|
299
|
+
await self._process_message_topics(message, result)
|
|
300
|
+
|
|
301
|
+
async def _process_message_topics(
|
|
302
|
+
self,
|
|
303
|
+
message: MessageDTO,
|
|
304
|
+
result: InsertResult,
|
|
305
|
+
) -> None:
|
|
306
|
+
"""Extract and process topics from a message."""
|
|
307
|
+
assert self._topic_extractor is not None
|
|
308
|
+
assert self._dedup_service is not None
|
|
309
|
+
assert self._storage is not None
|
|
310
|
+
assert self._graph_service is not None
|
|
311
|
+
assert self._recall_service is not None
|
|
312
|
+
|
|
313
|
+
candidates = await self._topic_extractor.extract(message)
|
|
314
|
+
|
|
315
|
+
for candidate in candidates:
|
|
316
|
+
dedup_result = await self._dedup_service.check_duplicate(candidate.embedding)
|
|
317
|
+
|
|
318
|
+
if dedup_result.action == DedupAction.MERGE:
|
|
319
|
+
assert dedup_result.existing_topic is not None
|
|
320
|
+
topic, evidence = await self._topic_extractor.create_evidence_for_existing(
|
|
321
|
+
candidate, dedup_result.existing_topic
|
|
322
|
+
)
|
|
323
|
+
await self._storage.update_topic(topic)
|
|
324
|
+
await self._storage.append_evidence(evidence)
|
|
325
|
+
await self._graph_service.add_evidence_to_existing_topic(topic, evidence)
|
|
326
|
+
result.topics_merged += 1
|
|
327
|
+
|
|
328
|
+
elif dedup_result.action == DedupAction.SOFT_MATCH:
|
|
329
|
+
assert dedup_result.existing_topic is not None
|
|
330
|
+
topic, evidence = await self._topic_extractor.create_topic_and_evidence(candidate)
|
|
331
|
+
await self._storage.save_topic(topic)
|
|
332
|
+
await self._storage.append_evidence(evidence)
|
|
333
|
+
await self._graph_service.add_topic_with_evidence(topic, evidence)
|
|
334
|
+
await self._graph_service.create_potential_duplicate_link(
|
|
335
|
+
topic.topic_id,
|
|
336
|
+
dedup_result.existing_topic.topic_id,
|
|
337
|
+
dedup_result.similarity,
|
|
338
|
+
)
|
|
339
|
+
result.topics_soft_matched += 1
|
|
340
|
+
result.topics_created += 1
|
|
341
|
+
|
|
342
|
+
else: # NEW
|
|
343
|
+
topic, evidence = await self._topic_extractor.create_topic_and_evidence(candidate)
|
|
344
|
+
await self._storage.save_topic(topic)
|
|
345
|
+
await self._storage.append_evidence(evidence)
|
|
346
|
+
await self._graph_service.add_topic_with_evidence(topic, evidence)
|
|
347
|
+
result.topics_created += 1
|
|
348
|
+
|
|
349
|
+
# Reinforce recall state
|
|
350
|
+
await self._recall_service.reinforce(
|
|
351
|
+
topic.topic_id,
|
|
352
|
+
confidence=candidate.confidence,
|
|
353
|
+
context="passive",
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# === RETRIEVAL METHODS ===
|
|
357
|
+
|
|
358
|
+
async def get_messages_for_chat(self, chat_id: str) -> list[MessageDTO]:
|
|
359
|
+
"""Get all messages for a chat."""
|
|
360
|
+
self._ensure_connected()
|
|
361
|
+
assert self._storage is not None
|
|
362
|
+
return await self._storage.get_messages_for_chat(chat_id)
|
|
363
|
+
|
|
364
|
+
async def get_related_topics(self, topic_id: str, limit: int = 10) -> list[tuple[str, float]]:
|
|
365
|
+
"""Get topics related to a given topic."""
|
|
366
|
+
self._ensure_connected()
|
|
367
|
+
assert self._graph is not None
|
|
368
|
+
return await self._graph.get_related_topics(topic_id, limit)
|
|
369
|
+
|
|
370
|
+
async def get_topic_evidence(self, topic_id: str) -> list[TopicEvidenceDTO]:
|
|
371
|
+
"""Get all evidence for a topic."""
|
|
372
|
+
self._ensure_connected()
|
|
373
|
+
assert self._storage is not None
|
|
374
|
+
return await self._storage.get_evidence_for_topic(topic_id)
|
|
375
|
+
|
|
376
|
+
async def get_chat_topics(self, chat_id: str) -> list[str]:
|
|
377
|
+
"""Get all topic IDs associated with a chat's messages."""
|
|
378
|
+
self._ensure_connected()
|
|
379
|
+
assert self._graph is not None
|
|
380
|
+
return await self._graph.get_chat_topics(chat_id)
|
|
381
|
+
|
|
382
|
+
async def get_recall_state(self, topic_id: str) -> TopicRecallStateDTO | None:
|
|
383
|
+
"""Get recall state for a topic."""
|
|
384
|
+
self._ensure_connected()
|
|
385
|
+
assert self._storage is not None
|
|
386
|
+
return await self._storage.get_recall_state(topic_id)
|
|
387
|
+
|
|
388
|
+
async def get_due_topics(self, threshold: float = 0.3) -> list[TopicRecallStateDTO]:
|
|
389
|
+
"""Get topics due for recall review."""
|
|
390
|
+
self._ensure_connected()
|
|
391
|
+
assert self._storage is not None
|
|
392
|
+
return await self._storage.get_due_topics(threshold)
|
|
393
|
+
|
|
394
|
+
async def get_all_recall_states(self) -> list[TopicRecallStateDTO]:
|
|
395
|
+
"""Get all recall states."""
|
|
396
|
+
self._ensure_connected()
|
|
397
|
+
assert self._storage is not None
|
|
398
|
+
return await self._storage.get_all_recall_states()
|
bot_knows/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Service layer for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module exports the main service entry points.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from bot_knows.services.chat_processing import ChatProcessingService
|
|
7
|
+
from bot_knows.services.dedup_service import DedupAction, DedupResult, DedupService
|
|
8
|
+
from bot_knows.services.graph_service import GraphService
|
|
9
|
+
from bot_knows.services.message_builder import MessageBuilder
|
|
10
|
+
from bot_knows.services.recall_service import CONTEXT_WEIGHTS, RecallService
|
|
11
|
+
from bot_knows.services.topic_extraction import TopicCandidate, TopicExtractionService
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"CONTEXT_WEIGHTS",
|
|
15
|
+
"ChatProcessingService",
|
|
16
|
+
"DedupAction",
|
|
17
|
+
"DedupResult",
|
|
18
|
+
"DedupService",
|
|
19
|
+
"GraphService",
|
|
20
|
+
"MessageBuilder",
|
|
21
|
+
"RecallService",
|
|
22
|
+
"TopicCandidate",
|
|
23
|
+
"TopicExtractionService",
|
|
24
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Chat processing service for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides the main service for chat creation and classification.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from bot_knows.interfaces.llm import LLMInterface
|
|
7
|
+
from bot_knows.interfaces.storage import StorageInterface
|
|
8
|
+
from bot_knows.logging import get_logger
|
|
9
|
+
from bot_knows.models.chat import ChatCategory, ChatDTO
|
|
10
|
+
from bot_knows.models.ingest import ChatIngest, IngestMessage
|
|
11
|
+
from bot_knows.utils.hashing import generate_chat_id
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ChatProcessingService",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ChatProcessingService:
|
|
21
|
+
"""Service for chat creation and classification.
|
|
22
|
+
|
|
23
|
+
Processes ChatIngest objects into ChatDTO objects, handling:
|
|
24
|
+
- Chat identity resolution (deterministic ID generation)
|
|
25
|
+
- Title resolution (from import or first message)
|
|
26
|
+
- One-time classification (only for new chats)
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
service = ChatProcessingService(storage, llm)
|
|
30
|
+
chat, is_new = await service.process(chat_ingest)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
storage: StorageInterface,
|
|
36
|
+
llm: LLMInterface,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Initialize service with dependencies.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
storage: Storage interface for persistence
|
|
42
|
+
llm: LLM interface for classification
|
|
43
|
+
"""
|
|
44
|
+
self._storage = storage
|
|
45
|
+
self._llm = llm
|
|
46
|
+
|
|
47
|
+
async def process(self, chat_ingest: ChatIngest) -> tuple[ChatDTO, bool]:
|
|
48
|
+
"""Process chat ingest into ChatDTO.
|
|
49
|
+
|
|
50
|
+
If chat already exists (by ID), returns existing chat.
|
|
51
|
+
Classification only runs for new chats.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
chat_ingest: Ingested chat data
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Tuple of (ChatDTO, is_new) where is_new indicates
|
|
58
|
+
if this was a newly created chat
|
|
59
|
+
"""
|
|
60
|
+
# Resolve title first (needed for ID)
|
|
61
|
+
title = self._resolve_title(chat_ingest)
|
|
62
|
+
|
|
63
|
+
# Generate deterministic chat ID
|
|
64
|
+
chat_id = generate_chat_id(
|
|
65
|
+
title=title,
|
|
66
|
+
source=chat_ingest.source,
|
|
67
|
+
timestamp=chat_ingest.imported_chat_timestamp,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Check if already exists (idempotency)
|
|
71
|
+
existing = await self._storage.get_chat(chat_id)
|
|
72
|
+
if existing:
|
|
73
|
+
logger.debug("chat_already_exists", chat_id=chat_id)
|
|
74
|
+
return existing, False
|
|
75
|
+
|
|
76
|
+
# Classify new chat
|
|
77
|
+
category, tags = await self._classify(chat_ingest)
|
|
78
|
+
|
|
79
|
+
# Create ChatDTO
|
|
80
|
+
chat = ChatDTO(
|
|
81
|
+
id=chat_id,
|
|
82
|
+
title=title,
|
|
83
|
+
source=chat_ingest.source,
|
|
84
|
+
category=category,
|
|
85
|
+
tags=tags,
|
|
86
|
+
created_on=chat_ingest.imported_chat_timestamp,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Persist
|
|
90
|
+
await self._storage.save_chat(chat)
|
|
91
|
+
|
|
92
|
+
logger.info(
|
|
93
|
+
"chat_created",
|
|
94
|
+
chat_id=chat_id,
|
|
95
|
+
title=title[:50],
|
|
96
|
+
category=category.value,
|
|
97
|
+
message_count=len(chat_ingest.messages),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return chat, True
|
|
101
|
+
|
|
102
|
+
def _resolve_title(self, chat_ingest: ChatIngest) -> str:
|
|
103
|
+
"""Resolve chat title from ingest or first message.
|
|
104
|
+
|
|
105
|
+
Priority:
|
|
106
|
+
1. Title from import
|
|
107
|
+
2. First sentence of first message
|
|
108
|
+
3. "Untitled Chat" fallback
|
|
109
|
+
"""
|
|
110
|
+
if chat_ingest.title:
|
|
111
|
+
return chat_ingest.title
|
|
112
|
+
|
|
113
|
+
# Use first sentence of first message
|
|
114
|
+
for msg in chat_ingest.messages:
|
|
115
|
+
if msg.content:
|
|
116
|
+
# Extract first sentence (up to first period or 100 chars)
|
|
117
|
+
content = msg.content.strip()
|
|
118
|
+
period_idx = content.find(".")
|
|
119
|
+
first_sentence = content[:100]
|
|
120
|
+
if period_idx > 0:
|
|
121
|
+
first_sentence = content[:period_idx]
|
|
122
|
+
if first_sentence:
|
|
123
|
+
return first_sentence.strip()
|
|
124
|
+
|
|
125
|
+
return "Untitled Chat"
|
|
126
|
+
|
|
127
|
+
async def _classify(
|
|
128
|
+
self,
|
|
129
|
+
chat_ingest: ChatIngest,
|
|
130
|
+
) -> tuple[ChatCategory, list[str]]:
|
|
131
|
+
"""Classify chat using LLM.
|
|
132
|
+
|
|
133
|
+
Uses first and last user-assistant pairs for classification.
|
|
134
|
+
"""
|
|
135
|
+
messages = chat_ingest.messages
|
|
136
|
+
|
|
137
|
+
# Find first user-assistant pair
|
|
138
|
+
first_pair = self._find_pair(messages, from_start=True)
|
|
139
|
+
|
|
140
|
+
# Find last user-assistant pair
|
|
141
|
+
last_pair = self._find_pair(messages, from_start=False)
|
|
142
|
+
|
|
143
|
+
if not first_pair:
|
|
144
|
+
return ChatCategory.GENERAL, []
|
|
145
|
+
|
|
146
|
+
# Use first pair for both if no distinct last pair
|
|
147
|
+
if not last_pair or last_pair == first_pair:
|
|
148
|
+
last_pair = first_pair
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
return await self._llm.classify_chat(first_pair, last_pair)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.warning("classification_failed", error=str(e))
|
|
154
|
+
return ChatCategory.GENERAL, []
|
|
155
|
+
|
|
156
|
+
def _find_pair(
|
|
157
|
+
self,
|
|
158
|
+
messages: list[IngestMessage],
|
|
159
|
+
from_start: bool,
|
|
160
|
+
) -> tuple[str, str] | None:
|
|
161
|
+
"""Find user-assistant pair from start or end of messages.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
messages: List of ingest messages
|
|
165
|
+
from_start: If True, search from start; else from end
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
(user_content, assistant_content) tuple or None
|
|
169
|
+
"""
|
|
170
|
+
if not messages:
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
msg_iter = messages if from_start else reversed(messages)
|
|
174
|
+
user_content: str | None = None
|
|
175
|
+
|
|
176
|
+
for msg in msg_iter:
|
|
177
|
+
if msg.role == "user" and user_content is None:
|
|
178
|
+
user_content = msg.content
|
|
179
|
+
elif msg.role == "assistant" and user_content is not None:
|
|
180
|
+
return (user_content, msg.content)
|
|
181
|
+
|
|
182
|
+
return None
|