bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bot_knows/__init__.py +70 -0
  2. bot_knows/config.py +115 -0
  3. bot_knows/domain/__init__.py +5 -0
  4. bot_knows/domain/chat.py +62 -0
  5. bot_knows/domain/message.py +64 -0
  6. bot_knows/domain/relation.py +56 -0
  7. bot_knows/domain/topic.py +132 -0
  8. bot_knows/domain/topic_evidence.py +55 -0
  9. bot_knows/importers/__init__.py +12 -0
  10. bot_knows/importers/base.py +116 -0
  11. bot_knows/importers/chatgpt.py +154 -0
  12. bot_knows/importers/claude.py +172 -0
  13. bot_knows/importers/generic_json.py +272 -0
  14. bot_knows/importers/registry.py +125 -0
  15. bot_knows/infra/__init__.py +5 -0
  16. bot_knows/infra/llm/__init__.py +6 -0
  17. bot_knows/infra/llm/anthropic_provider.py +172 -0
  18. bot_knows/infra/llm/openai_provider.py +195 -0
  19. bot_knows/infra/mongo/__init__.py +5 -0
  20. bot_knows/infra/mongo/client.py +145 -0
  21. bot_knows/infra/mongo/repositories.py +348 -0
  22. bot_knows/infra/neo4j/__init__.py +5 -0
  23. bot_knows/infra/neo4j/client.py +152 -0
  24. bot_knows/infra/neo4j/graph_repository.py +329 -0
  25. bot_knows/infra/redis/__init__.py +6 -0
  26. bot_knows/infra/redis/cache.py +198 -0
  27. bot_knows/infra/redis/client.py +193 -0
  28. bot_knows/interfaces/__init__.py +18 -0
  29. bot_knows/interfaces/embedding.py +55 -0
  30. bot_knows/interfaces/graph.py +194 -0
  31. bot_knows/interfaces/llm.py +70 -0
  32. bot_knows/interfaces/recall.py +92 -0
  33. bot_knows/interfaces/storage.py +225 -0
  34. bot_knows/logging.py +101 -0
  35. bot_knows/models/__init__.py +22 -0
  36. bot_knows/models/chat.py +55 -0
  37. bot_knows/models/ingest.py +70 -0
  38. bot_knows/models/message.py +49 -0
  39. bot_knows/models/recall.py +58 -0
  40. bot_knows/models/topic.py +100 -0
  41. bot_knows/orchestrator.py +398 -0
  42. bot_knows/py.typed +0 -0
  43. bot_knows/services/__init__.py +24 -0
  44. bot_knows/services/chat_processing.py +182 -0
  45. bot_knows/services/dedup_service.py +161 -0
  46. bot_knows/services/graph_service.py +217 -0
  47. bot_knows/services/message_builder.py +135 -0
  48. bot_knows/services/recall_service.py +296 -0
  49. bot_knows/services/tasks.py +128 -0
  50. bot_knows/services/topic_extraction.py +199 -0
  51. bot_knows/utils/__init__.py +22 -0
  52. bot_knows/utils/hashing.py +126 -0
  53. bot_knows-0.1.0.dist-info/METADATA +294 -0
  54. bot_knows-0.1.0.dist-info/RECORD +56 -0
  55. bot_knows-0.1.0.dist-info/WHEEL +4 -0
  56. bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,348 @@
1
+ """MongoDB repositories for bot_knows.
2
+
3
+ This module provides repository implementations for MongoDB storage.
4
+ """
5
+
6
+ from typing import Any, Self
7
+
8
+ import numpy as np
9
+
10
+ from bot_knows.config import MongoSettings
11
+ from bot_knows.infra.mongo.client import MongoClient
12
+ from bot_knows.interfaces.storage import StorageInterface
13
+ from bot_knows.logging import get_logger
14
+ from bot_knows.models.chat import ChatCategory, ChatDTO
15
+ from bot_knows.models.message import MessageDTO
16
+ from bot_knows.models.recall import TopicRecallStateDTO
17
+ from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
18
+
19
+ __all__ = [
20
+ "MongoStorageRepository",
21
+ ]
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+ class MongoStorageRepository(StorageInterface):
27
+ """MongoDB implementation of StorageInterface.
28
+
29
+ Provides CRUD operations for chats, messages, topics,
30
+ evidence, and recall states.
31
+ """
32
+
33
+ config_class = MongoSettings
34
+
35
+ def __init__(self, client: MongoClient) -> None:
36
+ """Initialize repository with MongoDB client.
37
+
38
+ Args:
39
+ client: Connected MongoClient instance
40
+ """
41
+ self._client = client
42
+ self._owns_client = False
43
+
44
+ @classmethod
45
+ async def from_config(cls, config: MongoSettings) -> Self:
46
+ """Factory method for BotKnows instantiation.
47
+
48
+ Creates a MongoClient, connects, creates indexes, and returns repository.
49
+
50
+ Args:
51
+ config: MongoDB settings
52
+
53
+ Returns:
54
+ Connected MongoStorageRepository instance
55
+ """
56
+ client = MongoClient(config)
57
+ await client.connect()
58
+ await client.create_indexes()
59
+ instance = cls(client)
60
+ instance._owns_client = True
61
+ return instance
62
+
63
+ @classmethod
64
+ async def from_dict(cls, config: dict[str, Any]) -> Self:
65
+ """Factory method for custom config dict.
66
+
67
+ Args:
68
+ config: Dictionary with MongoDB settings
69
+
70
+ Returns:
71
+ Connected MongoStorageRepository instance
72
+ """
73
+ settings = MongoSettings(**config)
74
+ return await cls.from_config(settings)
75
+
76
+ async def close(self) -> None:
77
+ """Close owned resources."""
78
+ if self._owns_client and self._client:
79
+ await self._client.disconnect()
80
+
81
+ # Chat operations
82
+ async def save_chat(self, chat: ChatDTO) -> str:
83
+ """Save or update a chat."""
84
+ doc = self._chat_to_doc(chat)
85
+ await self._client.chats.replace_one(
86
+ {"id": chat.id},
87
+ doc,
88
+ upsert=True,
89
+ )
90
+ return chat.id
91
+
92
+ async def get_chat(self, chat_id: str) -> ChatDTO | None:
93
+ """Get a chat by ID."""
94
+ doc = await self._client.chats.find_one({"id": chat_id})
95
+ return self._doc_to_chat(doc) if doc else None
96
+
97
+ async def chat_exists(self, chat_id: str) -> bool:
98
+ """Check if a chat exists."""
99
+ count = await self._client.chats.count_documents({"id": chat_id}, limit=1)
100
+ return count > 0
101
+
102
+ async def find_chats_by_source(self, source: str) -> list[ChatDTO]:
103
+ """Find all chats from a source."""
104
+ cursor = self._client.chats.find({"source": source})
105
+ return [self._doc_to_chat(doc) async for doc in cursor]
106
+
107
+ # Message operations
108
+ async def save_message(self, message: MessageDTO) -> str:
109
+ """Save or update a message."""
110
+ doc = self._message_to_doc(message)
111
+ await self._client.messages.replace_one(
112
+ {"message_id": message.message_id},
113
+ doc,
114
+ upsert=True,
115
+ )
116
+ return message.message_id
117
+
118
+ async def get_message(self, message_id: str) -> MessageDTO | None:
119
+ """Get a message by ID."""
120
+ doc = await self._client.messages.find_one({"message_id": message_id})
121
+ return self._doc_to_message(doc) if doc else None
122
+
123
+ async def get_messages_for_chat(self, chat_id: str) -> list[MessageDTO]:
124
+ """Get all messages for a chat, ordered by timestamp."""
125
+ cursor = self._client.messages.find({"chat_id": chat_id}).sort("created_on", 1)
126
+ return [self._doc_to_message(doc) async for doc in cursor]
127
+
128
+ # Topic operations
129
+ async def save_topic(self, topic: TopicDTO) -> str:
130
+ """Save or update a topic."""
131
+ doc = self._topic_to_doc(topic)
132
+ await self._client.topics.replace_one(
133
+ {"topic_id": topic.topic_id},
134
+ doc,
135
+ upsert=True,
136
+ )
137
+ return topic.topic_id
138
+
139
+ async def get_topic(self, topic_id: str) -> TopicDTO | None:
140
+ """Get a topic by ID."""
141
+ doc = await self._client.topics.find_one({"topic_id": topic_id})
142
+ return self._doc_to_topic(doc) if doc else None
143
+
144
+ async def update_topic(self, topic: TopicDTO) -> None:
145
+ """Update an existing topic."""
146
+ await self.save_topic(topic)
147
+
148
+ async def find_similar_topics(
149
+ self,
150
+ embedding: list[float],
151
+ threshold: float,
152
+ ) -> list[tuple[TopicDTO, float]]:
153
+ """Find topics with similar embeddings.
154
+
155
+ Uses cosine similarity comparison against all topics.
156
+ For production, consider using MongoDB Atlas Vector Search
157
+ or a dedicated vector database.
158
+ """
159
+ results: list[tuple[TopicDTO, float]] = []
160
+ query_vec = np.array(embedding)
161
+ query_norm = np.linalg.norm(query_vec)
162
+
163
+ if query_norm == 0:
164
+ return results
165
+
166
+ # Fetch all topics with embeddings
167
+ cursor = self._client.topics.find({"centroid_embedding": {"$exists": True, "$ne": []}})
168
+
169
+ async for doc in cursor:
170
+ topic = self._doc_to_topic(doc)
171
+ if not topic.centroid_embedding:
172
+ continue
173
+
174
+ # Calculate cosine similarity
175
+ doc_vec = np.array(topic.centroid_embedding)
176
+ doc_norm = np.linalg.norm(doc_vec)
177
+ if doc_norm == 0:
178
+ continue
179
+
180
+ similarity = float(np.dot(query_vec, doc_vec) / (query_norm * doc_norm))
181
+
182
+ if similarity >= threshold:
183
+ results.append((topic, similarity))
184
+
185
+ # Sort by similarity descending
186
+ results.sort(key=lambda x: x[1], reverse=True)
187
+ return results
188
+
189
+ async def get_all_topics(self, limit: int = 1000) -> list[TopicDTO]:
190
+ """Get all topics."""
191
+ cursor = self._client.topics.find().limit(limit)
192
+ return [self._doc_to_topic(doc) async for doc in cursor]
193
+
194
+ # Evidence operations
195
+ async def append_evidence(self, evidence: TopicEvidenceDTO) -> str:
196
+ """Append evidence record (never update)."""
197
+ doc = self._evidence_to_doc(evidence)
198
+ await self._client.evidence.insert_one(doc)
199
+ return evidence.evidence_id
200
+
201
+ async def get_evidence_for_topic(self, topic_id: str) -> list[TopicEvidenceDTO]:
202
+ """Get all evidence for a topic."""
203
+ cursor = self._client.evidence.find({"topic_id": topic_id}).sort("timestamp", 1)
204
+ return [self._doc_to_evidence(doc) async for doc in cursor]
205
+
206
+ # Recall state operations
207
+ async def save_recall_state(self, state: TopicRecallStateDTO) -> None:
208
+ """Save or update recall state."""
209
+ doc = self._recall_state_to_doc(state)
210
+ await self._client.recall_states.replace_one(
211
+ {"topic_id": state.topic_id},
212
+ doc,
213
+ upsert=True,
214
+ )
215
+
216
+ async def get_recall_state(self, topic_id: str) -> TopicRecallStateDTO | None:
217
+ """Get recall state for a topic."""
218
+ doc = await self._client.recall_states.find_one({"topic_id": topic_id})
219
+ return self._doc_to_recall_state(doc) if doc else None
220
+
221
+ async def get_due_topics(self, threshold: float) -> list[TopicRecallStateDTO]:
222
+ """Get topics due for recall (strength below threshold)."""
223
+ cursor = self._client.recall_states.find({"strength": {"$lt": threshold}}).sort(
224
+ "strength", 1
225
+ )
226
+ return [self._doc_to_recall_state(doc) async for doc in cursor]
227
+
228
+ async def get_all_recall_states(self) -> list[TopicRecallStateDTO]:
229
+ """Get all recall states."""
230
+ cursor = self._client.recall_states.find()
231
+ return [self._doc_to_recall_state(doc) async for doc in cursor]
232
+
233
+ # Document conversion helpers
234
+ @staticmethod
235
+ def _chat_to_doc(chat: ChatDTO) -> dict[str, Any]:
236
+ return {
237
+ "id": chat.id,
238
+ "title": chat.title,
239
+ "source": chat.source,
240
+ "category": chat.category.value,
241
+ "tags": chat.tags,
242
+ "created_on": chat.created_on,
243
+ "schema_version": chat.schema_version,
244
+ }
245
+
246
+ @staticmethod
247
+ def _doc_to_chat(doc: dict[str, Any]) -> ChatDTO:
248
+ return ChatDTO(
249
+ id=doc["id"],
250
+ title=doc["title"],
251
+ source=doc["source"],
252
+ category=ChatCategory(doc["category"]),
253
+ tags=doc.get("tags", []),
254
+ created_on=doc["created_on"],
255
+ schema_version=doc.get("schema_version", 1),
256
+ )
257
+
258
+ @staticmethod
259
+ def _message_to_doc(message: MessageDTO) -> dict[str, Any]:
260
+ return {
261
+ "message_id": message.message_id,
262
+ "chat_id": message.chat_id,
263
+ "user_content": message.user_content,
264
+ "assistant_content": message.assistant_content,
265
+ "created_on": message.created_on,
266
+ "schema_version": message.schema_version,
267
+ }
268
+
269
+ @staticmethod
270
+ def _doc_to_message(doc: dict[str, Any]) -> MessageDTO:
271
+ return MessageDTO(
272
+ message_id=doc["message_id"],
273
+ chat_id=doc["chat_id"],
274
+ user_content=doc.get("user_content", ""),
275
+ assistant_content=doc.get("assistant_content", ""),
276
+ created_on=doc["created_on"],
277
+ schema_version=doc.get("schema_version", 1),
278
+ )
279
+
280
+ @staticmethod
281
+ def _topic_to_doc(topic: TopicDTO) -> dict[str, Any]:
282
+ return {
283
+ "topic_id": topic.topic_id,
284
+ "canonical_name": topic.canonical_name,
285
+ "centroid_embedding": topic.centroid_embedding,
286
+ "evidence_count": topic.evidence_count,
287
+ "importance": topic.importance,
288
+ "recall_strength": topic.recall_strength,
289
+ "schema_version": topic.schema_version,
290
+ }
291
+
292
+ @staticmethod
293
+ def _doc_to_topic(doc: dict[str, Any]) -> TopicDTO:
294
+ return TopicDTO(
295
+ topic_id=doc["topic_id"],
296
+ canonical_name=doc["canonical_name"],
297
+ centroid_embedding=doc.get("centroid_embedding", []),
298
+ evidence_count=doc.get("evidence_count", 0),
299
+ importance=doc.get("importance", 0.0),
300
+ recall_strength=doc.get("recall_strength", 0.0),
301
+ schema_version=doc.get("schema_version", 1),
302
+ )
303
+
304
+ @staticmethod
305
+ def _evidence_to_doc(evidence: TopicEvidenceDTO) -> dict[str, Any]:
306
+ return {
307
+ "evidence_id": evidence.evidence_id,
308
+ "topic_id": evidence.topic_id,
309
+ "extracted_name": evidence.extracted_name,
310
+ "source_message_id": evidence.source_message_id,
311
+ "confidence": evidence.confidence,
312
+ "timestamp": evidence.timestamp,
313
+ "schema_version": evidence.schema_version,
314
+ }
315
+
316
+ @staticmethod
317
+ def _doc_to_evidence(doc: dict[str, Any]) -> TopicEvidenceDTO:
318
+ return TopicEvidenceDTO(
319
+ evidence_id=doc["evidence_id"],
320
+ topic_id=doc["topic_id"],
321
+ extracted_name=doc["extracted_name"],
322
+ source_message_id=doc["source_message_id"],
323
+ confidence=doc["confidence"],
324
+ timestamp=doc["timestamp"],
325
+ schema_version=doc.get("schema_version", 1),
326
+ )
327
+
328
+ @staticmethod
329
+ def _recall_state_to_doc(state: TopicRecallStateDTO) -> dict[str, Any]:
330
+ return {
331
+ "topic_id": state.topic_id,
332
+ "strength": state.strength,
333
+ "last_seen": state.last_seen,
334
+ "last_updated": state.last_updated,
335
+ "stability": state.stability,
336
+ "schema_version": state.schema_version,
337
+ }
338
+
339
+ @staticmethod
340
+ def _doc_to_recall_state(doc: dict[str, Any]) -> TopicRecallStateDTO:
341
+ return TopicRecallStateDTO(
342
+ topic_id=doc["topic_id"],
343
+ strength=doc["strength"],
344
+ last_seen=doc["last_seen"],
345
+ last_updated=doc["last_updated"],
346
+ stability=doc.get("stability", 1.0),
347
+ schema_version=doc.get("schema_version", 1),
348
+ )
@@ -0,0 +1,5 @@
1
+ """Neo4j infrastructure for bot_knows."""
2
+
3
+ from bot_knows.infra.neo4j.client import Neo4jClient
4
+
5
+ __all__ = ["Neo4jClient"]
@@ -0,0 +1,152 @@
1
+ """Neo4j client for bot_knows.
2
+
3
+ This module provides an async Neo4j client wrapper.
4
+ """
5
+
6
+ from typing import Any
7
+
8
+ from neo4j import AsyncDriver, AsyncGraphDatabase
9
+
10
+ from bot_knows.config import Neo4jSettings
11
+ from bot_knows.logging import get_logger
12
+
13
+ __all__ = [
14
+ "Neo4jClient",
15
+ ]
16
+
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ class Neo4jClient:
21
+ """Async Neo4j client wrapper.
22
+
23
+ Provides connection management and query execution
24
+ for the bot_knows knowledge graph.
25
+
26
+ Example:
27
+ client = Neo4jClient(settings)
28
+ await client.connect()
29
+
30
+ result = await client.execute_query(
31
+ "MATCH (n:Chat) RETURN n LIMIT 10"
32
+ )
33
+
34
+ await client.disconnect()
35
+ """
36
+
37
+ def __init__(self, settings: Neo4jSettings) -> None:
38
+ """Initialize client with settings.
39
+
40
+ Args:
41
+ settings: Neo4j connection settings
42
+ """
43
+ self._settings = settings
44
+ self._driver: AsyncDriver | None = None
45
+
46
+ async def connect(self) -> None:
47
+ """Initialize connection to Neo4j."""
48
+ if self._driver is not None:
49
+ return
50
+
51
+ self._driver = AsyncGraphDatabase.driver(
52
+ self._settings.uri,
53
+ auth=(
54
+ self._settings.username,
55
+ self._settings.password.get_secret_value(),
56
+ ),
57
+ )
58
+
59
+ # Verify connection
60
+ await self._driver.verify_connectivity()
61
+ logger.info("connected_to_neo4j", uri=self._settings.uri)
62
+
63
+ async def disconnect(self) -> None:
64
+ """Close connection to Neo4j."""
65
+ if self._driver:
66
+ await self._driver.close()
67
+ self._driver = None
68
+ logger.info("disconnected_from_neo4j")
69
+
70
+ @property
71
+ def driver(self) -> AsyncDriver:
72
+ """Get driver instance.
73
+
74
+ Raises:
75
+ RuntimeError: If not connected
76
+ """
77
+ if self._driver is None:
78
+ raise RuntimeError("Neo4jClient not connected. Call connect() first.")
79
+ return self._driver
80
+
81
+ async def execute_query(
82
+ self,
83
+ query: str,
84
+ parameters: dict[str, Any] | None = None,
85
+ ) -> list[dict[str, Any]]:
86
+ """Execute a Cypher query and return results.
87
+
88
+ Args:
89
+ query: Cypher query string
90
+ parameters: Query parameters
91
+
92
+ Returns:
93
+ List of result records as dicts
94
+ """
95
+ async with self.driver.session() as session:
96
+ result = await session.run(query, parameters or {})
97
+ records = await result.data()
98
+ return records
99
+
100
+ async def execute_write(
101
+ self,
102
+ query: str,
103
+ parameters: dict[str, Any] | None = None,
104
+ ) -> None:
105
+ """Execute a write query (CREATE, MERGE, etc.).
106
+
107
+ Args:
108
+ query: Cypher query string
109
+ parameters: Query parameters
110
+ """
111
+ async with self.driver.session() as session:
112
+ await session.run(query, parameters or {})
113
+
114
+ async def create_indexes(self) -> None:
115
+ """Create indexes for the knowledge graph."""
116
+ indexes = [
117
+ "CREATE INDEX chat_id_idx IF NOT EXISTS FOR (c:Chat) ON (c.id)",
118
+ "CREATE INDEX message_id_idx IF NOT EXISTS FOR (m:Message) ON (m.message_id)",
119
+ "CREATE INDEX message_chat_idx IF NOT EXISTS FOR (m:Message) ON (m.chat_id)",
120
+ "CREATE INDEX topic_id_idx IF NOT EXISTS FOR (t:Topic) ON (t.topic_id)",
121
+ ]
122
+
123
+ for index_query in indexes:
124
+ await self.execute_write(index_query)
125
+
126
+ logger.info("created_neo4j_indexes")
127
+
128
+ async def create_constraints(self) -> None:
129
+ """Create uniqueness constraints."""
130
+ constraints = [
131
+ "CREATE CONSTRAINT chat_id_unique IF NOT EXISTS FOR (c:Chat) REQUIRE c.id IS UNIQUE",
132
+ "CREATE CONSTRAINT message_id_unique IF NOT EXISTS FOR (m:Message) REQUIRE m.message_id IS UNIQUE",
133
+ "CREATE CONSTRAINT topic_id_unique IF NOT EXISTS FOR (t:Topic) REQUIRE t.topic_id IS UNIQUE",
134
+ ]
135
+
136
+ for constraint_query in constraints:
137
+ try:
138
+ await self.execute_write(constraint_query)
139
+ except Exception as e:
140
+ # Constraint may already exist
141
+ logger.debug("constraint_creation_skipped", error=str(e))
142
+
143
+ logger.info("created_neo4j_constraints")
144
+
145
+ async def __aenter__(self) -> "Neo4jClient":
146
+ """Async context manager entry."""
147
+ await self.connect()
148
+ return self
149
+
150
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
151
+ """Async context manager exit."""
152
+ await self.disconnect()