bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bot_knows/__init__.py +70 -0
  2. bot_knows/config.py +115 -0
  3. bot_knows/domain/__init__.py +5 -0
  4. bot_knows/domain/chat.py +62 -0
  5. bot_knows/domain/message.py +64 -0
  6. bot_knows/domain/relation.py +56 -0
  7. bot_knows/domain/topic.py +132 -0
  8. bot_knows/domain/topic_evidence.py +55 -0
  9. bot_knows/importers/__init__.py +12 -0
  10. bot_knows/importers/base.py +116 -0
  11. bot_knows/importers/chatgpt.py +154 -0
  12. bot_knows/importers/claude.py +172 -0
  13. bot_knows/importers/generic_json.py +272 -0
  14. bot_knows/importers/registry.py +125 -0
  15. bot_knows/infra/__init__.py +5 -0
  16. bot_knows/infra/llm/__init__.py +6 -0
  17. bot_knows/infra/llm/anthropic_provider.py +172 -0
  18. bot_knows/infra/llm/openai_provider.py +195 -0
  19. bot_knows/infra/mongo/__init__.py +5 -0
  20. bot_knows/infra/mongo/client.py +145 -0
  21. bot_knows/infra/mongo/repositories.py +348 -0
  22. bot_knows/infra/neo4j/__init__.py +5 -0
  23. bot_knows/infra/neo4j/client.py +152 -0
  24. bot_knows/infra/neo4j/graph_repository.py +329 -0
  25. bot_knows/infra/redis/__init__.py +6 -0
  26. bot_knows/infra/redis/cache.py +198 -0
  27. bot_knows/infra/redis/client.py +193 -0
  28. bot_knows/interfaces/__init__.py +18 -0
  29. bot_knows/interfaces/embedding.py +55 -0
  30. bot_knows/interfaces/graph.py +194 -0
  31. bot_knows/interfaces/llm.py +70 -0
  32. bot_knows/interfaces/recall.py +92 -0
  33. bot_knows/interfaces/storage.py +225 -0
  34. bot_knows/logging.py +101 -0
  35. bot_knows/models/__init__.py +22 -0
  36. bot_knows/models/chat.py +55 -0
  37. bot_knows/models/ingest.py +70 -0
  38. bot_knows/models/message.py +49 -0
  39. bot_knows/models/recall.py +58 -0
  40. bot_knows/models/topic.py +100 -0
  41. bot_knows/orchestrator.py +398 -0
  42. bot_knows/py.typed +0 -0
  43. bot_knows/services/__init__.py +24 -0
  44. bot_knows/services/chat_processing.py +182 -0
  45. bot_knows/services/dedup_service.py +161 -0
  46. bot_knows/services/graph_service.py +217 -0
  47. bot_knows/services/message_builder.py +135 -0
  48. bot_knows/services/recall_service.py +296 -0
  49. bot_knows/services/tasks.py +128 -0
  50. bot_knows/services/topic_extraction.py +199 -0
  51. bot_knows/utils/__init__.py +22 -0
  52. bot_knows/utils/hashing.py +126 -0
  53. bot_knows-0.1.0.dist-info/METADATA +294 -0
  54. bot_knows-0.1.0.dist-info/RECORD +56 -0
  55. bot_knows-0.1.0.dist-info/WHEEL +4 -0
  56. bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,161 @@
1
+ """Semantic deduplication service for bot_knows.
2
+
3
+ This module provides the service for deduplicating topics based on
4
+ semantic similarity using embedding vectors.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from enum import StrEnum
9
+
10
+ from bot_knows.interfaces.embedding import EmbeddingServiceInterface
11
+ from bot_knows.interfaces.storage import StorageInterface
12
+ from bot_knows.logging import get_logger
13
+ from bot_knows.models.topic import TopicDTO
14
+
15
+ __all__ = [
16
+ "DedupAction",
17
+ "DedupResult",
18
+ "DedupService",
19
+ ]
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class DedupAction(StrEnum):
25
+ """Actions resulting from deduplication check."""
26
+
27
+ MERGE = "merge"
28
+ """Similarity >= high_threshold: same topic, merge evidence"""
29
+
30
+ SOFT_MATCH = "soft_match"
31
+ """Similarity between low and high threshold: new topic + POTENTIALLY_DUPLICATE_OF edge"""
32
+
33
+ NEW = "new"
34
+ """Similarity < low_threshold: completely new topic"""
35
+
36
+
37
+ @dataclass
38
+ class DedupResult:
39
+ """Result of deduplication check."""
40
+
41
+ action: DedupAction
42
+ existing_topic: TopicDTO | None = None
43
+ similarity: float = 0.0
44
+
45
+
46
+ class DedupService:
47
+ """Semantic deduplication service.
48
+
49
+ Compares candidate topics against existing topics using
50
+ embedding similarity to determine:
51
+ - MERGE (>= 0.92): Same topic, link evidence to existing
52
+ - SOFT_MATCH (0.80-0.92): Create new topic with POTENTIALLY_DUPLICATE_OF edge
53
+ - NEW (< 0.80): Create completely new topic
54
+
55
+ Example:
56
+ service = DedupService(embedding_service, storage)
57
+ result = await service.check_duplicate(embedding)
58
+ if result.action == DedupAction.MERGE:
59
+ # Link evidence to result.existing_topic
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ embedding_service: EmbeddingServiceInterface,
65
+ storage: StorageInterface,
66
+ high_threshold: float = 0.92,
67
+ low_threshold: float = 0.80,
68
+ ) -> None:
69
+ """Initialize service with dependencies.
70
+
71
+ Args:
72
+ embedding_service: Embedding service for similarity calculation
73
+ storage: Storage interface for topic lookup
74
+ high_threshold: Similarity threshold for MERGE (default: 0.92)
75
+ low_threshold: Similarity threshold for SOFT_MATCH (default: 0.80)
76
+ """
77
+ self._embedding = embedding_service
78
+ self._storage = storage
79
+ self._high_threshold = high_threshold
80
+ self._low_threshold = low_threshold
81
+
82
+ async def check_duplicate(
83
+ self,
84
+ candidate_embedding: list[float],
85
+ ) -> DedupResult:
86
+ """Check if candidate embedding matches existing topics.
87
+
88
+ Args:
89
+ candidate_embedding: Embedding vector for candidate topic
90
+
91
+ Returns:
92
+ DedupResult with action and matched topic (if any)
93
+ """
94
+ # Find similar topics above low threshold
95
+ similar_topics = await self._storage.find_similar_topics(
96
+ embedding=candidate_embedding,
97
+ threshold=self._low_threshold,
98
+ )
99
+
100
+ if not similar_topics:
101
+ return DedupResult(action=DedupAction.NEW)
102
+
103
+ # Get best match (highest similarity)
104
+ best_topic, best_similarity = similar_topics[0]
105
+
106
+ if best_similarity >= self._high_threshold:
107
+ logger.debug(
108
+ "dedup_merge",
109
+ topic_id=best_topic.topic_id,
110
+ similarity=best_similarity,
111
+ )
112
+ return DedupResult(
113
+ action=DedupAction.MERGE,
114
+ existing_topic=best_topic,
115
+ similarity=best_similarity,
116
+ )
117
+
118
+ # Between thresholds: soft match
119
+ logger.debug(
120
+ "dedup_soft_match",
121
+ topic_id=best_topic.topic_id,
122
+ similarity=best_similarity,
123
+ )
124
+ return DedupResult(
125
+ action=DedupAction.SOFT_MATCH,
126
+ existing_topic=best_topic,
127
+ similarity=best_similarity,
128
+ )
129
+
130
+ async def find_best_match(
131
+ self,
132
+ candidate_embedding: list[float],
133
+ min_similarity: float = 0.5,
134
+ ) -> tuple[TopicDTO, float] | None:
135
+ """Find the best matching topic above minimum similarity.
136
+
137
+ Args:
138
+ candidate_embedding: Embedding vector to match
139
+ min_similarity: Minimum similarity threshold
140
+
141
+ Returns:
142
+ (TopicDTO, similarity) tuple or None if no match
143
+ """
144
+ similar_topics = await self._storage.find_similar_topics(
145
+ embedding=candidate_embedding,
146
+ threshold=min_similarity,
147
+ )
148
+
149
+ if similar_topics:
150
+ return similar_topics[0]
151
+ return None
152
+
153
+ @property
154
+ def high_threshold(self) -> float:
155
+ """Get high similarity threshold (MERGE)."""
156
+ return self._high_threshold
157
+
158
+ @property
159
+ def low_threshold(self) -> float:
160
+ """Get low similarity threshold (SOFT_MATCH)."""
161
+ return self._low_threshold
@@ -0,0 +1,217 @@
1
+ """Graph service for bot_knows.
2
+
3
+ This module provides the service for managing the knowledge graph.
4
+ """
5
+
6
+ from bot_knows.interfaces.graph import GraphServiceInterface
7
+ from bot_knows.logging import get_logger
8
+ from bot_knows.models.chat import ChatDTO
9
+ from bot_knows.models.message import MessageDTO
10
+ from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
11
+
12
+ __all__ = [
13
+ "GraphService",
14
+ ]
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ class GraphService:
20
+ """Service for managing the knowledge graph.
21
+
22
+ Wraps the graph interface to provide higher-level operations
23
+ for building and querying the knowledge graph.
24
+
25
+ Graph structure:
26
+ - (Message)-[:IS_PART_OF]->(Chat)
27
+ - (Message)-[:FOLLOWS_AFTER]->(Message)
28
+ - (Topic)-[:IS_SUPPORTED_BY {evidence}]->(Message)
29
+ - (Topic)-[:POTENTIALLY_DUPLICATE_OF {similarity}]->(Topic)
30
+ - (Topic)-[:RELATES_TO {type, weight}]->(Topic)
31
+
32
+ Example:
33
+ service = GraphService(graph_interface)
34
+ await service.add_chat_with_messages(chat, messages)
35
+ """
36
+
37
+ def __init__(self, graph: GraphServiceInterface) -> None:
38
+ """Initialize service with graph interface.
39
+
40
+ Args:
41
+ graph: Graph interface implementation
42
+ """
43
+ self._graph = graph
44
+
45
+ async def add_chat_with_messages(
46
+ self,
47
+ chat: ChatDTO,
48
+ messages: list[MessageDTO],
49
+ ) -> None:
50
+ """Add a chat and its messages to the graph.
51
+
52
+ Creates:
53
+ - Chat node
54
+ - Message nodes
55
+ - IS_PART_OF edges (Message -> Chat)
56
+ - FOLLOWS_AFTER edges (Message -> Message)
57
+
58
+ Args:
59
+ chat: Chat to add
60
+ messages: Messages to add (should be ordered)
61
+ """
62
+ # Create chat node
63
+ await self._graph.create_chat_node(chat)
64
+
65
+ # Create message nodes and edges
66
+ previous_message_id: str | None = None
67
+
68
+ for message in messages:
69
+ # Create message node
70
+ await self._graph.create_message_node(message)
71
+
72
+ # Create IS_PART_OF edge
73
+ await self._graph.create_is_part_of_edge(
74
+ message_id=message.message_id,
75
+ chat_id=chat.id,
76
+ )
77
+
78
+ # Create FOLLOWS_AFTER edge if not first message
79
+ if previous_message_id:
80
+ await self._graph.create_follows_after_edge(
81
+ message_id=message.message_id,
82
+ previous_message_id=previous_message_id,
83
+ )
84
+
85
+ previous_message_id = message.message_id
86
+
87
+ logger.debug(
88
+ "chat_added_to_graph",
89
+ chat_id=chat.id,
90
+ message_count=len(messages),
91
+ )
92
+
93
+ async def add_topic_with_evidence(
94
+ self,
95
+ topic: TopicDTO,
96
+ evidence: TopicEvidenceDTO,
97
+ ) -> None:
98
+ """Add a topic and link it to supporting message.
99
+
100
+ Creates:
101
+ - Topic node
102
+ - IS_SUPPORTED_BY edge with evidence properties
103
+
104
+ Args:
105
+ topic: Topic to add
106
+ evidence: Evidence linking topic to message
107
+ """
108
+ # Create topic node
109
+ await self._graph.create_topic_node(topic)
110
+
111
+ # Create IS_SUPPORTED_BY edge with evidence
112
+ await self._graph.create_is_supported_by_edge(
113
+ topic_id=topic.topic_id,
114
+ message_id=evidence.source_message_id,
115
+ evidence=evidence,
116
+ )
117
+
118
+ logger.debug(
119
+ "topic_added_to_graph",
120
+ topic_id=topic.topic_id,
121
+ message_id=evidence.source_message_id,
122
+ )
123
+
124
+ async def add_evidence_to_existing_topic(
125
+ self,
126
+ topic: TopicDTO,
127
+ evidence: TopicEvidenceDTO,
128
+ ) -> None:
129
+ """Add evidence to an existing topic.
130
+
131
+ Creates:
132
+ - IS_SUPPORTED_BY edge with evidence properties
133
+ - Updates topic node properties
134
+
135
+ Args:
136
+ topic: Updated topic (with new centroid)
137
+ evidence: New evidence
138
+ """
139
+ # Update topic node
140
+ await self._graph.update_topic_node(topic)
141
+
142
+ # Create IS_SUPPORTED_BY edge with evidence
143
+ await self._graph.create_is_supported_by_edge(
144
+ topic_id=topic.topic_id,
145
+ message_id=evidence.source_message_id,
146
+ evidence=evidence,
147
+ )
148
+
149
+ logger.debug(
150
+ "evidence_added_to_topic",
151
+ topic_id=topic.topic_id,
152
+ evidence_id=evidence.evidence_id,
153
+ )
154
+
155
+ async def create_potential_duplicate_link(
156
+ self,
157
+ new_topic_id: str,
158
+ existing_topic_id: str,
159
+ similarity: float,
160
+ ) -> None:
161
+ """Create POTENTIALLY_DUPLICATE_OF edge between topics.
162
+
163
+ Args:
164
+ new_topic_id: New topic ID
165
+ existing_topic_id: Existing similar topic ID
166
+ similarity: Similarity score
167
+ """
168
+ await self._graph.create_potentially_duplicate_of_edge(
169
+ topic_id=new_topic_id,
170
+ existing_topic_id=existing_topic_id,
171
+ similarity=similarity,
172
+ )
173
+
174
+ logger.debug(
175
+ "potential_duplicate_link_created",
176
+ new_topic=new_topic_id,
177
+ existing_topic=existing_topic_id,
178
+ similarity=similarity,
179
+ )
180
+
181
+ async def create_topic_relation(
182
+ self,
183
+ topic_id: str,
184
+ related_topic_id: str,
185
+ relation_type: str,
186
+ weight: float,
187
+ ) -> None:
188
+ """Create RELATES_TO edge between topics.
189
+
190
+ Args:
191
+ topic_id: Source topic ID
192
+ related_topic_id: Related topic ID
193
+ relation_type: Type of relationship
194
+ weight: Relationship strength (0.0-1.0)
195
+ """
196
+ await self._graph.create_relates_to_edge(
197
+ topic_id=topic_id,
198
+ related_topic_id=related_topic_id,
199
+ relation_type=relation_type,
200
+ weight=weight,
201
+ )
202
+
203
+ async def get_related_topics(
204
+ self,
205
+ topic_id: str,
206
+ limit: int = 10,
207
+ ) -> list[tuple[str, float]]:
208
+ """Get topics related to a given topic.
209
+
210
+ Args:
211
+ topic_id: Topic to find relations for
212
+ limit: Maximum results
213
+
214
+ Returns:
215
+ List of (topic_id, weight) tuples
216
+ """
217
+ return await self._graph.get_related_topics(topic_id, limit)
@@ -0,0 +1,135 @@
1
+ """Message builder service for bot_knows.
2
+
3
+ This module provides the service for building MessageDTOs from IngestMessages.
4
+ """
5
+
6
+ from bot_knows.logging import get_logger
7
+ from bot_knows.models.ingest import IngestMessage
8
+ from bot_knows.models.message import MessageDTO
9
+ from bot_knows.utils.hashing import generate_message_id
10
+
11
+ __all__ = [
12
+ "MessageBuilder",
13
+ ]
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class MessageBuilder:
19
+ """Service for building MessageDTOs from IngestMessages.
20
+
21
+ Transforms a list of IngestMessages into MessageDTOs by:
22
+ - Pairing user and assistant messages
23
+ - Generating deterministic message IDs
24
+ - Handling edge cases (missing pairs, system messages)
25
+
26
+ Example:
27
+ builder = MessageBuilder()
28
+ messages = builder.build(ingest_messages, chat_id)
29
+ """
30
+
31
+ def build(
32
+ self,
33
+ ingest_messages: list[IngestMessage],
34
+ chat_id: str,
35
+ ) -> list[MessageDTO]:
36
+ """Build MessageDTOs from IngestMessages.
37
+
38
+ Pairs consecutive user-assistant messages into single MessageDTO objects.
39
+ System messages are stored with empty user_content.
40
+
41
+ Args:
42
+ ingest_messages: List of ingested messages
43
+ chat_id: Parent chat ID
44
+
45
+ Returns:
46
+ List of MessageDTO objects
47
+ """
48
+ if not ingest_messages:
49
+ return []
50
+
51
+ # Sort by timestamp to ensure correct ordering
52
+ sorted_messages = sorted(ingest_messages, key=lambda m: m.timestamp)
53
+
54
+ messages: list[MessageDTO] = []
55
+ pending_user: IngestMessage | None = None
56
+
57
+ for msg in sorted_messages:
58
+ if msg.role == "system":
59
+ # System messages become standalone with empty user_content
60
+ message_dto = self._create_message(
61
+ chat_id=chat_id,
62
+ user_content="",
63
+ assistant_content=msg.content,
64
+ timestamp=msg.timestamp,
65
+ )
66
+ messages.append(message_dto)
67
+
68
+ elif msg.role == "user":
69
+ # If we have a pending user message, create it as standalone
70
+ if pending_user:
71
+ message_dto = self._create_message(
72
+ chat_id=chat_id,
73
+ user_content=pending_user.content,
74
+ assistant_content="",
75
+ timestamp=pending_user.timestamp,
76
+ )
77
+ messages.append(message_dto)
78
+
79
+ pending_user = msg
80
+
81
+ elif msg.role == "assistant":
82
+ # Pair with pending user message if available
83
+ user_content = pending_user.content if pending_user else ""
84
+ timestamp = pending_user.timestamp if pending_user else msg.timestamp
85
+
86
+ message_dto = self._create_message(
87
+ chat_id=chat_id,
88
+ user_content=user_content,
89
+ assistant_content=msg.content,
90
+ timestamp=timestamp,
91
+ )
92
+ messages.append(message_dto)
93
+ pending_user = None
94
+
95
+ # Handle trailing user message
96
+ if pending_user:
97
+ message_dto = self._create_message(
98
+ chat_id=chat_id,
99
+ user_content=pending_user.content,
100
+ assistant_content="",
101
+ timestamp=pending_user.timestamp,
102
+ )
103
+ messages.append(message_dto)
104
+
105
+ logger.debug(
106
+ "messages_built",
107
+ chat_id=chat_id,
108
+ input_count=len(ingest_messages),
109
+ output_count=len(messages),
110
+ )
111
+
112
+ return messages
113
+
114
+ def _create_message(
115
+ self,
116
+ chat_id: str,
117
+ user_content: str,
118
+ assistant_content: str,
119
+ timestamp: int,
120
+ ) -> MessageDTO:
121
+ """Create a MessageDTO with deterministic ID."""
122
+ message_id = generate_message_id(
123
+ chat_id=chat_id,
124
+ user_content=user_content,
125
+ assistant_content=assistant_content,
126
+ timestamp=timestamp,
127
+ )
128
+
129
+ return MessageDTO(
130
+ message_id=message_id,
131
+ chat_id=chat_id,
132
+ user_content=user_content,
133
+ assistant_content=assistant_content,
134
+ created_on=timestamp,
135
+ )