bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bot_knows/__init__.py +70 -0
  2. bot_knows/config.py +115 -0
  3. bot_knows/domain/__init__.py +5 -0
  4. bot_knows/domain/chat.py +62 -0
  5. bot_knows/domain/message.py +64 -0
  6. bot_knows/domain/relation.py +56 -0
  7. bot_knows/domain/topic.py +132 -0
  8. bot_knows/domain/topic_evidence.py +55 -0
  9. bot_knows/importers/__init__.py +12 -0
  10. bot_knows/importers/base.py +116 -0
  11. bot_knows/importers/chatgpt.py +154 -0
  12. bot_knows/importers/claude.py +172 -0
  13. bot_knows/importers/generic_json.py +272 -0
  14. bot_knows/importers/registry.py +125 -0
  15. bot_knows/infra/__init__.py +5 -0
  16. bot_knows/infra/llm/__init__.py +6 -0
  17. bot_knows/infra/llm/anthropic_provider.py +172 -0
  18. bot_knows/infra/llm/openai_provider.py +195 -0
  19. bot_knows/infra/mongo/__init__.py +5 -0
  20. bot_knows/infra/mongo/client.py +145 -0
  21. bot_knows/infra/mongo/repositories.py +348 -0
  22. bot_knows/infra/neo4j/__init__.py +5 -0
  23. bot_knows/infra/neo4j/client.py +152 -0
  24. bot_knows/infra/neo4j/graph_repository.py +329 -0
  25. bot_knows/infra/redis/__init__.py +6 -0
  26. bot_knows/infra/redis/cache.py +198 -0
  27. bot_knows/infra/redis/client.py +193 -0
  28. bot_knows/interfaces/__init__.py +18 -0
  29. bot_knows/interfaces/embedding.py +55 -0
  30. bot_knows/interfaces/graph.py +194 -0
  31. bot_knows/interfaces/llm.py +70 -0
  32. bot_knows/interfaces/recall.py +92 -0
  33. bot_knows/interfaces/storage.py +225 -0
  34. bot_knows/logging.py +101 -0
  35. bot_knows/models/__init__.py +22 -0
  36. bot_knows/models/chat.py +55 -0
  37. bot_knows/models/ingest.py +70 -0
  38. bot_knows/models/message.py +49 -0
  39. bot_knows/models/recall.py +58 -0
  40. bot_knows/models/topic.py +100 -0
  41. bot_knows/orchestrator.py +398 -0
  42. bot_knows/py.typed +0 -0
  43. bot_knows/services/__init__.py +24 -0
  44. bot_knows/services/chat_processing.py +182 -0
  45. bot_knows/services/dedup_service.py +161 -0
  46. bot_knows/services/graph_service.py +217 -0
  47. bot_knows/services/message_builder.py +135 -0
  48. bot_knows/services/recall_service.py +296 -0
  49. bot_knows/services/tasks.py +128 -0
  50. bot_knows/services/topic_extraction.py +199 -0
  51. bot_knows/utils/__init__.py +22 -0
  52. bot_knows/utils/hashing.py +126 -0
  53. bot_knows-0.1.0.dist-info/METADATA +294 -0
  54. bot_knows-0.1.0.dist-info/RECORD +56 -0
  55. bot_knows-0.1.0.dist-info/WHEEL +4 -0
  56. bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,225 @@
1
+ """Storage interface for bot_knows.
2
+
3
+ This module defines the Protocol for persistent storage operations.
4
+ """
5
+
6
+ from typing import ClassVar, Protocol, runtime_checkable
7
+
8
+ from bot_knows.models.chat import ChatDTO
9
+ from bot_knows.models.message import MessageDTO
10
+ from bot_knows.models.recall import TopicRecallStateDTO
11
+ from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
12
+
13
+ __all__ = [
14
+ "StorageInterface",
15
+ ]
16
+
17
+
18
+ @runtime_checkable
19
+ class StorageInterface(Protocol):
20
+ """Contract for persistent storage operations.
21
+
22
+ Implementations should provide CRUD operations for
23
+ chats, messages, topics, evidence, and recall state.
24
+ """
25
+
26
+ config_class: ClassVar[type | None] = None
27
+
28
+ # Chat operations
29
+ async def save_chat(self, chat: ChatDTO) -> str:
30
+ """Save a chat to storage.
31
+
32
+ Args:
33
+ chat: Chat data to save
34
+
35
+ Returns:
36
+ Chat ID
37
+ """
38
+ ...
39
+
40
+ async def get_chat(self, chat_id: str) -> ChatDTO | None:
41
+ """Get a chat by ID.
42
+
43
+ Args:
44
+ chat_id: Chat ID to retrieve
45
+
46
+ Returns:
47
+ ChatDTO if found, None otherwise
48
+ """
49
+ ...
50
+
51
+ async def chat_exists(self, chat_id: str) -> bool:
52
+ """Check if a chat exists.
53
+
54
+ Args:
55
+ chat_id: Chat ID to check
56
+
57
+ Returns:
58
+ True if exists, False otherwise
59
+ """
60
+ ...
61
+
62
+ async def find_chats_by_source(self, source: str) -> list[ChatDTO]:
63
+ """Find all chats from a given source.
64
+
65
+ Args:
66
+ source: Import source to filter by
67
+
68
+ Returns:
69
+ List of matching chats
70
+ """
71
+ ...
72
+
73
+ # Message operations
74
+ async def save_message(self, message: MessageDTO) -> str:
75
+ """Save a message to storage.
76
+
77
+ Args:
78
+ message: Message data to save
79
+
80
+ Returns:
81
+ Message ID
82
+ """
83
+ ...
84
+
85
+ async def get_message(self, message_id: str) -> MessageDTO | None:
86
+ """Get a message by ID.
87
+
88
+ Args:
89
+ message_id: Message ID to retrieve
90
+
91
+ Returns:
92
+ MessageDTO if found, None otherwise
93
+ """
94
+ ...
95
+
96
+ async def get_messages_for_chat(self, chat_id: str) -> list[MessageDTO]:
97
+ """Get all messages for a chat.
98
+
99
+ Args:
100
+ chat_id: Chat ID to query
101
+
102
+ Returns:
103
+ List of messages, ordered by timestamp
104
+ """
105
+ ...
106
+
107
+ # Topic operations
108
+ async def save_topic(self, topic: TopicDTO) -> str:
109
+ """Save a topic to storage.
110
+
111
+ Args:
112
+ topic: Topic data to save
113
+
114
+ Returns:
115
+ Topic ID
116
+ """
117
+ ...
118
+
119
+ async def get_topic(self, topic_id: str) -> TopicDTO | None:
120
+ """Get a topic by ID.
121
+
122
+ Args:
123
+ topic_id: Topic ID to retrieve
124
+
125
+ Returns:
126
+ TopicDTO if found, None otherwise
127
+ """
128
+ ...
129
+
130
+ async def update_topic(self, topic: TopicDTO) -> None:
131
+ """Update an existing topic.
132
+
133
+ Args:
134
+ topic: Updated topic data
135
+ """
136
+ ...
137
+
138
+ async def find_similar_topics(
139
+ self,
140
+ embedding: list[float],
141
+ threshold: float,
142
+ ) -> list[tuple[TopicDTO, float]]:
143
+ """Find topics with similar embeddings.
144
+
145
+ Args:
146
+ embedding: Query embedding vector
147
+ threshold: Minimum similarity threshold
148
+
149
+ Returns:
150
+ List of (TopicDTO, similarity) tuples, sorted by similarity desc
151
+ """
152
+ ...
153
+
154
+ async def get_all_topics(self, limit: int = 1000) -> list[TopicDTO]:
155
+ """Get all topics (for batch operations).
156
+
157
+ Args:
158
+ limit: Maximum number of topics to return
159
+
160
+ Returns:
161
+ List of topics
162
+ """
163
+ ...
164
+
165
+ # Evidence operations
166
+ async def append_evidence(self, evidence: TopicEvidenceDTO) -> str:
167
+ """Append evidence record (never update or delete).
168
+
169
+ Args:
170
+ evidence: Evidence data to append
171
+
172
+ Returns:
173
+ Evidence ID
174
+ """
175
+ ...
176
+
177
+ async def get_evidence_for_topic(self, topic_id: str) -> list[TopicEvidenceDTO]:
178
+ """Get all evidence for a topic.
179
+
180
+ Args:
181
+ topic_id: Topic ID to query
182
+
183
+ Returns:
184
+ List of evidence records
185
+ """
186
+ ...
187
+
188
+ # Recall state operations
189
+ async def save_recall_state(self, state: TopicRecallStateDTO) -> None:
190
+ """Save or update recall state for a topic.
191
+
192
+ Args:
193
+ state: Recall state to save
194
+ """
195
+ ...
196
+
197
+ async def get_recall_state(self, topic_id: str) -> TopicRecallStateDTO | None:
198
+ """Get recall state for a topic.
199
+
200
+ Args:
201
+ topic_id: Topic ID to query
202
+
203
+ Returns:
204
+ TopicRecallStateDTO if found, None otherwise
205
+ """
206
+ ...
207
+
208
+ async def get_due_topics(self, threshold: float) -> list[TopicRecallStateDTO]:
209
+ """Get topics due for recall review.
210
+
211
+ Args:
212
+ threshold: Strength threshold (topics below this are due)
213
+
214
+ Returns:
215
+ List of recall states for due topics
216
+ """
217
+ ...
218
+
219
+ async def get_all_recall_states(self) -> list[TopicRecallStateDTO]:
220
+ """Get all recall states (for batch decay updates).
221
+
222
+ Returns:
223
+ List of all recall states
224
+ """
225
+ ...
bot_knows/logging.py ADDED
@@ -0,0 +1,101 @@
1
+ """Structured logging for bot_knows.
2
+
3
+ This module provides a configured structlog logger with JSON output
4
+ for production and pretty console output for development.
5
+ """
6
+
7
+ import logging
8
+ import sys
9
+ from typing import Any
10
+
11
+ import structlog
12
+
13
+ __all__ = [
14
+ "configure_logging",
15
+ "get_logger",
16
+ ]
17
+
18
+
19
+ def configure_logging(
20
+ level: int = logging.INFO,
21
+ json_output: bool = False,
22
+ add_timestamp: bool = True,
23
+ ) -> None:
24
+ """Configure structlog for the application.
25
+
26
+ Args:
27
+ level: Logging level (default: INFO)
28
+ json_output: If True, output JSON; if False, pretty console output
29
+ add_timestamp: If True, add ISO timestamp to log entries
30
+ """
31
+ # Common processors
32
+ processors: list[Any] = [
33
+ structlog.contextvars.merge_contextvars,
34
+ structlog.stdlib.add_log_level,
35
+ structlog.stdlib.add_logger_name,
36
+ structlog.stdlib.PositionalArgumentsFormatter(),
37
+ structlog.processors.StackInfoRenderer(),
38
+ structlog.processors.UnicodeDecoder(),
39
+ ]
40
+
41
+ if add_timestamp:
42
+ processors.insert(0, structlog.processors.TimeStamper(fmt="iso"))
43
+
44
+ if json_output:
45
+ processors.append(structlog.processors.JSONRenderer())
46
+ else:
47
+ processors.append(
48
+ structlog.dev.ConsoleRenderer(
49
+ colors=True,
50
+ exception_formatter=structlog.dev.plain_traceback,
51
+ )
52
+ )
53
+
54
+ # Configure structlog
55
+ structlog.configure(
56
+ processors=processors,
57
+ wrapper_class=structlog.stdlib.BoundLogger,
58
+ context_class=dict,
59
+ logger_factory=structlog.stdlib.LoggerFactory(),
60
+ cache_logger_on_first_use=True,
61
+ )
62
+
63
+ # Configure standard library logging
64
+ logging.basicConfig(
65
+ format="%(message)s",
66
+ stream=sys.stdout,
67
+ level=level,
68
+ )
69
+
70
+ # Set levels for noisy third-party loggers
71
+ logging.getLogger("httpx").setLevel(logging.WARNING)
72
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
73
+ logging.getLogger("motor").setLevel(logging.WARNING)
74
+ logging.getLogger("neo4j").setLevel(logging.WARNING)
75
+
76
+
77
+ def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
78
+ """Get a configured structlog logger.
79
+
80
+ Args:
81
+ name: Logger name (usually __name__ of the calling module)
82
+
83
+ Returns:
84
+ Configured structlog BoundLogger instance
85
+ """
86
+ return structlog.get_logger(name)
87
+
88
+
89
+ # Convenience: configure with defaults on import if not already configured
90
+ _configured = False
91
+
92
+
93
+ def _ensure_configured() -> None:
94
+ """Ensure logging is configured with defaults."""
95
+ global _configured
96
+ if not _configured:
97
+ configure_logging()
98
+ _configured = True
99
+
100
+
101
+ _ensure_configured()
@@ -0,0 +1,22 @@
1
+ """Public DTO models for bot_knows.
2
+
3
+ This module exports all public data transfer objects.
4
+ """
5
+
6
+ from bot_knows.models.chat import ChatCategory, ChatDTO
7
+ from bot_knows.models.ingest import ChatIngest, IngestMessage
8
+ from bot_knows.models.message import MessageDTO
9
+ from bot_knows.models.recall import RecallItemDTO, TopicRecallStateDTO
10
+ from bot_knows.models.topic import TopicDTO, TopicEvidenceDTO
11
+
12
+ __all__ = [
13
+ "ChatCategory",
14
+ "ChatDTO",
15
+ "ChatIngest",
16
+ "IngestMessage",
17
+ "MessageDTO",
18
+ "RecallItemDTO",
19
+ "TopicDTO",
20
+ "TopicEvidenceDTO",
21
+ "TopicRecallStateDTO",
22
+ ]
@@ -0,0 +1,55 @@
1
+ """Chat models for bot_knows.
2
+
3
+ These models represent processed chats in the knowledge base.
4
+ """
5
+
6
+ from enum import StrEnum
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ __all__ = [
11
+ "ChatCategory",
12
+ "ChatDTO",
13
+ ]
14
+
15
+
16
+ class ChatCategory(StrEnum):
17
+ """Categories for chat classification.
18
+
19
+ Used by the LLM-based classifier to categorize chats
20
+ based on their content and purpose.
21
+ """
22
+
23
+ CODING = "coding"
24
+ RESEARCH = "research"
25
+ WRITING = "writing"
26
+ BRAINSTORMING = "brainstorming"
27
+ DEBUGGING = "debugging"
28
+ LEARNING = "learning"
29
+ GENERAL = "general"
30
+ OTHER = "other"
31
+
32
+
33
+ class ChatDTO(BaseModel, frozen=True):
34
+ """Public Chat data transfer object.
35
+
36
+ Represents a processed chat in the knowledge base.
37
+ Chats contain metadata only - message content is stored separately.
38
+
39
+ Attributes:
40
+ id: Deterministic chat ID (SHA256 of title + source + timestamp)
41
+ title: Chat title (resolved from import or first message)
42
+ source: Import source identifier
43
+ category: LLM-classified category
44
+ tags: Free-form tags from classification
45
+ created_on: Chat creation timestamp in epoch seconds
46
+ schema_version: Schema version for forward compatibility
47
+ """
48
+
49
+ id: str = Field(description="SHA256 hash of title + source + timestamp")
50
+ title: str
51
+ source: str = Field(description="Import source (chatgpt, claude, etc.)")
52
+ category: ChatCategory = Field(default=ChatCategory.GENERAL)
53
+ tags: list[str] = Field(default_factory=list)
54
+ created_on: int = Field(description="Epoch seconds")
55
+ schema_version: int = Field(default=1)
@@ -0,0 +1,70 @@
1
+ """Ingestion boundary models for bot_knows.
2
+
3
+ These frozen Pydantic models define the contract between import adapters
4
+ and the domain processing layer. They are immutable and validated at creation.
5
+ """
6
+
7
+ from typing import Literal
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ __all__ = [
12
+ "ChatIngest",
13
+ "IngestMessage",
14
+ ]
15
+
16
+
17
+ class IngestMessage(BaseModel, frozen=True):
18
+ """Single message from import source.
19
+
20
+ This is a frozen (immutable) model representing one message
21
+ in its raw form from the import source.
22
+
23
+ Attributes:
24
+ role: Message author role (user, assistant, or system)
25
+ content: Message text content
26
+ timestamp: Message timestamp in epoch seconds
27
+ chat_id: Provider's original chat/conversation identifier
28
+ schema_version: Schema version for forward compatibility
29
+ """
30
+
31
+ role: Literal["user", "assistant", "system"]
32
+ content: str
33
+ timestamp: int = Field(description="Epoch seconds")
34
+ chat_id: str = Field(description="Provider's chat identifier")
35
+ schema_version: int = Field(default=1)
36
+
37
+
38
+ class ChatIngest(BaseModel, frozen=True):
39
+ """Complete chat from import source.
40
+
41
+ This is a frozen (immutable) model representing one complete chat
42
+ conversation ready for domain processing.
43
+
44
+ Attributes:
45
+ source: Import source identifier (e.g., "chatgpt", "claude")
46
+ imported_chat_timestamp: Chat creation/import timestamp in epoch seconds
47
+ title: Chat title (may be None if not provided by source)
48
+ messages: List of messages in the chat, ordered by timestamp
49
+ provider: Original provider name (for provenance)
50
+ conversation_id: Provider's original conversation ID
51
+ schema_version: Schema version for forward compatibility
52
+ """
53
+
54
+ source: str = Field(description="Import source (chatgpt, claude, etc.)")
55
+ imported_chat_timestamp: int = Field(description="Epoch seconds")
56
+ title: str | None = Field(default=None)
57
+ messages: list[IngestMessage] = Field(default_factory=list)
58
+ provider: str | None = Field(default=None, description="Original provider")
59
+ conversation_id: str | None = Field(default=None, description="Provider's conversation ID")
60
+ schema_version: int = Field(default=1)
61
+
62
+ @property
63
+ def message_count(self) -> int:
64
+ """Get the number of messages in this chat."""
65
+ return len(self.messages)
66
+
67
+ @property
68
+ def has_messages(self) -> bool:
69
+ """Check if this chat has any messages."""
70
+ return len(self.messages) > 0
@@ -0,0 +1,49 @@
1
+ """Message models for bot_knows.
2
+
3
+ These models represent processed messages in the knowledge base.
4
+ """
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ __all__ = [
9
+ "MessageDTO",
10
+ ]
11
+
12
+
13
+ class MessageDTO(BaseModel, frozen=True):
14
+ """User-Assistant message pair.
15
+
16
+ Messages are stored as user-assistant pairs rather than individual
17
+ messages. This reflects the conversational nature of chat data
18
+ and simplifies topic extraction.
19
+
20
+ Attributes:
21
+ message_id: Deterministic message ID (hash-based)
22
+ chat_id: Parent chat ID
23
+ user_content: User's message content (may be empty)
24
+ assistant_content: Assistant's response content (may be empty)
25
+ created_on: Message timestamp in epoch seconds
26
+ schema_version: Schema version for forward compatibility
27
+ """
28
+
29
+ message_id: str = Field(description="Hash-based message ID")
30
+ chat_id: str = Field(description="Parent chat ID")
31
+ user_content: str = Field(default="", description="User's message")
32
+ assistant_content: str = Field(default="", description="Assistant's response")
33
+ created_on: int = Field(description="Epoch seconds")
34
+ schema_version: int = Field(default=1)
35
+
36
+ @property
37
+ def combined_content(self) -> str:
38
+ """Get combined user and assistant content for processing."""
39
+ parts = []
40
+ if self.user_content:
41
+ parts.append(f"User: {self.user_content}")
42
+ if self.assistant_content:
43
+ parts.append(f"Assistant: {self.assistant_content}")
44
+ return "\n\n".join(parts)
45
+
46
+ @property
47
+ def is_empty(self) -> bool:
48
+ """Check if both user and assistant content are empty."""
49
+ return not self.user_content and not self.assistant_content
@@ -0,0 +1,58 @@
1
+ """Recall models for bot_knows.
2
+
3
+ These models represent the recall/spaced repetition state
4
+ for topics in the knowledge base.
5
+ """
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from bot_knows.models.topic import TopicDTO
10
+
11
+ __all__ = [
12
+ "RecallItemDTO",
13
+ "TopicRecallStateDTO",
14
+ ]
15
+
16
+
17
+ class TopicRecallStateDTO(BaseModel, frozen=True):
18
+ """Persisted recall state per topic.
19
+
20
+ Tracks the spaced repetition state for a topic including
21
+ strength decay and stability.
22
+
23
+ Attributes:
24
+ topic_id: Topic ID this state belongs to
25
+ strength: Current recall strength (0.0 - 1.0)
26
+ last_seen: Last time topic was accessed (epoch seconds)
27
+ last_updated: Last time decay was applied (epoch seconds)
28
+ stability: Decay rate factor (higher = slower decay)
29
+ schema_version: Schema version for forward compatibility
30
+ """
31
+
32
+ topic_id: str = Field(description="Topic ID")
33
+ strength: float = Field(default=0.0, ge=0.0, le=1.0, description="Recall strength")
34
+ last_seen: int = Field(description="Epoch seconds")
35
+ last_updated: int = Field(description="Epoch seconds")
36
+ stability: float = Field(default=1.0, ge=0.0, description="Decay rate factor")
37
+ schema_version: int = Field(default=1)
38
+
39
+
40
+ class RecallItemDTO(BaseModel, frozen=True):
41
+ """Topic ready for recall/review.
42
+
43
+ Represents a topic that is due for review along with
44
+ its recall state and related topics.
45
+
46
+ Attributes:
47
+ topic: The topic to review
48
+ recall_state: Current recall state
49
+ due_score: Priority score for recall (higher = more due)
50
+ related_topics: IDs of semantically related topics
51
+ schema_version: Schema version for forward compatibility
52
+ """
53
+
54
+ topic: TopicDTO
55
+ recall_state: TopicRecallStateDTO
56
+ due_score: float = Field(ge=0.0, description="Priority for recall")
57
+ related_topics: list[str] = Field(default_factory=list, description="Related topic IDs")
58
+ schema_version: int = Field(default=1)
@@ -0,0 +1,100 @@
1
+ """Topic models for bot_knows.
2
+
3
+ These models represent canonical topics and their evidence
4
+ in the knowledge base.
5
+ """
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ __all__ = [
10
+ "TopicDTO",
11
+ "TopicEvidenceDTO",
12
+ ]
13
+
14
+
15
+ class TopicDTO(BaseModel, frozen=True):
16
+ """Canonical semantic topic.
17
+
18
+ Topics are deduplicated semantic concepts extracted from messages.
19
+ Each topic has a running centroid embedding for similarity matching.
20
+
21
+ Attributes:
22
+ topic_id: Deterministic topic ID
23
+ canonical_name: Canonical/normalized topic name
24
+ centroid_embedding: Running centroid of all evidence embeddings
25
+ evidence_count: Number of evidence records (for centroid updates)
26
+ importance: Topic importance score (0.0 - 1.0)
27
+ recall_strength: Current recall strength (0.0 - 1.0)
28
+ schema_version: Schema version for forward compatibility
29
+ """
30
+
31
+ topic_id: str = Field(description="Hash-based topic ID")
32
+ canonical_name: str = Field(description="Canonical topic name")
33
+ centroid_embedding: list[float] = Field(
34
+ default_factory=list, description="Running centroid embedding"
35
+ )
36
+ evidence_count: int = Field(default=0, description="Number of evidence records")
37
+ importance: float = Field(default=0.0, ge=0.0, le=1.0)
38
+ recall_strength: float = Field(default=0.0, ge=0.0, le=1.0)
39
+ schema_version: int = Field(default=1)
40
+
41
+ def with_updated_centroid(
42
+ self,
43
+ new_embedding: list[float],
44
+ ) -> "TopicDTO":
45
+ """Create a new TopicDTO with updated centroid embedding.
46
+
47
+ Uses incremental centroid update formula:
48
+ new_centroid = (old_centroid * n + new_embedding) / (n + 1)
49
+
50
+ Args:
51
+ new_embedding: New embedding to incorporate
52
+
53
+ Returns:
54
+ New TopicDTO with updated centroid and evidence_count
55
+ """
56
+ n = self.evidence_count
57
+ if n == 0:
58
+ # First embedding becomes the centroid
59
+ new_centroid = new_embedding
60
+ else:
61
+ # Incremental update
62
+ new_centroid = [
63
+ (old * n + new) / (n + 1)
64
+ for old, new in zip(self.centroid_embedding, new_embedding, strict=False)
65
+ ]
66
+
67
+ return TopicDTO(
68
+ topic_id=self.topic_id,
69
+ canonical_name=self.canonical_name,
70
+ centroid_embedding=new_centroid,
71
+ evidence_count=n + 1,
72
+ importance=self.importance,
73
+ recall_strength=self.recall_strength,
74
+ schema_version=self.schema_version,
75
+ )
76
+
77
+
78
+ class TopicEvidenceDTO(BaseModel, frozen=True):
79
+ """Append-only evidence linking extraction to topic.
80
+
81
+ Evidence records are never modified or deleted. They provide
82
+ a complete audit trail of topic extractions.
83
+
84
+ Attributes:
85
+ evidence_id: Deterministic evidence ID
86
+ topic_id: Parent topic ID
87
+ extracted_name: Raw extracted topic name (before normalization)
88
+ source_message_id: ID of the message this was extracted from
89
+ confidence: Extraction confidence score (0.0 - 1.0)
90
+ timestamp: Extraction timestamp in epoch seconds
91
+ schema_version: Schema version for forward compatibility
92
+ """
93
+
94
+ evidence_id: str = Field(description="Hash-based evidence ID")
95
+ topic_id: str = Field(description="Parent topic ID")
96
+ extracted_name: str = Field(description="Raw extracted name")
97
+ source_message_id: str = Field(description="Source message ID")
98
+ confidence: float = Field(ge=0.0, le=1.0, description="Extraction confidence")
99
+ timestamp: int = Field(description="Epoch seconds")
100
+ schema_version: int = Field(default=1)