bot-knows 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bot_knows/__init__.py +70 -0
- bot_knows/config.py +115 -0
- bot_knows/domain/__init__.py +5 -0
- bot_knows/domain/chat.py +62 -0
- bot_knows/domain/message.py +64 -0
- bot_knows/domain/relation.py +56 -0
- bot_knows/domain/topic.py +132 -0
- bot_knows/domain/topic_evidence.py +55 -0
- bot_knows/importers/__init__.py +12 -0
- bot_knows/importers/base.py +116 -0
- bot_knows/importers/chatgpt.py +154 -0
- bot_knows/importers/claude.py +172 -0
- bot_knows/importers/generic_json.py +272 -0
- bot_knows/importers/registry.py +125 -0
- bot_knows/infra/__init__.py +5 -0
- bot_knows/infra/llm/__init__.py +6 -0
- bot_knows/infra/llm/anthropic_provider.py +172 -0
- bot_knows/infra/llm/openai_provider.py +195 -0
- bot_knows/infra/mongo/__init__.py +5 -0
- bot_knows/infra/mongo/client.py +145 -0
- bot_knows/infra/mongo/repositories.py +348 -0
- bot_knows/infra/neo4j/__init__.py +5 -0
- bot_knows/infra/neo4j/client.py +152 -0
- bot_knows/infra/neo4j/graph_repository.py +329 -0
- bot_knows/infra/redis/__init__.py +6 -0
- bot_knows/infra/redis/cache.py +198 -0
- bot_knows/infra/redis/client.py +193 -0
- bot_knows/interfaces/__init__.py +18 -0
- bot_knows/interfaces/embedding.py +55 -0
- bot_knows/interfaces/graph.py +194 -0
- bot_knows/interfaces/llm.py +70 -0
- bot_knows/interfaces/recall.py +92 -0
- bot_knows/interfaces/storage.py +225 -0
- bot_knows/logging.py +101 -0
- bot_knows/models/__init__.py +22 -0
- bot_knows/models/chat.py +55 -0
- bot_knows/models/ingest.py +70 -0
- bot_knows/models/message.py +49 -0
- bot_knows/models/recall.py +58 -0
- bot_knows/models/topic.py +100 -0
- bot_knows/orchestrator.py +398 -0
- bot_knows/py.typed +0 -0
- bot_knows/services/__init__.py +24 -0
- bot_knows/services/chat_processing.py +182 -0
- bot_knows/services/dedup_service.py +161 -0
- bot_knows/services/graph_service.py +217 -0
- bot_knows/services/message_builder.py +135 -0
- bot_knows/services/recall_service.py +296 -0
- bot_knows/services/tasks.py +128 -0
- bot_knows/services/topic_extraction.py +199 -0
- bot_knows/utils/__init__.py +22 -0
- bot_knows/utils/hashing.py +126 -0
- bot_knows-0.1.0.dist-info/METADATA +294 -0
- bot_knows-0.1.0.dist-info/RECORD +56 -0
- bot_knows-0.1.0.dist-info/WHEEL +4 -0
- bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Import adapter registry for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides a registry for dynamically registering
|
|
4
|
+
and looking up chat import adapters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from bot_knows.importers.base import ChatImportAdapter
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"ImportAdapterRegistry",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImportAdapterRegistry:
|
|
15
|
+
"""Registry for chat import adapters.
|
|
16
|
+
|
|
17
|
+
Provides a centralized registry for import adapters, allowing
|
|
18
|
+
dynamic registration and lookup by source name.
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
# Register an adapter (as decorator)
|
|
22
|
+
@ImportAdapterRegistry.register
|
|
23
|
+
class MyAdapter(ChatImportAdapter):
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
# Or register manually
|
|
27
|
+
ImportAdapterRegistry.register(MyAdapter)
|
|
28
|
+
|
|
29
|
+
# Create an adapter instance
|
|
30
|
+
adapter = ImportAdapterRegistry.create("my_source")
|
|
31
|
+
chats = adapter.parse(raw_data)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
_adapters: dict[str, type[ChatImportAdapter]] = {}
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def register(
|
|
38
|
+
cls,
|
|
39
|
+
adapter_cls: type[ChatImportAdapter],
|
|
40
|
+
) -> type[ChatImportAdapter]:
|
|
41
|
+
"""Register an adapter class.
|
|
42
|
+
|
|
43
|
+
Can be used as a decorator or called directly.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
adapter_cls: Adapter class to register
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
The adapter class (for use as decorator)
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If adapter source_name is already registered
|
|
53
|
+
"""
|
|
54
|
+
# Create instance to get source_name
|
|
55
|
+
instance = adapter_cls()
|
|
56
|
+
source_name = instance.source_name
|
|
57
|
+
|
|
58
|
+
if source_name in cls._adapters:
|
|
59
|
+
raise ValueError(f"Adapter already registered for source: {source_name}")
|
|
60
|
+
|
|
61
|
+
cls._adapters[source_name] = adapter_cls
|
|
62
|
+
return adapter_cls
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def get(cls, source_name: str) -> type[ChatImportAdapter]:
|
|
66
|
+
"""Get adapter class by source name.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
source_name: Source identifier to look up
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Adapter class
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
KeyError: If no adapter registered for source
|
|
76
|
+
"""
|
|
77
|
+
if source_name not in cls._adapters:
|
|
78
|
+
available = ", ".join(cls._adapters.keys()) or "none"
|
|
79
|
+
raise KeyError(
|
|
80
|
+
f"No adapter registered for source: {source_name}. Available: {available}"
|
|
81
|
+
)
|
|
82
|
+
return cls._adapters[source_name]
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def create(cls, source_name: str, **kwargs: object) -> ChatImportAdapter:
|
|
86
|
+
"""Create adapter instance by source name.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
source_name: Source identifier to look up
|
|
90
|
+
**kwargs: Arguments to pass to adapter constructor
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Adapter instance
|
|
94
|
+
"""
|
|
95
|
+
adapter_cls = cls.get(source_name)
|
|
96
|
+
return adapter_cls(**kwargs)
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def list_sources(cls) -> list[str]:
|
|
100
|
+
"""List all registered source names.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of registered source identifiers
|
|
104
|
+
"""
|
|
105
|
+
return list(cls._adapters.keys())
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
def is_registered(cls, source_name: str) -> bool:
|
|
109
|
+
"""Check if a source has a registered adapter.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
source_name: Source identifier to check
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
True if registered, False otherwise
|
|
116
|
+
"""
|
|
117
|
+
return source_name in cls._adapters
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def clear(cls) -> None:
|
|
121
|
+
"""Clear all registered adapters.
|
|
122
|
+
|
|
123
|
+
Primarily for testing purposes.
|
|
124
|
+
"""
|
|
125
|
+
cls._adapters.clear()
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Anthropic LLM provider for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides the Anthropic implementation of LLM interface.
|
|
4
|
+
Note: Anthropic does not provide embeddings, so this provider
|
|
5
|
+
requires a separate embedding service.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from typing import Any, Self
|
|
10
|
+
|
|
11
|
+
from anthropic import AsyncAnthropic
|
|
12
|
+
|
|
13
|
+
from bot_knows.config import LLMSettings
|
|
14
|
+
from bot_knows.interfaces.llm import LLMInterface
|
|
15
|
+
from bot_knows.logging import get_logger
|
|
16
|
+
from bot_knows.models.chat import ChatCategory
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"AnthropicProvider",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AnthropicProvider(LLMInterface):
|
|
26
|
+
"""Anthropic implementation of LLM interface.
|
|
27
|
+
|
|
28
|
+
Provides chat classification and topic extraction using
|
|
29
|
+
Anthropic's Claude API.
|
|
30
|
+
|
|
31
|
+
Note: This provider does NOT implement embedding generation.
|
|
32
|
+
Use OpenAI or another embedding provider for embeddings.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
config_class = LLMSettings
|
|
36
|
+
|
|
37
|
+
def __init__(self, settings: LLMSettings) -> None:
|
|
38
|
+
"""Initialize Anthropic provider.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
settings: LLM configuration settings
|
|
42
|
+
"""
|
|
43
|
+
self._settings = settings
|
|
44
|
+
api_key = settings.api_key.get_secret_value() if settings.api_key else None
|
|
45
|
+
self._client = AsyncAnthropic(api_key=api_key)
|
|
46
|
+
self._model = settings.model or "claude-sonnet-4-20250514"
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
async def from_config(cls, config: LLMSettings) -> Self:
|
|
50
|
+
"""Factory method for BotKnows instantiation.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config: LLM settings
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
AnthropicProvider instance
|
|
57
|
+
"""
|
|
58
|
+
return cls(config)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
async def from_dict(cls, config: dict[str, Any]) -> Self:
|
|
62
|
+
"""Factory method for custom config dict.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
config: Dictionary with LLM settings
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
AnthropicProvider instance
|
|
69
|
+
"""
|
|
70
|
+
settings = LLMSettings(**config)
|
|
71
|
+
return cls(settings)
|
|
72
|
+
|
|
73
|
+
async def close(self) -> None:
|
|
74
|
+
"""Close resources (no-op for Anthropic)."""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
async def classify_chat(
|
|
78
|
+
self,
|
|
79
|
+
first_pair: tuple[str, str],
|
|
80
|
+
last_pair: tuple[str, str],
|
|
81
|
+
) -> tuple[ChatCategory, list[str]]:
|
|
82
|
+
"""Classify chat and extract tags."""
|
|
83
|
+
system_prompt = """You are a chat classifier.
|
|
84
|
+
Analyze the conversation samples and classify the chat and assign tags.
|
|
85
|
+
|
|
86
|
+
Categories: coding, research, writing, brainstorming, debugging, learning, general, other
|
|
87
|
+
Tags: no strick - should be subcategroy of the category.
|
|
88
|
+
Respond with JSON only:
|
|
89
|
+
{"category": "category_name", "tags": ["tag1", "tag2"]}"""
|
|
90
|
+
|
|
91
|
+
user_content = f"""First exchange:
|
|
92
|
+
User: {first_pair[0][:500]}
|
|
93
|
+
Assistant: {first_pair[1][:500]}
|
|
94
|
+
|
|
95
|
+
Last exchange:
|
|
96
|
+
User: {last_pair[0][:500]}
|
|
97
|
+
Assistant: {last_pair[1][:500]}"""
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
response = await self._client.messages.create(
|
|
101
|
+
model=self._model,
|
|
102
|
+
max_tokens=256,
|
|
103
|
+
system=system_prompt,
|
|
104
|
+
messages=[{"role": "user", "content": user_content}],
|
|
105
|
+
)
|
|
106
|
+
content = response.content[0].text if response.content else "{}"
|
|
107
|
+
# Extract JSON from response
|
|
108
|
+
result = self._parse_json_response(content)
|
|
109
|
+
category_str = result.get("category", "general").lower()
|
|
110
|
+
try:
|
|
111
|
+
category = ChatCategory(category_str)
|
|
112
|
+
except ValueError:
|
|
113
|
+
category = ChatCategory.GENERAL
|
|
114
|
+
tags = result.get("tags", [])
|
|
115
|
+
return category, tags[:5] if isinstance(tags, list) else []
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.warning("chat_classification_failed", error=str(e))
|
|
118
|
+
return ChatCategory.GENERAL, []
|
|
119
|
+
|
|
120
|
+
async def extract_topics(
|
|
121
|
+
self,
|
|
122
|
+
user_content: str,
|
|
123
|
+
assistant_content: str,
|
|
124
|
+
) -> list[tuple[str, float]]:
|
|
125
|
+
"""Extract topic candidates from message pair."""
|
|
126
|
+
system_prompt = """Extract key topics from this conversation. Use concise canonical names.
|
|
127
|
+
|
|
128
|
+
Respond with JSON only:
|
|
129
|
+
{"topics": [{"name": "topic_name", "confidence": 0.9}]}
|
|
130
|
+
|
|
131
|
+
Extract 0-5 topics."""
|
|
132
|
+
|
|
133
|
+
user_prompt = f"User: {user_content[:1000]}\n\nAssistant: {assistant_content[:1000]}"
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
response = await self._client.messages.create(
|
|
137
|
+
model=self._model,
|
|
138
|
+
max_tokens=256,
|
|
139
|
+
system=system_prompt,
|
|
140
|
+
messages=[{"role": "user", "content": user_prompt}],
|
|
141
|
+
)
|
|
142
|
+
content = response.content[0].text if response.content else "{}"
|
|
143
|
+
result = self._parse_json_response(content)
|
|
144
|
+
topics = result.get("topics", [])
|
|
145
|
+
return [
|
|
146
|
+
(t["name"], float(t.get("confidence", 0.5)))
|
|
147
|
+
for t in topics
|
|
148
|
+
if isinstance(t, dict) and "name" in t
|
|
149
|
+
][:5]
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.warning("topic_extraction_failed", error=str(e))
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
async def normalize_topic_name(self, extracted_name: str) -> str:
|
|
155
|
+
"""Normalize topic name to canonical form."""
|
|
156
|
+
normalized = extracted_name.strip().lower()
|
|
157
|
+
normalized = " ".join(word.capitalize() for word in normalized.split())
|
|
158
|
+
return normalized[:100]
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _parse_json_response(content: str) -> dict:
|
|
162
|
+
"""Parse JSON from response, handling markdown code blocks."""
|
|
163
|
+
content = content.strip()
|
|
164
|
+
# Remove markdown code blocks if present
|
|
165
|
+
if content.startswith("```"):
|
|
166
|
+
lines = content.split("\n")
|
|
167
|
+
# Remove first and last lines (```json and ```)
|
|
168
|
+
content = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
|
|
169
|
+
try:
|
|
170
|
+
return json.loads(content)
|
|
171
|
+
except json.JSONDecodeError:
|
|
172
|
+
return {}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""OpenAI LLM provider for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides the OpenAI implementation of LLM and embedding interfaces.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any, Self
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from openai import AsyncOpenAI
|
|
11
|
+
|
|
12
|
+
from bot_knows.config import LLMSettings
|
|
13
|
+
from bot_knows.interfaces.embedding import EmbeddingServiceInterface
|
|
14
|
+
from bot_knows.interfaces.llm import LLMInterface
|
|
15
|
+
from bot_knows.logging import get_logger
|
|
16
|
+
from bot_knows.models.chat import ChatCategory
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"OpenAIProvider",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OpenAIProvider(LLMInterface, EmbeddingServiceInterface):
|
|
26
|
+
"""OpenAI implementation of LLM and embedding interfaces.
|
|
27
|
+
|
|
28
|
+
Provides chat classification, topic extraction, and embedding
|
|
29
|
+
generation using OpenAI's API.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
config_class = LLMSettings
|
|
33
|
+
|
|
34
|
+
def __init__(self, settings: LLMSettings) -> None:
|
|
35
|
+
"""Initialize OpenAI provider.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
settings: LLM configuration settings
|
|
39
|
+
"""
|
|
40
|
+
self._settings = settings
|
|
41
|
+
api_key = settings.api_key.get_secret_value() if settings.api_key else None
|
|
42
|
+
self._client = AsyncOpenAI(api_key=api_key)
|
|
43
|
+
self._model = settings.model
|
|
44
|
+
self._embedding_model = settings.embedding_model
|
|
45
|
+
self._embedding_dimensions = settings.embedding_dimensions
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
async def from_config(cls, config: LLMSettings) -> Self:
|
|
49
|
+
"""Factory method for BotKnows instantiation.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: LLM settings
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
OpenAIProvider instance
|
|
56
|
+
"""
|
|
57
|
+
return cls(config)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
async def from_dict(cls, config: dict[str, Any]) -> Self:
|
|
61
|
+
"""Factory method for custom config dict.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: Dictionary with LLM settings
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
OpenAIProvider instance
|
|
68
|
+
"""
|
|
69
|
+
settings = LLMSettings(**config)
|
|
70
|
+
return cls(settings)
|
|
71
|
+
|
|
72
|
+
async def close(self) -> None:
|
|
73
|
+
"""Close resources (no-op for OpenAI)."""
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
# Embedding interface
|
|
77
|
+
async def embed(self, text: str) -> list[float]:
|
|
78
|
+
"""Generate embedding for text."""
|
|
79
|
+
response = await self._client.embeddings.create(
|
|
80
|
+
model=self._embedding_model,
|
|
81
|
+
input=text,
|
|
82
|
+
)
|
|
83
|
+
return response.data[0].embedding
|
|
84
|
+
|
|
85
|
+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
86
|
+
"""Generate embeddings for multiple texts."""
|
|
87
|
+
if not texts:
|
|
88
|
+
return []
|
|
89
|
+
response = await self._client.embeddings.create(
|
|
90
|
+
model=self._embedding_model,
|
|
91
|
+
input=texts,
|
|
92
|
+
)
|
|
93
|
+
sorted_data = sorted(response.data, key=lambda x: x.index)
|
|
94
|
+
return [item.embedding for item in sorted_data]
|
|
95
|
+
|
|
96
|
+
async def similarity(self, embedding1: list[float], embedding2: list[float]) -> float:
|
|
97
|
+
"""Compute cosine similarity between embeddings."""
|
|
98
|
+
vec1 = np.array(embedding1)
|
|
99
|
+
vec2 = np.array(embedding2)
|
|
100
|
+
norm1 = np.linalg.norm(vec1)
|
|
101
|
+
norm2 = np.linalg.norm(vec2)
|
|
102
|
+
if norm1 == 0 or norm2 == 0:
|
|
103
|
+
return 0.0
|
|
104
|
+
return float(np.dot(vec1, vec2) / (norm1 * norm2))
|
|
105
|
+
|
|
106
|
+
# LLM interface
|
|
107
|
+
async def classify_chat(
|
|
108
|
+
self,
|
|
109
|
+
first_pair: tuple[str, str],
|
|
110
|
+
last_pair: tuple[str, str],
|
|
111
|
+
) -> tuple[ChatCategory, list[str]]:
|
|
112
|
+
"""Classify chat and extract tags."""
|
|
113
|
+
system_prompt = """You are a chat classifier.
|
|
114
|
+
Analyze the conversation samples and classify the chat and assign tags.
|
|
115
|
+
|
|
116
|
+
Categories: coding, research, writing, brainstorming, debugging, learning, general, other
|
|
117
|
+
Tags: no strick - should be subcategroy of the category.
|
|
118
|
+
Respond with JSON only:
|
|
119
|
+
{"category": "category_name", "tags": ["tag1", "tag2"]}"""
|
|
120
|
+
|
|
121
|
+
user_content = f"""First exchange:
|
|
122
|
+
User: {first_pair[0][:500]}
|
|
123
|
+
Assistant: {first_pair[1][:500]}
|
|
124
|
+
|
|
125
|
+
Last exchange:
|
|
126
|
+
User: {last_pair[0][:500]}
|
|
127
|
+
Assistant: {last_pair[1][:500]}"""
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
response = await self._client.chat.completions.create(
|
|
131
|
+
model=self._model,
|
|
132
|
+
messages=[
|
|
133
|
+
{"role": "system", "content": system_prompt},
|
|
134
|
+
{"role": "user", "content": user_content},
|
|
135
|
+
],
|
|
136
|
+
response_format={"type": "json_object"},
|
|
137
|
+
temperature=0.3,
|
|
138
|
+
)
|
|
139
|
+
content = response.choices[0].message.content or "{}"
|
|
140
|
+
result = json.loads(content)
|
|
141
|
+
category_str = result.get("category", "general").lower()
|
|
142
|
+
try:
|
|
143
|
+
category = ChatCategory(category_str)
|
|
144
|
+
except ValueError:
|
|
145
|
+
category = ChatCategory.GENERAL
|
|
146
|
+
tags = result.get("tags", [])
|
|
147
|
+
return category, tags[:5] if isinstance(tags, list) else []
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning("chat_classification_failed", error=str(e))
|
|
150
|
+
return ChatCategory.GENERAL, []
|
|
151
|
+
|
|
152
|
+
async def extract_topics(
|
|
153
|
+
self,
|
|
154
|
+
user_content: str,
|
|
155
|
+
assistant_content: str,
|
|
156
|
+
) -> list[tuple[str, float]]:
|
|
157
|
+
"""Extract topic candidates from message pair."""
|
|
158
|
+
system_prompt = """Extract key topics from this conversation. Use concise canonical names.
|
|
159
|
+
|
|
160
|
+
Respond with JSON only:
|
|
161
|
+
{"topics": [{"name": "topic_name", "confidence": 0.9}]}
|
|
162
|
+
|
|
163
|
+
Extract 0-5 topics."""
|
|
164
|
+
|
|
165
|
+
user_prompt = f"User: {user_content[:1000]}\n\nAssistant: {assistant_content[:1000]}"
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
response = await self._client.chat.completions.create(
|
|
169
|
+
model=self._model,
|
|
170
|
+
messages=[
|
|
171
|
+
{"role": "system", "content": system_prompt},
|
|
172
|
+
{"role": "user", "content": user_prompt},
|
|
173
|
+
],
|
|
174
|
+
response_format={"type": "json_object"},
|
|
175
|
+
temperature=0.3,
|
|
176
|
+
)
|
|
177
|
+
content = response.choices[0].message.content or "{}"
|
|
178
|
+
result = json.loads(content)
|
|
179
|
+
topics = result.get("topics", [])
|
|
180
|
+
return [
|
|
181
|
+
(t["name"], float(t.get("confidence", 0.5)))
|
|
182
|
+
for t in topics
|
|
183
|
+
if isinstance(t, dict) and "name" in t
|
|
184
|
+
][:5]
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.warning("topic_extraction_failed", error=str(e))
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
async def normalize_topic_name(self, extracted_name: str) -> str:
|
|
190
|
+
"""Normalize topic name to canonical form."""
|
|
191
|
+
# Simple normalization: lowercase, strip, limit length
|
|
192
|
+
normalized = extracted_name.strip().lower()
|
|
193
|
+
# Capitalize first letter of each word
|
|
194
|
+
normalized = " ".join(word.capitalize() for word in normalized.split())
|
|
195
|
+
return normalized[:100]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""MongoDB client for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides an async MongoDB client wrapper using Motor.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection, AsyncIOMotorDatabase
|
|
9
|
+
|
|
10
|
+
from bot_knows.config import MongoSettings
|
|
11
|
+
from bot_knows.logging import get_logger
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"MongoClient",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MongoClient:
|
|
21
|
+
"""Async MongoDB client wrapper.
|
|
22
|
+
|
|
23
|
+
Provides a connection manager and collection accessors
|
|
24
|
+
for the bot_knows MongoDB database.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
client = MongoClient(settings)
|
|
28
|
+
await client.connect()
|
|
29
|
+
|
|
30
|
+
# Access collections
|
|
31
|
+
await client.chats.insert_one(chat_data)
|
|
32
|
+
|
|
33
|
+
await client.disconnect()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, settings: MongoSettings) -> None:
|
|
37
|
+
"""Initialize client with settings.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
settings: MongoDB connection settings
|
|
41
|
+
"""
|
|
42
|
+
self._settings = settings
|
|
43
|
+
self._client: AsyncIOMotorClient[dict[str, Any]] | None = None
|
|
44
|
+
self._db: AsyncIOMotorDatabase[dict[str, Any]] | None = None
|
|
45
|
+
|
|
46
|
+
async def connect(self) -> None:
|
|
47
|
+
"""Initialize connection to MongoDB."""
|
|
48
|
+
if self._client is not None:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
uri = self._settings.uri.get_secret_value()
|
|
52
|
+
self._client = AsyncIOMotorClient(uri)
|
|
53
|
+
self._db = self._client[self._settings.database]
|
|
54
|
+
|
|
55
|
+
# Verify connection
|
|
56
|
+
await self._client.admin.command("ping")
|
|
57
|
+
logger.info(
|
|
58
|
+
"connected_to_mongodb",
|
|
59
|
+
database=self._settings.database,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
async def disconnect(self) -> None:
|
|
63
|
+
"""Close connection to MongoDB."""
|
|
64
|
+
if self._client:
|
|
65
|
+
self._client.close()
|
|
66
|
+
self._client = None
|
|
67
|
+
self._db = None
|
|
68
|
+
logger.info("disconnected_from_mongodb")
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def db(self) -> AsyncIOMotorDatabase[dict[str, Any]]:
|
|
72
|
+
"""Get database instance.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
RuntimeError: If not connected
|
|
76
|
+
"""
|
|
77
|
+
if self._db is None:
|
|
78
|
+
raise RuntimeError("MongoClient not connected. Call connect() first.")
|
|
79
|
+
return self._db
|
|
80
|
+
|
|
81
|
+
def _collection(self, name: str) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
82
|
+
"""Get collection with optional prefix."""
|
|
83
|
+
full_name = f"{self._settings.collection_prefix}{name}"
|
|
84
|
+
return self.db[full_name]
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def chats(self) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
88
|
+
"""Get chats collection."""
|
|
89
|
+
return self._collection("chats")
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def messages(self) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
93
|
+
"""Get messages collection."""
|
|
94
|
+
return self._collection("messages")
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def topics(self) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
98
|
+
"""Get topics collection."""
|
|
99
|
+
return self._collection("topics")
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def evidence(self) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
103
|
+
"""Get topic_evidence collection."""
|
|
104
|
+
return self._collection("topic_evidence")
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def recall_states(self) -> AsyncIOMotorCollection[dict[str, Any]]:
|
|
108
|
+
"""Get recall_states collection."""
|
|
109
|
+
return self._collection("recall_states")
|
|
110
|
+
|
|
111
|
+
async def create_indexes(self) -> None:
|
|
112
|
+
"""Create indexes for all collections."""
|
|
113
|
+
# Chats indexes
|
|
114
|
+
await self.chats.create_index("id", unique=True)
|
|
115
|
+
await self.chats.create_index("source")
|
|
116
|
+
await self.chats.create_index("created_on")
|
|
117
|
+
|
|
118
|
+
# Messages indexes
|
|
119
|
+
await self.messages.create_index("message_id", unique=True)
|
|
120
|
+
await self.messages.create_index("chat_id")
|
|
121
|
+
await self.messages.create_index("created_on")
|
|
122
|
+
|
|
123
|
+
# Topics indexes
|
|
124
|
+
await self.topics.create_index("topic_id", unique=True)
|
|
125
|
+
await self.topics.create_index("canonical_name")
|
|
126
|
+
|
|
127
|
+
# Evidence indexes
|
|
128
|
+
await self.evidence.create_index("evidence_id", unique=True)
|
|
129
|
+
await self.evidence.create_index("topic_id")
|
|
130
|
+
await self.evidence.create_index("source_message_id")
|
|
131
|
+
|
|
132
|
+
# Recall states indexes
|
|
133
|
+
await self.recall_states.create_index("topic_id", unique=True)
|
|
134
|
+
await self.recall_states.create_index("strength")
|
|
135
|
+
|
|
136
|
+
logger.info("created_mongodb_indexes")
|
|
137
|
+
|
|
138
|
+
async def __aenter__(self) -> "MongoClient":
|
|
139
|
+
"""Async context manager entry."""
|
|
140
|
+
await self.connect()
|
|
141
|
+
return self
|
|
142
|
+
|
|
143
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
144
|
+
"""Async context manager exit."""
|
|
145
|
+
await self.disconnect()
|