bot-knows 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bot_knows/__init__.py +70 -0
  2. bot_knows/config.py +115 -0
  3. bot_knows/domain/__init__.py +5 -0
  4. bot_knows/domain/chat.py +62 -0
  5. bot_knows/domain/message.py +64 -0
  6. bot_knows/domain/relation.py +56 -0
  7. bot_knows/domain/topic.py +132 -0
  8. bot_knows/domain/topic_evidence.py +55 -0
  9. bot_knows/importers/__init__.py +12 -0
  10. bot_knows/importers/base.py +116 -0
  11. bot_knows/importers/chatgpt.py +154 -0
  12. bot_knows/importers/claude.py +172 -0
  13. bot_knows/importers/generic_json.py +272 -0
  14. bot_knows/importers/registry.py +125 -0
  15. bot_knows/infra/__init__.py +5 -0
  16. bot_knows/infra/llm/__init__.py +6 -0
  17. bot_knows/infra/llm/anthropic_provider.py +172 -0
  18. bot_knows/infra/llm/openai_provider.py +195 -0
  19. bot_knows/infra/mongo/__init__.py +5 -0
  20. bot_knows/infra/mongo/client.py +145 -0
  21. bot_knows/infra/mongo/repositories.py +348 -0
  22. bot_knows/infra/neo4j/__init__.py +5 -0
  23. bot_knows/infra/neo4j/client.py +152 -0
  24. bot_knows/infra/neo4j/graph_repository.py +329 -0
  25. bot_knows/infra/redis/__init__.py +6 -0
  26. bot_knows/infra/redis/cache.py +198 -0
  27. bot_knows/infra/redis/client.py +193 -0
  28. bot_knows/interfaces/__init__.py +18 -0
  29. bot_knows/interfaces/embedding.py +55 -0
  30. bot_knows/interfaces/graph.py +194 -0
  31. bot_knows/interfaces/llm.py +70 -0
  32. bot_knows/interfaces/recall.py +92 -0
  33. bot_knows/interfaces/storage.py +225 -0
  34. bot_knows/logging.py +101 -0
  35. bot_knows/models/__init__.py +22 -0
  36. bot_knows/models/chat.py +55 -0
  37. bot_knows/models/ingest.py +70 -0
  38. bot_knows/models/message.py +49 -0
  39. bot_knows/models/recall.py +58 -0
  40. bot_knows/models/topic.py +100 -0
  41. bot_knows/orchestrator.py +398 -0
  42. bot_knows/py.typed +0 -0
  43. bot_knows/services/__init__.py +24 -0
  44. bot_knows/services/chat_processing.py +182 -0
  45. bot_knows/services/dedup_service.py +161 -0
  46. bot_knows/services/graph_service.py +217 -0
  47. bot_knows/services/message_builder.py +135 -0
  48. bot_knows/services/recall_service.py +296 -0
  49. bot_knows/services/tasks.py +128 -0
  50. bot_knows/services/topic_extraction.py +199 -0
  51. bot_knows/utils/__init__.py +22 -0
  52. bot_knows/utils/hashing.py +126 -0
  53. bot_knows-0.1.0.dist-info/METADATA +294 -0
  54. bot_knows-0.1.0.dist-info/RECORD +56 -0
  55. bot_knows-0.1.0.dist-info/WHEEL +4 -0
  56. bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,154 @@
1
+ """ChatGPT import adapter for bot_knows.
2
+
3
+ This module provides an adapter for parsing ChatGPT export files.
4
+ """
5
+
6
+ from typing import Any, override
7
+
8
+ from bot_knows.importers.base import ChatImportAdapter
9
+ from bot_knows.importers.registry import ImportAdapterRegistry
10
+ from bot_knows.models.ingest import ChatIngest, IngestMessage
11
+
12
+
13
+ @ImportAdapterRegistry.register
14
+ class ChatGPTAdapter(ChatImportAdapter):
15
+ """Adapter for ChatGPT export format (conversations.json).
16
+
17
+ Parses the JSON export from ChatGPT's "Export data" feature.
18
+ The export contains a list of conversations, each with a mapping
19
+ of message nodes.
20
+
21
+ Export format example:
22
+ [
23
+ {
24
+ "id": "conversation-id",
25
+ "title": "Chat Title",
26
+ "create_time": 1704067200,
27
+ "mapping": {
28
+ "node-id": {
29
+ "message": {
30
+ "author": {"role": "user"},
31
+ "content": {"parts": ["Hello"]},
32
+ "create_time": 1704067200
33
+ }
34
+ }
35
+ }
36
+ }
37
+ ]
38
+ """
39
+
40
+ @property
41
+ @override
42
+ def source_name(self) -> str:
43
+ return "chatgpt"
44
+
45
+ @override
46
+ def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
47
+ """Parse ChatGPT export format.
48
+
49
+ Args:
50
+ raw_export: Raw JSON data (list of conversations or dict with 'conversations' key)
51
+
52
+ Returns:
53
+ List of ChatIngest objects
54
+ """
55
+ chats: list[ChatIngest] = []
56
+
57
+ # Handle both formats: direct list or dict with 'conversations' key
58
+ if isinstance(raw_export, list):
59
+ conversations = raw_export
60
+ else:
61
+ conversations = raw_export.get("conversations", [])
62
+
63
+ for conv in conversations:
64
+ chat_ingest = self._parse_conversation(conv)
65
+ if chat_ingest and chat_ingest.has_messages:
66
+ chats.append(chat_ingest)
67
+
68
+ return chats
69
+
70
+ def _parse_conversation(self, conv: dict[str, Any]) -> ChatIngest | None:
71
+ """Parse a single conversation."""
72
+ conv_id = conv.get("id", "")
73
+ title = conv.get("title")
74
+ create_time = self._parse_timestamp(conv.get("create_time", 0))
75
+
76
+ messages: list[IngestMessage] = []
77
+ mapping = conv.get("mapping", {})
78
+
79
+ for node in mapping.values():
80
+ msg = self._parse_message_node(node, conv_id, create_time)
81
+ if msg:
82
+ messages.append(msg)
83
+
84
+ # Sort messages by timestamp
85
+ messages.sort(key=lambda m: m.timestamp)
86
+
87
+ return ChatIngest(
88
+ source="chatgpt",
89
+ imported_chat_timestamp=create_time,
90
+ title=title,
91
+ messages=messages,
92
+ provider="chatgpt",
93
+ conversation_id=conv_id,
94
+ )
95
+
96
+ def _parse_message_node(
97
+ self,
98
+ node: dict[str, Any],
99
+ conv_id: str,
100
+ default_timestamp: int,
101
+ ) -> IngestMessage | None:
102
+ """Parse a message node from the mapping."""
103
+ msg = node.get("message")
104
+ if not msg:
105
+ return None
106
+
107
+ # Get role
108
+ author = msg.get("author", {})
109
+ role = author.get("role", "")
110
+ if role not in ("user", "assistant", "system"):
111
+ return None
112
+
113
+ # Get content
114
+ content = self._extract_content(msg)
115
+ if not content:
116
+ return None
117
+
118
+ # Get timestamp
119
+ timestamp = self._parse_timestamp(msg.get("create_time"))
120
+ if timestamp == 0:
121
+ timestamp = default_timestamp
122
+
123
+ return IngestMessage(
124
+ role=role, # type: ignore[arg-type]
125
+ content=content,
126
+ timestamp=timestamp,
127
+ chat_id=conv_id,
128
+ )
129
+
130
+ def _extract_content(self, msg: dict[str, Any]) -> str:
131
+ """Extract text content from message."""
132
+ content_obj = msg.get("content", {})
133
+
134
+ # Handle different content formats
135
+ if isinstance(content_obj, str):
136
+ return content_obj
137
+
138
+ if isinstance(content_obj, dict):
139
+ parts = content_obj.get("parts", [])
140
+ # Filter and join text parts
141
+ text_parts = [str(p) for p in parts if p and isinstance(p, (str, int, float))]
142
+ return " ".join(text_parts).strip()
143
+
144
+ return ""
145
+
146
+ def _parse_timestamp(self, value: Any) -> int:
147
+ """Parse timestamp to epoch seconds."""
148
+ if value is None:
149
+ return 0
150
+ if isinstance(value, int):
151
+ return value
152
+ if isinstance(value, float):
153
+ return int(value)
154
+ return 0
@@ -0,0 +1,172 @@
1
+ """Claude import adapter for bot_knows.
2
+
3
+ This module provides an adapter for parsing Claude export files.
4
+ """
5
+
6
+ from typing import Any, override
7
+
8
+ from bot_knows.importers.base import ChatImportAdapter
9
+ from bot_knows.importers.registry import ImportAdapterRegistry
10
+ from bot_knows.models.ingest import ChatIngest, IngestMessage
11
+
12
+
13
+ @ImportAdapterRegistry.register
14
+ class ClaudeAdapter(ChatImportAdapter):
15
+ """Adapter for Claude export format.
16
+
17
+ Parses the JSON export from Claude's export feature.
18
+ The exact format may vary based on Claude version.
19
+
20
+ Expected format:
21
+ {
22
+ "conversations": [
23
+ {
24
+ "uuid": "conversation-id",
25
+ "name": "Chat Title",
26
+ "created_at": "2024-01-01T00:00:00Z",
27
+ "chat_messages": [
28
+ {
29
+ "uuid": "message-id",
30
+ "sender": "human",
31
+ "text": "Hello",
32
+ "created_at": "2024-01-01T00:00:00Z"
33
+ }
34
+ ]
35
+ }
36
+ ]
37
+ }
38
+ """
39
+
40
+ @property
41
+ @override
42
+ def source_name(self) -> str:
43
+ return "claude"
44
+
45
+ @override
46
+ def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
47
+ """Parse Claude export format.
48
+
49
+ Args:
50
+ raw_export: Raw JSON data
51
+
52
+ Returns:
53
+ List of ChatIngest objects
54
+ """
55
+ chats: list[ChatIngest] = []
56
+
57
+ # Handle different possible formats
58
+ if isinstance(raw_export, list):
59
+ conversations = raw_export
60
+ else:
61
+ conversations = raw_export.get("conversations", [])
62
+
63
+ for conv in conversations:
64
+ chat_ingest = self._parse_conversation(conv)
65
+ if chat_ingest and chat_ingest.has_messages:
66
+ chats.append(chat_ingest)
67
+
68
+ return chats
69
+
70
+ def _parse_conversation(self, conv: dict[str, Any]) -> ChatIngest | None:
71
+ """Parse a single conversation."""
72
+ conv_id = conv.get("uuid", conv.get("id", ""))
73
+ title = conv.get("name", conv.get("title"))
74
+
75
+ # Parse timestamp
76
+ created_at = conv.get("created_at", conv.get("create_time", 0))
77
+ create_time = self._parse_timestamp(created_at)
78
+
79
+ # Get messages
80
+ raw_messages = conv.get("chat_messages", conv.get("messages", []))
81
+ messages: list[IngestMessage] = []
82
+
83
+ for raw_msg in raw_messages:
84
+ msg = self._parse_message(raw_msg, conv_id, create_time)
85
+ if msg:
86
+ messages.append(msg)
87
+
88
+ # Sort messages by timestamp
89
+ messages.sort(key=lambda m: m.timestamp)
90
+
91
+ return ChatIngest(
92
+ source="claude",
93
+ imported_chat_timestamp=create_time,
94
+ title=title,
95
+ messages=messages,
96
+ provider="claude",
97
+ conversation_id=conv_id,
98
+ )
99
+
100
+ def _parse_message(
101
+ self,
102
+ raw_msg: dict[str, Any],
103
+ conv_id: str,
104
+ default_timestamp: int,
105
+ ) -> IngestMessage | None:
106
+ """Parse a single message."""
107
+ # Get role (Claude uses 'sender' with 'human'/'assistant')
108
+ sender = raw_msg.get("sender", raw_msg.get("role", ""))
109
+ role = self._normalize_role(sender)
110
+ if not role:
111
+ return None
112
+
113
+ # Get content
114
+ content = raw_msg.get("text", raw_msg.get("content", ""))
115
+ if isinstance(content, list):
116
+ # Handle content blocks format
117
+ content = self._extract_text_from_blocks(content)
118
+ if not content:
119
+ return None
120
+
121
+ # Get timestamp
122
+ created_at = raw_msg.get("created_at", raw_msg.get("timestamp"))
123
+ timestamp = self._parse_timestamp(created_at)
124
+ if timestamp == 0:
125
+ timestamp = default_timestamp
126
+
127
+ return IngestMessage(
128
+ role=role, # type: ignore[arg-type]
129
+ content=content,
130
+ timestamp=timestamp,
131
+ chat_id=conv_id,
132
+ )
133
+
134
+ def _normalize_role(self, sender: str) -> str | None:
135
+ """Normalize sender/role to standard roles."""
136
+ sender_lower = sender.lower()
137
+ if sender_lower in ("human", "user"):
138
+ return "user"
139
+ if sender_lower in ("assistant", "ai", "claude"):
140
+ return "assistant"
141
+ if sender_lower == "system":
142
+ return "system"
143
+ return None
144
+
145
+ def _extract_text_from_blocks(self, blocks: list[Any]) -> str:
146
+ """Extract text from content blocks format."""
147
+ texts = []
148
+ for block in blocks:
149
+ if isinstance(block, str):
150
+ texts.append(block)
151
+ elif isinstance(block, dict) and block.get("type") == "text":
152
+ texts.append(block.get("text", ""))
153
+ return "\n".join(texts).strip()
154
+
155
+ def _parse_timestamp(self, value: Any) -> int:
156
+ """Parse timestamp to epoch seconds."""
157
+ if value is None:
158
+ return 0
159
+ if isinstance(value, int):
160
+ return value
161
+ if isinstance(value, float):
162
+ return int(value)
163
+ if isinstance(value, str):
164
+ # Try to parse ISO format
165
+ try:
166
+ from datetime import datetime
167
+
168
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
169
+ return int(dt.timestamp())
170
+ except (ValueError, TypeError):
171
+ return 0
172
+ return 0
@@ -0,0 +1,272 @@
1
+ """Generic JSON import adapter for bot_knows.
2
+
3
+ This module provides a flexible adapter for custom JSON formats.
4
+ """
5
+
6
+ from collections.abc import Callable
7
+ from typing import Any, override
8
+
9
+ from bot_knows.importers.base import ChatImportAdapter
10
+ from bot_knows.importers.registry import ImportAdapterRegistry
11
+ from bot_knows.models.ingest import ChatIngest, IngestMessage
12
+
13
+
14
+ @ImportAdapterRegistry.register
15
+ class GenericJSONAdapter(ChatImportAdapter):
16
+ """Flexible adapter for custom JSON formats.
17
+
18
+ This adapter can handle various JSON structures by allowing
19
+ customization of field mappings.
20
+
21
+ Expected default format:
22
+ {
23
+ "chats": [
24
+ {
25
+ "id": "chat-id",
26
+ "title": "Chat Title",
27
+ "timestamp": 1704067200,
28
+ "messages": [
29
+ {
30
+ "role": "user",
31
+ "content": "Hello",
32
+ "timestamp": 1704067200
33
+ }
34
+ ]
35
+ }
36
+ ]
37
+ }
38
+
39
+ Or simpler flat format:
40
+ [
41
+ {
42
+ "role": "user",
43
+ "content": "Hello",
44
+ "timestamp": 1704067200,
45
+ "chat_id": "chat-1"
46
+ }
47
+ ]
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ chats_key: str = "chats",
53
+ messages_key: str = "messages",
54
+ role_key: str = "role",
55
+ content_key: str = "content",
56
+ timestamp_key: str = "timestamp",
57
+ chat_id_key: str = "id",
58
+ title_key: str = "title",
59
+ role_mapping: dict[str, str] | None = None,
60
+ content_extractor: Callable[[Any], str] | None = None,
61
+ ) -> None:
62
+ """Initialize adapter with field mappings.
63
+
64
+ Args:
65
+ chats_key: Key for chats array in root object
66
+ messages_key: Key for messages array in chat object
67
+ role_key: Key for role in message object
68
+ content_key: Key for content in message object
69
+ timestamp_key: Key for timestamp in message/chat object
70
+ chat_id_key: Key for chat ID in chat object
71
+ title_key: Key for title in chat object
72
+ role_mapping: Custom role mapping (e.g., {"human": "user"})
73
+ content_extractor: Custom function to extract content from message
74
+ """
75
+ self._chats_key = chats_key
76
+ self._messages_key = messages_key
77
+ self._role_key = role_key
78
+ self._content_key = content_key
79
+ self._timestamp_key = timestamp_key
80
+ self._chat_id_key = chat_id_key
81
+ self._title_key = title_key
82
+ self._role_mapping = role_mapping or {}
83
+ self._content_extractor = content_extractor
84
+
85
+ @property
86
+ @override
87
+ def source_name(self) -> str:
88
+ return "generic_json"
89
+
90
+ @override
91
+ def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
92
+ """Parse generic JSON format.
93
+
94
+ Supports:
95
+ 1. Object with 'chats' array containing chat objects with 'messages'
96
+ 2. Direct array of chat objects
97
+ 3. Flat array of messages with chat_id field
98
+
99
+ Args:
100
+ raw_export: Raw JSON data
101
+
102
+ Returns:
103
+ List of ChatIngest objects
104
+ """
105
+ # Try different formats
106
+ if isinstance(raw_export, list):
107
+ return self._parse_list(raw_export)
108
+
109
+ if self._chats_key in raw_export:
110
+ return self._parse_chats_object(raw_export)
111
+
112
+ # Try to interpret as single chat
113
+ return self._parse_single_chat(raw_export)
114
+
115
+ def _parse_list(self, items: list[Any]) -> list[ChatIngest]:
116
+ """Parse list format (could be chats or flat messages)."""
117
+ if not items:
118
+ return []
119
+
120
+ # Check if first item looks like a chat or a message
121
+ first = items[0]
122
+ if isinstance(first, dict) and self._messages_key in first:
123
+ # List of chat objects
124
+ return [self._parse_chat(chat) for chat in items if chat]
125
+
126
+ # Assume flat list of messages - group by chat_id
127
+ return self._parse_flat_messages(items)
128
+
129
+ def _parse_chats_object(self, data: dict[str, Any]) -> list[ChatIngest]:
130
+ """Parse object with chats array."""
131
+ chats = data.get(self._chats_key, [])
132
+ return [self._parse_chat(chat) for chat in chats if chat]
133
+
134
+ def _parse_single_chat(self, data: dict[str, Any]) -> list[ChatIngest]:
135
+ """Parse as single chat object."""
136
+ chat = self._parse_chat(data)
137
+ return [chat] if chat.has_messages else []
138
+
139
+ def _parse_chat(self, chat: dict[str, Any]) -> ChatIngest:
140
+ """Parse a single chat object."""
141
+ chat_id = str(chat.get(self._chat_id_key, ""))
142
+ title = chat.get(self._title_key)
143
+ timestamp = self._parse_timestamp(chat.get(self._timestamp_key, 0))
144
+
145
+ raw_messages = chat.get(self._messages_key, [])
146
+ messages = [
147
+ msg
148
+ for msg in (self._parse_message(m, chat_id, timestamp) for m in raw_messages)
149
+ if msg is not None
150
+ ]
151
+
152
+ messages.sort(key=lambda m: m.timestamp)
153
+
154
+ return ChatIngest(
155
+ source="generic_json",
156
+ imported_chat_timestamp=timestamp,
157
+ title=title,
158
+ messages=messages,
159
+ conversation_id=chat_id,
160
+ )
161
+
162
+ def _parse_flat_messages(self, messages: list[Any]) -> list[ChatIngest]:
163
+ """Parse flat list of messages into chats grouped by chat_id."""
164
+ from collections import defaultdict
165
+
166
+ # Group messages by chat_id
167
+ grouped: dict[str, list[IngestMessage]] = defaultdict(list)
168
+
169
+ for raw_msg in messages:
170
+ if not isinstance(raw_msg, dict):
171
+ continue
172
+
173
+ chat_id = str(raw_msg.get("chat_id", raw_msg.get(self._chat_id_key, "default")))
174
+ msg = self._parse_message(raw_msg, chat_id, 0)
175
+ if msg:
176
+ grouped[chat_id].append(msg)
177
+
178
+ # Create ChatIngest for each group
179
+ chats: list[ChatIngest] = []
180
+ for chat_id, msgs in grouped.items():
181
+ msgs.sort(key=lambda m: m.timestamp)
182
+ min_timestamp = msgs[0].timestamp if msgs else 0
183
+
184
+ chats.append(
185
+ ChatIngest(
186
+ source="generic_json",
187
+ imported_chat_timestamp=min_timestamp,
188
+ title=None,
189
+ messages=msgs,
190
+ conversation_id=chat_id,
191
+ )
192
+ )
193
+
194
+ return chats
195
+
196
+ def _parse_message(
197
+ self,
198
+ raw_msg: dict[str, Any],
199
+ chat_id: str,
200
+ default_timestamp: int,
201
+ ) -> IngestMessage | None:
202
+ """Parse a single message."""
203
+ # Get role
204
+ raw_role = raw_msg.get(self._role_key, "")
205
+ role = self._normalize_role(raw_role)
206
+ if not role:
207
+ return None
208
+
209
+ # Get content
210
+ if self._content_extractor:
211
+ content = self._content_extractor(raw_msg)
212
+ else:
213
+ content = raw_msg.get(self._content_key, "")
214
+ if isinstance(content, list):
215
+ content = "\n".join(str(c) for c in content)
216
+
217
+ if not content:
218
+ return None
219
+
220
+ # Get timestamp
221
+ timestamp = self._parse_timestamp(raw_msg.get(self._timestamp_key))
222
+ if timestamp == 0:
223
+ timestamp = default_timestamp
224
+
225
+ return IngestMessage(
226
+ role=role, # type: ignore[arg-type]
227
+ content=str(content),
228
+ timestamp=timestamp,
229
+ chat_id=chat_id,
230
+ )
231
+
232
+ def _normalize_role(self, role: str) -> str | None:
233
+ """Normalize role to standard values."""
234
+ role_lower = str(role).lower()
235
+
236
+ # Check custom mapping first
237
+ if role_lower in self._role_mapping:
238
+ return self._role_mapping[role_lower]
239
+
240
+ # Standard mappings
241
+ if role_lower in ("user", "human"):
242
+ return "user"
243
+ if role_lower in ("assistant", "ai", "bot"):
244
+ return "assistant"
245
+ if role_lower == "system":
246
+ return "system"
247
+
248
+ return None
249
+
250
+ def _parse_timestamp(self, value: Any) -> int:
251
+ """Parse timestamp to epoch seconds."""
252
+ if value is None:
253
+ return 0
254
+ if isinstance(value, int):
255
+ return value
256
+ if isinstance(value, float):
257
+ return int(value)
258
+ if isinstance(value, str):
259
+ try:
260
+ # Try numeric string
261
+ return int(float(value))
262
+ except ValueError:
263
+ pass
264
+ try:
265
+ # Try ISO format
266
+ from datetime import datetime
267
+
268
+ dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
269
+ return int(dt.timestamp())
270
+ except (ValueError, TypeError):
271
+ pass
272
+ return 0