bot-knows 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bot_knows/__init__.py +70 -0
- bot_knows/config.py +115 -0
- bot_knows/domain/__init__.py +5 -0
- bot_knows/domain/chat.py +62 -0
- bot_knows/domain/message.py +64 -0
- bot_knows/domain/relation.py +56 -0
- bot_knows/domain/topic.py +132 -0
- bot_knows/domain/topic_evidence.py +55 -0
- bot_knows/importers/__init__.py +12 -0
- bot_knows/importers/base.py +116 -0
- bot_knows/importers/chatgpt.py +154 -0
- bot_knows/importers/claude.py +172 -0
- bot_knows/importers/generic_json.py +272 -0
- bot_knows/importers/registry.py +125 -0
- bot_knows/infra/__init__.py +5 -0
- bot_knows/infra/llm/__init__.py +6 -0
- bot_knows/infra/llm/anthropic_provider.py +172 -0
- bot_knows/infra/llm/openai_provider.py +195 -0
- bot_knows/infra/mongo/__init__.py +5 -0
- bot_knows/infra/mongo/client.py +145 -0
- bot_knows/infra/mongo/repositories.py +348 -0
- bot_knows/infra/neo4j/__init__.py +5 -0
- bot_knows/infra/neo4j/client.py +152 -0
- bot_knows/infra/neo4j/graph_repository.py +329 -0
- bot_knows/infra/redis/__init__.py +6 -0
- bot_knows/infra/redis/cache.py +198 -0
- bot_knows/infra/redis/client.py +193 -0
- bot_knows/interfaces/__init__.py +18 -0
- bot_knows/interfaces/embedding.py +55 -0
- bot_knows/interfaces/graph.py +194 -0
- bot_knows/interfaces/llm.py +70 -0
- bot_knows/interfaces/recall.py +92 -0
- bot_knows/interfaces/storage.py +225 -0
- bot_knows/logging.py +101 -0
- bot_knows/models/__init__.py +22 -0
- bot_knows/models/chat.py +55 -0
- bot_knows/models/ingest.py +70 -0
- bot_knows/models/message.py +49 -0
- bot_knows/models/recall.py +58 -0
- bot_knows/models/topic.py +100 -0
- bot_knows/orchestrator.py +398 -0
- bot_knows/py.typed +0 -0
- bot_knows/services/__init__.py +24 -0
- bot_knows/services/chat_processing.py +182 -0
- bot_knows/services/dedup_service.py +161 -0
- bot_knows/services/graph_service.py +217 -0
- bot_knows/services/message_builder.py +135 -0
- bot_knows/services/recall_service.py +296 -0
- bot_knows/services/tasks.py +128 -0
- bot_knows/services/topic_extraction.py +199 -0
- bot_knows/utils/__init__.py +22 -0
- bot_knows/utils/hashing.py +126 -0
- bot_knows-0.1.0.dist-info/METADATA +294 -0
- bot_knows-0.1.0.dist-info/RECORD +56 -0
- bot_knows-0.1.0.dist-info/WHEEL +4 -0
- bot_knows-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""ChatGPT import adapter for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides an adapter for parsing ChatGPT export files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, override
|
|
7
|
+
|
|
8
|
+
from bot_knows.importers.base import ChatImportAdapter
|
|
9
|
+
from bot_knows.importers.registry import ImportAdapterRegistry
|
|
10
|
+
from bot_knows.models.ingest import ChatIngest, IngestMessage
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@ImportAdapterRegistry.register
|
|
14
|
+
class ChatGPTAdapter(ChatImportAdapter):
|
|
15
|
+
"""Adapter for ChatGPT export format (conversations.json).
|
|
16
|
+
|
|
17
|
+
Parses the JSON export from ChatGPT's "Export data" feature.
|
|
18
|
+
The export contains a list of conversations, each with a mapping
|
|
19
|
+
of message nodes.
|
|
20
|
+
|
|
21
|
+
Export format example:
|
|
22
|
+
[
|
|
23
|
+
{
|
|
24
|
+
"id": "conversation-id",
|
|
25
|
+
"title": "Chat Title",
|
|
26
|
+
"create_time": 1704067200,
|
|
27
|
+
"mapping": {
|
|
28
|
+
"node-id": {
|
|
29
|
+
"message": {
|
|
30
|
+
"author": {"role": "user"},
|
|
31
|
+
"content": {"parts": ["Hello"]},
|
|
32
|
+
"create_time": 1704067200
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
@override
|
|
42
|
+
def source_name(self) -> str:
|
|
43
|
+
return "chatgpt"
|
|
44
|
+
|
|
45
|
+
@override
|
|
46
|
+
def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
|
|
47
|
+
"""Parse ChatGPT export format.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
raw_export: Raw JSON data (list of conversations or dict with 'conversations' key)
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of ChatIngest objects
|
|
54
|
+
"""
|
|
55
|
+
chats: list[ChatIngest] = []
|
|
56
|
+
|
|
57
|
+
# Handle both formats: direct list or dict with 'conversations' key
|
|
58
|
+
if isinstance(raw_export, list):
|
|
59
|
+
conversations = raw_export
|
|
60
|
+
else:
|
|
61
|
+
conversations = raw_export.get("conversations", [])
|
|
62
|
+
|
|
63
|
+
for conv in conversations:
|
|
64
|
+
chat_ingest = self._parse_conversation(conv)
|
|
65
|
+
if chat_ingest and chat_ingest.has_messages:
|
|
66
|
+
chats.append(chat_ingest)
|
|
67
|
+
|
|
68
|
+
return chats
|
|
69
|
+
|
|
70
|
+
def _parse_conversation(self, conv: dict[str, Any]) -> ChatIngest | None:
|
|
71
|
+
"""Parse a single conversation."""
|
|
72
|
+
conv_id = conv.get("id", "")
|
|
73
|
+
title = conv.get("title")
|
|
74
|
+
create_time = self._parse_timestamp(conv.get("create_time", 0))
|
|
75
|
+
|
|
76
|
+
messages: list[IngestMessage] = []
|
|
77
|
+
mapping = conv.get("mapping", {})
|
|
78
|
+
|
|
79
|
+
for node in mapping.values():
|
|
80
|
+
msg = self._parse_message_node(node, conv_id, create_time)
|
|
81
|
+
if msg:
|
|
82
|
+
messages.append(msg)
|
|
83
|
+
|
|
84
|
+
# Sort messages by timestamp
|
|
85
|
+
messages.sort(key=lambda m: m.timestamp)
|
|
86
|
+
|
|
87
|
+
return ChatIngest(
|
|
88
|
+
source="chatgpt",
|
|
89
|
+
imported_chat_timestamp=create_time,
|
|
90
|
+
title=title,
|
|
91
|
+
messages=messages,
|
|
92
|
+
provider="chatgpt",
|
|
93
|
+
conversation_id=conv_id,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _parse_message_node(
|
|
97
|
+
self,
|
|
98
|
+
node: dict[str, Any],
|
|
99
|
+
conv_id: str,
|
|
100
|
+
default_timestamp: int,
|
|
101
|
+
) -> IngestMessage | None:
|
|
102
|
+
"""Parse a message node from the mapping."""
|
|
103
|
+
msg = node.get("message")
|
|
104
|
+
if not msg:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
# Get role
|
|
108
|
+
author = msg.get("author", {})
|
|
109
|
+
role = author.get("role", "")
|
|
110
|
+
if role not in ("user", "assistant", "system"):
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
# Get content
|
|
114
|
+
content = self._extract_content(msg)
|
|
115
|
+
if not content:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
# Get timestamp
|
|
119
|
+
timestamp = self._parse_timestamp(msg.get("create_time"))
|
|
120
|
+
if timestamp == 0:
|
|
121
|
+
timestamp = default_timestamp
|
|
122
|
+
|
|
123
|
+
return IngestMessage(
|
|
124
|
+
role=role, # type: ignore[arg-type]
|
|
125
|
+
content=content,
|
|
126
|
+
timestamp=timestamp,
|
|
127
|
+
chat_id=conv_id,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def _extract_content(self, msg: dict[str, Any]) -> str:
|
|
131
|
+
"""Extract text content from message."""
|
|
132
|
+
content_obj = msg.get("content", {})
|
|
133
|
+
|
|
134
|
+
# Handle different content formats
|
|
135
|
+
if isinstance(content_obj, str):
|
|
136
|
+
return content_obj
|
|
137
|
+
|
|
138
|
+
if isinstance(content_obj, dict):
|
|
139
|
+
parts = content_obj.get("parts", [])
|
|
140
|
+
# Filter and join text parts
|
|
141
|
+
text_parts = [str(p) for p in parts if p and isinstance(p, (str, int, float))]
|
|
142
|
+
return " ".join(text_parts).strip()
|
|
143
|
+
|
|
144
|
+
return ""
|
|
145
|
+
|
|
146
|
+
def _parse_timestamp(self, value: Any) -> int:
|
|
147
|
+
"""Parse timestamp to epoch seconds."""
|
|
148
|
+
if value is None:
|
|
149
|
+
return 0
|
|
150
|
+
if isinstance(value, int):
|
|
151
|
+
return value
|
|
152
|
+
if isinstance(value, float):
|
|
153
|
+
return int(value)
|
|
154
|
+
return 0
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Claude import adapter for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides an adapter for parsing Claude export files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, override
|
|
7
|
+
|
|
8
|
+
from bot_knows.importers.base import ChatImportAdapter
|
|
9
|
+
from bot_knows.importers.registry import ImportAdapterRegistry
|
|
10
|
+
from bot_knows.models.ingest import ChatIngest, IngestMessage
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@ImportAdapterRegistry.register
|
|
14
|
+
class ClaudeAdapter(ChatImportAdapter):
|
|
15
|
+
"""Adapter for Claude export format.
|
|
16
|
+
|
|
17
|
+
Parses the JSON export from Claude's export feature.
|
|
18
|
+
The exact format may vary based on Claude version.
|
|
19
|
+
|
|
20
|
+
Expected format:
|
|
21
|
+
{
|
|
22
|
+
"conversations": [
|
|
23
|
+
{
|
|
24
|
+
"uuid": "conversation-id",
|
|
25
|
+
"name": "Chat Title",
|
|
26
|
+
"created_at": "2024-01-01T00:00:00Z",
|
|
27
|
+
"chat_messages": [
|
|
28
|
+
{
|
|
29
|
+
"uuid": "message-id",
|
|
30
|
+
"sender": "human",
|
|
31
|
+
"text": "Hello",
|
|
32
|
+
"created_at": "2024-01-01T00:00:00Z"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
@override
|
|
42
|
+
def source_name(self) -> str:
|
|
43
|
+
return "claude"
|
|
44
|
+
|
|
45
|
+
@override
|
|
46
|
+
def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
|
|
47
|
+
"""Parse Claude export format.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
raw_export: Raw JSON data
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of ChatIngest objects
|
|
54
|
+
"""
|
|
55
|
+
chats: list[ChatIngest] = []
|
|
56
|
+
|
|
57
|
+
# Handle different possible formats
|
|
58
|
+
if isinstance(raw_export, list):
|
|
59
|
+
conversations = raw_export
|
|
60
|
+
else:
|
|
61
|
+
conversations = raw_export.get("conversations", [])
|
|
62
|
+
|
|
63
|
+
for conv in conversations:
|
|
64
|
+
chat_ingest = self._parse_conversation(conv)
|
|
65
|
+
if chat_ingest and chat_ingest.has_messages:
|
|
66
|
+
chats.append(chat_ingest)
|
|
67
|
+
|
|
68
|
+
return chats
|
|
69
|
+
|
|
70
|
+
def _parse_conversation(self, conv: dict[str, Any]) -> ChatIngest | None:
|
|
71
|
+
"""Parse a single conversation."""
|
|
72
|
+
conv_id = conv.get("uuid", conv.get("id", ""))
|
|
73
|
+
title = conv.get("name", conv.get("title"))
|
|
74
|
+
|
|
75
|
+
# Parse timestamp
|
|
76
|
+
created_at = conv.get("created_at", conv.get("create_time", 0))
|
|
77
|
+
create_time = self._parse_timestamp(created_at)
|
|
78
|
+
|
|
79
|
+
# Get messages
|
|
80
|
+
raw_messages = conv.get("chat_messages", conv.get("messages", []))
|
|
81
|
+
messages: list[IngestMessage] = []
|
|
82
|
+
|
|
83
|
+
for raw_msg in raw_messages:
|
|
84
|
+
msg = self._parse_message(raw_msg, conv_id, create_time)
|
|
85
|
+
if msg:
|
|
86
|
+
messages.append(msg)
|
|
87
|
+
|
|
88
|
+
# Sort messages by timestamp
|
|
89
|
+
messages.sort(key=lambda m: m.timestamp)
|
|
90
|
+
|
|
91
|
+
return ChatIngest(
|
|
92
|
+
source="claude",
|
|
93
|
+
imported_chat_timestamp=create_time,
|
|
94
|
+
title=title,
|
|
95
|
+
messages=messages,
|
|
96
|
+
provider="claude",
|
|
97
|
+
conversation_id=conv_id,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def _parse_message(
|
|
101
|
+
self,
|
|
102
|
+
raw_msg: dict[str, Any],
|
|
103
|
+
conv_id: str,
|
|
104
|
+
default_timestamp: int,
|
|
105
|
+
) -> IngestMessage | None:
|
|
106
|
+
"""Parse a single message."""
|
|
107
|
+
# Get role (Claude uses 'sender' with 'human'/'assistant')
|
|
108
|
+
sender = raw_msg.get("sender", raw_msg.get("role", ""))
|
|
109
|
+
role = self._normalize_role(sender)
|
|
110
|
+
if not role:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
# Get content
|
|
114
|
+
content = raw_msg.get("text", raw_msg.get("content", ""))
|
|
115
|
+
if isinstance(content, list):
|
|
116
|
+
# Handle content blocks format
|
|
117
|
+
content = self._extract_text_from_blocks(content)
|
|
118
|
+
if not content:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
# Get timestamp
|
|
122
|
+
created_at = raw_msg.get("created_at", raw_msg.get("timestamp"))
|
|
123
|
+
timestamp = self._parse_timestamp(created_at)
|
|
124
|
+
if timestamp == 0:
|
|
125
|
+
timestamp = default_timestamp
|
|
126
|
+
|
|
127
|
+
return IngestMessage(
|
|
128
|
+
role=role, # type: ignore[arg-type]
|
|
129
|
+
content=content,
|
|
130
|
+
timestamp=timestamp,
|
|
131
|
+
chat_id=conv_id,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def _normalize_role(self, sender: str) -> str | None:
|
|
135
|
+
"""Normalize sender/role to standard roles."""
|
|
136
|
+
sender_lower = sender.lower()
|
|
137
|
+
if sender_lower in ("human", "user"):
|
|
138
|
+
return "user"
|
|
139
|
+
if sender_lower in ("assistant", "ai", "claude"):
|
|
140
|
+
return "assistant"
|
|
141
|
+
if sender_lower == "system":
|
|
142
|
+
return "system"
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
def _extract_text_from_blocks(self, blocks: list[Any]) -> str:
|
|
146
|
+
"""Extract text from content blocks format."""
|
|
147
|
+
texts = []
|
|
148
|
+
for block in blocks:
|
|
149
|
+
if isinstance(block, str):
|
|
150
|
+
texts.append(block)
|
|
151
|
+
elif isinstance(block, dict) and block.get("type") == "text":
|
|
152
|
+
texts.append(block.get("text", ""))
|
|
153
|
+
return "\n".join(texts).strip()
|
|
154
|
+
|
|
155
|
+
def _parse_timestamp(self, value: Any) -> int:
|
|
156
|
+
"""Parse timestamp to epoch seconds."""
|
|
157
|
+
if value is None:
|
|
158
|
+
return 0
|
|
159
|
+
if isinstance(value, int):
|
|
160
|
+
return value
|
|
161
|
+
if isinstance(value, float):
|
|
162
|
+
return int(value)
|
|
163
|
+
if isinstance(value, str):
|
|
164
|
+
# Try to parse ISO format
|
|
165
|
+
try:
|
|
166
|
+
from datetime import datetime
|
|
167
|
+
|
|
168
|
+
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
169
|
+
return int(dt.timestamp())
|
|
170
|
+
except (ValueError, TypeError):
|
|
171
|
+
return 0
|
|
172
|
+
return 0
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Generic JSON import adapter for bot_knows.
|
|
2
|
+
|
|
3
|
+
This module provides a flexible adapter for custom JSON formats.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any, override
|
|
8
|
+
|
|
9
|
+
from bot_knows.importers.base import ChatImportAdapter
|
|
10
|
+
from bot_knows.importers.registry import ImportAdapterRegistry
|
|
11
|
+
from bot_knows.models.ingest import ChatIngest, IngestMessage
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@ImportAdapterRegistry.register
|
|
15
|
+
class GenericJSONAdapter(ChatImportAdapter):
|
|
16
|
+
"""Flexible adapter for custom JSON formats.
|
|
17
|
+
|
|
18
|
+
This adapter can handle various JSON structures by allowing
|
|
19
|
+
customization of field mappings.
|
|
20
|
+
|
|
21
|
+
Expected default format:
|
|
22
|
+
{
|
|
23
|
+
"chats": [
|
|
24
|
+
{
|
|
25
|
+
"id": "chat-id",
|
|
26
|
+
"title": "Chat Title",
|
|
27
|
+
"timestamp": 1704067200,
|
|
28
|
+
"messages": [
|
|
29
|
+
{
|
|
30
|
+
"role": "user",
|
|
31
|
+
"content": "Hello",
|
|
32
|
+
"timestamp": 1704067200
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
Or simpler flat format:
|
|
40
|
+
[
|
|
41
|
+
{
|
|
42
|
+
"role": "user",
|
|
43
|
+
"content": "Hello",
|
|
44
|
+
"timestamp": 1704067200,
|
|
45
|
+
"chat_id": "chat-1"
|
|
46
|
+
}
|
|
47
|
+
]
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
chats_key: str = "chats",
|
|
53
|
+
messages_key: str = "messages",
|
|
54
|
+
role_key: str = "role",
|
|
55
|
+
content_key: str = "content",
|
|
56
|
+
timestamp_key: str = "timestamp",
|
|
57
|
+
chat_id_key: str = "id",
|
|
58
|
+
title_key: str = "title",
|
|
59
|
+
role_mapping: dict[str, str] | None = None,
|
|
60
|
+
content_extractor: Callable[[Any], str] | None = None,
|
|
61
|
+
) -> None:
|
|
62
|
+
"""Initialize adapter with field mappings.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
chats_key: Key for chats array in root object
|
|
66
|
+
messages_key: Key for messages array in chat object
|
|
67
|
+
role_key: Key for role in message object
|
|
68
|
+
content_key: Key for content in message object
|
|
69
|
+
timestamp_key: Key for timestamp in message/chat object
|
|
70
|
+
chat_id_key: Key for chat ID in chat object
|
|
71
|
+
title_key: Key for title in chat object
|
|
72
|
+
role_mapping: Custom role mapping (e.g., {"human": "user"})
|
|
73
|
+
content_extractor: Custom function to extract content from message
|
|
74
|
+
"""
|
|
75
|
+
self._chats_key = chats_key
|
|
76
|
+
self._messages_key = messages_key
|
|
77
|
+
self._role_key = role_key
|
|
78
|
+
self._content_key = content_key
|
|
79
|
+
self._timestamp_key = timestamp_key
|
|
80
|
+
self._chat_id_key = chat_id_key
|
|
81
|
+
self._title_key = title_key
|
|
82
|
+
self._role_mapping = role_mapping or {}
|
|
83
|
+
self._content_extractor = content_extractor
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
@override
|
|
87
|
+
def source_name(self) -> str:
|
|
88
|
+
return "generic_json"
|
|
89
|
+
|
|
90
|
+
@override
|
|
91
|
+
def parse(self, raw_export: dict[str, Any]) -> list[ChatIngest]:
|
|
92
|
+
"""Parse generic JSON format.
|
|
93
|
+
|
|
94
|
+
Supports:
|
|
95
|
+
1. Object with 'chats' array containing chat objects with 'messages'
|
|
96
|
+
2. Direct array of chat objects
|
|
97
|
+
3. Flat array of messages with chat_id field
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
raw_export: Raw JSON data
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of ChatIngest objects
|
|
104
|
+
"""
|
|
105
|
+
# Try different formats
|
|
106
|
+
if isinstance(raw_export, list):
|
|
107
|
+
return self._parse_list(raw_export)
|
|
108
|
+
|
|
109
|
+
if self._chats_key in raw_export:
|
|
110
|
+
return self._parse_chats_object(raw_export)
|
|
111
|
+
|
|
112
|
+
# Try to interpret as single chat
|
|
113
|
+
return self._parse_single_chat(raw_export)
|
|
114
|
+
|
|
115
|
+
def _parse_list(self, items: list[Any]) -> list[ChatIngest]:
|
|
116
|
+
"""Parse list format (could be chats or flat messages)."""
|
|
117
|
+
if not items:
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
# Check if first item looks like a chat or a message
|
|
121
|
+
first = items[0]
|
|
122
|
+
if isinstance(first, dict) and self._messages_key in first:
|
|
123
|
+
# List of chat objects
|
|
124
|
+
return [self._parse_chat(chat) for chat in items if chat]
|
|
125
|
+
|
|
126
|
+
# Assume flat list of messages - group by chat_id
|
|
127
|
+
return self._parse_flat_messages(items)
|
|
128
|
+
|
|
129
|
+
def _parse_chats_object(self, data: dict[str, Any]) -> list[ChatIngest]:
|
|
130
|
+
"""Parse object with chats array."""
|
|
131
|
+
chats = data.get(self._chats_key, [])
|
|
132
|
+
return [self._parse_chat(chat) for chat in chats if chat]
|
|
133
|
+
|
|
134
|
+
def _parse_single_chat(self, data: dict[str, Any]) -> list[ChatIngest]:
|
|
135
|
+
"""Parse as single chat object."""
|
|
136
|
+
chat = self._parse_chat(data)
|
|
137
|
+
return [chat] if chat.has_messages else []
|
|
138
|
+
|
|
139
|
+
def _parse_chat(self, chat: dict[str, Any]) -> ChatIngest:
|
|
140
|
+
"""Parse a single chat object."""
|
|
141
|
+
chat_id = str(chat.get(self._chat_id_key, ""))
|
|
142
|
+
title = chat.get(self._title_key)
|
|
143
|
+
timestamp = self._parse_timestamp(chat.get(self._timestamp_key, 0))
|
|
144
|
+
|
|
145
|
+
raw_messages = chat.get(self._messages_key, [])
|
|
146
|
+
messages = [
|
|
147
|
+
msg
|
|
148
|
+
for msg in (self._parse_message(m, chat_id, timestamp) for m in raw_messages)
|
|
149
|
+
if msg is not None
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
messages.sort(key=lambda m: m.timestamp)
|
|
153
|
+
|
|
154
|
+
return ChatIngest(
|
|
155
|
+
source="generic_json",
|
|
156
|
+
imported_chat_timestamp=timestamp,
|
|
157
|
+
title=title,
|
|
158
|
+
messages=messages,
|
|
159
|
+
conversation_id=chat_id,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def _parse_flat_messages(self, messages: list[Any]) -> list[ChatIngest]:
|
|
163
|
+
"""Parse flat list of messages into chats grouped by chat_id."""
|
|
164
|
+
from collections import defaultdict
|
|
165
|
+
|
|
166
|
+
# Group messages by chat_id
|
|
167
|
+
grouped: dict[str, list[IngestMessage]] = defaultdict(list)
|
|
168
|
+
|
|
169
|
+
for raw_msg in messages:
|
|
170
|
+
if not isinstance(raw_msg, dict):
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
chat_id = str(raw_msg.get("chat_id", raw_msg.get(self._chat_id_key, "default")))
|
|
174
|
+
msg = self._parse_message(raw_msg, chat_id, 0)
|
|
175
|
+
if msg:
|
|
176
|
+
grouped[chat_id].append(msg)
|
|
177
|
+
|
|
178
|
+
# Create ChatIngest for each group
|
|
179
|
+
chats: list[ChatIngest] = []
|
|
180
|
+
for chat_id, msgs in grouped.items():
|
|
181
|
+
msgs.sort(key=lambda m: m.timestamp)
|
|
182
|
+
min_timestamp = msgs[0].timestamp if msgs else 0
|
|
183
|
+
|
|
184
|
+
chats.append(
|
|
185
|
+
ChatIngest(
|
|
186
|
+
source="generic_json",
|
|
187
|
+
imported_chat_timestamp=min_timestamp,
|
|
188
|
+
title=None,
|
|
189
|
+
messages=msgs,
|
|
190
|
+
conversation_id=chat_id,
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return chats
|
|
195
|
+
|
|
196
|
+
def _parse_message(
|
|
197
|
+
self,
|
|
198
|
+
raw_msg: dict[str, Any],
|
|
199
|
+
chat_id: str,
|
|
200
|
+
default_timestamp: int,
|
|
201
|
+
) -> IngestMessage | None:
|
|
202
|
+
"""Parse a single message."""
|
|
203
|
+
# Get role
|
|
204
|
+
raw_role = raw_msg.get(self._role_key, "")
|
|
205
|
+
role = self._normalize_role(raw_role)
|
|
206
|
+
if not role:
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Get content
|
|
210
|
+
if self._content_extractor:
|
|
211
|
+
content = self._content_extractor(raw_msg)
|
|
212
|
+
else:
|
|
213
|
+
content = raw_msg.get(self._content_key, "")
|
|
214
|
+
if isinstance(content, list):
|
|
215
|
+
content = "\n".join(str(c) for c in content)
|
|
216
|
+
|
|
217
|
+
if not content:
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
# Get timestamp
|
|
221
|
+
timestamp = self._parse_timestamp(raw_msg.get(self._timestamp_key))
|
|
222
|
+
if timestamp == 0:
|
|
223
|
+
timestamp = default_timestamp
|
|
224
|
+
|
|
225
|
+
return IngestMessage(
|
|
226
|
+
role=role, # type: ignore[arg-type]
|
|
227
|
+
content=str(content),
|
|
228
|
+
timestamp=timestamp,
|
|
229
|
+
chat_id=chat_id,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def _normalize_role(self, role: str) -> str | None:
|
|
233
|
+
"""Normalize role to standard values."""
|
|
234
|
+
role_lower = str(role).lower()
|
|
235
|
+
|
|
236
|
+
# Check custom mapping first
|
|
237
|
+
if role_lower in self._role_mapping:
|
|
238
|
+
return self._role_mapping[role_lower]
|
|
239
|
+
|
|
240
|
+
# Standard mappings
|
|
241
|
+
if role_lower in ("user", "human"):
|
|
242
|
+
return "user"
|
|
243
|
+
if role_lower in ("assistant", "ai", "bot"):
|
|
244
|
+
return "assistant"
|
|
245
|
+
if role_lower == "system":
|
|
246
|
+
return "system"
|
|
247
|
+
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
def _parse_timestamp(self, value: Any) -> int:
|
|
251
|
+
"""Parse timestamp to epoch seconds."""
|
|
252
|
+
if value is None:
|
|
253
|
+
return 0
|
|
254
|
+
if isinstance(value, int):
|
|
255
|
+
return value
|
|
256
|
+
if isinstance(value, float):
|
|
257
|
+
return int(value)
|
|
258
|
+
if isinstance(value, str):
|
|
259
|
+
try:
|
|
260
|
+
# Try numeric string
|
|
261
|
+
return int(float(value))
|
|
262
|
+
except ValueError:
|
|
263
|
+
pass
|
|
264
|
+
try:
|
|
265
|
+
# Try ISO format
|
|
266
|
+
from datetime import datetime
|
|
267
|
+
|
|
268
|
+
dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
269
|
+
return int(dt.timestamp())
|
|
270
|
+
except (ValueError, TypeError):
|
|
271
|
+
pass
|
|
272
|
+
return 0
|