footprinter-cli 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- footprinter/__init__.py +8 -0
- footprinter/access.py +431 -0
- footprinter/api/__init__.py +1 -0
- footprinter/api/db.py +61 -0
- footprinter/api/entities.py +250 -0
- footprinter/api/search.py +47 -0
- footprinter/api/semantic.py +33 -0
- footprinter/api/server.py +66 -0
- footprinter/api/status.py +15 -0
- footprinter/bundled/__init__.py +0 -0
- footprinter/bundled/config.example.yaml +161 -0
- footprinter/bundled/patterns/context_patterns.yaml +18 -0
- footprinter/bundled/patterns/extensions.yaml +283 -0
- footprinter/bundled/patterns/filename_patterns.yaml +61 -0
- footprinter/bundled/patterns/mime_mappings.yaml +68 -0
- footprinter/bundled/patterns/salesforce_rules.yaml +84 -0
- footprinter/bundled/patterns/security_patterns.yaml +27 -0
- footprinter/bundled/samples/hidden-client-file-sample.txt +2 -0
- footprinter/bundled/samples/opaque-project-file-sample.txt +2 -0
- footprinter/bundled/samples/visible-file-sample.txt +2 -0
- footprinter/cli/__init__.py +135 -0
- footprinter/cli/__main__.py +6 -0
- footprinter/cli/_common.py +327 -0
- footprinter/cli/_policy_helpers.py +646 -0
- footprinter/cli/_prompt.py +220 -0
- footprinter/cli/_sample_seed.py +204 -0
- footprinter/cli/api_cmd.py +32 -0
- footprinter/cli/connect.py +591 -0
- footprinter/cli/data.py +879 -0
- footprinter/cli/delete.py +128 -0
- footprinter/cli/ingest.py +543 -0
- footprinter/cli/mcp_cmd.py +750 -0
- footprinter/cli/mcp_setup.py +306 -0
- footprinter/cli/search.py +393 -0
- footprinter/cli/search_cmd.py +69 -0
- footprinter/cli/setup.py +2001 -0
- footprinter/cli/status.py +747 -0
- footprinter/cli/status_cmd.py +104 -0
- footprinter/cli/upsert.py +794 -0
- footprinter/cli/vectorize_cmd.py +215 -0
- footprinter/cli/view.py +322 -0
- footprinter/connectors/__init__.py +171 -0
- footprinter/connectors/config_utils.py +141 -0
- footprinter/db/__init__.py +37 -0
- footprinter/db/browser.py +198 -0
- footprinter/db/chats.py +602 -0
- footprinter/db/clients.py +307 -0
- footprinter/db/emails.py +279 -0
- footprinter/db/files.py +724 -0
- footprinter/db/folders.py +659 -0
- footprinter/db/messages.py +192 -0
- footprinter/db/policies.py +151 -0
- footprinter/db/projects.py +673 -0
- footprinter/db/search.py +573 -0
- footprinter/db/sql_utils.py +168 -0
- footprinter/db/status.py +320 -0
- footprinter/db/uploads.py +70 -0
- footprinter/ingest/__init__.py +0 -0
- footprinter/ingest/adapters/__init__.py +33 -0
- footprinter/ingest/adapters/browser.py +54 -0
- footprinter/ingest/adapters/chat.py +57 -0
- footprinter/ingest/adapters/ingest.py +146 -0
- footprinter/ingest/adapters/local_files.py +68 -0
- footprinter/ingest/adapters/local_folders.py +52 -0
- footprinter/ingest/adapters/protocol.py +174 -0
- footprinter/ingest/browser_indexer.py +216 -0
- footprinter/ingest/chat_dedup.py +156 -0
- footprinter/ingest/chat_indexer.py +487 -0
- footprinter/ingest/chat_parsers/__init__.py +8 -0
- footprinter/ingest/chat_parsers/chatgpt_parser.py +229 -0
- footprinter/ingest/chat_parsers/claude_parser.py +161 -0
- footprinter/ingest/cli.py +827 -0
- footprinter/ingest/content_extractors.py +117 -0
- footprinter/ingest/database.py +36 -0
- footprinter/ingest/db/__init__.py +1 -0
- footprinter/ingest/db/connector_schema.py +47 -0
- footprinter/ingest/db/migration.py +315 -0
- footprinter/ingest/db/schema.py +1043 -0
- footprinter/ingest/db/security.py +6 -0
- footprinter/ingest/file_indexer.py +223 -0
- footprinter/ingest/file_scanner.py +277 -0
- footprinter/ingest/folder_indexer.py +226 -0
- footprinter/ingest/full_content_extractor.py +321 -0
- footprinter/ingest/orchestrator.py +112 -0
- footprinter/ingest/pipe_runner.py +200 -0
- footprinter/ingest/processing.py +165 -0
- footprinter/ingest/registry.py +186 -0
- footprinter/ingest/run_record.py +91 -0
- footprinter/ingest/status.py +346 -0
- footprinter/mcp/__init__.py +0 -0
- footprinter/mcp/__main__.py +5 -0
- footprinter/mcp/db.py +67 -0
- footprinter/mcp/errors.py +105 -0
- footprinter/mcp/extraction.py +226 -0
- footprinter/mcp/server.py +39 -0
- footprinter/mcp/tools/__init__.py +0 -0
- footprinter/mcp/tools/navigation.py +70 -0
- footprinter/mcp/tools/read.py +75 -0
- footprinter/mcp/tools/search.py +158 -0
- footprinter/mcp/tools/semantic.py +79 -0
- footprinter/mcp/tools/status.py +19 -0
- footprinter/paths.py +117 -0
- footprinter/permissions.py +1152 -0
- footprinter/semantic/__init__.py +13 -0
- footprinter/semantic/chunking.py +52 -0
- footprinter/semantic/embeddings.py +23 -0
- footprinter/semantic/hybrid_search.py +273 -0
- footprinter/semantic/vector_store.py +471 -0
- footprinter/services/__init__.py +49 -0
- footprinter/services/access_service.py +342 -0
- footprinter/services/chat_service.py +85 -0
- footprinter/services/client_service.py +267 -0
- footprinter/services/content_service.py +181 -0
- footprinter/services/email_service.py +89 -0
- footprinter/services/file_service.py +83 -0
- footprinter/services/folder_service.py +122 -0
- footprinter/services/includes.py +19 -0
- footprinter/services/ingest_service.py +231 -0
- footprinter/services/project_service.py +262 -0
- footprinter/services/roles.py +25 -0
- footprinter/services/search_service.py +177 -0
- footprinter/services/semantic_service.py +360 -0
- footprinter/services/status_service.py +18 -0
- footprinter/services/visit_service.py +65 -0
- footprinter/source_registry.py +194 -0
- footprinter/utils/__init__.py +7 -0
- footprinter/utils/hash_utils.py +59 -0
- footprinter/utils/logging_config.py +68 -0
- footprinter/utils/mime.py +30 -0
- footprinter/utils/text.py +6 -0
- footprinter/utils/time.py +11 -0
- footprinter/visibility.py +1264 -0
- footprinter_cli-1.0.0rc1.dist-info/LICENSE +21 -0
- footprinter_cli-1.0.0rc1.dist-info/METADATA +223 -0
- footprinter_cli-1.0.0rc1.dist-info/RECORD +138 -0
- footprinter_cli-1.0.0rc1.dist-info/WHEEL +5 -0
- footprinter_cli-1.0.0rc1.dist-info/entry_points.txt +2 -0
- footprinter_cli-1.0.0rc1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chat indexer for importing and querying AI chat exports.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m footprinter.ingest.chat_indexer upload ~/Downloads/claude-export.zip
|
|
6
|
+
python -m footprinter.ingest.chat_indexer stats
|
|
7
|
+
python -m footprinter.ingest.chat_indexer history
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
import zipfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Dict, Tuple
|
|
18
|
+
|
|
19
|
+
from footprinter.db import chats as chats_db
|
|
20
|
+
from footprinter.db import uploads as uploads_db
|
|
21
|
+
from footprinter.semantic.vector_store import _chat_vectorization_enabled
|
|
22
|
+
|
|
23
|
+
from ..utils.hash_utils import compute_sha256
|
|
24
|
+
from ..utils.time import utc_now_iso
|
|
25
|
+
from .chat_parsers import ChatGPTParser, ClaudeParser
|
|
26
|
+
from .database import Database
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# Security limits for zip processing
|
|
31
|
+
MAX_DECOMPRESSED_SIZE = 1_073_741_824 # 1 GB
|
|
32
|
+
MAX_ZIP_ENTRIES = 10_000
|
|
33
|
+
MAX_COMPRESSION_RATIO = 100 # 100:1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ChatIndexer:
|
|
37
|
+
"""Manager for importing and querying chat history."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, db: Database):
|
|
40
|
+
self.db = db
|
|
41
|
+
self._vector_store = None # lazy
|
|
42
|
+
|
|
43
|
+
def _get_vector_store(self):
|
|
44
|
+
if self._vector_store is None:
|
|
45
|
+
try:
|
|
46
|
+
from footprinter.semantic.vector_store import VectorStore
|
|
47
|
+
|
|
48
|
+
self._vector_store = VectorStore.get_instance()
|
|
49
|
+
except (ImportError, Exception):
|
|
50
|
+
self._vector_store = False # sentinel: don't retry
|
|
51
|
+
return self._vector_store if self._vector_store is not False else None
|
|
52
|
+
|
|
53
|
+
def _vectorize_message(self, msg_id, chat_id, msg, conv_data):
|
|
54
|
+
if not _chat_vectorization_enabled():
|
|
55
|
+
return
|
|
56
|
+
# Check per-record vectorize flag
|
|
57
|
+
row = self.db.conn.execute(
|
|
58
|
+
"SELECT COALESCE(json_extract(metadata, '$.vectorize'), 1) as vec FROM messages WHERE id = ?",
|
|
59
|
+
(msg_id,),
|
|
60
|
+
).fetchone()
|
|
61
|
+
if row and row["vec"] == 0:
|
|
62
|
+
return
|
|
63
|
+
store = self._get_vector_store()
|
|
64
|
+
if not store or not msg.get("content"):
|
|
65
|
+
return
|
|
66
|
+
try:
|
|
67
|
+
metadata = {
|
|
68
|
+
"source": conv_data.get("source", "unknown"),
|
|
69
|
+
"role": msg.get("role", "unknown"),
|
|
70
|
+
"chat_title": (conv_data.get("title") or "(untitled)")[:200],
|
|
71
|
+
"created_at": msg.get("created_at", ""),
|
|
72
|
+
"message_position": 0,
|
|
73
|
+
}
|
|
74
|
+
store.upsert_chat_message(
|
|
75
|
+
message_id=msg_id,
|
|
76
|
+
chat_id=chat_id,
|
|
77
|
+
content=msg["content"],
|
|
78
|
+
metadata=metadata,
|
|
79
|
+
)
|
|
80
|
+
self.db.conn.execute(
|
|
81
|
+
"UPDATE messages SET vectorized_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
82
|
+
(msg_id,),
|
|
83
|
+
)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.debug(f"Chat message vectorization skipped for msg {msg_id}: {e}")
|
|
86
|
+
|
|
87
|
+
def _vectorize_chat_info(self, chat_id, conv_data):
|
|
88
|
+
if not _chat_vectorization_enabled():
|
|
89
|
+
return
|
|
90
|
+
# Check per-record vectorize flag
|
|
91
|
+
row = self.db.conn.execute(
|
|
92
|
+
"SELECT COALESCE(json_extract(metadata, '$.vectorize'), 1) as vec FROM chats WHERE id = ?",
|
|
93
|
+
(chat_id,),
|
|
94
|
+
).fetchone()
|
|
95
|
+
if row and row["vec"] == 0:
|
|
96
|
+
return
|
|
97
|
+
store = self._get_vector_store()
|
|
98
|
+
if not store:
|
|
99
|
+
return
|
|
100
|
+
try:
|
|
101
|
+
store.index_chat_info(
|
|
102
|
+
chat_id=chat_id,
|
|
103
|
+
title=conv_data.get("title"),
|
|
104
|
+
summary=conv_data.get("summary"),
|
|
105
|
+
source=conv_data.get("source", "unknown"),
|
|
106
|
+
created_at=conv_data.get("created_at", ""),
|
|
107
|
+
message_count=conv_data.get("message_count", 0),
|
|
108
|
+
)
|
|
109
|
+
self.db.conn.execute(
|
|
110
|
+
"UPDATE chats SET metadata_vectorized_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
111
|
+
(chat_id,),
|
|
112
|
+
)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.debug(f"Chat info vectorization skipped for {chat_id}: {e}")
|
|
115
|
+
|
|
116
|
+
def _validate_zip(self, zf: zipfile.ZipFile, extract_dir: Path) -> None:
|
|
117
|
+
"""Validate zip contents for path traversal, size, and compression ratio."""
|
|
118
|
+
entries = zf.infolist()
|
|
119
|
+
|
|
120
|
+
if len(entries) > MAX_ZIP_ENTRIES:
|
|
121
|
+
raise ValueError(f"Zip contains {len(entries)} entries, exceeds limit of {MAX_ZIP_ENTRIES}")
|
|
122
|
+
|
|
123
|
+
extract_root = extract_dir.resolve()
|
|
124
|
+
total_decompressed = 0
|
|
125
|
+
total_compressed = 0
|
|
126
|
+
|
|
127
|
+
for info in entries:
|
|
128
|
+
# Reject absolute paths
|
|
129
|
+
if info.filename.startswith("/"):
|
|
130
|
+
raise ValueError(f"Zip contains absolute path: {info.filename}")
|
|
131
|
+
|
|
132
|
+
# Reject path traversal
|
|
133
|
+
target = (extract_dir / info.filename).resolve()
|
|
134
|
+
if not str(target).startswith(str(extract_root)):
|
|
135
|
+
raise ValueError(f"Zip contains path traversal: {info.filename}")
|
|
136
|
+
|
|
137
|
+
total_decompressed += info.file_size
|
|
138
|
+
total_compressed += info.compress_size
|
|
139
|
+
|
|
140
|
+
if total_decompressed > MAX_DECOMPRESSED_SIZE:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Zip decompressed size {total_decompressed} bytes exceeds limit of {MAX_DECOMPRESSED_SIZE} bytes"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if total_compressed > 0:
|
|
146
|
+
ratio = total_decompressed / total_compressed
|
|
147
|
+
if ratio > MAX_COMPRESSION_RATIO:
|
|
148
|
+
raise ValueError(f"Zip compression ratio {ratio:.1f}:1 exceeds limit of {MAX_COMPRESSION_RATIO}:1")
|
|
149
|
+
|
|
150
|
+
def upload(self, file_path: Path) -> Dict:
|
|
151
|
+
"""
|
|
152
|
+
Upload and import a chat export (zip or directory).
|
|
153
|
+
|
|
154
|
+
Accepts a .zip file or extracted directory. Zip files are hashed
|
|
155
|
+
to prevent duplicate imports. Chats and messages are
|
|
156
|
+
deduplicated on import.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
file_path: Path to .zip file or extracted directory
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Dict with upload statistics
|
|
163
|
+
"""
|
|
164
|
+
file_path = Path(file_path)
|
|
165
|
+
if not file_path.exists():
|
|
166
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
167
|
+
|
|
168
|
+
is_zip = file_path.suffix.lower() == ".zip"
|
|
169
|
+
|
|
170
|
+
if is_zip:
|
|
171
|
+
file_hash = compute_sha256(str(file_path))
|
|
172
|
+
if not file_hash:
|
|
173
|
+
raise ValueError(f"Could not compute hash for {file_path}")
|
|
174
|
+
|
|
175
|
+
existing = uploads_db.get_upload_by_hash(self.db.conn, file_hash)
|
|
176
|
+
if existing:
|
|
177
|
+
logger.info(f"File already uploaded on {existing['uploaded_at']}")
|
|
178
|
+
return {
|
|
179
|
+
"status": "duplicate",
|
|
180
|
+
"upload_id": existing["id"],
|
|
181
|
+
"previous_upload": existing,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
file_size = file_path.stat().st_size
|
|
185
|
+
|
|
186
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
187
|
+
extract_dir = Path(tmpdir) / "extract"
|
|
188
|
+
with zipfile.ZipFile(file_path, "r") as zf:
|
|
189
|
+
self._validate_zip(zf, extract_dir)
|
|
190
|
+
zf.extractall(extract_dir)
|
|
191
|
+
|
|
192
|
+
source, extract_dir = self._detect_source(extract_dir)
|
|
193
|
+
|
|
194
|
+
upload_id = uploads_db.create_upload(
|
|
195
|
+
self.db.conn,
|
|
196
|
+
{
|
|
197
|
+
"filename": file_path.name,
|
|
198
|
+
"file_hash": file_hash,
|
|
199
|
+
"file_size": file_size,
|
|
200
|
+
"type": "chat",
|
|
201
|
+
"source": source,
|
|
202
|
+
"status": "processing",
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
result = self._import_with_dedup(extract_dir, source)
|
|
208
|
+
uploads_db.update_upload(
|
|
209
|
+
self.db.conn,
|
|
210
|
+
upload_id,
|
|
211
|
+
status="completed",
|
|
212
|
+
completed_at=utc_now_iso(),
|
|
213
|
+
items_added=result["chats_added"],
|
|
214
|
+
items_updated=result["chats_updated"],
|
|
215
|
+
items_total=result["chats_added"] + result["chats_updated"],
|
|
216
|
+
)
|
|
217
|
+
result["upload_id"] = upload_id
|
|
218
|
+
result["status"] = "completed"
|
|
219
|
+
return result
|
|
220
|
+
except Exception as e:
|
|
221
|
+
uploads_db.update_upload(
|
|
222
|
+
self.db.conn,
|
|
223
|
+
upload_id,
|
|
224
|
+
status="failed",
|
|
225
|
+
error_message=str(e),
|
|
226
|
+
completed_at=utc_now_iso(),
|
|
227
|
+
)
|
|
228
|
+
raise
|
|
229
|
+
else:
|
|
230
|
+
# Directory import (alternative to single-file)
|
|
231
|
+
source, file_path = self._detect_source(file_path)
|
|
232
|
+
return self._import_with_dedup(file_path, source)
|
|
233
|
+
|
|
234
|
+
def _detect_source(self, extract_dir: Path) -> Tuple[str, Path]:
|
|
235
|
+
"""Detect chat export source from directory contents.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Tuple of (source, resolved_dir) where resolved_dir contains conversations.json
|
|
239
|
+
"""
|
|
240
|
+
conv_file = extract_dir / "conversations.json"
|
|
241
|
+
if not conv_file.exists():
|
|
242
|
+
# Search one level deep for subdirectory-wrapped exports
|
|
243
|
+
for child in extract_dir.iterdir():
|
|
244
|
+
if child.is_dir() and (child / "conversations.json").exists():
|
|
245
|
+
conv_file = child / "conversations.json"
|
|
246
|
+
extract_dir = child
|
|
247
|
+
break
|
|
248
|
+
else:
|
|
249
|
+
raise ValueError("conversations.json not found in export")
|
|
250
|
+
|
|
251
|
+
with open(conv_file, "r", encoding="utf-8") as f:
|
|
252
|
+
data = json.load(f)
|
|
253
|
+
|
|
254
|
+
if not isinstance(data, list) or len(data) == 0:
|
|
255
|
+
raise ValueError("conversations.json is empty or invalid")
|
|
256
|
+
|
|
257
|
+
first = data[0]
|
|
258
|
+
if "uuid" in first and "chat_messages" in first:
|
|
259
|
+
return ("claude", extract_dir)
|
|
260
|
+
if "mapping" in first:
|
|
261
|
+
return ("chatgpt", extract_dir)
|
|
262
|
+
|
|
263
|
+
raise ValueError("Unknown export format")
|
|
264
|
+
|
|
265
|
+
def _import_with_dedup(self, export_dir: Path, source: str) -> Dict:
|
|
266
|
+
"""Import chats with message deduplication."""
|
|
267
|
+
if source == "claude":
|
|
268
|
+
parser = ClaudeParser(export_dir)
|
|
269
|
+
elif source == "chatgpt":
|
|
270
|
+
conv_file = export_dir / "conversations.json"
|
|
271
|
+
parser = ChatGPTParser(conv_file)
|
|
272
|
+
else:
|
|
273
|
+
raise ValueError(f"Unknown source: {source}")
|
|
274
|
+
|
|
275
|
+
stats = parser.get_stats()
|
|
276
|
+
logger.info(f"{source.capitalize()} export contains:")
|
|
277
|
+
logger.info(f" {stats['total_chats']} chats")
|
|
278
|
+
logger.info(f" {stats['chats_with_messages']} with messages")
|
|
279
|
+
logger.info(f" {stats['total_messages']} total messages")
|
|
280
|
+
logger.info(f" Date range: {stats['earliest_chat']} to {stats['latest_chat']}")
|
|
281
|
+
|
|
282
|
+
chats_added = 0
|
|
283
|
+
chats_updated = 0
|
|
284
|
+
messages_imported = 0
|
|
285
|
+
errors = 0
|
|
286
|
+
|
|
287
|
+
for conv_data in parser.parse_chats():
|
|
288
|
+
try:
|
|
289
|
+
# Check if chat already exists
|
|
290
|
+
existing_id = chats_db.get_chat_id_by_uuid(self.db.conn, conv_data["external_id"])
|
|
291
|
+
|
|
292
|
+
if existing_id:
|
|
293
|
+
# Delete old messages and vectors before re-import
|
|
294
|
+
chats_db.delete_chat_messages(self.db.conn, existing_id)
|
|
295
|
+
store = self._get_vector_store()
|
|
296
|
+
if store:
|
|
297
|
+
try:
|
|
298
|
+
store.delete_chat(existing_id)
|
|
299
|
+
except Exception:
|
|
300
|
+
pass
|
|
301
|
+
|
|
302
|
+
# Insert/replace chat
|
|
303
|
+
chats_db.insert_chat(
|
|
304
|
+
self.db.conn,
|
|
305
|
+
{
|
|
306
|
+
"external_id": conv_data["external_id"],
|
|
307
|
+
"account": conv_data["source"], # Map source → account
|
|
308
|
+
"title": conv_data["title"],
|
|
309
|
+
"summary": conv_data["summary"],
|
|
310
|
+
"created_at": conv_data["created_at"],
|
|
311
|
+
"updated_at": conv_data["updated_at"],
|
|
312
|
+
"message_count": conv_data["message_count"],
|
|
313
|
+
"metadata": conv_data.get("metadata", {}),
|
|
314
|
+
},
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Count only after successful insert
|
|
318
|
+
if existing_id:
|
|
319
|
+
chats_updated += 1
|
|
320
|
+
else:
|
|
321
|
+
chats_added += 1
|
|
322
|
+
|
|
323
|
+
internal_id = chats_db.get_chat_id_by_uuid(self.db.conn, conv_data["external_id"])
|
|
324
|
+
|
|
325
|
+
for msg in conv_data["messages"]:
|
|
326
|
+
msg_id = chats_db.insert_message(
|
|
327
|
+
self.db.conn,
|
|
328
|
+
{
|
|
329
|
+
"chat_id": internal_id,
|
|
330
|
+
"message_id": msg["message_id"],
|
|
331
|
+
"role": msg["role"],
|
|
332
|
+
"content": msg["content"],
|
|
333
|
+
"created_at": msg["created_at"],
|
|
334
|
+
"metadata": msg.get("metadata", {}),
|
|
335
|
+
},
|
|
336
|
+
)
|
|
337
|
+
self._vectorize_message(msg_id, internal_id, msg, conv_data)
|
|
338
|
+
messages_imported += 1
|
|
339
|
+
|
|
340
|
+
self._vectorize_chat_info(internal_id, conv_data)
|
|
341
|
+
|
|
342
|
+
# Commit per-chat: all messages + vectorization for this chat
|
|
343
|
+
self.db.conn.commit()
|
|
344
|
+
|
|
345
|
+
total = chats_added + chats_updated
|
|
346
|
+
if total % 100 == 0:
|
|
347
|
+
logger.info(f"Imported {total} chats...")
|
|
348
|
+
|
|
349
|
+
except Exception as e:
|
|
350
|
+
logger.error(f"Error importing chat {conv_data.get('external_id')}: {e}")
|
|
351
|
+
errors += 1
|
|
352
|
+
|
|
353
|
+
return {
|
|
354
|
+
"chats_added": chats_added,
|
|
355
|
+
"chats_updated": chats_updated,
|
|
356
|
+
"messages_imported": messages_imported,
|
|
357
|
+
"errors": errors,
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
def get_stats(self) -> Dict:
|
|
361
|
+
"""Get chat history statistics."""
|
|
362
|
+
cursor = self.db.conn.cursor()
|
|
363
|
+
|
|
364
|
+
# Chat stats
|
|
365
|
+
cursor.execute("SELECT COUNT(*) as count FROM chats")
|
|
366
|
+
chat_count = cursor.fetchone()["count"]
|
|
367
|
+
|
|
368
|
+
cursor.execute("SELECT COUNT(*) as count FROM messages")
|
|
369
|
+
msg_count = cursor.fetchone()["count"]
|
|
370
|
+
|
|
371
|
+
# By account
|
|
372
|
+
cursor.execute("""
|
|
373
|
+
SELECT account, COUNT(*) as count
|
|
374
|
+
FROM chats
|
|
375
|
+
GROUP BY account
|
|
376
|
+
""")
|
|
377
|
+
by_account = {row["account"]: row["count"] for row in cursor.fetchall()}
|
|
378
|
+
|
|
379
|
+
# Date range
|
|
380
|
+
cursor.execute("""
|
|
381
|
+
SELECT MIN(created_at) as earliest, MAX(created_at) as latest
|
|
382
|
+
FROM chats
|
|
383
|
+
""")
|
|
384
|
+
dates = cursor.fetchone()
|
|
385
|
+
|
|
386
|
+
# Top chats by message count
|
|
387
|
+
cursor.execute("""
|
|
388
|
+
SELECT title, message_count, created_at
|
|
389
|
+
FROM chats
|
|
390
|
+
ORDER BY message_count DESC
|
|
391
|
+
LIMIT 10
|
|
392
|
+
""")
|
|
393
|
+
top_chats = [dict(row) for row in cursor.fetchall()]
|
|
394
|
+
|
|
395
|
+
return {
|
|
396
|
+
"total_chats": chat_count,
|
|
397
|
+
"total_messages": msg_count,
|
|
398
|
+
"by_account": by_account,
|
|
399
|
+
"earliest_chat": dates["earliest"],
|
|
400
|
+
"latest_chat": dates["latest"],
|
|
401
|
+
"top_chats": top_chats,
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def main():
|
|
406
|
+
"""CLI entry point for importing and managing AI chat history."""
|
|
407
|
+
parser = argparse.ArgumentParser(description="Import and manage AI chat history")
|
|
408
|
+
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
|
409
|
+
|
|
410
|
+
# upload command
|
|
411
|
+
upload_parser = subparsers.add_parser("upload", help="Upload and import chat export (zip or directory)")
|
|
412
|
+
upload_parser.add_argument("file_path", type=Path, help="Path to chat export zip file or directory")
|
|
413
|
+
|
|
414
|
+
# stats command
|
|
415
|
+
subparsers.add_parser("stats", help="Show chat history statistics")
|
|
416
|
+
|
|
417
|
+
# history command
|
|
418
|
+
history_parser = subparsers.add_parser("history", help="Show recent upload history")
|
|
419
|
+
history_parser.add_argument("--limit", type=int, default=10, help="Number of uploads to show (default: 10)")
|
|
420
|
+
|
|
421
|
+
args = parser.parse_args()
|
|
422
|
+
|
|
423
|
+
if not args.command:
|
|
424
|
+
parser.print_help()
|
|
425
|
+
sys.exit(1)
|
|
426
|
+
|
|
427
|
+
# Initialize database and manager
|
|
428
|
+
from footprinter.paths import get_db_path
|
|
429
|
+
|
|
430
|
+
db = Database(str(get_db_path()))
|
|
431
|
+
manager = ChatIndexer(db)
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
if args.command == "upload":
|
|
435
|
+
logger.info(f"Uploading chat export from {args.file_path}")
|
|
436
|
+
result = manager.upload(args.file_path)
|
|
437
|
+
|
|
438
|
+
if result.get("status") == "duplicate":
|
|
439
|
+
prev = result["previous_upload"]
|
|
440
|
+
logger.warning("This file was already uploaded.")
|
|
441
|
+
logger.info(f" Uploaded: {prev['uploaded_at']}")
|
|
442
|
+
logger.info(f" Items: {prev['items_added']} added, {prev['items_updated']} updated")
|
|
443
|
+
else:
|
|
444
|
+
logger.info("=" * 60)
|
|
445
|
+
logger.info("Upload complete!")
|
|
446
|
+
logger.info(f" Upload ID: {result['upload_id']}")
|
|
447
|
+
logger.info(f" Chats: {result['chats_added']} added, {result['chats_updated']} updated")
|
|
448
|
+
logger.info(f" Messages: {result['messages_imported']}")
|
|
449
|
+
if result["errors"]:
|
|
450
|
+
logger.warning(f" Errors: {result['errors']}")
|
|
451
|
+
|
|
452
|
+
elif args.command == "stats":
|
|
453
|
+
stats = manager.get_stats()
|
|
454
|
+
logger.info("=" * 60)
|
|
455
|
+
logger.info("CHAT HISTORY STATISTICS")
|
|
456
|
+
logger.info("=" * 60)
|
|
457
|
+
logger.info(f"Total chats: {stats['total_chats']}")
|
|
458
|
+
logger.info(f"Total messages: {stats['total_messages']}")
|
|
459
|
+
logger.info("By account:")
|
|
460
|
+
for account, count in stats.get("by_account", {}).items():
|
|
461
|
+
logger.info(f" {account}: {count}")
|
|
462
|
+
logger.info(f"Date range: {stats['earliest_chat']} to {stats['latest_chat']}")
|
|
463
|
+
logger.info("Top chats by message count:")
|
|
464
|
+
for conv in stats.get("top_chats", []):
|
|
465
|
+
title = conv["title"][:50] + "..." if len(conv["title"] or "") > 50 else conv["title"]
|
|
466
|
+
logger.info(f" {conv['message_count']:4d} msgs: {title}")
|
|
467
|
+
|
|
468
|
+
elif args.command == "history":
|
|
469
|
+
uploads = uploads_db.get_recent_uploads(db.conn, upload_type="chat", limit=args.limit)
|
|
470
|
+
logger.info("=" * 60)
|
|
471
|
+
logger.info("CHAT UPLOAD HISTORY")
|
|
472
|
+
logger.info("=" * 60)
|
|
473
|
+
if not uploads:
|
|
474
|
+
logger.info("No uploads found.")
|
|
475
|
+
for upload in uploads:
|
|
476
|
+
logger.info(f"{upload['uploaded_at']} - {upload['filename']}")
|
|
477
|
+
logger.info(f" Source: {upload['source']} Status: {upload['status']}")
|
|
478
|
+
logger.info(f" Added: {upload['items_added']} Updated: {upload['items_updated']}")
|
|
479
|
+
if upload.get("error_message"):
|
|
480
|
+
logger.info(f" Error: {upload['error_message']}")
|
|
481
|
+
|
|
482
|
+
finally:
|
|
483
|
+
db.close()
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
if __name__ == "__main__":
|
|
487
|
+
main()
|