realtimex-deeptutor 0.5.0.post2__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/METADATA +1 -1
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/RECORD +17 -8
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/top_level.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/audit_prompts.py +179 -0
- scripts/check_install.py +460 -0
- scripts/generate_roster.py +327 -0
- scripts/install_all.py +653 -0
- scripts/migrate_kb.py +655 -0
- scripts/start.py +807 -0
- scripts/start_web.py +632 -0
- scripts/sync_prompts_from_en.py +147 -0
- src/cli/start.py +58 -66
- src/services/config/unified_config.py +2 -2
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/WHEEL +0 -0
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/entry_points.txt +0 -0
- {realtimex_deeptutor-0.5.0.post2.dist-info → realtimex_deeptutor-0.5.0.post3.dist-info}/licenses/LICENSE +0 -0
scripts/migrate_kb.py
ADDED
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Knowledge Base Migration Script
|
|
5
|
+
===============================
|
|
6
|
+
|
|
7
|
+
Migrate existing knowledge bases into DeepTutor's knowledge base system.
|
|
8
|
+
|
|
9
|
+
Features:
|
|
10
|
+
- Auto-detect RAG provider type (LlamaIndex or LightRAG/RAGAnything)
|
|
11
|
+
- Validate required index files
|
|
12
|
+
- Copy/migrate KB to target directory
|
|
13
|
+
- Register in kb_config.json
|
|
14
|
+
- Optionally extract numbered items (if content_list exists)
|
|
15
|
+
- Run test query to verify migration
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
python scripts/migrate_kb.py /path/to/kb --name my_kb --test --extract-items
|
|
19
|
+
python scripts/migrate_kb.py /path/to/kb --validate-only
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import asyncio
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
import json
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
import shutil
|
|
28
|
+
import sys
|
|
29
|
+
|
|
30
|
+
# Add project root to path
|
|
31
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
32
|
+
sys.path.insert(0, str(PROJECT_ROOT))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# Constants: Required files for each RAG provider
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
LLAMAINDEX_REQUIRED_FILES = [
|
|
40
|
+
"docstore.json",
|
|
41
|
+
"index_store.json",
|
|
42
|
+
"default__vector_store.json",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
LIGHTRAG_REQUIRED_FILES = [
|
|
46
|
+
"kv_store_text_chunks.json",
|
|
47
|
+
"kv_store_full_entities.json",
|
|
48
|
+
"kv_store_full_relations.json",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Optional but recommended for better performance
|
|
52
|
+
LIGHTRAG_OPTIONAL_FILES = [
|
|
53
|
+
"vdb_chunks.json",
|
|
54
|
+
"vdb_entities.json",
|
|
55
|
+
"vdb_relationships.json",
|
|
56
|
+
"graph_chunk_entity_relation.graphml",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Default target directory
|
|
60
|
+
DEFAULT_KB_BASE_DIR = PROJECT_ROOT / "data" / "knowledge_bases"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# =============================================================================
|
|
64
|
+
# Validation Functions
|
|
65
|
+
# =============================================================================
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def detect_provider(kb_path: Path) -> str | None:
|
|
69
|
+
"""
|
|
70
|
+
Detect the RAG provider type based on directory structure and valid files.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
kb_path: Path to the knowledge base directory
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Provider name: "llamaindex", "lightrag", or None if not detected
|
|
77
|
+
"""
|
|
78
|
+
llamaindex_dir = kb_path / "llamaindex_storage"
|
|
79
|
+
lightrag_dir = kb_path / "rag_storage"
|
|
80
|
+
|
|
81
|
+
has_llamaindex = llamaindex_dir.exists() and llamaindex_dir.is_dir()
|
|
82
|
+
has_lightrag = lightrag_dir.exists() and lightrag_dir.is_dir()
|
|
83
|
+
|
|
84
|
+
# Check which one has valid index files
|
|
85
|
+
llamaindex_valid = False
|
|
86
|
+
lightrag_valid = False
|
|
87
|
+
|
|
88
|
+
if has_llamaindex:
|
|
89
|
+
# Check if LlamaIndex has required files
|
|
90
|
+
llamaindex_valid = all((llamaindex_dir / f).exists() for f in LLAMAINDEX_REQUIRED_FILES)
|
|
91
|
+
|
|
92
|
+
if has_lightrag:
|
|
93
|
+
# Check if LightRAG has required files
|
|
94
|
+
lightrag_valid = all((lightrag_dir / f).exists() for f in LIGHTRAG_REQUIRED_FILES)
|
|
95
|
+
|
|
96
|
+
# Return based on which has valid files
|
|
97
|
+
if llamaindex_valid and lightrag_valid:
|
|
98
|
+
# Both valid, prefer LightRAG (more feature-rich)
|
|
99
|
+
return "lightrag"
|
|
100
|
+
elif lightrag_valid:
|
|
101
|
+
return "lightrag"
|
|
102
|
+
elif llamaindex_valid:
|
|
103
|
+
return "llamaindex"
|
|
104
|
+
elif has_llamaindex:
|
|
105
|
+
# Directory exists but incomplete
|
|
106
|
+
return "llamaindex"
|
|
107
|
+
elif has_lightrag:
|
|
108
|
+
# Directory exists but incomplete
|
|
109
|
+
return "lightrag"
|
|
110
|
+
else:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def validate_llamaindex_files(storage_dir: Path) -> tuple[bool, list[str], list[str]]:
|
|
115
|
+
"""
|
|
116
|
+
Validate LlamaIndex storage has required files.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
storage_dir: Path to llamaindex_storage directory
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Tuple of (is_valid, missing_files, found_files)
|
|
123
|
+
"""
|
|
124
|
+
missing = []
|
|
125
|
+
found = []
|
|
126
|
+
|
|
127
|
+
for filename in LLAMAINDEX_REQUIRED_FILES:
|
|
128
|
+
filepath = storage_dir / filename
|
|
129
|
+
if filepath.exists():
|
|
130
|
+
found.append(filename)
|
|
131
|
+
else:
|
|
132
|
+
missing.append(filename)
|
|
133
|
+
|
|
134
|
+
return len(missing) == 0, missing, found
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def validate_lightrag_files(storage_dir: Path) -> tuple[bool, list[str], list[str]]:
|
|
138
|
+
"""
|
|
139
|
+
Validate LightRAG storage has required files.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
storage_dir: Path to rag_storage directory
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Tuple of (is_valid, missing_files, found_files)
|
|
146
|
+
"""
|
|
147
|
+
missing = []
|
|
148
|
+
found = []
|
|
149
|
+
|
|
150
|
+
for filename in LIGHTRAG_REQUIRED_FILES:
|
|
151
|
+
filepath = storage_dir / filename
|
|
152
|
+
if filepath.exists():
|
|
153
|
+
found.append(filename)
|
|
154
|
+
else:
|
|
155
|
+
missing.append(filename)
|
|
156
|
+
|
|
157
|
+
# Also check optional files
|
|
158
|
+
optional_found = []
|
|
159
|
+
for filename in LIGHTRAG_OPTIONAL_FILES:
|
|
160
|
+
if (storage_dir / filename).exists():
|
|
161
|
+
optional_found.append(filename)
|
|
162
|
+
|
|
163
|
+
return len(missing) == 0, missing, found + optional_found
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def validate_kb(kb_path: Path) -> dict:
|
|
167
|
+
"""
|
|
168
|
+
Validate a knowledge base directory.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
kb_path: Path to the knowledge base directory
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Validation result dictionary
|
|
175
|
+
"""
|
|
176
|
+
result = {
|
|
177
|
+
"path": str(kb_path),
|
|
178
|
+
"exists": kb_path.exists(),
|
|
179
|
+
"is_valid": False,
|
|
180
|
+
"provider": None,
|
|
181
|
+
"missing_files": [],
|
|
182
|
+
"found_files": [],
|
|
183
|
+
"has_content_list": False,
|
|
184
|
+
"has_raw_docs": False,
|
|
185
|
+
"has_images": False,
|
|
186
|
+
"has_metadata": False,
|
|
187
|
+
"has_numbered_items": False,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if not kb_path.exists():
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
# Detect provider
|
|
194
|
+
provider = detect_provider(kb_path)
|
|
195
|
+
result["provider"] = provider
|
|
196
|
+
|
|
197
|
+
if provider is None:
|
|
198
|
+
result["error"] = (
|
|
199
|
+
"No valid RAG storage found (neither llamaindex_storage/ nor rag_storage/)"
|
|
200
|
+
)
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
# Validate based on provider
|
|
204
|
+
if provider == "llamaindex":
|
|
205
|
+
storage_dir = kb_path / "llamaindex_storage"
|
|
206
|
+
is_valid, missing, found = validate_llamaindex_files(storage_dir)
|
|
207
|
+
else: # lightrag
|
|
208
|
+
storage_dir = kb_path / "rag_storage"
|
|
209
|
+
is_valid, missing, found = validate_lightrag_files(storage_dir)
|
|
210
|
+
|
|
211
|
+
result["is_valid"] = is_valid
|
|
212
|
+
result["missing_files"] = missing
|
|
213
|
+
result["found_files"] = found
|
|
214
|
+
|
|
215
|
+
# Check optional directories
|
|
216
|
+
content_list_dir = kb_path / "content_list"
|
|
217
|
+
result["has_content_list"] = content_list_dir.exists() and any(content_list_dir.glob("*.json"))
|
|
218
|
+
|
|
219
|
+
raw_dir = kb_path / "raw"
|
|
220
|
+
result["has_raw_docs"] = raw_dir.exists() and any(raw_dir.iterdir())
|
|
221
|
+
|
|
222
|
+
images_dir = kb_path / "images"
|
|
223
|
+
result["has_images"] = images_dir.exists() and any(images_dir.iterdir())
|
|
224
|
+
|
|
225
|
+
result["has_metadata"] = (kb_path / "metadata.json").exists()
|
|
226
|
+
result["has_numbered_items"] = (kb_path / "numbered_items.json").exists()
|
|
227
|
+
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# =============================================================================
|
|
232
|
+
# Migration Functions
|
|
233
|
+
# =============================================================================
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def copy_kb_directory(source_path: Path, target_path: Path, verbose: bool = True) -> bool:
|
|
237
|
+
"""
|
|
238
|
+
Copy knowledge base directory to target location.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
source_path: Source KB directory
|
|
242
|
+
target_path: Target KB directory
|
|
243
|
+
verbose: Print progress messages
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
True if successful
|
|
247
|
+
"""
|
|
248
|
+
if target_path.exists():
|
|
249
|
+
if verbose:
|
|
250
|
+
print(f" ⚠️ Target directory already exists: {target_path}")
|
|
251
|
+
return False
|
|
252
|
+
|
|
253
|
+
if verbose:
|
|
254
|
+
print(f" Copying {source_path} -> {target_path}")
|
|
255
|
+
|
|
256
|
+
shutil.copytree(source_path, target_path)
|
|
257
|
+
|
|
258
|
+
if verbose:
|
|
259
|
+
print(" ✓ Copied successfully")
|
|
260
|
+
|
|
261
|
+
return True
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def register_kb(
|
|
265
|
+
kb_name: str, kb_base_dir: Path, description: str = "", provider: str | None = None
|
|
266
|
+
) -> bool:
|
|
267
|
+
"""
|
|
268
|
+
Register knowledge base in kb_config.json.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
kb_name: Knowledge base name
|
|
272
|
+
kb_base_dir: Base directory containing kb_config.json
|
|
273
|
+
description: Optional description
|
|
274
|
+
provider: RAG provider name
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
True if successful
|
|
278
|
+
"""
|
|
279
|
+
config_file = kb_base_dir / "kb_config.json"
|
|
280
|
+
|
|
281
|
+
# Load existing config
|
|
282
|
+
if config_file.exists():
|
|
283
|
+
with open(config_file, encoding="utf-8") as f:
|
|
284
|
+
config = json.load(f)
|
|
285
|
+
else:
|
|
286
|
+
config = {"knowledge_bases": {}}
|
|
287
|
+
|
|
288
|
+
if "knowledge_bases" not in config:
|
|
289
|
+
config["knowledge_bases"] = {}
|
|
290
|
+
|
|
291
|
+
# Check if already registered
|
|
292
|
+
if kb_name in config["knowledge_bases"]:
|
|
293
|
+
print(f" ⚠️ KB '{kb_name}' is already registered in kb_config.json")
|
|
294
|
+
return True
|
|
295
|
+
|
|
296
|
+
# Add to config
|
|
297
|
+
config["knowledge_bases"][kb_name] = {
|
|
298
|
+
"path": kb_name,
|
|
299
|
+
"description": description or f"Knowledge base: {kb_name}",
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# Save config
|
|
303
|
+
with open(config_file, "w", encoding="utf-8") as f:
|
|
304
|
+
json.dump(config, f, indent=2, ensure_ascii=False)
|
|
305
|
+
|
|
306
|
+
print(f" ✓ Registered '{kb_name}' in kb_config.json")
|
|
307
|
+
|
|
308
|
+
# Also create/update metadata.json if needed
|
|
309
|
+
kb_dir = kb_base_dir / kb_name
|
|
310
|
+
metadata_file = kb_dir / "metadata.json"
|
|
311
|
+
|
|
312
|
+
if not metadata_file.exists():
|
|
313
|
+
metadata = {
|
|
314
|
+
"name": kb_name,
|
|
315
|
+
"created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
316
|
+
"description": description or f"Knowledge base: {kb_name}",
|
|
317
|
+
"version": "1.0",
|
|
318
|
+
"rag_provider": provider,
|
|
319
|
+
"migrated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
320
|
+
}
|
|
321
|
+
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
322
|
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
323
|
+
print(" ✓ Created metadata.json")
|
|
324
|
+
else:
|
|
325
|
+
# Update existing metadata with migration info
|
|
326
|
+
with open(metadata_file, encoding="utf-8") as f:
|
|
327
|
+
metadata = json.load(f)
|
|
328
|
+
|
|
329
|
+
if provider and not metadata.get("rag_provider"):
|
|
330
|
+
metadata["rag_provider"] = provider
|
|
331
|
+
metadata["migrated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
332
|
+
|
|
333
|
+
with open(metadata_file, "w", encoding="utf-8") as f:
|
|
334
|
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
335
|
+
print(" ✓ Updated metadata.json")
|
|
336
|
+
|
|
337
|
+
return True
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
async def extract_numbered_items(kb_name: str, kb_base_dir: Path) -> bool:
|
|
341
|
+
"""
|
|
342
|
+
Extract numbered items from content_list files.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
kb_name: Knowledge base name
|
|
346
|
+
kb_base_dir: Base directory for knowledge bases
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if successful
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
from src.knowledge.extract_numbered_items import (
|
|
353
|
+
extract_numbered_items_with_llm_async,
|
|
354
|
+
)
|
|
355
|
+
from src.services.llm import get_llm_client
|
|
356
|
+
except ImportError as e:
|
|
357
|
+
print(f" ⚠️ Could not import extraction module: {e}")
|
|
358
|
+
return False
|
|
359
|
+
|
|
360
|
+
kb_dir = kb_base_dir / kb_name
|
|
361
|
+
content_list_dir = kb_dir / "content_list"
|
|
362
|
+
|
|
363
|
+
if not content_list_dir.exists():
|
|
364
|
+
print(" ⚠️ No content_list directory found")
|
|
365
|
+
return False
|
|
366
|
+
|
|
367
|
+
# Load all content list files
|
|
368
|
+
all_content_items = []
|
|
369
|
+
json_files = list(content_list_dir.glob("*.json"))
|
|
370
|
+
|
|
371
|
+
if not json_files:
|
|
372
|
+
print(" ⚠️ No JSON files found in content_list/")
|
|
373
|
+
return False
|
|
374
|
+
|
|
375
|
+
print(f" Loading {len(json_files)} content list files...")
|
|
376
|
+
|
|
377
|
+
for json_file in json_files:
|
|
378
|
+
try:
|
|
379
|
+
with open(json_file, encoding="utf-8") as f:
|
|
380
|
+
content_items = json.load(f)
|
|
381
|
+
if isinstance(content_items, list):
|
|
382
|
+
all_content_items.extend(content_items)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
print(f" ⚠️ Error loading {json_file.name}: {e}")
|
|
385
|
+
|
|
386
|
+
if not all_content_items:
|
|
387
|
+
print(" ⚠️ No content items found")
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
print(f" Extracting numbered items from {len(all_content_items)} content items...")
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
llm_client = get_llm_client()
|
|
394
|
+
items = await extract_numbered_items_with_llm_async(
|
|
395
|
+
all_content_items,
|
|
396
|
+
api_key=llm_client.config.api_key,
|
|
397
|
+
base_url=llm_client.config.base_url,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if items:
|
|
401
|
+
output_file = kb_dir / "numbered_items.json"
|
|
402
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
403
|
+
json.dump(items, f, ensure_ascii=False, indent=2)
|
|
404
|
+
print(f" ✓ Extracted {len(items)} numbered items")
|
|
405
|
+
return True
|
|
406
|
+
else:
|
|
407
|
+
print(" ⚠️ No numbered items extracted")
|
|
408
|
+
return False
|
|
409
|
+
|
|
410
|
+
except Exception as e:
|
|
411
|
+
print(f" ✗ Extraction failed: {e}")
|
|
412
|
+
return False
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
async def test_kb_search(kb_name: str, query: str = "What is this knowledge base about?") -> bool:
|
|
416
|
+
"""
|
|
417
|
+
Test knowledge base with a simple search query.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
kb_name: Knowledge base name
|
|
421
|
+
query: Test query
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
True if search succeeded
|
|
425
|
+
"""
|
|
426
|
+
try:
|
|
427
|
+
from src.tools.rag_tool import rag_search
|
|
428
|
+
except ImportError as e:
|
|
429
|
+
print(f" ⚠️ Could not import rag_tool: {e}")
|
|
430
|
+
return False
|
|
431
|
+
|
|
432
|
+
print(f" Running test query: '{query[:50]}...'")
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
result = await rag_search(
|
|
436
|
+
query=query,
|
|
437
|
+
kb_name=kb_name,
|
|
438
|
+
mode="naive", # Use simplest mode for testing
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
if result and result.get("answer"):
|
|
442
|
+
answer_preview = result["answer"][:200]
|
|
443
|
+
print(" ✓ Search successful!")
|
|
444
|
+
print(f" Provider: {result.get('provider', 'unknown')}")
|
|
445
|
+
print(f" Answer preview: {answer_preview}...")
|
|
446
|
+
return True
|
|
447
|
+
else:
|
|
448
|
+
print(" ⚠️ Search returned empty result")
|
|
449
|
+
return False
|
|
450
|
+
|
|
451
|
+
except Exception as e:
|
|
452
|
+
print(f" ✗ Search failed: {e}")
|
|
453
|
+
return False
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
async def migrate_kb(
|
|
457
|
+
source_path: str,
|
|
458
|
+
target_base_dir: str | None = None,
|
|
459
|
+
kb_name: str | None = None,
|
|
460
|
+
run_test: bool = False,
|
|
461
|
+
extract_items: bool = False,
|
|
462
|
+
validate_only: bool = False,
|
|
463
|
+
force: bool = False,
|
|
464
|
+
) -> bool:
|
|
465
|
+
"""
|
|
466
|
+
Migrate a knowledge base to DeepTutor.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
source_path: Path to source knowledge base
|
|
470
|
+
target_base_dir: Target base directory (default: data/knowledge_bases)
|
|
471
|
+
kb_name: Name for the migrated KB (default: source directory name)
|
|
472
|
+
run_test: Run a test query after migration
|
|
473
|
+
extract_items: Extract numbered items if content_list exists
|
|
474
|
+
validate_only: Only validate, don't migrate
|
|
475
|
+
force: Overwrite existing KB if exists
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
True if successful
|
|
479
|
+
"""
|
|
480
|
+
source = Path(source_path).resolve()
|
|
481
|
+
target_base = Path(target_base_dir) if target_base_dir else DEFAULT_KB_BASE_DIR
|
|
482
|
+
target_base = target_base.resolve()
|
|
483
|
+
|
|
484
|
+
# Determine KB name
|
|
485
|
+
if kb_name is None:
|
|
486
|
+
kb_name = source.name
|
|
487
|
+
|
|
488
|
+
print("=" * 60)
|
|
489
|
+
print("Knowledge Base Migration")
|
|
490
|
+
print("=" * 60)
|
|
491
|
+
print(f"Source: {source}")
|
|
492
|
+
print(f"Target: {target_base / kb_name}")
|
|
493
|
+
print()
|
|
494
|
+
|
|
495
|
+
# Step 1: Validate source KB
|
|
496
|
+
print("Step 1: Validating source knowledge base...")
|
|
497
|
+
validation = validate_kb(source)
|
|
498
|
+
|
|
499
|
+
if not validation["exists"]:
|
|
500
|
+
print(f" ✗ Source directory does not exist: {source}")
|
|
501
|
+
return False
|
|
502
|
+
|
|
503
|
+
if not validation["is_valid"]:
|
|
504
|
+
print(" ✗ Validation failed!")
|
|
505
|
+
if validation.get("error"):
|
|
506
|
+
print(f" Error: {validation['error']}")
|
|
507
|
+
if validation["missing_files"]:
|
|
508
|
+
print(f" Missing files: {', '.join(validation['missing_files'])}")
|
|
509
|
+
return False
|
|
510
|
+
|
|
511
|
+
print(" ✓ Validation passed")
|
|
512
|
+
print(f" Provider: {validation['provider']}")
|
|
513
|
+
print(f" Found files: {', '.join(validation['found_files'][:5])}...")
|
|
514
|
+
print(f" Has content_list: {'Yes' if validation['has_content_list'] else 'No'}")
|
|
515
|
+
print(f" Has raw docs: {'Yes' if validation['has_raw_docs'] else 'No'}")
|
|
516
|
+
print(f" Has images: {'Yes' if validation['has_images'] else 'No'}")
|
|
517
|
+
print()
|
|
518
|
+
|
|
519
|
+
if validate_only:
|
|
520
|
+
print("Validation-only mode. Exiting.")
|
|
521
|
+
return True
|
|
522
|
+
|
|
523
|
+
# Step 2: Copy to target
|
|
524
|
+
print("Step 2: Copying knowledge base...")
|
|
525
|
+
target_path = target_base / kb_name
|
|
526
|
+
|
|
527
|
+
if target_path.exists():
|
|
528
|
+
if force:
|
|
529
|
+
print(" Removing existing directory (--force)...")
|
|
530
|
+
shutil.rmtree(target_path)
|
|
531
|
+
elif source.resolve() == target_path.resolve():
|
|
532
|
+
print(" Source and target are the same. Skipping copy.")
|
|
533
|
+
else:
|
|
534
|
+
print(f" ✗ Target directory already exists: {target_path}")
|
|
535
|
+
print(" Use --force to overwrite or --name to specify a different name")
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
if source.resolve() != target_path.resolve():
|
|
539
|
+
success = copy_kb_directory(source, target_path)
|
|
540
|
+
if not success:
|
|
541
|
+
return False
|
|
542
|
+
print()
|
|
543
|
+
|
|
544
|
+
# Step 3: Register in kb_config.json
|
|
545
|
+
print("Step 3: Registering knowledge base...")
|
|
546
|
+
register_kb(
|
|
547
|
+
kb_name=kb_name,
|
|
548
|
+
kb_base_dir=target_base,
|
|
549
|
+
description=f"Migrated from: {source}",
|
|
550
|
+
provider=validation["provider"],
|
|
551
|
+
)
|
|
552
|
+
print()
|
|
553
|
+
|
|
554
|
+
# Step 4: Extract numbered items (optional)
|
|
555
|
+
if extract_items and validation["has_content_list"]:
|
|
556
|
+
print("Step 4: Extracting numbered items...")
|
|
557
|
+
await extract_numbered_items(kb_name, target_base)
|
|
558
|
+
print()
|
|
559
|
+
elif extract_items:
|
|
560
|
+
print("Step 4: Skipping numbered items extraction (no content_list)")
|
|
561
|
+
print()
|
|
562
|
+
|
|
563
|
+
# Step 5: Test search (optional)
|
|
564
|
+
if run_test:
|
|
565
|
+
print("Step 5: Testing knowledge base...")
|
|
566
|
+
await test_kb_search(kb_name)
|
|
567
|
+
print()
|
|
568
|
+
|
|
569
|
+
print("=" * 60)
|
|
570
|
+
print("✓ Migration complete!")
|
|
571
|
+
print(f" Knowledge base '{kb_name}' is now available in DeepTutor.")
|
|
572
|
+
print("=" * 60)
|
|
573
|
+
|
|
574
|
+
return True
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
# =============================================================================
|
|
578
|
+
# CLI Entry Point
|
|
579
|
+
# =============================================================================
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def main():
|
|
583
|
+
parser = argparse.ArgumentParser(
|
|
584
|
+
description="Migrate knowledge bases into DeepTutor",
|
|
585
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
586
|
+
epilog="""
|
|
587
|
+
Examples:
|
|
588
|
+
# Migrate a knowledge base
|
|
589
|
+
python scripts/migrate_kb.py /path/to/my_kb
|
|
590
|
+
|
|
591
|
+
# Migrate with custom name
|
|
592
|
+
python scripts/migrate_kb.py /path/to/kb --name my_textbook
|
|
593
|
+
|
|
594
|
+
# Migrate and run test query
|
|
595
|
+
python scripts/migrate_kb.py /path/to/kb --test
|
|
596
|
+
|
|
597
|
+
# Migrate and extract numbered items
|
|
598
|
+
python scripts/migrate_kb.py /path/to/kb --extract-items
|
|
599
|
+
|
|
600
|
+
# Validate only (don't migrate)
|
|
601
|
+
python scripts/migrate_kb.py /path/to/kb --validate-only
|
|
602
|
+
|
|
603
|
+
# Force overwrite existing KB
|
|
604
|
+
python scripts/migrate_kb.py /path/to/kb --force
|
|
605
|
+
""",
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
parser.add_argument("source", help="Path to source knowledge base directory")
|
|
609
|
+
|
|
610
|
+
parser.add_argument(
|
|
611
|
+
"--name", help="Name for the migrated knowledge base (default: source directory name)"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
parser.add_argument(
|
|
615
|
+
"--target-dir", help=f"Target base directory (default: {DEFAULT_KB_BASE_DIR})"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
parser.add_argument("--test", action="store_true", help="Run a test query after migration")
|
|
619
|
+
|
|
620
|
+
parser.add_argument(
|
|
621
|
+
"--extract-items",
|
|
622
|
+
action="store_true",
|
|
623
|
+
help="Extract numbered items from content_list (requires LLM API)",
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
parser.add_argument(
|
|
627
|
+
"--validate-only",
|
|
628
|
+
action="store_true",
|
|
629
|
+
help="Only validate the knowledge base, don't migrate",
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
parser.add_argument(
|
|
633
|
+
"--force", action="store_true", help="Overwrite existing knowledge base if exists"
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
args = parser.parse_args()
|
|
637
|
+
|
|
638
|
+
# Run migration
|
|
639
|
+
success = asyncio.run(
|
|
640
|
+
migrate_kb(
|
|
641
|
+
source_path=args.source,
|
|
642
|
+
target_base_dir=args.target_dir,
|
|
643
|
+
kb_name=args.name,
|
|
644
|
+
run_test=args.test,
|
|
645
|
+
extract_items=args.extract_items,
|
|
646
|
+
validate_only=args.validate_only,
|
|
647
|
+
force=args.force,
|
|
648
|
+
)
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
sys.exit(0 if success else 1)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
if __name__ == "__main__":
|
|
655
|
+
main()
|