realtimex-deeptutor 0.5.0.post2__py3-none-any.whl → 0.5.0.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scripts/migrate_kb.py ADDED
@@ -0,0 +1,655 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Knowledge Base Migration Script
5
+ ===============================
6
+
7
+ Migrate existing knowledge bases into DeepTutor's knowledge base system.
8
+
9
+ Features:
10
+ - Auto-detect RAG provider type (LlamaIndex or LightRAG/RAGAnything)
11
+ - Validate required index files
12
+ - Copy/migrate KB to target directory
13
+ - Register in kb_config.json
14
+ - Optionally extract numbered items (if content_list exists)
15
+ - Run test query to verify migration
16
+
17
+ Usage:
18
+ python scripts/migrate_kb.py /path/to/kb --name my_kb --test --extract-items
19
+ python scripts/migrate_kb.py /path/to/kb --validate-only
20
+ """
21
+
22
+ import argparse
23
+ import asyncio
24
+ from datetime import datetime
25
+ import json
26
+ from pathlib import Path
27
+ import shutil
28
+ import sys
29
+
30
+ # Add project root to path
31
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
32
+ sys.path.insert(0, str(PROJECT_ROOT))
33
+
34
+
35
+ # =============================================================================
36
+ # Constants: Required files for each RAG provider
37
+ # =============================================================================
38
+
39
+ LLAMAINDEX_REQUIRED_FILES = [
40
+ "docstore.json",
41
+ "index_store.json",
42
+ "default__vector_store.json",
43
+ ]
44
+
45
+ LIGHTRAG_REQUIRED_FILES = [
46
+ "kv_store_text_chunks.json",
47
+ "kv_store_full_entities.json",
48
+ "kv_store_full_relations.json",
49
+ ]
50
+
51
+ # Optional but recommended for better performance
52
+ LIGHTRAG_OPTIONAL_FILES = [
53
+ "vdb_chunks.json",
54
+ "vdb_entities.json",
55
+ "vdb_relationships.json",
56
+ "graph_chunk_entity_relation.graphml",
57
+ ]
58
+
59
+ # Default target directory
60
+ DEFAULT_KB_BASE_DIR = PROJECT_ROOT / "data" / "knowledge_bases"
61
+
62
+
63
+ # =============================================================================
64
+ # Validation Functions
65
+ # =============================================================================
66
+
67
+
68
+ def detect_provider(kb_path: Path) -> str | None:
69
+ """
70
+ Detect the RAG provider type based on directory structure and valid files.
71
+
72
+ Args:
73
+ kb_path: Path to the knowledge base directory
74
+
75
+ Returns:
76
+ Provider name: "llamaindex", "lightrag", or None if not detected
77
+ """
78
+ llamaindex_dir = kb_path / "llamaindex_storage"
79
+ lightrag_dir = kb_path / "rag_storage"
80
+
81
+ has_llamaindex = llamaindex_dir.exists() and llamaindex_dir.is_dir()
82
+ has_lightrag = lightrag_dir.exists() and lightrag_dir.is_dir()
83
+
84
+ # Check which one has valid index files
85
+ llamaindex_valid = False
86
+ lightrag_valid = False
87
+
88
+ if has_llamaindex:
89
+ # Check if LlamaIndex has required files
90
+ llamaindex_valid = all((llamaindex_dir / f).exists() for f in LLAMAINDEX_REQUIRED_FILES)
91
+
92
+ if has_lightrag:
93
+ # Check if LightRAG has required files
94
+ lightrag_valid = all((lightrag_dir / f).exists() for f in LIGHTRAG_REQUIRED_FILES)
95
+
96
+ # Return based on which has valid files
97
+ if llamaindex_valid and lightrag_valid:
98
+ # Both valid, prefer LightRAG (more feature-rich)
99
+ return "lightrag"
100
+ elif lightrag_valid:
101
+ return "lightrag"
102
+ elif llamaindex_valid:
103
+ return "llamaindex"
104
+ elif has_llamaindex:
105
+ # Directory exists but incomplete
106
+ return "llamaindex"
107
+ elif has_lightrag:
108
+ # Directory exists but incomplete
109
+ return "lightrag"
110
+ else:
111
+ return None
112
+
113
+
114
+ def validate_llamaindex_files(storage_dir: Path) -> tuple[bool, list[str], list[str]]:
115
+ """
116
+ Validate LlamaIndex storage has required files.
117
+
118
+ Args:
119
+ storage_dir: Path to llamaindex_storage directory
120
+
121
+ Returns:
122
+ Tuple of (is_valid, missing_files, found_files)
123
+ """
124
+ missing = []
125
+ found = []
126
+
127
+ for filename in LLAMAINDEX_REQUIRED_FILES:
128
+ filepath = storage_dir / filename
129
+ if filepath.exists():
130
+ found.append(filename)
131
+ else:
132
+ missing.append(filename)
133
+
134
+ return len(missing) == 0, missing, found
135
+
136
+
137
+ def validate_lightrag_files(storage_dir: Path) -> tuple[bool, list[str], list[str]]:
138
+ """
139
+ Validate LightRAG storage has required files.
140
+
141
+ Args:
142
+ storage_dir: Path to rag_storage directory
143
+
144
+ Returns:
145
+ Tuple of (is_valid, missing_files, found_files)
146
+ """
147
+ missing = []
148
+ found = []
149
+
150
+ for filename in LIGHTRAG_REQUIRED_FILES:
151
+ filepath = storage_dir / filename
152
+ if filepath.exists():
153
+ found.append(filename)
154
+ else:
155
+ missing.append(filename)
156
+
157
+ # Also check optional files
158
+ optional_found = []
159
+ for filename in LIGHTRAG_OPTIONAL_FILES:
160
+ if (storage_dir / filename).exists():
161
+ optional_found.append(filename)
162
+
163
+ return len(missing) == 0, missing, found + optional_found
164
+
165
+
166
+ def validate_kb(kb_path: Path) -> dict:
167
+ """
168
+ Validate a knowledge base directory.
169
+
170
+ Args:
171
+ kb_path: Path to the knowledge base directory
172
+
173
+ Returns:
174
+ Validation result dictionary
175
+ """
176
+ result = {
177
+ "path": str(kb_path),
178
+ "exists": kb_path.exists(),
179
+ "is_valid": False,
180
+ "provider": None,
181
+ "missing_files": [],
182
+ "found_files": [],
183
+ "has_content_list": False,
184
+ "has_raw_docs": False,
185
+ "has_images": False,
186
+ "has_metadata": False,
187
+ "has_numbered_items": False,
188
+ }
189
+
190
+ if not kb_path.exists():
191
+ return result
192
+
193
+ # Detect provider
194
+ provider = detect_provider(kb_path)
195
+ result["provider"] = provider
196
+
197
+ if provider is None:
198
+ result["error"] = (
199
+ "No valid RAG storage found (neither llamaindex_storage/ nor rag_storage/)"
200
+ )
201
+ return result
202
+
203
+ # Validate based on provider
204
+ if provider == "llamaindex":
205
+ storage_dir = kb_path / "llamaindex_storage"
206
+ is_valid, missing, found = validate_llamaindex_files(storage_dir)
207
+ else: # lightrag
208
+ storage_dir = kb_path / "rag_storage"
209
+ is_valid, missing, found = validate_lightrag_files(storage_dir)
210
+
211
+ result["is_valid"] = is_valid
212
+ result["missing_files"] = missing
213
+ result["found_files"] = found
214
+
215
+ # Check optional directories
216
+ content_list_dir = kb_path / "content_list"
217
+ result["has_content_list"] = content_list_dir.exists() and any(content_list_dir.glob("*.json"))
218
+
219
+ raw_dir = kb_path / "raw"
220
+ result["has_raw_docs"] = raw_dir.exists() and any(raw_dir.iterdir())
221
+
222
+ images_dir = kb_path / "images"
223
+ result["has_images"] = images_dir.exists() and any(images_dir.iterdir())
224
+
225
+ result["has_metadata"] = (kb_path / "metadata.json").exists()
226
+ result["has_numbered_items"] = (kb_path / "numbered_items.json").exists()
227
+
228
+ return result
229
+
230
+
231
+ # =============================================================================
232
+ # Migration Functions
233
+ # =============================================================================
234
+
235
+
236
+ def copy_kb_directory(source_path: Path, target_path: Path, verbose: bool = True) -> bool:
237
+ """
238
+ Copy knowledge base directory to target location.
239
+
240
+ Args:
241
+ source_path: Source KB directory
242
+ target_path: Target KB directory
243
+ verbose: Print progress messages
244
+
245
+ Returns:
246
+ True if successful
247
+ """
248
+ if target_path.exists():
249
+ if verbose:
250
+ print(f" ⚠️ Target directory already exists: {target_path}")
251
+ return False
252
+
253
+ if verbose:
254
+ print(f" Copying {source_path} -> {target_path}")
255
+
256
+ shutil.copytree(source_path, target_path)
257
+
258
+ if verbose:
259
+ print(" ✓ Copied successfully")
260
+
261
+ return True
262
+
263
+
264
+ def register_kb(
265
+ kb_name: str, kb_base_dir: Path, description: str = "", provider: str | None = None
266
+ ) -> bool:
267
+ """
268
+ Register knowledge base in kb_config.json.
269
+
270
+ Args:
271
+ kb_name: Knowledge base name
272
+ kb_base_dir: Base directory containing kb_config.json
273
+ description: Optional description
274
+ provider: RAG provider name
275
+
276
+ Returns:
277
+ True if successful
278
+ """
279
+ config_file = kb_base_dir / "kb_config.json"
280
+
281
+ # Load existing config
282
+ if config_file.exists():
283
+ with open(config_file, encoding="utf-8") as f:
284
+ config = json.load(f)
285
+ else:
286
+ config = {"knowledge_bases": {}}
287
+
288
+ if "knowledge_bases" not in config:
289
+ config["knowledge_bases"] = {}
290
+
291
+ # Check if already registered
292
+ if kb_name in config["knowledge_bases"]:
293
+ print(f" ⚠️ KB '{kb_name}' is already registered in kb_config.json")
294
+ return True
295
+
296
+ # Add to config
297
+ config["knowledge_bases"][kb_name] = {
298
+ "path": kb_name,
299
+ "description": description or f"Knowledge base: {kb_name}",
300
+ }
301
+
302
+ # Save config
303
+ with open(config_file, "w", encoding="utf-8") as f:
304
+ json.dump(config, f, indent=2, ensure_ascii=False)
305
+
306
+ print(f" ✓ Registered '{kb_name}' in kb_config.json")
307
+
308
+ # Also create/update metadata.json if needed
309
+ kb_dir = kb_base_dir / kb_name
310
+ metadata_file = kb_dir / "metadata.json"
311
+
312
+ if not metadata_file.exists():
313
+ metadata = {
314
+ "name": kb_name,
315
+ "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
316
+ "description": description or f"Knowledge base: {kb_name}",
317
+ "version": "1.0",
318
+ "rag_provider": provider,
319
+ "migrated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
320
+ }
321
+ with open(metadata_file, "w", encoding="utf-8") as f:
322
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
323
+ print(" ✓ Created metadata.json")
324
+ else:
325
+ # Update existing metadata with migration info
326
+ with open(metadata_file, encoding="utf-8") as f:
327
+ metadata = json.load(f)
328
+
329
+ if provider and not metadata.get("rag_provider"):
330
+ metadata["rag_provider"] = provider
331
+ metadata["migrated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
332
+
333
+ with open(metadata_file, "w", encoding="utf-8") as f:
334
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
335
+ print(" ✓ Updated metadata.json")
336
+
337
+ return True
338
+
339
+
340
+ async def extract_numbered_items(kb_name: str, kb_base_dir: Path) -> bool:
341
+ """
342
+ Extract numbered items from content_list files.
343
+
344
+ Args:
345
+ kb_name: Knowledge base name
346
+ kb_base_dir: Base directory for knowledge bases
347
+
348
+ Returns:
349
+ True if successful
350
+ """
351
+ try:
352
+ from src.knowledge.extract_numbered_items import (
353
+ extract_numbered_items_with_llm_async,
354
+ )
355
+ from src.services.llm import get_llm_client
356
+ except ImportError as e:
357
+ print(f" ⚠️ Could not import extraction module: {e}")
358
+ return False
359
+
360
+ kb_dir = kb_base_dir / kb_name
361
+ content_list_dir = kb_dir / "content_list"
362
+
363
+ if not content_list_dir.exists():
364
+ print(" ⚠️ No content_list directory found")
365
+ return False
366
+
367
+ # Load all content list files
368
+ all_content_items = []
369
+ json_files = list(content_list_dir.glob("*.json"))
370
+
371
+ if not json_files:
372
+ print(" ⚠️ No JSON files found in content_list/")
373
+ return False
374
+
375
+ print(f" Loading {len(json_files)} content list files...")
376
+
377
+ for json_file in json_files:
378
+ try:
379
+ with open(json_file, encoding="utf-8") as f:
380
+ content_items = json.load(f)
381
+ if isinstance(content_items, list):
382
+ all_content_items.extend(content_items)
383
+ except Exception as e:
384
+ print(f" ⚠️ Error loading {json_file.name}: {e}")
385
+
386
+ if not all_content_items:
387
+ print(" ⚠️ No content items found")
388
+ return False
389
+
390
+ print(f" Extracting numbered items from {len(all_content_items)} content items...")
391
+
392
+ try:
393
+ llm_client = get_llm_client()
394
+ items = await extract_numbered_items_with_llm_async(
395
+ all_content_items,
396
+ api_key=llm_client.config.api_key,
397
+ base_url=llm_client.config.base_url,
398
+ )
399
+
400
+ if items:
401
+ output_file = kb_dir / "numbered_items.json"
402
+ with open(output_file, "w", encoding="utf-8") as f:
403
+ json.dump(items, f, ensure_ascii=False, indent=2)
404
+ print(f" ✓ Extracted {len(items)} numbered items")
405
+ return True
406
+ else:
407
+ print(" ⚠️ No numbered items extracted")
408
+ return False
409
+
410
+ except Exception as e:
411
+ print(f" ✗ Extraction failed: {e}")
412
+ return False
413
+
414
+
415
+ async def test_kb_search(kb_name: str, query: str = "What is this knowledge base about?") -> bool:
416
+ """
417
+ Test knowledge base with a simple search query.
418
+
419
+ Args:
420
+ kb_name: Knowledge base name
421
+ query: Test query
422
+
423
+ Returns:
424
+ True if search succeeded
425
+ """
426
+ try:
427
+ from src.tools.rag_tool import rag_search
428
+ except ImportError as e:
429
+ print(f" ⚠️ Could not import rag_tool: {e}")
430
+ return False
431
+
432
+ print(f" Running test query: '{query[:50]}...'")
433
+
434
+ try:
435
+ result = await rag_search(
436
+ query=query,
437
+ kb_name=kb_name,
438
+ mode="naive", # Use simplest mode for testing
439
+ )
440
+
441
+ if result and result.get("answer"):
442
+ answer_preview = result["answer"][:200]
443
+ print(" ✓ Search successful!")
444
+ print(f" Provider: {result.get('provider', 'unknown')}")
445
+ print(f" Answer preview: {answer_preview}...")
446
+ return True
447
+ else:
448
+ print(" ⚠️ Search returned empty result")
449
+ return False
450
+
451
+ except Exception as e:
452
+ print(f" ✗ Search failed: {e}")
453
+ return False
454
+
455
+
456
+ async def migrate_kb(
457
+ source_path: str,
458
+ target_base_dir: str | None = None,
459
+ kb_name: str | None = None,
460
+ run_test: bool = False,
461
+ extract_items: bool = False,
462
+ validate_only: bool = False,
463
+ force: bool = False,
464
+ ) -> bool:
465
+ """
466
+ Migrate a knowledge base to DeepTutor.
467
+
468
+ Args:
469
+ source_path: Path to source knowledge base
470
+ target_base_dir: Target base directory (default: data/knowledge_bases)
471
+ kb_name: Name for the migrated KB (default: source directory name)
472
+ run_test: Run a test query after migration
473
+ extract_items: Extract numbered items if content_list exists
474
+ validate_only: Only validate, don't migrate
475
+ force: Overwrite existing KB if exists
476
+
477
+ Returns:
478
+ True if successful
479
+ """
480
+ source = Path(source_path).resolve()
481
+ target_base = Path(target_base_dir) if target_base_dir else DEFAULT_KB_BASE_DIR
482
+ target_base = target_base.resolve()
483
+
484
+ # Determine KB name
485
+ if kb_name is None:
486
+ kb_name = source.name
487
+
488
+ print("=" * 60)
489
+ print("Knowledge Base Migration")
490
+ print("=" * 60)
491
+ print(f"Source: {source}")
492
+ print(f"Target: {target_base / kb_name}")
493
+ print()
494
+
495
+ # Step 1: Validate source KB
496
+ print("Step 1: Validating source knowledge base...")
497
+ validation = validate_kb(source)
498
+
499
+ if not validation["exists"]:
500
+ print(f" ✗ Source directory does not exist: {source}")
501
+ return False
502
+
503
+ if not validation["is_valid"]:
504
+ print(" ✗ Validation failed!")
505
+ if validation.get("error"):
506
+ print(f" Error: {validation['error']}")
507
+ if validation["missing_files"]:
508
+ print(f" Missing files: {', '.join(validation['missing_files'])}")
509
+ return False
510
+
511
+ print(" ✓ Validation passed")
512
+ print(f" Provider: {validation['provider']}")
513
+ print(f" Found files: {', '.join(validation['found_files'][:5])}...")
514
+ print(f" Has content_list: {'Yes' if validation['has_content_list'] else 'No'}")
515
+ print(f" Has raw docs: {'Yes' if validation['has_raw_docs'] else 'No'}")
516
+ print(f" Has images: {'Yes' if validation['has_images'] else 'No'}")
517
+ print()
518
+
519
+ if validate_only:
520
+ print("Validation-only mode. Exiting.")
521
+ return True
522
+
523
+ # Step 2: Copy to target
524
+ print("Step 2: Copying knowledge base...")
525
+ target_path = target_base / kb_name
526
+
527
+ if target_path.exists():
528
+ if force:
529
+ print(" Removing existing directory (--force)...")
530
+ shutil.rmtree(target_path)
531
+ elif source.resolve() == target_path.resolve():
532
+ print(" Source and target are the same. Skipping copy.")
533
+ else:
534
+ print(f" ✗ Target directory already exists: {target_path}")
535
+ print(" Use --force to overwrite or --name to specify a different name")
536
+ return False
537
+
538
+ if source.resolve() != target_path.resolve():
539
+ success = copy_kb_directory(source, target_path)
540
+ if not success:
541
+ return False
542
+ print()
543
+
544
+ # Step 3: Register in kb_config.json
545
+ print("Step 3: Registering knowledge base...")
546
+ register_kb(
547
+ kb_name=kb_name,
548
+ kb_base_dir=target_base,
549
+ description=f"Migrated from: {source}",
550
+ provider=validation["provider"],
551
+ )
552
+ print()
553
+
554
+ # Step 4: Extract numbered items (optional)
555
+ if extract_items and validation["has_content_list"]:
556
+ print("Step 4: Extracting numbered items...")
557
+ await extract_numbered_items(kb_name, target_base)
558
+ print()
559
+ elif extract_items:
560
+ print("Step 4: Skipping numbered items extraction (no content_list)")
561
+ print()
562
+
563
+ # Step 5: Test search (optional)
564
+ if run_test:
565
+ print("Step 5: Testing knowledge base...")
566
+ await test_kb_search(kb_name)
567
+ print()
568
+
569
+ print("=" * 60)
570
+ print("✓ Migration complete!")
571
+ print(f" Knowledge base '{kb_name}' is now available in DeepTutor.")
572
+ print("=" * 60)
573
+
574
+ return True
575
+
576
+
577
+ # =============================================================================
578
+ # CLI Entry Point
579
+ # =============================================================================
580
+
581
+
582
+ def main():
583
+ parser = argparse.ArgumentParser(
584
+ description="Migrate knowledge bases into DeepTutor",
585
+ formatter_class=argparse.RawDescriptionHelpFormatter,
586
+ epilog="""
587
+ Examples:
588
+ # Migrate a knowledge base
589
+ python scripts/migrate_kb.py /path/to/my_kb
590
+
591
+ # Migrate with custom name
592
+ python scripts/migrate_kb.py /path/to/kb --name my_textbook
593
+
594
+ # Migrate and run test query
595
+ python scripts/migrate_kb.py /path/to/kb --test
596
+
597
+ # Migrate and extract numbered items
598
+ python scripts/migrate_kb.py /path/to/kb --extract-items
599
+
600
+ # Validate only (don't migrate)
601
+ python scripts/migrate_kb.py /path/to/kb --validate-only
602
+
603
+ # Force overwrite existing KB
604
+ python scripts/migrate_kb.py /path/to/kb --force
605
+ """,
606
+ )
607
+
608
+ parser.add_argument("source", help="Path to source knowledge base directory")
609
+
610
+ parser.add_argument(
611
+ "--name", help="Name for the migrated knowledge base (default: source directory name)"
612
+ )
613
+
614
+ parser.add_argument(
615
+ "--target-dir", help=f"Target base directory (default: {DEFAULT_KB_BASE_DIR})"
616
+ )
617
+
618
+ parser.add_argument("--test", action="store_true", help="Run a test query after migration")
619
+
620
+ parser.add_argument(
621
+ "--extract-items",
622
+ action="store_true",
623
+ help="Extract numbered items from content_list (requires LLM API)",
624
+ )
625
+
626
+ parser.add_argument(
627
+ "--validate-only",
628
+ action="store_true",
629
+ help="Only validate the knowledge base, don't migrate",
630
+ )
631
+
632
+ parser.add_argument(
633
+ "--force", action="store_true", help="Overwrite existing knowledge base if exists"
634
+ )
635
+
636
+ args = parser.parse_args()
637
+
638
+ # Run migration
639
+ success = asyncio.run(
640
+ migrate_kb(
641
+ source_path=args.source,
642
+ target_base_dir=args.target_dir,
643
+ kb_name=args.name,
644
+ run_test=args.test,
645
+ extract_items=args.extract_items,
646
+ validate_only=args.validate_only,
647
+ force=args.force,
648
+ )
649
+ )
650
+
651
+ sys.exit(0 if success else 1)
652
+
653
+
654
+ if __name__ == "__main__":
655
+ main()