chunksilo 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chunksilo might be problematic. Click here for more details.

chunksilo/index.py ADDED
@@ -0,0 +1,1420 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """
4
+ Indexing pipeline for building a RAG index from PDF, DOCX, DOC, Markdown, and TXT documents.
5
+ Supports incremental indexing using a local SQLite database to track file states.
6
+ """
7
+ import argparse
8
+ import hashlib
9
+ import itertools
10
+ import json
11
+ import logging
12
+ import os
13
+ import sqlite3
14
+ import sys
15
+ import threading
16
+ import time
17
+ from datetime import datetime
18
+ from abc import ABC, abstractmethod
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+ from typing import List, Dict, Optional, Iterator, Set, Any
22
+
23
+ from docx import Document
24
+
25
+ from llama_index.core import (
26
+ VectorStoreIndex,
27
+ StorageContext,
28
+ Settings,
29
+ SimpleDirectoryReader,
30
+ Document as LlamaIndexDocument,
31
+ load_index_from_storage,
32
+ )
33
+ from llama_index.core.node_parser import SentenceSplitter
34
+ from llama_index.embeddings.fastembed import FastEmbedEmbedding
35
+
36
+ # Load configuration from config.yaml
37
+ from . import cfgload
38
+ from .cfgload import load_config
39
+ _config = load_config()
40
+
41
+ # Configuration from config.yaml
42
+ STORAGE_DIR = Path(_config["storage"]["storage_dir"])
43
+ STATE_DB_PATH = STORAGE_DIR / "ingestion_state.db"
44
+
45
+ # Stage 1 (embedding/vector search) configuration
46
+ RETRIEVAL_EMBED_MODEL_NAME = _config["retrieval"]["embed_model_name"]
47
+
48
+ # Stage 2 (FlashRank reranking, CPU-only, ONNX-based) configuration
49
+ RETRIEVAL_RERANK_MODEL_NAME = _config["retrieval"]["rerank_model_name"]
50
+
51
+ # Shared cache directory for embedding and reranking models
52
+ RETRIEVAL_MODEL_CACHE_DIR = Path(_config["storage"]["model_cache_dir"])
53
+
54
+ # BM25 index directory for file name matching
55
+ BM25_INDEX_DIR = STORAGE_DIR / "bm25_index"
56
+
57
+ # Heading store for document headings (stored separately to avoid metadata size issues)
58
+ HEADING_STORE_PATH = STORAGE_DIR / "heading_store.json"
59
+
60
+ # Metadata exclusion configuration
61
+ # These keys are excluded from the embedding text to save tokens and avoid length errors
62
+ EXCLUDED_EMBED_METADATA_KEYS = [
63
+ "line_offsets", # Large integer array, primary cause of length errors
64
+ "document_headings", # Heading hierarchy array with positions, excluded like line_offsets
65
+ "heading_path", # Pre-computed heading hierarchy, stored separately to save chunk space
66
+ "file_path", # redundant with file_name/source, strict path less useful for semantic similarity
67
+ "source", # often same as file_path
68
+ "creation_date", # temporal, not semantic
69
+ "last_modified_date",# temporal, not semantic
70
+ "doc_ids", # internal tracking
71
+ "hash", # internal tracking
72
+ ]
73
+
74
+ # These keys are excluded from the LLM context to save context window
75
+ EXCLUDED_LLM_METADATA_KEYS = [
76
+ "line_offsets", # LLM needs text content, not integer map
77
+ "hash", # internal tracking
78
+ "doc_ids", # internal tracking
79
+ "file_path", # usually redundant if file_name is present
80
+ "source", # usually redundant
81
+ ]
82
+
83
+ # Set up logging
84
+ logging.basicConfig(
85
+ level=logging.INFO,
86
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
87
+ )
88
+ logger = logging.getLogger(__name__)
89
+
90
+
91
+ # Default file type patterns
92
+ DEFAULT_INCLUDE_PATTERNS = ["**/*.pdf", "**/*.md", "**/*.txt", "**/*.docx", "**/*.doc"]
93
+
94
+
95
+ @dataclass
96
+ class DirectoryConfig:
97
+ """Configuration for a single source directory."""
98
+ path: Path
99
+ enabled: bool = True
100
+ include: List[str] = field(default_factory=lambda: DEFAULT_INCLUDE_PATTERNS.copy())
101
+ exclude: List[str] = field(default_factory=list)
102
+ recursive: bool = True
103
+
104
+
105
+ @dataclass
106
+ class IndexConfig:
107
+ """Complete indexing configuration."""
108
+ directories: List[DirectoryConfig]
109
+ chunk_size: int = 1600
110
+ chunk_overlap: int = 200
111
+
112
+
113
+ def load_index_config() -> IndexConfig:
114
+ """Load indexing configuration from config.yaml.
115
+
116
+ Raises:
117
+ ValueError: If config is invalid
118
+ """
119
+ indexing_config = _config.get("indexing", {})
120
+
121
+ if not indexing_config.get("directories"):
122
+ raise ValueError(
123
+ "Config must have at least one directory in 'indexing.directories'.\n"
124
+ "Please update config.yaml with your directory configuration.\n"
125
+ "Example:\n"
126
+ "indexing:\n"
127
+ " directories:\n"
128
+ ' - "./data"\n'
129
+ " chunk_size: 1600\n"
130
+ " chunk_overlap: 200\n"
131
+ )
132
+
133
+ logger.info("Loading indexing config from config.yaml")
134
+ return _parse_index_config(indexing_config)
135
+
136
+
137
+ def _parse_index_config(config_data: dict) -> IndexConfig:
138
+ """Parse raw config dict into IndexConfig."""
139
+ # Get defaults section
140
+ defaults = config_data.get("defaults", {})
141
+ default_include = defaults.get("include", DEFAULT_INCLUDE_PATTERNS.copy())
142
+ default_exclude = defaults.get("exclude", [])
143
+ default_recursive = defaults.get("recursive", True)
144
+
145
+ # Parse directories
146
+ directories: List[DirectoryConfig] = []
147
+ raw_dirs = config_data.get("directories", [])
148
+
149
+ if not raw_dirs:
150
+ raise ValueError("Config must have at least one directory in 'directories' list")
151
+
152
+ for entry in raw_dirs:
153
+ if isinstance(entry, str):
154
+ # Simple path string - use defaults
155
+ dir_config = DirectoryConfig(
156
+ path=Path(entry),
157
+ include=default_include.copy(),
158
+ exclude=default_exclude.copy(),
159
+ recursive=default_recursive,
160
+ )
161
+ elif isinstance(entry, dict):
162
+ # Full directory config object
163
+ path_str = entry.get("path")
164
+ if not path_str:
165
+ raise ValueError(f"Directory config missing 'path': {entry}")
166
+
167
+ dir_config = DirectoryConfig(
168
+ path=Path(path_str),
169
+ enabled=entry.get("enabled", True),
170
+ include=entry.get("include", default_include.copy()),
171
+ exclude=entry.get("exclude", default_exclude.copy()),
172
+ recursive=entry.get("recursive", default_recursive),
173
+ )
174
+ else:
175
+ raise ValueError(f"Invalid directory entry: {entry}")
176
+
177
+ directories.append(dir_config)
178
+
179
+ return IndexConfig(
180
+ directories=directories,
181
+ chunk_size=config_data.get("chunk_size", 1600),
182
+ chunk_overlap=config_data.get("chunk_overlap", 200),
183
+ )
184
+
185
+
186
+ class HeadingStore:
187
+ """Stores document headings separately from chunk metadata.
188
+
189
+ This avoids the LlamaIndex SentenceSplitter metadata size validation issue,
190
+ which checks metadata length before applying exclusions. By storing headings
191
+ in a separate file, we keep chunk metadata small while preserving heading
192
+ data for retrieval.
193
+ """
194
+
195
+ def __init__(self, store_path: Path):
196
+ self.store_path = store_path
197
+ self._data: Dict[str, List[dict]] = {}
198
+ self._load()
199
+
200
+ def _load(self):
201
+ """Load heading data from disk."""
202
+ if self.store_path.exists():
203
+ try:
204
+ with open(self.store_path, "r", encoding="utf-8") as f:
205
+ self._data = json.load(f)
206
+ except Exception as e:
207
+ logger.warning(f"Failed to load heading store: {e}")
208
+ self._data = {}
209
+
210
+ def _save(self):
211
+ """Save heading data to disk."""
212
+ self.store_path.parent.mkdir(parents=True, exist_ok=True)
213
+ with open(self.store_path, "w", encoding="utf-8") as f:
214
+ json.dump(self._data, f)
215
+
216
+ def set_headings(self, file_path: str, headings: List[dict]):
217
+ """Store headings for a file."""
218
+ self._data[file_path] = headings
219
+ self._save()
220
+
221
+ def get_headings(self, file_path: str) -> List[dict]:
222
+ """Get headings for a file."""
223
+ return self._data.get(file_path, [])
224
+
225
+ def remove_headings(self, file_path: str):
226
+ """Remove headings for a file."""
227
+ if file_path in self._data:
228
+ del self._data[file_path]
229
+ self._save()
230
+
231
+
232
+ # Module-level heading store instance (lazy initialized)
233
+ _heading_store: Optional["HeadingStore"] = None
234
+
235
+
236
+ def get_heading_store() -> HeadingStore:
237
+ """Get the singleton HeadingStore instance."""
238
+ global _heading_store
239
+ if _heading_store is None:
240
+ _heading_store = HeadingStore(HEADING_STORE_PATH)
241
+ return _heading_store
242
+
243
+
244
+ @dataclass
245
+ class FileInfo:
246
+ """Metadata about a file in the data source."""
247
+ path: str
248
+ hash: str
249
+ last_modified: float
250
+ source_dir: str = "" # Tracks which configured directory this file came from
251
+
252
+
253
+ class IngestionState:
254
+ """Manages the state of ingested files using a SQLite database."""
255
+
256
+ def __init__(self, db_path: Path):
257
+ self.db_path = db_path
258
+ self._init_db()
259
+
260
+ def _init_db(self):
261
+ """Initialize the SQLite database schema with migration support."""
262
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
263
+ with sqlite3.connect(self.db_path) as conn:
264
+ # Check if table exists
265
+ cursor = conn.execute(
266
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='files'"
267
+ )
268
+ table_exists = cursor.fetchone() is not None
269
+
270
+ if not table_exists:
271
+ # Create new table with source_dir column
272
+ conn.execute(
273
+ """
274
+ CREATE TABLE files (
275
+ path TEXT PRIMARY KEY,
276
+ hash TEXT NOT NULL,
277
+ last_modified REAL NOT NULL,
278
+ doc_ids TEXT NOT NULL,
279
+ source_dir TEXT DEFAULT ''
280
+ )
281
+ """
282
+ )
283
+ else:
284
+ # Migration: add source_dir column if missing
285
+ cursor = conn.execute("PRAGMA table_info(files)")
286
+ columns = {row[1] for row in cursor}
287
+ if "source_dir" not in columns:
288
+ conn.execute("ALTER TABLE files ADD COLUMN source_dir TEXT DEFAULT ''")
289
+ logger.info("Migrated files table: added source_dir column")
290
+
291
+ def get_all_files(self) -> Dict[str, dict]:
292
+ """Retrieve all tracked files and their metadata."""
293
+ with sqlite3.connect(self.db_path) as conn:
294
+ cursor = conn.execute(
295
+ "SELECT path, hash, last_modified, doc_ids, source_dir FROM files"
296
+ )
297
+ return {
298
+ row[0]: {
299
+ "hash": row[1],
300
+ "last_modified": row[2],
301
+ "doc_ids": row[3].split(",") if row[3] else [],
302
+ "source_dir": row[4] if row[4] else "",
303
+ }
304
+ for row in cursor
305
+ }
306
+
307
+ def update_file_state(self, file_info: FileInfo, doc_ids: List[str]):
308
+ """Update or insert the state for a file."""
309
+ with sqlite3.connect(self.db_path) as conn:
310
+ conn.execute(
311
+ """
312
+ INSERT INTO files (path, hash, last_modified, doc_ids, source_dir)
313
+ VALUES (?, ?, ?, ?, ?)
314
+ ON CONFLICT(path) DO UPDATE SET
315
+ hash=excluded.hash,
316
+ last_modified=excluded.last_modified,
317
+ doc_ids=excluded.doc_ids,
318
+ source_dir=excluded.source_dir
319
+ """,
320
+ (
321
+ file_info.path,
322
+ file_info.hash,
323
+ file_info.last_modified,
324
+ ",".join(doc_ids),
325
+ file_info.source_dir,
326
+ ),
327
+ )
328
+
329
+ def remove_file_state(self, path: str):
330
+ """Remove a file from the state tracking."""
331
+ with sqlite3.connect(self.db_path) as conn:
332
+ conn.execute("DELETE FROM files WHERE path = ?", (path,))
333
+
334
+
335
+ class DataSource(ABC):
336
+ """Abstract base class for data sources."""
337
+
338
+ @abstractmethod
339
+ def iter_files(self) -> Iterator[FileInfo]:
340
+ """Yield FileInfo for each file in the source."""
341
+ pass
342
+
343
+ @abstractmethod
344
+ def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
345
+ """Load and return documents for a given file."""
346
+ pass
347
+
348
+
349
+ def _compute_line_offsets(text: str) -> List[int]:
350
+ """Compute character offset positions for each line start.
351
+
352
+ Returns a list where line_offsets[i] is the character position where line i+1 starts.
353
+ Line 1 starts at position 0 (implicit).
354
+ """
355
+ offsets = [0] # Line 1 starts at position 0
356
+ for i, char in enumerate(text):
357
+ if char == '\n':
358
+ offsets.append(i + 1) # Next line starts after the newline
359
+ return offsets
360
+
361
+
362
+ def _extract_markdown_headings(text: str) -> List[dict]:
363
+ """Extract heading hierarchy from Markdown text using ATX-style syntax.
364
+
365
+ Parses # Heading syntax and returns list of dicts with text, position, level.
366
+ Handles ATX-style headings (# Heading) but not Setext (underlined).
367
+
368
+ Returns:
369
+ List of dicts with keys: text (str), position (int), level (int)
370
+ """
371
+ import re
372
+
373
+ headings = []
374
+ # Match ATX-style headings: line start, 1-6 #s, space, text
375
+ pattern = re.compile(r'^(#{1,6})\s+(.+?)$', re.MULTILINE)
376
+
377
+ # Find all code block ranges to skip headings inside them
378
+ code_blocks = []
379
+ for match in re.finditer(r'```.*?```', text, flags=re.DOTALL):
380
+ code_blocks.append((match.start(), match.end()))
381
+
382
+ def is_in_code_block(pos):
383
+ """Check if position is inside a code block."""
384
+ return any(start <= pos < end for start, end in code_blocks)
385
+
386
+ for match in pattern.finditer(text):
387
+ # Skip headings inside code blocks
388
+ if is_in_code_block(match.start()):
389
+ continue
390
+
391
+ level = len(match.group(1))
392
+ heading_text = match.group(2).strip()
393
+ position = match.start()
394
+
395
+ if heading_text:
396
+ headings.append({
397
+ "text": heading_text,
398
+ "position": position,
399
+ "level": level
400
+ })
401
+
402
+ return headings
403
+
404
+
405
+ def _extract_pdf_headings_from_outline(pdf_path: Path) -> List[dict]:
406
+ """Extract headings from PDF outline/bookmarks (TOC).
407
+
408
+ Returns list of dicts with text, position (estimated), level.
409
+ Position is approximate based on cumulative page character counts.
410
+ Falls back to empty list if PDF has no outline or extraction fails.
411
+
412
+ Returns:
413
+ List of dicts with keys: text (str), position (int), level (int)
414
+ """
415
+ try:
416
+ from pypdf import PdfReader
417
+ except ImportError:
418
+ logger.warning("pypdf not available, skipping PDF heading extraction")
419
+ return []
420
+
421
+ try:
422
+ reader = PdfReader(pdf_path)
423
+ outline = reader.outline
424
+
425
+ if not outline:
426
+ return []
427
+
428
+ def flatten_outline(items, level=1):
429
+ """Flatten nested outline into list of (title, page_num, level)."""
430
+ results = []
431
+ for item in items:
432
+ if isinstance(item, list):
433
+ results.extend(flatten_outline(item, level + 1))
434
+ else:
435
+ page_num = reader.get_destination_page_number(item)
436
+ results.append((item.title, page_num, level))
437
+ return results
438
+
439
+ flat = flatten_outline(outline)
440
+ headings = []
441
+ for title, page_num, level in flat:
442
+ # Estimate position by accumulating text from previous pages
443
+ position = 0
444
+ for page_idx in range(page_num):
445
+ if page_idx < len(reader.pages):
446
+ position += len(reader.pages[page_idx].extract_text() or "")
447
+
448
+ headings.append({
449
+ "text": title.strip(),
450
+ "position": position,
451
+ "level": level
452
+ })
453
+
454
+ return headings
455
+
456
+ except Exception as e:
457
+ logger.warning(f"Failed to extract PDF outline from {pdf_path}: {e}")
458
+ return []
459
+
460
+
461
+ class LocalFileSystemSource(DataSource):
462
+ """Implementation of DataSource for the local file system with filtering."""
463
+
464
+ def __init__(self, config: DirectoryConfig):
465
+ self.config = config
466
+ self.base_dir = config.path
467
+
468
+ def is_available(self) -> bool:
469
+ """Check if the directory is available and accessible."""
470
+ try:
471
+ if not self.base_dir.exists():
472
+ return False
473
+ if not self.base_dir.is_dir():
474
+ return False
475
+ # Try to list directory to verify access (important for network mounts)
476
+ next(self.base_dir.iterdir(), None)
477
+ return True
478
+ except (OSError, PermissionError):
479
+ return False
480
+
481
+ def _matches_patterns(self, file_path: Path) -> bool:
482
+ """Check if file matches include patterns and doesn't match exclude patterns.
483
+
484
+ Uses Path.match() which supports ** glob patterns for directory matching.
485
+ For directory exclusion patterns like **/*venv*/**, checks each path component.
486
+ """
487
+ import fnmatch
488
+
489
+ try:
490
+ rel_path = file_path.relative_to(self.base_dir)
491
+ except ValueError:
492
+ rel_path = Path(file_path.name)
493
+
494
+ # Check exclude patterns first
495
+ for pattern in self.config.exclude:
496
+ # Handle directory exclusion patterns (e.g., **/*venv*/**, **/node_modules/**)
497
+ # by checking if any directory component matches
498
+ if pattern.startswith('**/') and pattern.endswith('/**'):
499
+ # Extract the directory pattern (e.g., *venv* or node_modules)
500
+ dir_pattern = pattern[3:-3] # Remove **/ prefix and /** suffix
501
+ for part in rel_path.parts[:-1]: # Check all directory components (not filename)
502
+ if fnmatch.fnmatch(part, dir_pattern):
503
+ return False
504
+ else:
505
+ # Standard pattern matching
506
+ if rel_path.match(pattern) or file_path.name == pattern:
507
+ return False
508
+
509
+ # Check include patterns
510
+ if not self.config.include:
511
+ return True
512
+
513
+ for pattern in self.config.include:
514
+ # Path.match() supports ** for recursive directory matching
515
+ if rel_path.match(pattern) or file_path.match(pattern):
516
+ return True
517
+
518
+ return False
519
+
520
+ def iter_files(self) -> Iterator[FileInfo]:
521
+ """Yield FileInfo for each matching file in the source."""
522
+ if self.config.recursive:
523
+ walker = os.walk(self.base_dir)
524
+ else:
525
+ # Non-recursive: only top-level files
526
+ try:
527
+ top_files = [
528
+ f for f in self.base_dir.iterdir() if f.is_file()
529
+ ]
530
+ walker = [(str(self.base_dir), [], [f.name for f in top_files])]
531
+ except OSError as e:
532
+ logger.warning(f"Could not list directory {self.base_dir}: {e}")
533
+ return
534
+
535
+ for root, _, files in walker:
536
+ for file in files:
537
+ file_path = Path(root) / file
538
+
539
+ # Check patterns
540
+ if not self._matches_patterns(file_path):
541
+ continue
542
+
543
+ try:
544
+ yield self._create_file_info(file_path)
545
+ except (OSError, IOError) as e:
546
+ logger.warning(f"Could not access file {file_path}: {e}")
547
+ continue
548
+
549
+ def _create_file_info(self, file_path: Path) -> FileInfo:
550
+ """Create FileInfo with source directory context."""
551
+ hasher = hashlib.md5()
552
+ with open(file_path, "rb") as f:
553
+ buf = f.read(65536)
554
+ while len(buf) > 0:
555
+ hasher.update(buf)
556
+ buf = f.read(65536)
557
+
558
+ return FileInfo(
559
+ path=str(file_path.absolute()),
560
+ hash=hasher.hexdigest(),
561
+ last_modified=file_path.stat().st_mtime,
562
+ source_dir=str(self.base_dir.absolute()),
563
+ )
564
+
565
+ def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
566
+ file_path = Path(file_info.path)
567
+ if file_path.suffix.lower() == ".docx":
568
+ return split_docx_into_heading_documents(file_path)
569
+ elif file_path.suffix.lower() == ".doc":
570
+ # Convert .doc to .docx using LibreOffice, then process
571
+ docx_path = _convert_doc_to_docx(file_path)
572
+ if docx_path is None:
573
+ logger.warning(f"Skipping {file_path}: could not convert .doc to .docx")
574
+ return []
575
+ try:
576
+ docs = split_docx_into_heading_documents(docx_path)
577
+ # Update metadata to point to original .doc file
578
+ for doc in docs:
579
+ doc.metadata["file_path"] = str(file_path)
580
+ doc.metadata["file_name"] = file_path.name
581
+ if "source" in doc.metadata:
582
+ doc.metadata["source"] = str(file_path)
583
+ return docs
584
+ finally:
585
+ # Clean up temp file
586
+ if docx_path.exists():
587
+ docx_path.unlink()
588
+ else:
589
+ reader = SimpleDirectoryReader(
590
+ input_files=[str(file_path)],
591
+ )
592
+ docs = reader.load_data()
593
+ # Ensure dates are visible to LLM (remove from exclusion list)
594
+ for doc in docs:
595
+ if hasattr(doc, 'excluded_llm_metadata_keys') and doc.excluded_llm_metadata_keys:
596
+ doc.excluded_llm_metadata_keys = [
597
+ k for k in doc.excluded_llm_metadata_keys
598
+ if k not in ('creation_date', 'last_modified_date')
599
+ ]
600
+
601
+ # Add line offsets for text-based files (markdown, txt) to enable line number lookup
602
+ if file_path.suffix.lower() in {".md", ".txt"}:
603
+ for doc in docs:
604
+ text = doc.get_content()
605
+ line_offsets = _compute_line_offsets(text)
606
+ doc.metadata["line_offsets"] = line_offsets
607
+
608
+ # Extract headings for Markdown and store separately
609
+ # (not in metadata to avoid SentenceSplitter size validation)
610
+ if file_path.suffix.lower() == ".md":
611
+ headings = _extract_markdown_headings(text)
612
+ get_heading_store().set_headings(str(file_path), headings)
613
+
614
+ # Extract headings for PDF files and store separately
615
+ if file_path.suffix.lower() == ".pdf":
616
+ headings = _extract_pdf_headings_from_outline(file_path)
617
+ get_heading_store().set_headings(str(file_path), headings)
618
+
619
+ # Apply metadata exclusions
620
+ for doc in docs:
621
+ doc.excluded_embed_metadata_keys = EXCLUDED_EMBED_METADATA_KEYS
622
+ doc.excluded_llm_metadata_keys = EXCLUDED_LLM_METADATA_KEYS
623
+
624
+ return docs
625
+
626
+
627
+ class MultiDirectoryDataSource(DataSource):
628
+ """Aggregates multiple LocalFileSystemSource instances."""
629
+
630
+ def __init__(self, config: IndexConfig):
631
+ self.config = config
632
+ self.sources: List[LocalFileSystemSource] = []
633
+ self.unavailable_dirs: List[DirectoryConfig] = []
634
+
635
+ for dir_config in config.directories:
636
+ if not dir_config.enabled:
637
+ logger.info(f"Skipping disabled directory: {dir_config.path}")
638
+ continue
639
+
640
+ source = LocalFileSystemSource(dir_config)
641
+
642
+ if source.is_available():
643
+ self.sources.append(source)
644
+ logger.info(f"Added directory source: {dir_config.path}")
645
+ else:
646
+ self.unavailable_dirs.append(dir_config)
647
+ logger.warning(f"Directory unavailable, skipping: {dir_config.path}")
648
+
649
+ def iter_files(self) -> Iterator[FileInfo]:
650
+ """Iterate over files from all available sources."""
651
+ seen_paths: Set[str] = set()
652
+
653
+ for source in self.sources:
654
+ for file_info in source.iter_files():
655
+ # Deduplicate in case of overlapping mounts
656
+ if file_info.path not in seen_paths:
657
+ seen_paths.add(file_info.path)
658
+ yield file_info
659
+
660
+ def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
661
+ """Load file using the appropriate source based on source_dir."""
662
+ # Find the source that owns this file
663
+ for source in self.sources:
664
+ if file_info.source_dir == str(source.base_dir.absolute()):
665
+ return source.load_file(file_info)
666
+
667
+ # Fallback: use first source (shouldn't happen normally)
668
+ if self.sources:
669
+ return self.sources[0].load_file(file_info)
670
+
671
+ raise ValueError(f"No source available for file: {file_info.path}")
672
+
673
+ def get_summary(self) -> Dict[str, Any]:
674
+ """Return summary of configured directories."""
675
+ return {
676
+ "available": [str(s.base_dir) for s in self.sources],
677
+ "unavailable": [str(d.path) for d in self.unavailable_dirs],
678
+ "total_sources": len(self.sources),
679
+ }
680
+
681
+
682
+ class SimpleProgressBar:
683
+ """Lightweight progress bar using only the standard library."""
684
+
685
+ def __init__(self, total: int, desc: str, unit: str = "item", width: int = 30):
686
+ self.total = max(total, 0)
687
+ self.desc = desc
688
+ self.unit = unit
689
+ self.width = width
690
+ self.current = 0
691
+ if self.total > 0:
692
+ self._render()
693
+
694
+ def update(self, step: int = 1) -> None:
695
+ if self.total <= 0:
696
+ return
697
+ self.current = min(self.total, self.current + step)
698
+ self._render()
699
+ if self.current >= self.total:
700
+ sys.stdout.write("\n")
701
+ sys.stdout.flush()
702
+
703
+ def _render(self) -> None:
704
+ progress = self.current / self.total if self.total else 0
705
+ filled = int(self.width * progress)
706
+ bar = "#" * filled + "-" * (self.width - filled)
707
+ sys.stdout.write(
708
+ f"\r{self.desc} [{bar}] {progress * 100:5.1f}% ({self.current}/{self.total} {self.unit}s)"
709
+ )
710
+ sys.stdout.flush()
711
+
712
+
713
+ class Spinner:
714
+ """Simple console spinner to indicate long-running steps."""
715
+
716
+ def __init__(self, desc: str, interval: float = 0.1):
717
+ self.desc = desc
718
+ self.interval = interval
719
+ self._stop_event = threading.Event()
720
+ self._thread: threading.Thread | None = None
721
+ self._line = desc
722
+
723
+ def __enter__(self):
724
+ self._thread = threading.Thread(target=self._spin, daemon=True)
725
+ self._thread.start()
726
+ return self
727
+
728
+ def __exit__(self, exc_type, exc_val, exc_tb):
729
+ self._stop_event.set()
730
+ if self._thread:
731
+ self._thread.join()
732
+ # Clear spinner line
733
+ sys.stdout.write("\r" + " " * len(self._line) + "\r")
734
+ sys.stdout.flush()
735
+
736
+ def _spin(self) -> None:
737
+ for char in itertools.cycle("|/-\\"):
738
+ if self._stop_event.is_set():
739
+ break
740
+ self._line = f"{self.desc} {char}"
741
+ sys.stdout.write("\r" + self._line)
742
+ sys.stdout.flush()
743
+ time.sleep(self.interval)
744
+
745
+
746
+ def _embedding_cache_path(model_name: str, cache_dir: Path) -> Path:
747
+ """Return the expected cache directory for a FastEmbed model."""
748
+ return cache_dir / f"models--{model_name.replace('/', '--')}"
749
+
750
+
751
+ def _verify_model_cache_exists(cache_dir: Path) -> bool:
752
+ """
753
+ Verify that the cached model directory exists and contains the expected model files.
754
+ """
755
+ from fastembed import TextEmbedding
756
+
757
+ try:
758
+ models = TextEmbedding.list_supported_models()
759
+ model_info = [m for m in models if m.get("model") == RETRIEVAL_EMBED_MODEL_NAME]
760
+ if not model_info:
761
+ return False
762
+
763
+ model_info = model_info[0]
764
+ hf_source = model_info.get("sources", {}).get("hf")
765
+ if not hf_source:
766
+ return False
767
+
768
+ expected_dir = cache_dir / f"models--{hf_source.replace('/', '--')}"
769
+ if not expected_dir.exists():
770
+ return False
771
+
772
+ snapshots_dir = expected_dir / "snapshots"
773
+ if not snapshots_dir.exists():
774
+ return False
775
+
776
+ model_file = model_info.get("model_file", "model_optimized.onnx")
777
+ for snapshot in snapshots_dir.iterdir():
778
+ if snapshot.is_dir():
779
+ model_path = snapshot / model_file
780
+ if model_path.exists() or model_path.is_symlink():
781
+ return True
782
+
783
+ return False
784
+ except Exception:
785
+ return False
786
+
787
+
788
+ def _get_cached_model_path(cache_dir: Path, model_name: str) -> Path | None:
789
+ """Get the cached model directory path."""
790
+ try:
791
+ from huggingface_hub import snapshot_download
792
+ from fastembed import TextEmbedding
793
+ models = TextEmbedding.list_supported_models()
794
+ model_info = [m for m in models if m.get("model") == model_name]
795
+ if model_info:
796
+ hf_source = model_info[0].get("sources", {}).get("hf")
797
+ if hf_source:
798
+ cache_dir_abs = cache_dir.resolve()
799
+ model_dir = snapshot_download(
800
+ repo_id=hf_source,
801
+ local_files_only=True,
802
+ cache_dir=str(cache_dir_abs)
803
+ )
804
+ return Path(model_dir).resolve()
805
+ except (ImportError, Exception):
806
+ pass
807
+ return None
808
+
809
+
810
+ def _create_fastembed_embedding(cache_dir: Path, offline: bool = False):
811
+ """Create a FastEmbedEmbedding instance."""
812
+ if offline:
813
+ cached_model_path = _get_cached_model_path(cache_dir, RETRIEVAL_EMBED_MODEL_NAME)
814
+ if cached_model_path:
815
+ logger.info(
816
+ f"Using cached model path to bypass download: {cached_model_path}"
817
+ )
818
+ return FastEmbedEmbedding(
819
+ model_name=RETRIEVAL_EMBED_MODEL_NAME,
820
+ cache_dir=str(cache_dir),
821
+ specific_model_path=str(cached_model_path)
822
+ )
823
+ else:
824
+ logger.warning(
825
+ "Could not find cached model path, falling back to normal initialization"
826
+ )
827
+
828
+ return FastEmbedEmbedding(
829
+ model_name=RETRIEVAL_EMBED_MODEL_NAME, cache_dir=str(cache_dir)
830
+ )
831
+
832
+
833
+ def ensure_embedding_model_cached(cache_dir: Path, offline: bool = False) -> None:
834
+ """Ensure the embedding model is available in the local cache."""
835
+ if offline:
836
+ logger.info("Verifying embedding model cache...")
837
+ if _verify_model_cache_exists(cache_dir):
838
+ logger.info("Embedding model found in cache")
839
+ else:
840
+ logger.error(
841
+ "Offline mode enabled, but embedding model cache not found in %s",
842
+ cache_dir,
843
+ )
844
+ raise FileNotFoundError(
845
+ f"Embedding model '{RETRIEVAL_EMBED_MODEL_NAME}' not found in cache directory '{cache_dir}'. "
846
+ )
847
+
848
+ try:
849
+ logger.info("Initializing embedding model from cache...")
850
+ cache_dir_abs = cache_dir.resolve()
851
+ if offline:
852
+ os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
853
+
854
+ _create_fastembed_embedding(cache_dir, offline=offline)
855
+ logger.info("Embedding model initialized successfully")
856
+ return
857
+ except (ValueError, Exception) as e:
858
+ # Simplified error handling for brevity, similar logic as original
859
+ if offline:
860
+ raise FileNotFoundError(f"Failed to load model offline: {e}") from e
861
+ else:
862
+ raise RuntimeError(f"Failed to download/initialize model: {e}") from e
863
+
864
+
865
+ def ensure_rerank_model_cached(cache_dir: Path, offline: bool = False) -> Path:
866
+ """Ensure the reranking model is cached locally."""
867
+ try:
868
+ from flashrank import Ranker
869
+ except ImportError as exc:
870
+ raise ImportError(
871
+ "flashrank is required for reranking."
872
+ ) from exc
873
+
874
+ cache_dir_abs = cache_dir.resolve()
875
+ logger.info("Ensuring rerank model is available in cache...")
876
+
877
+ # Map cross-encoder model names to FlashRank equivalents if needed
878
+ model_name = RETRIEVAL_RERANK_MODEL_NAME
879
+ # Note: FlashRank doesn't have L-6 models, so we map to L-12 equivalents
880
+ model_mapping = {
881
+ "cross-encoder/ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2", # L-6 not available, use L-12
882
+ "ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2", # Direct mapping for L-6
883
+ }
884
+ if model_name in model_mapping:
885
+ model_name = model_mapping[model_name]
886
+ elif model_name.startswith("cross-encoder/"):
887
+ # Extract model name after cross-encoder/ prefix and try to map
888
+ base_name = model_name.replace("cross-encoder/", "")
889
+ # If it's an L-6 model, map to L-12
890
+ if "L-6" in base_name:
891
+ model_name = base_name.replace("L-6", "L-12")
892
+ else:
893
+ model_name = base_name
894
+
895
+ try:
896
+ reranker = Ranker(model_name=model_name, cache_dir=str(cache_dir_abs))
897
+ logger.info(f"FlashRank model '{model_name}' initialized successfully")
898
+ return cache_dir_abs
899
+ except Exception as exc:
900
+ if offline:
901
+ raise FileNotFoundError(
902
+ f"Rerank model '{model_name}' not found in cache."
903
+ ) from exc
904
+ raise
905
+
906
+
907
+ def _parse_heading_level(style_name: str | None) -> int:
908
+ """Best-effort extraction of a numeric heading level from a DOCX style name."""
909
+ if not style_name:
910
+ return 1
911
+ try:
912
+ if "Heading" in style_name:
913
+ level_str = style_name.replace("Heading", "").strip()
914
+ if level_str:
915
+ return int(level_str)
916
+ except (ValueError, AttributeError):
917
+ pass
918
+ return 1
919
+
920
+
921
+ def _get_doc_temp_dir() -> Path:
922
+ """Get the temporary directory for .doc conversion, creating it if needed."""
923
+ storage_dir = Path(cfgload.get("storage.storage_dir", "./storage"))
924
+ temp_dir = storage_dir / "doc_temp"
925
+ temp_dir.mkdir(parents=True, exist_ok=True)
926
+ return temp_dir
927
+
928
+
929
+ def _convert_doc_to_docx(doc_path: Path) -> Path | None:
930
+ """Convert a .doc file to .docx using LibreOffice.
931
+
932
+ Returns path to temporary .docx file, or None if conversion fails.
933
+ Caller is responsible for cleaning up the temp file.
934
+ """
935
+ import subprocess
936
+ import shutil
937
+
938
+ # Find LibreOffice executable
939
+ soffice_paths = [
940
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice", # macOS
941
+ "/usr/bin/soffice", # Linux
942
+ "/usr/bin/libreoffice", # Linux alternative
943
+ "soffice", # Windows (in PATH)
944
+ ]
945
+
946
+ soffice = None
947
+ for path in soffice_paths:
948
+ if shutil.which(path):
949
+ soffice = path
950
+ break
951
+
952
+ if not soffice:
953
+ logger.warning(f"LibreOffice not found. Cannot convert {doc_path}")
954
+ return None
955
+
956
+ # Use storage directory for temp files (more reliable space than /tmp)
957
+ temp_dir = _get_doc_temp_dir()
958
+
959
+ try:
960
+ result = subprocess.run(
961
+ [soffice, "--headless", "--convert-to", "docx",
962
+ "--outdir", str(temp_dir), str(doc_path)],
963
+ capture_output=True,
964
+ timeout=60,
965
+ )
966
+ if result.returncode != 0:
967
+ logger.warning(f"LibreOffice conversion failed for {doc_path}: {result.stderr}")
968
+ return None
969
+
970
+ # Find the converted file
971
+ docx_name = doc_path.stem + ".docx"
972
+ docx_path = temp_dir / docx_name
973
+ if docx_path.exists():
974
+ return docx_path
975
+
976
+ logger.warning(f"Converted file not found: {docx_path}")
977
+ except subprocess.TimeoutExpired:
978
+ logger.warning(f"LibreOffice conversion timed out for {doc_path}")
979
+ except Exception as e:
980
+ logger.warning(f"Error converting {doc_path}: {e}")
981
+
982
+ return None
983
+
984
+
985
+ def split_docx_into_heading_documents(docx_path: Path) -> List[LlamaIndexDocument]:
986
+ """Split DOCX into documents by heading."""
987
+ docs: List[LlamaIndexDocument] = []
988
+ try:
989
+ doc = Document(docx_path)
990
+ except Exception as e:
991
+ logger.warning(f"Failed to open DOCX {docx_path}: {e}")
992
+ return docs
993
+
994
+ # Extract file dates from filesystem
995
+ stat = docx_path.stat()
996
+ creation_date = datetime.fromtimestamp(stat.st_ctime).strftime("%Y-%m-%d")
997
+ last_modified_date = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d")
998
+
999
+ # Try to extract dates from DOCX core properties (more accurate than filesystem)
1000
+ try:
1001
+ core_props = doc.core_properties
1002
+ if core_props.created:
1003
+ creation_date = core_props.created.strftime("%Y-%m-%d")
1004
+ if core_props.modified:
1005
+ last_modified_date = core_props.modified.strftime("%Y-%m-%d")
1006
+ except Exception:
1007
+ pass # Fall back to filesystem dates
1008
+
1009
+ # First pass: Extract all headings with positions for hierarchy metadata
1010
+ all_headings = []
1011
+ char_position = 0
1012
+ for para in doc.paragraphs:
1013
+ style_name = getattr(para.style, "name", "") or ""
1014
+ is_heading = (
1015
+ style_name.startswith("Heading")
1016
+ or style_name.startswith("heading")
1017
+ or "Heading" in style_name
1018
+ )
1019
+
1020
+ if is_heading and para.text.strip():
1021
+ heading_level = _parse_heading_level(style_name)
1022
+ all_headings.append({
1023
+ "text": para.text.strip(),
1024
+ "position": char_position,
1025
+ "level": heading_level
1026
+ })
1027
+
1028
+ char_position += len(para.text) + 1 # +1 for newline
1029
+
1030
+ # Store headings separately to avoid metadata size issues during chunking
1031
+ get_heading_store().set_headings(str(docx_path), all_headings)
1032
+
1033
+ # Second pass: Split by heading (existing logic)
1034
+ current_heading: str | None = None
1035
+ current_level: int | None = None
1036
+ current_body: list[str] = []
1037
+
1038
+ def flush_current():
1039
+ if not current_heading:
1040
+ return
1041
+ text = "\n".join(line for line in current_body if line is not None).strip()
1042
+ if not text:
1043
+ return
1044
+
1045
+ # Build hierarchical heading_path by finding parent headings based on level
1046
+ heading_path = []
1047
+ if all_headings:
1048
+ # Find the index of the current heading in all_headings
1049
+ current_idx = None
1050
+ for idx, h in enumerate(all_headings):
1051
+ if h["text"] == current_heading and h["level"] == current_level:
1052
+ current_idx = idx
1053
+ break
1054
+
1055
+ if current_idx is not None:
1056
+ # Build path by including all parent headings (those with lower level numbers)
1057
+ # Walk backwards from current heading and include headings with level < current_level
1058
+ path_headings = [all_headings[current_idx]] # Start with current
1059
+ for idx in range(current_idx - 1, -1, -1):
1060
+ h = all_headings[idx]
1061
+ if h["level"] < path_headings[0]["level"]:
1062
+ path_headings.insert(0, h)
1063
+ heading_path = [h["text"] for h in path_headings]
1064
+
1065
+ metadata = {
1066
+ "file_path": str(docx_path),
1067
+ "file_name": docx_path.name,
1068
+ "source": str(docx_path),
1069
+ "heading": current_heading,
1070
+ "heading_level": current_level,
1071
+ "creation_date": creation_date,
1072
+ "last_modified_date": last_modified_date,
1073
+ "heading_path": heading_path, # Pre-computed hierarchical path
1074
+ }
1075
+ docs.append(LlamaIndexDocument(
1076
+ text=text,
1077
+ metadata=metadata,
1078
+ excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
1079
+ excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS,
1080
+ ))
1081
+
1082
+ for para in doc.paragraphs:
1083
+ style_name = getattr(para.style, "name", "") or ""
1084
+ is_heading = (
1085
+ style_name.startswith("Heading")
1086
+ or style_name.startswith("heading")
1087
+ or "Heading" in style_name
1088
+ )
1089
+
1090
+ if is_heading and para.text.strip():
1091
+ flush_current()
1092
+ current_heading = para.text.strip()
1093
+ current_level = _parse_heading_level(style_name)
1094
+ current_body = []
1095
+ else:
1096
+ if current_heading is not None:
1097
+ current_body.append(para.text)
1098
+
1099
+ flush_current()
1100
+
1101
+ if not docs:
1102
+ try:
1103
+ full_text = "\n".join(p.text for p in doc.paragraphs).strip()
1104
+ except Exception:
1105
+ full_text = ""
1106
+
1107
+ if full_text:
1108
+ metadata = {
1109
+ "file_path": str(docx_path),
1110
+ "file_name": docx_path.name,
1111
+ "source": str(docx_path),
1112
+ "heading": None,
1113
+ "heading_level": None,
1114
+ "creation_date": creation_date,
1115
+ "last_modified_date": last_modified_date,
1116
+ }
1117
+ docs.append(LlamaIndexDocument(
1118
+ text=full_text,
1119
+ metadata=metadata,
1120
+ excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
1121
+ excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS,
1122
+ ))
1123
+
1124
+ logger.info(
1125
+ f"Split DOCX {docx_path} into {len(docs)} heading-based document(s)"
1126
+ )
1127
+ return docs
1128
+
1129
+
1130
+ def tokenize_filename(filename: str) -> List[str]:
1131
+ """
1132
+ Tokenize a filename for BM25 indexing.
1133
+
1134
+ Splits on delimiters (underscore, hyphen, dot, space) and camelCase.
1135
+
1136
+ Examples:
1137
+ 'cpp_styleguide.md' -> ['cpp', 'styleguide', 'md']
1138
+ 'API-Reference-v2.pdf' -> ['api', 'reference', 'v2', 'pdf']
1139
+ 'CamelCaseDoc.docx' -> ['camel', 'case', 'doc', 'docx']
1140
+ """
1141
+ import re
1142
+
1143
+ name_parts = filename.rsplit('.', 1)
1144
+ base_name = name_parts[0]
1145
+ extension = name_parts[1] if len(name_parts) > 1 else ""
1146
+
1147
+ # Split on explicit delimiters
1148
+ parts = re.split(r'[_\-\.\s]+', base_name)
1149
+
1150
+ # Split camelCase within each part
1151
+ tokens = []
1152
+ for part in parts:
1153
+ camel_split = re.sub(r'([a-z])([A-Z])', r'\1 \2', part).split()
1154
+ tokens.extend(t.lower() for t in camel_split if t)
1155
+
1156
+ # Add extension as a token
1157
+ if extension:
1158
+ tokens.append(extension.lower())
1159
+
1160
+ return tokens
1161
+
1162
+
1163
+ def build_bm25_index(index, storage_dir: Path) -> None:
1164
+ """
1165
+ Build a BM25 index over file names from the docstore.
1166
+
1167
+ This enables keyword matching for queries like 'cpp styleguide' to find
1168
+ files named 'cpp_styleguide.md'.
1169
+ """
1170
+ from llama_index.retrievers.bm25 import BM25Retriever
1171
+ from llama_index.core.schema import TextNode
1172
+
1173
+ logger.info("Building BM25 index for file name matching...")
1174
+
1175
+ # Create filename nodes - one per unique file
1176
+ filename_nodes = []
1177
+ seen_files: Set[str] = set()
1178
+
1179
+ for doc_id, node in index.docstore.docs.items():
1180
+ metadata = node.metadata or {}
1181
+ file_name = metadata.get("file_name", "")
1182
+ file_path = metadata.get("file_path", "")
1183
+
1184
+ if not file_name or file_path in seen_files:
1185
+ continue
1186
+ seen_files.add(file_path)
1187
+
1188
+ tokens = tokenize_filename(file_name)
1189
+ filename_nodes.append(TextNode(
1190
+ text=" ".join(tokens),
1191
+ metadata={"file_name": file_name, "file_path": file_path},
1192
+ id_=f"bm25_{file_path}"
1193
+ ))
1194
+
1195
+ if not filename_nodes:
1196
+ logger.warning("No documents found for BM25 indexing")
1197
+ return
1198
+
1199
+ logger.info(f"Creating BM25 index with {len(filename_nodes)} file name entries")
1200
+
1201
+ bm25_retriever = BM25Retriever.from_defaults(
1202
+ nodes=filename_nodes,
1203
+ similarity_top_k=10,
1204
+ )
1205
+
1206
+ bm25_dir = storage_dir / "bm25_index"
1207
+ bm25_dir.mkdir(parents=True, exist_ok=True)
1208
+ bm25_retriever.persist(str(bm25_dir))
1209
+
1210
+ logger.info(f"BM25 index persisted to {bm25_dir}")
1211
+
1212
+
1213
+ def configure_offline_mode(offline: bool, cache_dir: Path) -> None:
1214
+ """Configure environment variables for offline mode."""
1215
+ if offline:
1216
+ os.environ["HF_HUB_OFFLINE"] = "1"
1217
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
1218
+ os.environ["HF_DATASETS_OFFLINE"] = "1"
1219
+ cache_dir_abs = cache_dir.resolve()
1220
+ os.environ["HF_HOME"] = str(cache_dir_abs)
1221
+ os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
1222
+ os.environ["HF_DATASETS_CACHE"] = str(cache_dir_abs)
1223
+ logger.info("Offline mode enabled.")
1224
+ else:
1225
+ # Clear offline mode environment variables to allow downloads
1226
+ for var in ["HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE"]:
1227
+ os.environ.pop(var, None)
1228
+
1229
+ # Update huggingface_hub's cached constant (it caches at import time)
1230
+ try:
1231
+ from huggingface_hub import constants
1232
+ constants.HF_HUB_OFFLINE = offline
1233
+ except ImportError:
1234
+ pass
1235
+
1236
+
1237
+ def build_index(
1238
+ download_only: bool = False,
1239
+ config_path: Path | None = None,
1240
+ model_cache_dir: Path | None = None,
1241
+ ) -> None:
1242
+ """Build and persist the vector index incrementally."""
1243
+ global _config, STORAGE_DIR, STATE_DB_PATH, RETRIEVAL_MODEL_CACHE_DIR, BM25_INDEX_DIR, HEADING_STORE_PATH
1244
+ global RETRIEVAL_EMBED_MODEL_NAME, RETRIEVAL_RERANK_MODEL_NAME
1245
+
1246
+ if config_path:
1247
+ cfg = load_config(config_path)
1248
+ _config = cfg
1249
+ STORAGE_DIR = Path(cfg["storage"]["storage_dir"])
1250
+ STATE_DB_PATH = STORAGE_DIR / "ingestion_state.db"
1251
+ RETRIEVAL_MODEL_CACHE_DIR = Path(cfg["storage"]["model_cache_dir"])
1252
+ BM25_INDEX_DIR = STORAGE_DIR / "bm25_index"
1253
+ HEADING_STORE_PATH = STORAGE_DIR / "heading_store.json"
1254
+ RETRIEVAL_EMBED_MODEL_NAME = cfg["retrieval"]["embed_model_name"]
1255
+ RETRIEVAL_RERANK_MODEL_NAME = cfg["retrieval"]["rerank_model_name"]
1256
+
1257
+ # Override model cache dir if specified via CLI
1258
+ if model_cache_dir:
1259
+ RETRIEVAL_MODEL_CACHE_DIR = model_cache_dir
1260
+
1261
+ # Read offline setting from config; force online when downloading models
1262
+ offline = False if download_only else _config["retrieval"].get("offline", False)
1263
+ cache_dir = RETRIEVAL_MODEL_CACHE_DIR
1264
+ configure_offline_mode(offline, cache_dir)
1265
+
1266
+ # Load configuration
1267
+ index_config = load_index_config()
1268
+ logger.info(f"Indexing configured with {len(index_config.directories)} directories")
1269
+
1270
+ ensure_embedding_model_cached(cache_dir, offline=offline)
1271
+ try:
1272
+ ensure_rerank_model_cached(cache_dir, offline=offline)
1273
+ except FileNotFoundError:
1274
+ if download_only or offline:
1275
+ raise
1276
+ logger.warning("Rerank model could not be cached yet; continuing without it.")
1277
+
1278
+ if download_only:
1279
+ logger.info("Models downloaded; skipping index build.")
1280
+ return
1281
+
1282
+ # Initialize State and Multi-Directory Data Source
1283
+ ingestion_state = IngestionState(STATE_DB_PATH)
1284
+ data_source = MultiDirectoryDataSource(index_config)
1285
+
1286
+ # Log directory summary
1287
+ summary = data_source.get_summary()
1288
+ logger.info(f"Active directories: {summary['available']}")
1289
+ if summary['unavailable']:
1290
+ logger.warning(f"Unavailable directories (skipped): {summary['unavailable']}")
1291
+
1292
+ if not data_source.sources:
1293
+ logger.error("No available directories to index. Check your config.yaml indexing.directories.")
1294
+ return
1295
+
1296
+ # Initialize Embedding Model
1297
+ logger.info(f"Initializing embedding model: {RETRIEVAL_EMBED_MODEL_NAME}")
1298
+ with Spinner("Initializing embedding model"):
1299
+ embed_model = _create_fastembed_embedding(RETRIEVAL_MODEL_CACHE_DIR, offline=offline)
1300
+ Settings.embed_model = embed_model
1301
+
1302
+ # Configure Text Splitter using config values
1303
+ text_splitter = SentenceSplitter(
1304
+ chunk_size=index_config.chunk_size,
1305
+ chunk_overlap=index_config.chunk_overlap,
1306
+ separator=" ",
1307
+ )
1308
+ Settings.text_splitter = text_splitter
1309
+
1310
+ # Load existing index or create new
1311
+ if (STORAGE_DIR / "docstore.json").exists():
1312
+ logger.info("Loading existing index context...")
1313
+ storage_context = StorageContext.from_defaults(persist_dir=str(STORAGE_DIR))
1314
+ index = load_index_from_storage(storage_context, embed_model=embed_model)
1315
+ else:
1316
+ logger.info("Creating new index context...")
1317
+ storage_context = StorageContext.from_defaults()
1318
+ index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
1319
+
1320
+ # Change Detection
1321
+ tracked_files = ingestion_state.get_all_files()
1322
+ found_files: Set[str] = set()
1323
+ files_to_process: List[FileInfo] = []
1324
+
1325
+ logger.info("Scanning for changes...")
1326
+ for file_info in data_source.iter_files():
1327
+ found_files.add(file_info.path)
1328
+ existing_state = tracked_files.get(file_info.path)
1329
+
1330
+ if existing_state:
1331
+ # Check if modified
1332
+ if existing_state["hash"] != file_info.hash:
1333
+ logger.info(f"Modified file detected: {file_info.path}")
1334
+ files_to_process.append(file_info)
1335
+ else:
1336
+ # New file
1337
+ logger.info(f"New file detected: {file_info.path}")
1338
+ files_to_process.append(file_info)
1339
+
1340
+ # Identify Deleted Files
1341
+ deleted_files = set(tracked_files.keys()) - found_files
1342
+ for deleted_path in deleted_files:
1343
+ logger.info(f"Deleted file detected: {deleted_path}")
1344
+ doc_ids = tracked_files[deleted_path]["doc_ids"]
1345
+ for doc_id in doc_ids:
1346
+ try:
1347
+ index.delete_ref_doc(doc_id, delete_from_docstore=True)
1348
+ except Exception as e:
1349
+ logger.warning(f"Failed to delete doc {doc_id} from index: {e}")
1350
+ # Clean up heading data for deleted file
1351
+ get_heading_store().remove_headings(deleted_path)
1352
+ ingestion_state.remove_file_state(deleted_path)
1353
+
1354
+ if not files_to_process and not deleted_files:
1355
+ logger.info("No changes detected. Index is up to date.")
1356
+ return
1357
+
1358
+ # Process New/Modified Files
1359
+ if files_to_process:
1360
+ progress = SimpleProgressBar(len(files_to_process), desc="Processing files", unit="file")
1361
+ for file_info in files_to_process:
1362
+ # Remove old versions if they exist
1363
+ existing_state = tracked_files.get(file_info.path)
1364
+ if existing_state:
1365
+ for doc_id in existing_state["doc_ids"]:
1366
+ try:
1367
+ index.delete_ref_doc(doc_id, delete_from_docstore=True)
1368
+ except KeyError:
1369
+ pass # Document might already be gone
1370
+
1371
+ # Load and Index New Version
1372
+ docs = data_source.load_file(file_info)
1373
+ doc_ids = []
1374
+ for doc in docs:
1375
+ index.insert(doc)
1376
+ doc_ids.append(doc.doc_id)
1377
+
1378
+ # Update State
1379
+ ingestion_state.update_file_state(file_info, doc_ids)
1380
+ progress.update()
1381
+
1382
+ # Persist Index
1383
+ STORAGE_DIR.mkdir(parents=True, exist_ok=True)
1384
+ logger.info(f"Persisting index to {STORAGE_DIR}")
1385
+ index.storage_context.persist(persist_dir=str(STORAGE_DIR))
1386
+
1387
+ # Build BM25 index for file name matching
1388
+ build_bm25_index(index, STORAGE_DIR)
1389
+
1390
+ logger.info("Indexing complete.")
1391
+
1392
+
1393
+ if __name__ == "__main__":
1394
+ parser = argparse.ArgumentParser(description="Build the document index")
1395
+ parser.add_argument(
1396
+ "--download-models",
1397
+ action="store_true",
1398
+ help="Download the retrieval models and exit",
1399
+ )
1400
+ parser.add_argument(
1401
+ "--config",
1402
+ type=str,
1403
+ help="Path to config.yaml (overrides auto-discovery)",
1404
+ )
1405
+ parser.add_argument(
1406
+ "--model-cache-dir",
1407
+ type=str,
1408
+ help="Directory to download/cache models (overrides config)",
1409
+ )
1410
+ args = parser.parse_args()
1411
+
1412
+ try:
1413
+ build_index(
1414
+ download_only=args.download_models,
1415
+ config_path=Path(args.config) if args.config else None,
1416
+ model_cache_dir=Path(args.model_cache_dir) if args.model_cache_dir else None,
1417
+ )
1418
+ except Exception as e:
1419
+ logger.error(f"Indexing failed: {e}", exc_info=True)
1420
+ raise