chunksilo 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- chunksilo/__init__.py +4 -0
- chunksilo/__main__.py +3 -0
- chunksilo/cfgload.py +163 -0
- chunksilo/cli.py +124 -0
- chunksilo/confluence_html_formatter.py +96 -0
- chunksilo/index.py +1420 -0
- chunksilo/search.py +784 -0
- chunksilo/server.py +110 -0
- chunksilo-2.0.0.dist-info/METADATA +366 -0
- chunksilo-2.0.0.dist-info/RECORD +15 -0
- chunksilo-2.0.0.dist-info/WHEEL +5 -0
- chunksilo-2.0.0.dist-info/entry_points.txt +3 -0
- chunksilo-2.0.0.dist-info/licenses/LICENSE +191 -0
- chunksilo-2.0.0.dist-info/licenses/NOTICE +33 -0
- chunksilo-2.0.0.dist-info/top_level.txt +1 -0
chunksilo/index.py
ADDED
|
@@ -0,0 +1,1420 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""
|
|
4
|
+
Indexing pipeline for building a RAG index from PDF, DOCX, DOC, Markdown, and TXT documents.
|
|
5
|
+
Supports incremental indexing using a local SQLite database to track file states.
|
|
6
|
+
"""
|
|
7
|
+
import argparse
|
|
8
|
+
import hashlib
|
|
9
|
+
import itertools
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import sqlite3
|
|
14
|
+
import sys
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import List, Dict, Optional, Iterator, Set, Any
|
|
22
|
+
|
|
23
|
+
from docx import Document
|
|
24
|
+
|
|
25
|
+
from llama_index.core import (
|
|
26
|
+
VectorStoreIndex,
|
|
27
|
+
StorageContext,
|
|
28
|
+
Settings,
|
|
29
|
+
SimpleDirectoryReader,
|
|
30
|
+
Document as LlamaIndexDocument,
|
|
31
|
+
load_index_from_storage,
|
|
32
|
+
)
|
|
33
|
+
from llama_index.core.node_parser import SentenceSplitter
|
|
34
|
+
from llama_index.embeddings.fastembed import FastEmbedEmbedding
|
|
35
|
+
|
|
36
|
+
# Load configuration from config.yaml
|
|
37
|
+
from . import cfgload
|
|
38
|
+
from .cfgload import load_config
|
|
39
|
+
_config = load_config()
|
|
40
|
+
|
|
41
|
+
# Configuration from config.yaml
|
|
42
|
+
STORAGE_DIR = Path(_config["storage"]["storage_dir"])
|
|
43
|
+
STATE_DB_PATH = STORAGE_DIR / "ingestion_state.db"
|
|
44
|
+
|
|
45
|
+
# Stage 1 (embedding/vector search) configuration
|
|
46
|
+
RETRIEVAL_EMBED_MODEL_NAME = _config["retrieval"]["embed_model_name"]
|
|
47
|
+
|
|
48
|
+
# Stage 2 (FlashRank reranking, CPU-only, ONNX-based) configuration
|
|
49
|
+
RETRIEVAL_RERANK_MODEL_NAME = _config["retrieval"]["rerank_model_name"]
|
|
50
|
+
|
|
51
|
+
# Shared cache directory for embedding and reranking models
|
|
52
|
+
RETRIEVAL_MODEL_CACHE_DIR = Path(_config["storage"]["model_cache_dir"])
|
|
53
|
+
|
|
54
|
+
# BM25 index directory for file name matching
|
|
55
|
+
BM25_INDEX_DIR = STORAGE_DIR / "bm25_index"
|
|
56
|
+
|
|
57
|
+
# Heading store for document headings (stored separately to avoid metadata size issues)
|
|
58
|
+
HEADING_STORE_PATH = STORAGE_DIR / "heading_store.json"
|
|
59
|
+
|
|
60
|
+
# Metadata exclusion configuration
|
|
61
|
+
# These keys are excluded from the embedding text to save tokens and avoid length errors
|
|
62
|
+
EXCLUDED_EMBED_METADATA_KEYS = [
|
|
63
|
+
"line_offsets", # Large integer array, primary cause of length errors
|
|
64
|
+
"document_headings", # Heading hierarchy array with positions, excluded like line_offsets
|
|
65
|
+
"heading_path", # Pre-computed heading hierarchy, stored separately to save chunk space
|
|
66
|
+
"file_path", # redundant with file_name/source, strict path less useful for semantic similarity
|
|
67
|
+
"source", # often same as file_path
|
|
68
|
+
"creation_date", # temporal, not semantic
|
|
69
|
+
"last_modified_date",# temporal, not semantic
|
|
70
|
+
"doc_ids", # internal tracking
|
|
71
|
+
"hash", # internal tracking
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
# These keys are excluded from the LLM context to save context window
|
|
75
|
+
EXCLUDED_LLM_METADATA_KEYS = [
|
|
76
|
+
"line_offsets", # LLM needs text content, not integer map
|
|
77
|
+
"hash", # internal tracking
|
|
78
|
+
"doc_ids", # internal tracking
|
|
79
|
+
"file_path", # usually redundant if file_name is present
|
|
80
|
+
"source", # usually redundant
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
# Set up logging
|
|
84
|
+
logging.basicConfig(
|
|
85
|
+
level=logging.INFO,
|
|
86
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
87
|
+
)
|
|
88
|
+
logger = logging.getLogger(__name__)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# Default file type patterns
|
|
92
|
+
DEFAULT_INCLUDE_PATTERNS = ["**/*.pdf", "**/*.md", "**/*.txt", "**/*.docx", "**/*.doc"]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class DirectoryConfig:
|
|
97
|
+
"""Configuration for a single source directory."""
|
|
98
|
+
path: Path
|
|
99
|
+
enabled: bool = True
|
|
100
|
+
include: List[str] = field(default_factory=lambda: DEFAULT_INCLUDE_PATTERNS.copy())
|
|
101
|
+
exclude: List[str] = field(default_factory=list)
|
|
102
|
+
recursive: bool = True
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class IndexConfig:
|
|
107
|
+
"""Complete indexing configuration."""
|
|
108
|
+
directories: List[DirectoryConfig]
|
|
109
|
+
chunk_size: int = 1600
|
|
110
|
+
chunk_overlap: int = 200
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def load_index_config() -> IndexConfig:
|
|
114
|
+
"""Load indexing configuration from config.yaml.
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ValueError: If config is invalid
|
|
118
|
+
"""
|
|
119
|
+
indexing_config = _config.get("indexing", {})
|
|
120
|
+
|
|
121
|
+
if not indexing_config.get("directories"):
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"Config must have at least one directory in 'indexing.directories'.\n"
|
|
124
|
+
"Please update config.yaml with your directory configuration.\n"
|
|
125
|
+
"Example:\n"
|
|
126
|
+
"indexing:\n"
|
|
127
|
+
" directories:\n"
|
|
128
|
+
' - "./data"\n'
|
|
129
|
+
" chunk_size: 1600\n"
|
|
130
|
+
" chunk_overlap: 200\n"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
logger.info("Loading indexing config from config.yaml")
|
|
134
|
+
return _parse_index_config(indexing_config)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_index_config(config_data: dict) -> IndexConfig:
|
|
138
|
+
"""Parse raw config dict into IndexConfig."""
|
|
139
|
+
# Get defaults section
|
|
140
|
+
defaults = config_data.get("defaults", {})
|
|
141
|
+
default_include = defaults.get("include", DEFAULT_INCLUDE_PATTERNS.copy())
|
|
142
|
+
default_exclude = defaults.get("exclude", [])
|
|
143
|
+
default_recursive = defaults.get("recursive", True)
|
|
144
|
+
|
|
145
|
+
# Parse directories
|
|
146
|
+
directories: List[DirectoryConfig] = []
|
|
147
|
+
raw_dirs = config_data.get("directories", [])
|
|
148
|
+
|
|
149
|
+
if not raw_dirs:
|
|
150
|
+
raise ValueError("Config must have at least one directory in 'directories' list")
|
|
151
|
+
|
|
152
|
+
for entry in raw_dirs:
|
|
153
|
+
if isinstance(entry, str):
|
|
154
|
+
# Simple path string - use defaults
|
|
155
|
+
dir_config = DirectoryConfig(
|
|
156
|
+
path=Path(entry),
|
|
157
|
+
include=default_include.copy(),
|
|
158
|
+
exclude=default_exclude.copy(),
|
|
159
|
+
recursive=default_recursive,
|
|
160
|
+
)
|
|
161
|
+
elif isinstance(entry, dict):
|
|
162
|
+
# Full directory config object
|
|
163
|
+
path_str = entry.get("path")
|
|
164
|
+
if not path_str:
|
|
165
|
+
raise ValueError(f"Directory config missing 'path': {entry}")
|
|
166
|
+
|
|
167
|
+
dir_config = DirectoryConfig(
|
|
168
|
+
path=Path(path_str),
|
|
169
|
+
enabled=entry.get("enabled", True),
|
|
170
|
+
include=entry.get("include", default_include.copy()),
|
|
171
|
+
exclude=entry.get("exclude", default_exclude.copy()),
|
|
172
|
+
recursive=entry.get("recursive", default_recursive),
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
raise ValueError(f"Invalid directory entry: {entry}")
|
|
176
|
+
|
|
177
|
+
directories.append(dir_config)
|
|
178
|
+
|
|
179
|
+
return IndexConfig(
|
|
180
|
+
directories=directories,
|
|
181
|
+
chunk_size=config_data.get("chunk_size", 1600),
|
|
182
|
+
chunk_overlap=config_data.get("chunk_overlap", 200),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class HeadingStore:
|
|
187
|
+
"""Stores document headings separately from chunk metadata.
|
|
188
|
+
|
|
189
|
+
This avoids the LlamaIndex SentenceSplitter metadata size validation issue,
|
|
190
|
+
which checks metadata length before applying exclusions. By storing headings
|
|
191
|
+
in a separate file, we keep chunk metadata small while preserving heading
|
|
192
|
+
data for retrieval.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def __init__(self, store_path: Path):
|
|
196
|
+
self.store_path = store_path
|
|
197
|
+
self._data: Dict[str, List[dict]] = {}
|
|
198
|
+
self._load()
|
|
199
|
+
|
|
200
|
+
def _load(self):
|
|
201
|
+
"""Load heading data from disk."""
|
|
202
|
+
if self.store_path.exists():
|
|
203
|
+
try:
|
|
204
|
+
with open(self.store_path, "r", encoding="utf-8") as f:
|
|
205
|
+
self._data = json.load(f)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.warning(f"Failed to load heading store: {e}")
|
|
208
|
+
self._data = {}
|
|
209
|
+
|
|
210
|
+
def _save(self):
|
|
211
|
+
"""Save heading data to disk."""
|
|
212
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
with open(self.store_path, "w", encoding="utf-8") as f:
|
|
214
|
+
json.dump(self._data, f)
|
|
215
|
+
|
|
216
|
+
def set_headings(self, file_path: str, headings: List[dict]):
|
|
217
|
+
"""Store headings for a file."""
|
|
218
|
+
self._data[file_path] = headings
|
|
219
|
+
self._save()
|
|
220
|
+
|
|
221
|
+
def get_headings(self, file_path: str) -> List[dict]:
|
|
222
|
+
"""Get headings for a file."""
|
|
223
|
+
return self._data.get(file_path, [])
|
|
224
|
+
|
|
225
|
+
def remove_headings(self, file_path: str):
|
|
226
|
+
"""Remove headings for a file."""
|
|
227
|
+
if file_path in self._data:
|
|
228
|
+
del self._data[file_path]
|
|
229
|
+
self._save()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# Module-level heading store instance (lazy initialized)
|
|
233
|
+
_heading_store: Optional["HeadingStore"] = None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_heading_store() -> HeadingStore:
|
|
237
|
+
"""Get the singleton HeadingStore instance."""
|
|
238
|
+
global _heading_store
|
|
239
|
+
if _heading_store is None:
|
|
240
|
+
_heading_store = HeadingStore(HEADING_STORE_PATH)
|
|
241
|
+
return _heading_store
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclass
|
|
245
|
+
class FileInfo:
|
|
246
|
+
"""Metadata about a file in the data source."""
|
|
247
|
+
path: str
|
|
248
|
+
hash: str
|
|
249
|
+
last_modified: float
|
|
250
|
+
source_dir: str = "" # Tracks which configured directory this file came from
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class IngestionState:
|
|
254
|
+
"""Manages the state of ingested files using a SQLite database."""
|
|
255
|
+
|
|
256
|
+
def __init__(self, db_path: Path):
|
|
257
|
+
self.db_path = db_path
|
|
258
|
+
self._init_db()
|
|
259
|
+
|
|
260
|
+
def _init_db(self):
|
|
261
|
+
"""Initialize the SQLite database schema with migration support."""
|
|
262
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
263
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
264
|
+
# Check if table exists
|
|
265
|
+
cursor = conn.execute(
|
|
266
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='files'"
|
|
267
|
+
)
|
|
268
|
+
table_exists = cursor.fetchone() is not None
|
|
269
|
+
|
|
270
|
+
if not table_exists:
|
|
271
|
+
# Create new table with source_dir column
|
|
272
|
+
conn.execute(
|
|
273
|
+
"""
|
|
274
|
+
CREATE TABLE files (
|
|
275
|
+
path TEXT PRIMARY KEY,
|
|
276
|
+
hash TEXT NOT NULL,
|
|
277
|
+
last_modified REAL NOT NULL,
|
|
278
|
+
doc_ids TEXT NOT NULL,
|
|
279
|
+
source_dir TEXT DEFAULT ''
|
|
280
|
+
)
|
|
281
|
+
"""
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
# Migration: add source_dir column if missing
|
|
285
|
+
cursor = conn.execute("PRAGMA table_info(files)")
|
|
286
|
+
columns = {row[1] for row in cursor}
|
|
287
|
+
if "source_dir" not in columns:
|
|
288
|
+
conn.execute("ALTER TABLE files ADD COLUMN source_dir TEXT DEFAULT ''")
|
|
289
|
+
logger.info("Migrated files table: added source_dir column")
|
|
290
|
+
|
|
291
|
+
def get_all_files(self) -> Dict[str, dict]:
|
|
292
|
+
"""Retrieve all tracked files and their metadata."""
|
|
293
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
294
|
+
cursor = conn.execute(
|
|
295
|
+
"SELECT path, hash, last_modified, doc_ids, source_dir FROM files"
|
|
296
|
+
)
|
|
297
|
+
return {
|
|
298
|
+
row[0]: {
|
|
299
|
+
"hash": row[1],
|
|
300
|
+
"last_modified": row[2],
|
|
301
|
+
"doc_ids": row[3].split(",") if row[3] else [],
|
|
302
|
+
"source_dir": row[4] if row[4] else "",
|
|
303
|
+
}
|
|
304
|
+
for row in cursor
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def update_file_state(self, file_info: FileInfo, doc_ids: List[str]):
|
|
308
|
+
"""Update or insert the state for a file."""
|
|
309
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
310
|
+
conn.execute(
|
|
311
|
+
"""
|
|
312
|
+
INSERT INTO files (path, hash, last_modified, doc_ids, source_dir)
|
|
313
|
+
VALUES (?, ?, ?, ?, ?)
|
|
314
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
315
|
+
hash=excluded.hash,
|
|
316
|
+
last_modified=excluded.last_modified,
|
|
317
|
+
doc_ids=excluded.doc_ids,
|
|
318
|
+
source_dir=excluded.source_dir
|
|
319
|
+
""",
|
|
320
|
+
(
|
|
321
|
+
file_info.path,
|
|
322
|
+
file_info.hash,
|
|
323
|
+
file_info.last_modified,
|
|
324
|
+
",".join(doc_ids),
|
|
325
|
+
file_info.source_dir,
|
|
326
|
+
),
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def remove_file_state(self, path: str):
|
|
330
|
+
"""Remove a file from the state tracking."""
|
|
331
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
332
|
+
conn.execute("DELETE FROM files WHERE path = ?", (path,))
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class DataSource(ABC):
|
|
336
|
+
"""Abstract base class for data sources."""
|
|
337
|
+
|
|
338
|
+
@abstractmethod
|
|
339
|
+
def iter_files(self) -> Iterator[FileInfo]:
|
|
340
|
+
"""Yield FileInfo for each file in the source."""
|
|
341
|
+
pass
|
|
342
|
+
|
|
343
|
+
@abstractmethod
|
|
344
|
+
def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
|
|
345
|
+
"""Load and return documents for a given file."""
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _compute_line_offsets(text: str) -> List[int]:
|
|
350
|
+
"""Compute character offset positions for each line start.
|
|
351
|
+
|
|
352
|
+
Returns a list where line_offsets[i] is the character position where line i+1 starts.
|
|
353
|
+
Line 1 starts at position 0 (implicit).
|
|
354
|
+
"""
|
|
355
|
+
offsets = [0] # Line 1 starts at position 0
|
|
356
|
+
for i, char in enumerate(text):
|
|
357
|
+
if char == '\n':
|
|
358
|
+
offsets.append(i + 1) # Next line starts after the newline
|
|
359
|
+
return offsets
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _extract_markdown_headings(text: str) -> List[dict]:
|
|
363
|
+
"""Extract heading hierarchy from Markdown text using ATX-style syntax.
|
|
364
|
+
|
|
365
|
+
Parses # Heading syntax and returns list of dicts with text, position, level.
|
|
366
|
+
Handles ATX-style headings (# Heading) but not Setext (underlined).
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
List of dicts with keys: text (str), position (int), level (int)
|
|
370
|
+
"""
|
|
371
|
+
import re
|
|
372
|
+
|
|
373
|
+
headings = []
|
|
374
|
+
# Match ATX-style headings: line start, 1-6 #s, space, text
|
|
375
|
+
pattern = re.compile(r'^(#{1,6})\s+(.+?)$', re.MULTILINE)
|
|
376
|
+
|
|
377
|
+
# Find all code block ranges to skip headings inside them
|
|
378
|
+
code_blocks = []
|
|
379
|
+
for match in re.finditer(r'```.*?```', text, flags=re.DOTALL):
|
|
380
|
+
code_blocks.append((match.start(), match.end()))
|
|
381
|
+
|
|
382
|
+
def is_in_code_block(pos):
|
|
383
|
+
"""Check if position is inside a code block."""
|
|
384
|
+
return any(start <= pos < end for start, end in code_blocks)
|
|
385
|
+
|
|
386
|
+
for match in pattern.finditer(text):
|
|
387
|
+
# Skip headings inside code blocks
|
|
388
|
+
if is_in_code_block(match.start()):
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
level = len(match.group(1))
|
|
392
|
+
heading_text = match.group(2).strip()
|
|
393
|
+
position = match.start()
|
|
394
|
+
|
|
395
|
+
if heading_text:
|
|
396
|
+
headings.append({
|
|
397
|
+
"text": heading_text,
|
|
398
|
+
"position": position,
|
|
399
|
+
"level": level
|
|
400
|
+
})
|
|
401
|
+
|
|
402
|
+
return headings
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _extract_pdf_headings_from_outline(pdf_path: Path) -> List[dict]:
|
|
406
|
+
"""Extract headings from PDF outline/bookmarks (TOC).
|
|
407
|
+
|
|
408
|
+
Returns list of dicts with text, position (estimated), level.
|
|
409
|
+
Position is approximate based on cumulative page character counts.
|
|
410
|
+
Falls back to empty list if PDF has no outline or extraction fails.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
List of dicts with keys: text (str), position (int), level (int)
|
|
414
|
+
"""
|
|
415
|
+
try:
|
|
416
|
+
from pypdf import PdfReader
|
|
417
|
+
except ImportError:
|
|
418
|
+
logger.warning("pypdf not available, skipping PDF heading extraction")
|
|
419
|
+
return []
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
reader = PdfReader(pdf_path)
|
|
423
|
+
outline = reader.outline
|
|
424
|
+
|
|
425
|
+
if not outline:
|
|
426
|
+
return []
|
|
427
|
+
|
|
428
|
+
def flatten_outline(items, level=1):
|
|
429
|
+
"""Flatten nested outline into list of (title, page_num, level)."""
|
|
430
|
+
results = []
|
|
431
|
+
for item in items:
|
|
432
|
+
if isinstance(item, list):
|
|
433
|
+
results.extend(flatten_outline(item, level + 1))
|
|
434
|
+
else:
|
|
435
|
+
page_num = reader.get_destination_page_number(item)
|
|
436
|
+
results.append((item.title, page_num, level))
|
|
437
|
+
return results
|
|
438
|
+
|
|
439
|
+
flat = flatten_outline(outline)
|
|
440
|
+
headings = []
|
|
441
|
+
for title, page_num, level in flat:
|
|
442
|
+
# Estimate position by accumulating text from previous pages
|
|
443
|
+
position = 0
|
|
444
|
+
for page_idx in range(page_num):
|
|
445
|
+
if page_idx < len(reader.pages):
|
|
446
|
+
position += len(reader.pages[page_idx].extract_text() or "")
|
|
447
|
+
|
|
448
|
+
headings.append({
|
|
449
|
+
"text": title.strip(),
|
|
450
|
+
"position": position,
|
|
451
|
+
"level": level
|
|
452
|
+
})
|
|
453
|
+
|
|
454
|
+
return headings
|
|
455
|
+
|
|
456
|
+
except Exception as e:
|
|
457
|
+
logger.warning(f"Failed to extract PDF outline from {pdf_path}: {e}")
|
|
458
|
+
return []
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class LocalFileSystemSource(DataSource):
|
|
462
|
+
"""Implementation of DataSource for the local file system with filtering."""
|
|
463
|
+
|
|
464
|
+
def __init__(self, config: DirectoryConfig):
|
|
465
|
+
self.config = config
|
|
466
|
+
self.base_dir = config.path
|
|
467
|
+
|
|
468
|
+
def is_available(self) -> bool:
|
|
469
|
+
"""Check if the directory is available and accessible."""
|
|
470
|
+
try:
|
|
471
|
+
if not self.base_dir.exists():
|
|
472
|
+
return False
|
|
473
|
+
if not self.base_dir.is_dir():
|
|
474
|
+
return False
|
|
475
|
+
# Try to list directory to verify access (important for network mounts)
|
|
476
|
+
next(self.base_dir.iterdir(), None)
|
|
477
|
+
return True
|
|
478
|
+
except (OSError, PermissionError):
|
|
479
|
+
return False
|
|
480
|
+
|
|
481
|
+
def _matches_patterns(self, file_path: Path) -> bool:
|
|
482
|
+
"""Check if file matches include patterns and doesn't match exclude patterns.
|
|
483
|
+
|
|
484
|
+
Uses Path.match() which supports ** glob patterns for directory matching.
|
|
485
|
+
For directory exclusion patterns like **/*venv*/**, checks each path component.
|
|
486
|
+
"""
|
|
487
|
+
import fnmatch
|
|
488
|
+
|
|
489
|
+
try:
|
|
490
|
+
rel_path = file_path.relative_to(self.base_dir)
|
|
491
|
+
except ValueError:
|
|
492
|
+
rel_path = Path(file_path.name)
|
|
493
|
+
|
|
494
|
+
# Check exclude patterns first
|
|
495
|
+
for pattern in self.config.exclude:
|
|
496
|
+
# Handle directory exclusion patterns (e.g., **/*venv*/**, **/node_modules/**)
|
|
497
|
+
# by checking if any directory component matches
|
|
498
|
+
if pattern.startswith('**/') and pattern.endswith('/**'):
|
|
499
|
+
# Extract the directory pattern (e.g., *venv* or node_modules)
|
|
500
|
+
dir_pattern = pattern[3:-3] # Remove **/ prefix and /** suffix
|
|
501
|
+
for part in rel_path.parts[:-1]: # Check all directory components (not filename)
|
|
502
|
+
if fnmatch.fnmatch(part, dir_pattern):
|
|
503
|
+
return False
|
|
504
|
+
else:
|
|
505
|
+
# Standard pattern matching
|
|
506
|
+
if rel_path.match(pattern) or file_path.name == pattern:
|
|
507
|
+
return False
|
|
508
|
+
|
|
509
|
+
# Check include patterns
|
|
510
|
+
if not self.config.include:
|
|
511
|
+
return True
|
|
512
|
+
|
|
513
|
+
for pattern in self.config.include:
|
|
514
|
+
# Path.match() supports ** for recursive directory matching
|
|
515
|
+
if rel_path.match(pattern) or file_path.match(pattern):
|
|
516
|
+
return True
|
|
517
|
+
|
|
518
|
+
return False
|
|
519
|
+
|
|
520
|
+
def iter_files(self) -> Iterator[FileInfo]:
|
|
521
|
+
"""Yield FileInfo for each matching file in the source."""
|
|
522
|
+
if self.config.recursive:
|
|
523
|
+
walker = os.walk(self.base_dir)
|
|
524
|
+
else:
|
|
525
|
+
# Non-recursive: only top-level files
|
|
526
|
+
try:
|
|
527
|
+
top_files = [
|
|
528
|
+
f for f in self.base_dir.iterdir() if f.is_file()
|
|
529
|
+
]
|
|
530
|
+
walker = [(str(self.base_dir), [], [f.name for f in top_files])]
|
|
531
|
+
except OSError as e:
|
|
532
|
+
logger.warning(f"Could not list directory {self.base_dir}: {e}")
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
for root, _, files in walker:
|
|
536
|
+
for file in files:
|
|
537
|
+
file_path = Path(root) / file
|
|
538
|
+
|
|
539
|
+
# Check patterns
|
|
540
|
+
if not self._matches_patterns(file_path):
|
|
541
|
+
continue
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
yield self._create_file_info(file_path)
|
|
545
|
+
except (OSError, IOError) as e:
|
|
546
|
+
logger.warning(f"Could not access file {file_path}: {e}")
|
|
547
|
+
continue
|
|
548
|
+
|
|
549
|
+
def _create_file_info(self, file_path: Path) -> FileInfo:
|
|
550
|
+
"""Create FileInfo with source directory context."""
|
|
551
|
+
hasher = hashlib.md5()
|
|
552
|
+
with open(file_path, "rb") as f:
|
|
553
|
+
buf = f.read(65536)
|
|
554
|
+
while len(buf) > 0:
|
|
555
|
+
hasher.update(buf)
|
|
556
|
+
buf = f.read(65536)
|
|
557
|
+
|
|
558
|
+
return FileInfo(
|
|
559
|
+
path=str(file_path.absolute()),
|
|
560
|
+
hash=hasher.hexdigest(),
|
|
561
|
+
last_modified=file_path.stat().st_mtime,
|
|
562
|
+
source_dir=str(self.base_dir.absolute()),
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
|
|
566
|
+
file_path = Path(file_info.path)
|
|
567
|
+
if file_path.suffix.lower() == ".docx":
|
|
568
|
+
return split_docx_into_heading_documents(file_path)
|
|
569
|
+
elif file_path.suffix.lower() == ".doc":
|
|
570
|
+
# Convert .doc to .docx using LibreOffice, then process
|
|
571
|
+
docx_path = _convert_doc_to_docx(file_path)
|
|
572
|
+
if docx_path is None:
|
|
573
|
+
logger.warning(f"Skipping {file_path}: could not convert .doc to .docx")
|
|
574
|
+
return []
|
|
575
|
+
try:
|
|
576
|
+
docs = split_docx_into_heading_documents(docx_path)
|
|
577
|
+
# Update metadata to point to original .doc file
|
|
578
|
+
for doc in docs:
|
|
579
|
+
doc.metadata["file_path"] = str(file_path)
|
|
580
|
+
doc.metadata["file_name"] = file_path.name
|
|
581
|
+
if "source" in doc.metadata:
|
|
582
|
+
doc.metadata["source"] = str(file_path)
|
|
583
|
+
return docs
|
|
584
|
+
finally:
|
|
585
|
+
# Clean up temp file
|
|
586
|
+
if docx_path.exists():
|
|
587
|
+
docx_path.unlink()
|
|
588
|
+
else:
|
|
589
|
+
reader = SimpleDirectoryReader(
|
|
590
|
+
input_files=[str(file_path)],
|
|
591
|
+
)
|
|
592
|
+
docs = reader.load_data()
|
|
593
|
+
# Ensure dates are visible to LLM (remove from exclusion list)
|
|
594
|
+
for doc in docs:
|
|
595
|
+
if hasattr(doc, 'excluded_llm_metadata_keys') and doc.excluded_llm_metadata_keys:
|
|
596
|
+
doc.excluded_llm_metadata_keys = [
|
|
597
|
+
k for k in doc.excluded_llm_metadata_keys
|
|
598
|
+
if k not in ('creation_date', 'last_modified_date')
|
|
599
|
+
]
|
|
600
|
+
|
|
601
|
+
# Add line offsets for text-based files (markdown, txt) to enable line number lookup
|
|
602
|
+
if file_path.suffix.lower() in {".md", ".txt"}:
|
|
603
|
+
for doc in docs:
|
|
604
|
+
text = doc.get_content()
|
|
605
|
+
line_offsets = _compute_line_offsets(text)
|
|
606
|
+
doc.metadata["line_offsets"] = line_offsets
|
|
607
|
+
|
|
608
|
+
# Extract headings for Markdown and store separately
|
|
609
|
+
# (not in metadata to avoid SentenceSplitter size validation)
|
|
610
|
+
if file_path.suffix.lower() == ".md":
|
|
611
|
+
headings = _extract_markdown_headings(text)
|
|
612
|
+
get_heading_store().set_headings(str(file_path), headings)
|
|
613
|
+
|
|
614
|
+
# Extract headings for PDF files and store separately
|
|
615
|
+
if file_path.suffix.lower() == ".pdf":
|
|
616
|
+
headings = _extract_pdf_headings_from_outline(file_path)
|
|
617
|
+
get_heading_store().set_headings(str(file_path), headings)
|
|
618
|
+
|
|
619
|
+
# Apply metadata exclusions
|
|
620
|
+
for doc in docs:
|
|
621
|
+
doc.excluded_embed_metadata_keys = EXCLUDED_EMBED_METADATA_KEYS
|
|
622
|
+
doc.excluded_llm_metadata_keys = EXCLUDED_LLM_METADATA_KEYS
|
|
623
|
+
|
|
624
|
+
return docs
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
class MultiDirectoryDataSource(DataSource):
|
|
628
|
+
"""Aggregates multiple LocalFileSystemSource instances."""
|
|
629
|
+
|
|
630
|
+
def __init__(self, config: IndexConfig):
|
|
631
|
+
self.config = config
|
|
632
|
+
self.sources: List[LocalFileSystemSource] = []
|
|
633
|
+
self.unavailable_dirs: List[DirectoryConfig] = []
|
|
634
|
+
|
|
635
|
+
for dir_config in config.directories:
|
|
636
|
+
if not dir_config.enabled:
|
|
637
|
+
logger.info(f"Skipping disabled directory: {dir_config.path}")
|
|
638
|
+
continue
|
|
639
|
+
|
|
640
|
+
source = LocalFileSystemSource(dir_config)
|
|
641
|
+
|
|
642
|
+
if source.is_available():
|
|
643
|
+
self.sources.append(source)
|
|
644
|
+
logger.info(f"Added directory source: {dir_config.path}")
|
|
645
|
+
else:
|
|
646
|
+
self.unavailable_dirs.append(dir_config)
|
|
647
|
+
logger.warning(f"Directory unavailable, skipping: {dir_config.path}")
|
|
648
|
+
|
|
649
|
+
def iter_files(self) -> Iterator[FileInfo]:
|
|
650
|
+
"""Iterate over files from all available sources."""
|
|
651
|
+
seen_paths: Set[str] = set()
|
|
652
|
+
|
|
653
|
+
for source in self.sources:
|
|
654
|
+
for file_info in source.iter_files():
|
|
655
|
+
# Deduplicate in case of overlapping mounts
|
|
656
|
+
if file_info.path not in seen_paths:
|
|
657
|
+
seen_paths.add(file_info.path)
|
|
658
|
+
yield file_info
|
|
659
|
+
|
|
660
|
+
def load_file(self, file_info: FileInfo) -> List[LlamaIndexDocument]:
|
|
661
|
+
"""Load file using the appropriate source based on source_dir."""
|
|
662
|
+
# Find the source that owns this file
|
|
663
|
+
for source in self.sources:
|
|
664
|
+
if file_info.source_dir == str(source.base_dir.absolute()):
|
|
665
|
+
return source.load_file(file_info)
|
|
666
|
+
|
|
667
|
+
# Fallback: use first source (shouldn't happen normally)
|
|
668
|
+
if self.sources:
|
|
669
|
+
return self.sources[0].load_file(file_info)
|
|
670
|
+
|
|
671
|
+
raise ValueError(f"No source available for file: {file_info.path}")
|
|
672
|
+
|
|
673
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
674
|
+
"""Return summary of configured directories."""
|
|
675
|
+
return {
|
|
676
|
+
"available": [str(s.base_dir) for s in self.sources],
|
|
677
|
+
"unavailable": [str(d.path) for d in self.unavailable_dirs],
|
|
678
|
+
"total_sources": len(self.sources),
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
class SimpleProgressBar:
|
|
683
|
+
"""Lightweight progress bar using only the standard library."""
|
|
684
|
+
|
|
685
|
+
def __init__(self, total: int, desc: str, unit: str = "item", width: int = 30):
|
|
686
|
+
self.total = max(total, 0)
|
|
687
|
+
self.desc = desc
|
|
688
|
+
self.unit = unit
|
|
689
|
+
self.width = width
|
|
690
|
+
self.current = 0
|
|
691
|
+
if self.total > 0:
|
|
692
|
+
self._render()
|
|
693
|
+
|
|
694
|
+
def update(self, step: int = 1) -> None:
|
|
695
|
+
if self.total <= 0:
|
|
696
|
+
return
|
|
697
|
+
self.current = min(self.total, self.current + step)
|
|
698
|
+
self._render()
|
|
699
|
+
if self.current >= self.total:
|
|
700
|
+
sys.stdout.write("\n")
|
|
701
|
+
sys.stdout.flush()
|
|
702
|
+
|
|
703
|
+
def _render(self) -> None:
|
|
704
|
+
progress = self.current / self.total if self.total else 0
|
|
705
|
+
filled = int(self.width * progress)
|
|
706
|
+
bar = "#" * filled + "-" * (self.width - filled)
|
|
707
|
+
sys.stdout.write(
|
|
708
|
+
f"\r{self.desc} [{bar}] {progress * 100:5.1f}% ({self.current}/{self.total} {self.unit}s)"
|
|
709
|
+
)
|
|
710
|
+
sys.stdout.flush()
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
class Spinner:
|
|
714
|
+
"""Simple console spinner to indicate long-running steps."""
|
|
715
|
+
|
|
716
|
+
def __init__(self, desc: str, interval: float = 0.1):
|
|
717
|
+
self.desc = desc
|
|
718
|
+
self.interval = interval
|
|
719
|
+
self._stop_event = threading.Event()
|
|
720
|
+
self._thread: threading.Thread | None = None
|
|
721
|
+
self._line = desc
|
|
722
|
+
|
|
723
|
+
def __enter__(self):
|
|
724
|
+
self._thread = threading.Thread(target=self._spin, daemon=True)
|
|
725
|
+
self._thread.start()
|
|
726
|
+
return self
|
|
727
|
+
|
|
728
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
729
|
+
self._stop_event.set()
|
|
730
|
+
if self._thread:
|
|
731
|
+
self._thread.join()
|
|
732
|
+
# Clear spinner line
|
|
733
|
+
sys.stdout.write("\r" + " " * len(self._line) + "\r")
|
|
734
|
+
sys.stdout.flush()
|
|
735
|
+
|
|
736
|
+
def _spin(self) -> None:
|
|
737
|
+
for char in itertools.cycle("|/-\\"):
|
|
738
|
+
if self._stop_event.is_set():
|
|
739
|
+
break
|
|
740
|
+
self._line = f"{self.desc} {char}"
|
|
741
|
+
sys.stdout.write("\r" + self._line)
|
|
742
|
+
sys.stdout.flush()
|
|
743
|
+
time.sleep(self.interval)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _embedding_cache_path(model_name: str, cache_dir: Path) -> Path:
|
|
747
|
+
"""Return the expected cache directory for a FastEmbed model."""
|
|
748
|
+
return cache_dir / f"models--{model_name.replace('/', '--')}"
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def _verify_model_cache_exists(cache_dir: Path) -> bool:
|
|
752
|
+
"""
|
|
753
|
+
Verify that the cached model directory exists and contains the expected model files.
|
|
754
|
+
"""
|
|
755
|
+
from fastembed import TextEmbedding
|
|
756
|
+
|
|
757
|
+
try:
|
|
758
|
+
models = TextEmbedding.list_supported_models()
|
|
759
|
+
model_info = [m for m in models if m.get("model") == RETRIEVAL_EMBED_MODEL_NAME]
|
|
760
|
+
if not model_info:
|
|
761
|
+
return False
|
|
762
|
+
|
|
763
|
+
model_info = model_info[0]
|
|
764
|
+
hf_source = model_info.get("sources", {}).get("hf")
|
|
765
|
+
if not hf_source:
|
|
766
|
+
return False
|
|
767
|
+
|
|
768
|
+
expected_dir = cache_dir / f"models--{hf_source.replace('/', '--')}"
|
|
769
|
+
if not expected_dir.exists():
|
|
770
|
+
return False
|
|
771
|
+
|
|
772
|
+
snapshots_dir = expected_dir / "snapshots"
|
|
773
|
+
if not snapshots_dir.exists():
|
|
774
|
+
return False
|
|
775
|
+
|
|
776
|
+
model_file = model_info.get("model_file", "model_optimized.onnx")
|
|
777
|
+
for snapshot in snapshots_dir.iterdir():
|
|
778
|
+
if snapshot.is_dir():
|
|
779
|
+
model_path = snapshot / model_file
|
|
780
|
+
if model_path.exists() or model_path.is_symlink():
|
|
781
|
+
return True
|
|
782
|
+
|
|
783
|
+
return False
|
|
784
|
+
except Exception:
|
|
785
|
+
return False
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def _get_cached_model_path(cache_dir: Path, model_name: str) -> Path | None:
|
|
789
|
+
"""Get the cached model directory path."""
|
|
790
|
+
try:
|
|
791
|
+
from huggingface_hub import snapshot_download
|
|
792
|
+
from fastembed import TextEmbedding
|
|
793
|
+
models = TextEmbedding.list_supported_models()
|
|
794
|
+
model_info = [m for m in models if m.get("model") == model_name]
|
|
795
|
+
if model_info:
|
|
796
|
+
hf_source = model_info[0].get("sources", {}).get("hf")
|
|
797
|
+
if hf_source:
|
|
798
|
+
cache_dir_abs = cache_dir.resolve()
|
|
799
|
+
model_dir = snapshot_download(
|
|
800
|
+
repo_id=hf_source,
|
|
801
|
+
local_files_only=True,
|
|
802
|
+
cache_dir=str(cache_dir_abs)
|
|
803
|
+
)
|
|
804
|
+
return Path(model_dir).resolve()
|
|
805
|
+
except (ImportError, Exception):
|
|
806
|
+
pass
|
|
807
|
+
return None
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def _create_fastembed_embedding(cache_dir: Path, offline: bool = False):
|
|
811
|
+
"""Create a FastEmbedEmbedding instance."""
|
|
812
|
+
if offline:
|
|
813
|
+
cached_model_path = _get_cached_model_path(cache_dir, RETRIEVAL_EMBED_MODEL_NAME)
|
|
814
|
+
if cached_model_path:
|
|
815
|
+
logger.info(
|
|
816
|
+
f"Using cached model path to bypass download: {cached_model_path}"
|
|
817
|
+
)
|
|
818
|
+
return FastEmbedEmbedding(
|
|
819
|
+
model_name=RETRIEVAL_EMBED_MODEL_NAME,
|
|
820
|
+
cache_dir=str(cache_dir),
|
|
821
|
+
specific_model_path=str(cached_model_path)
|
|
822
|
+
)
|
|
823
|
+
else:
|
|
824
|
+
logger.warning(
|
|
825
|
+
"Could not find cached model path, falling back to normal initialization"
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
return FastEmbedEmbedding(
|
|
829
|
+
model_name=RETRIEVAL_EMBED_MODEL_NAME, cache_dir=str(cache_dir)
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def ensure_embedding_model_cached(cache_dir: Path, offline: bool = False) -> None:
|
|
834
|
+
"""Ensure the embedding model is available in the local cache."""
|
|
835
|
+
if offline:
|
|
836
|
+
logger.info("Verifying embedding model cache...")
|
|
837
|
+
if _verify_model_cache_exists(cache_dir):
|
|
838
|
+
logger.info("Embedding model found in cache")
|
|
839
|
+
else:
|
|
840
|
+
logger.error(
|
|
841
|
+
"Offline mode enabled, but embedding model cache not found in %s",
|
|
842
|
+
cache_dir,
|
|
843
|
+
)
|
|
844
|
+
raise FileNotFoundError(
|
|
845
|
+
f"Embedding model '{RETRIEVAL_EMBED_MODEL_NAME}' not found in cache directory '{cache_dir}'. "
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
try:
|
|
849
|
+
logger.info("Initializing embedding model from cache...")
|
|
850
|
+
cache_dir_abs = cache_dir.resolve()
|
|
851
|
+
if offline:
|
|
852
|
+
os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
|
|
853
|
+
|
|
854
|
+
_create_fastembed_embedding(cache_dir, offline=offline)
|
|
855
|
+
logger.info("Embedding model initialized successfully")
|
|
856
|
+
return
|
|
857
|
+
except (ValueError, Exception) as e:
|
|
858
|
+
# Simplified error handling for brevity, similar logic as original
|
|
859
|
+
if offline:
|
|
860
|
+
raise FileNotFoundError(f"Failed to load model offline: {e}") from e
|
|
861
|
+
else:
|
|
862
|
+
raise RuntimeError(f"Failed to download/initialize model: {e}") from e
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def ensure_rerank_model_cached(cache_dir: Path, offline: bool = False) -> Path:
|
|
866
|
+
"""Ensure the reranking model is cached locally."""
|
|
867
|
+
try:
|
|
868
|
+
from flashrank import Ranker
|
|
869
|
+
except ImportError as exc:
|
|
870
|
+
raise ImportError(
|
|
871
|
+
"flashrank is required for reranking."
|
|
872
|
+
) from exc
|
|
873
|
+
|
|
874
|
+
cache_dir_abs = cache_dir.resolve()
|
|
875
|
+
logger.info("Ensuring rerank model is available in cache...")
|
|
876
|
+
|
|
877
|
+
# Map cross-encoder model names to FlashRank equivalents if needed
|
|
878
|
+
model_name = RETRIEVAL_RERANK_MODEL_NAME
|
|
879
|
+
# Note: FlashRank doesn't have L-6 models, so we map to L-12 equivalents
|
|
880
|
+
model_mapping = {
|
|
881
|
+
"cross-encoder/ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2", # L-6 not available, use L-12
|
|
882
|
+
"ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2", # Direct mapping for L-6
|
|
883
|
+
}
|
|
884
|
+
if model_name in model_mapping:
|
|
885
|
+
model_name = model_mapping[model_name]
|
|
886
|
+
elif model_name.startswith("cross-encoder/"):
|
|
887
|
+
# Extract model name after cross-encoder/ prefix and try to map
|
|
888
|
+
base_name = model_name.replace("cross-encoder/", "")
|
|
889
|
+
# If it's an L-6 model, map to L-12
|
|
890
|
+
if "L-6" in base_name:
|
|
891
|
+
model_name = base_name.replace("L-6", "L-12")
|
|
892
|
+
else:
|
|
893
|
+
model_name = base_name
|
|
894
|
+
|
|
895
|
+
try:
|
|
896
|
+
reranker = Ranker(model_name=model_name, cache_dir=str(cache_dir_abs))
|
|
897
|
+
logger.info(f"FlashRank model '{model_name}' initialized successfully")
|
|
898
|
+
return cache_dir_abs
|
|
899
|
+
except Exception as exc:
|
|
900
|
+
if offline:
|
|
901
|
+
raise FileNotFoundError(
|
|
902
|
+
f"Rerank model '{model_name}' not found in cache."
|
|
903
|
+
) from exc
|
|
904
|
+
raise
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
def _parse_heading_level(style_name: str | None) -> int:
|
|
908
|
+
"""Best-effort extraction of a numeric heading level from a DOCX style name."""
|
|
909
|
+
if not style_name:
|
|
910
|
+
return 1
|
|
911
|
+
try:
|
|
912
|
+
if "Heading" in style_name:
|
|
913
|
+
level_str = style_name.replace("Heading", "").strip()
|
|
914
|
+
if level_str:
|
|
915
|
+
return int(level_str)
|
|
916
|
+
except (ValueError, AttributeError):
|
|
917
|
+
pass
|
|
918
|
+
return 1
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def _get_doc_temp_dir() -> Path:
|
|
922
|
+
"""Get the temporary directory for .doc conversion, creating it if needed."""
|
|
923
|
+
storage_dir = Path(cfgload.get("storage.storage_dir", "./storage"))
|
|
924
|
+
temp_dir = storage_dir / "doc_temp"
|
|
925
|
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
926
|
+
return temp_dir
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _convert_doc_to_docx(doc_path: Path) -> Path | None:
|
|
930
|
+
"""Convert a .doc file to .docx using LibreOffice.
|
|
931
|
+
|
|
932
|
+
Returns path to temporary .docx file, or None if conversion fails.
|
|
933
|
+
Caller is responsible for cleaning up the temp file.
|
|
934
|
+
"""
|
|
935
|
+
import subprocess
|
|
936
|
+
import shutil
|
|
937
|
+
|
|
938
|
+
# Find LibreOffice executable
|
|
939
|
+
soffice_paths = [
|
|
940
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice", # macOS
|
|
941
|
+
"/usr/bin/soffice", # Linux
|
|
942
|
+
"/usr/bin/libreoffice", # Linux alternative
|
|
943
|
+
"soffice", # Windows (in PATH)
|
|
944
|
+
]
|
|
945
|
+
|
|
946
|
+
soffice = None
|
|
947
|
+
for path in soffice_paths:
|
|
948
|
+
if shutil.which(path):
|
|
949
|
+
soffice = path
|
|
950
|
+
break
|
|
951
|
+
|
|
952
|
+
if not soffice:
|
|
953
|
+
logger.warning(f"LibreOffice not found. Cannot convert {doc_path}")
|
|
954
|
+
return None
|
|
955
|
+
|
|
956
|
+
# Use storage directory for temp files (more reliable space than /tmp)
|
|
957
|
+
temp_dir = _get_doc_temp_dir()
|
|
958
|
+
|
|
959
|
+
try:
|
|
960
|
+
result = subprocess.run(
|
|
961
|
+
[soffice, "--headless", "--convert-to", "docx",
|
|
962
|
+
"--outdir", str(temp_dir), str(doc_path)],
|
|
963
|
+
capture_output=True,
|
|
964
|
+
timeout=60,
|
|
965
|
+
)
|
|
966
|
+
if result.returncode != 0:
|
|
967
|
+
logger.warning(f"LibreOffice conversion failed for {doc_path}: {result.stderr}")
|
|
968
|
+
return None
|
|
969
|
+
|
|
970
|
+
# Find the converted file
|
|
971
|
+
docx_name = doc_path.stem + ".docx"
|
|
972
|
+
docx_path = temp_dir / docx_name
|
|
973
|
+
if docx_path.exists():
|
|
974
|
+
return docx_path
|
|
975
|
+
|
|
976
|
+
logger.warning(f"Converted file not found: {docx_path}")
|
|
977
|
+
except subprocess.TimeoutExpired:
|
|
978
|
+
logger.warning(f"LibreOffice conversion timed out for {doc_path}")
|
|
979
|
+
except Exception as e:
|
|
980
|
+
logger.warning(f"Error converting {doc_path}: {e}")
|
|
981
|
+
|
|
982
|
+
return None
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
def split_docx_into_heading_documents(docx_path: Path) -> List[LlamaIndexDocument]:
|
|
986
|
+
"""Split DOCX into documents by heading."""
|
|
987
|
+
docs: List[LlamaIndexDocument] = []
|
|
988
|
+
try:
|
|
989
|
+
doc = Document(docx_path)
|
|
990
|
+
except Exception as e:
|
|
991
|
+
logger.warning(f"Failed to open DOCX {docx_path}: {e}")
|
|
992
|
+
return docs
|
|
993
|
+
|
|
994
|
+
# Extract file dates from filesystem
|
|
995
|
+
stat = docx_path.stat()
|
|
996
|
+
creation_date = datetime.fromtimestamp(stat.st_ctime).strftime("%Y-%m-%d")
|
|
997
|
+
last_modified_date = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d")
|
|
998
|
+
|
|
999
|
+
# Try to extract dates from DOCX core properties (more accurate than filesystem)
|
|
1000
|
+
try:
|
|
1001
|
+
core_props = doc.core_properties
|
|
1002
|
+
if core_props.created:
|
|
1003
|
+
creation_date = core_props.created.strftime("%Y-%m-%d")
|
|
1004
|
+
if core_props.modified:
|
|
1005
|
+
last_modified_date = core_props.modified.strftime("%Y-%m-%d")
|
|
1006
|
+
except Exception:
|
|
1007
|
+
pass # Fall back to filesystem dates
|
|
1008
|
+
|
|
1009
|
+
# First pass: Extract all headings with positions for hierarchy metadata
|
|
1010
|
+
all_headings = []
|
|
1011
|
+
char_position = 0
|
|
1012
|
+
for para in doc.paragraphs:
|
|
1013
|
+
style_name = getattr(para.style, "name", "") or ""
|
|
1014
|
+
is_heading = (
|
|
1015
|
+
style_name.startswith("Heading")
|
|
1016
|
+
or style_name.startswith("heading")
|
|
1017
|
+
or "Heading" in style_name
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
if is_heading and para.text.strip():
|
|
1021
|
+
heading_level = _parse_heading_level(style_name)
|
|
1022
|
+
all_headings.append({
|
|
1023
|
+
"text": para.text.strip(),
|
|
1024
|
+
"position": char_position,
|
|
1025
|
+
"level": heading_level
|
|
1026
|
+
})
|
|
1027
|
+
|
|
1028
|
+
char_position += len(para.text) + 1 # +1 for newline
|
|
1029
|
+
|
|
1030
|
+
# Store headings separately to avoid metadata size issues during chunking
|
|
1031
|
+
get_heading_store().set_headings(str(docx_path), all_headings)
|
|
1032
|
+
|
|
1033
|
+
# Second pass: Split by heading (existing logic)
|
|
1034
|
+
current_heading: str | None = None
|
|
1035
|
+
current_level: int | None = None
|
|
1036
|
+
current_body: list[str] = []
|
|
1037
|
+
|
|
1038
|
+
def flush_current():
|
|
1039
|
+
if not current_heading:
|
|
1040
|
+
return
|
|
1041
|
+
text = "\n".join(line for line in current_body if line is not None).strip()
|
|
1042
|
+
if not text:
|
|
1043
|
+
return
|
|
1044
|
+
|
|
1045
|
+
# Build hierarchical heading_path by finding parent headings based on level
|
|
1046
|
+
heading_path = []
|
|
1047
|
+
if all_headings:
|
|
1048
|
+
# Find the index of the current heading in all_headings
|
|
1049
|
+
current_idx = None
|
|
1050
|
+
for idx, h in enumerate(all_headings):
|
|
1051
|
+
if h["text"] == current_heading and h["level"] == current_level:
|
|
1052
|
+
current_idx = idx
|
|
1053
|
+
break
|
|
1054
|
+
|
|
1055
|
+
if current_idx is not None:
|
|
1056
|
+
# Build path by including all parent headings (those with lower level numbers)
|
|
1057
|
+
# Walk backwards from current heading and include headings with level < current_level
|
|
1058
|
+
path_headings = [all_headings[current_idx]] # Start with current
|
|
1059
|
+
for idx in range(current_idx - 1, -1, -1):
|
|
1060
|
+
h = all_headings[idx]
|
|
1061
|
+
if h["level"] < path_headings[0]["level"]:
|
|
1062
|
+
path_headings.insert(0, h)
|
|
1063
|
+
heading_path = [h["text"] for h in path_headings]
|
|
1064
|
+
|
|
1065
|
+
metadata = {
|
|
1066
|
+
"file_path": str(docx_path),
|
|
1067
|
+
"file_name": docx_path.name,
|
|
1068
|
+
"source": str(docx_path),
|
|
1069
|
+
"heading": current_heading,
|
|
1070
|
+
"heading_level": current_level,
|
|
1071
|
+
"creation_date": creation_date,
|
|
1072
|
+
"last_modified_date": last_modified_date,
|
|
1073
|
+
"heading_path": heading_path, # Pre-computed hierarchical path
|
|
1074
|
+
}
|
|
1075
|
+
docs.append(LlamaIndexDocument(
|
|
1076
|
+
text=text,
|
|
1077
|
+
metadata=metadata,
|
|
1078
|
+
excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
|
|
1079
|
+
excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS,
|
|
1080
|
+
))
|
|
1081
|
+
|
|
1082
|
+
for para in doc.paragraphs:
|
|
1083
|
+
style_name = getattr(para.style, "name", "") or ""
|
|
1084
|
+
is_heading = (
|
|
1085
|
+
style_name.startswith("Heading")
|
|
1086
|
+
or style_name.startswith("heading")
|
|
1087
|
+
or "Heading" in style_name
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
if is_heading and para.text.strip():
|
|
1091
|
+
flush_current()
|
|
1092
|
+
current_heading = para.text.strip()
|
|
1093
|
+
current_level = _parse_heading_level(style_name)
|
|
1094
|
+
current_body = []
|
|
1095
|
+
else:
|
|
1096
|
+
if current_heading is not None:
|
|
1097
|
+
current_body.append(para.text)
|
|
1098
|
+
|
|
1099
|
+
flush_current()
|
|
1100
|
+
|
|
1101
|
+
if not docs:
|
|
1102
|
+
try:
|
|
1103
|
+
full_text = "\n".join(p.text for p in doc.paragraphs).strip()
|
|
1104
|
+
except Exception:
|
|
1105
|
+
full_text = ""
|
|
1106
|
+
|
|
1107
|
+
if full_text:
|
|
1108
|
+
metadata = {
|
|
1109
|
+
"file_path": str(docx_path),
|
|
1110
|
+
"file_name": docx_path.name,
|
|
1111
|
+
"source": str(docx_path),
|
|
1112
|
+
"heading": None,
|
|
1113
|
+
"heading_level": None,
|
|
1114
|
+
"creation_date": creation_date,
|
|
1115
|
+
"last_modified_date": last_modified_date,
|
|
1116
|
+
}
|
|
1117
|
+
docs.append(LlamaIndexDocument(
|
|
1118
|
+
text=full_text,
|
|
1119
|
+
metadata=metadata,
|
|
1120
|
+
excluded_embed_metadata_keys=EXCLUDED_EMBED_METADATA_KEYS,
|
|
1121
|
+
excluded_llm_metadata_keys=EXCLUDED_LLM_METADATA_KEYS,
|
|
1122
|
+
))
|
|
1123
|
+
|
|
1124
|
+
logger.info(
|
|
1125
|
+
f"Split DOCX {docx_path} into {len(docs)} heading-based document(s)"
|
|
1126
|
+
)
|
|
1127
|
+
return docs
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
def tokenize_filename(filename: str) -> List[str]:
|
|
1131
|
+
"""
|
|
1132
|
+
Tokenize a filename for BM25 indexing.
|
|
1133
|
+
|
|
1134
|
+
Splits on delimiters (underscore, hyphen, dot, space) and camelCase.
|
|
1135
|
+
|
|
1136
|
+
Examples:
|
|
1137
|
+
'cpp_styleguide.md' -> ['cpp', 'styleguide', 'md']
|
|
1138
|
+
'API-Reference-v2.pdf' -> ['api', 'reference', 'v2', 'pdf']
|
|
1139
|
+
'CamelCaseDoc.docx' -> ['camel', 'case', 'doc', 'docx']
|
|
1140
|
+
"""
|
|
1141
|
+
import re
|
|
1142
|
+
|
|
1143
|
+
name_parts = filename.rsplit('.', 1)
|
|
1144
|
+
base_name = name_parts[0]
|
|
1145
|
+
extension = name_parts[1] if len(name_parts) > 1 else ""
|
|
1146
|
+
|
|
1147
|
+
# Split on explicit delimiters
|
|
1148
|
+
parts = re.split(r'[_\-\.\s]+', base_name)
|
|
1149
|
+
|
|
1150
|
+
# Split camelCase within each part
|
|
1151
|
+
tokens = []
|
|
1152
|
+
for part in parts:
|
|
1153
|
+
camel_split = re.sub(r'([a-z])([A-Z])', r'\1 \2', part).split()
|
|
1154
|
+
tokens.extend(t.lower() for t in camel_split if t)
|
|
1155
|
+
|
|
1156
|
+
# Add extension as a token
|
|
1157
|
+
if extension:
|
|
1158
|
+
tokens.append(extension.lower())
|
|
1159
|
+
|
|
1160
|
+
return tokens
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
def build_bm25_index(index, storage_dir: Path) -> None:
|
|
1164
|
+
"""
|
|
1165
|
+
Build a BM25 index over file names from the docstore.
|
|
1166
|
+
|
|
1167
|
+
This enables keyword matching for queries like 'cpp styleguide' to find
|
|
1168
|
+
files named 'cpp_styleguide.md'.
|
|
1169
|
+
"""
|
|
1170
|
+
from llama_index.retrievers.bm25 import BM25Retriever
|
|
1171
|
+
from llama_index.core.schema import TextNode
|
|
1172
|
+
|
|
1173
|
+
logger.info("Building BM25 index for file name matching...")
|
|
1174
|
+
|
|
1175
|
+
# Create filename nodes - one per unique file
|
|
1176
|
+
filename_nodes = []
|
|
1177
|
+
seen_files: Set[str] = set()
|
|
1178
|
+
|
|
1179
|
+
for doc_id, node in index.docstore.docs.items():
|
|
1180
|
+
metadata = node.metadata or {}
|
|
1181
|
+
file_name = metadata.get("file_name", "")
|
|
1182
|
+
file_path = metadata.get("file_path", "")
|
|
1183
|
+
|
|
1184
|
+
if not file_name or file_path in seen_files:
|
|
1185
|
+
continue
|
|
1186
|
+
seen_files.add(file_path)
|
|
1187
|
+
|
|
1188
|
+
tokens = tokenize_filename(file_name)
|
|
1189
|
+
filename_nodes.append(TextNode(
|
|
1190
|
+
text=" ".join(tokens),
|
|
1191
|
+
metadata={"file_name": file_name, "file_path": file_path},
|
|
1192
|
+
id_=f"bm25_{file_path}"
|
|
1193
|
+
))
|
|
1194
|
+
|
|
1195
|
+
if not filename_nodes:
|
|
1196
|
+
logger.warning("No documents found for BM25 indexing")
|
|
1197
|
+
return
|
|
1198
|
+
|
|
1199
|
+
logger.info(f"Creating BM25 index with {len(filename_nodes)} file name entries")
|
|
1200
|
+
|
|
1201
|
+
bm25_retriever = BM25Retriever.from_defaults(
|
|
1202
|
+
nodes=filename_nodes,
|
|
1203
|
+
similarity_top_k=10,
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
bm25_dir = storage_dir / "bm25_index"
|
|
1207
|
+
bm25_dir.mkdir(parents=True, exist_ok=True)
|
|
1208
|
+
bm25_retriever.persist(str(bm25_dir))
|
|
1209
|
+
|
|
1210
|
+
logger.info(f"BM25 index persisted to {bm25_dir}")
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def configure_offline_mode(offline: bool, cache_dir: Path) -> None:
|
|
1214
|
+
"""Configure environment variables for offline mode."""
|
|
1215
|
+
if offline:
|
|
1216
|
+
os.environ["HF_HUB_OFFLINE"] = "1"
|
|
1217
|
+
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
1218
|
+
os.environ["HF_DATASETS_OFFLINE"] = "1"
|
|
1219
|
+
cache_dir_abs = cache_dir.resolve()
|
|
1220
|
+
os.environ["HF_HOME"] = str(cache_dir_abs)
|
|
1221
|
+
os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
|
|
1222
|
+
os.environ["HF_DATASETS_CACHE"] = str(cache_dir_abs)
|
|
1223
|
+
logger.info("Offline mode enabled.")
|
|
1224
|
+
else:
|
|
1225
|
+
# Clear offline mode environment variables to allow downloads
|
|
1226
|
+
for var in ["HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE"]:
|
|
1227
|
+
os.environ.pop(var, None)
|
|
1228
|
+
|
|
1229
|
+
# Update huggingface_hub's cached constant (it caches at import time)
|
|
1230
|
+
try:
|
|
1231
|
+
from huggingface_hub import constants
|
|
1232
|
+
constants.HF_HUB_OFFLINE = offline
|
|
1233
|
+
except ImportError:
|
|
1234
|
+
pass
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
def build_index(
|
|
1238
|
+
download_only: bool = False,
|
|
1239
|
+
config_path: Path | None = None,
|
|
1240
|
+
model_cache_dir: Path | None = None,
|
|
1241
|
+
) -> None:
|
|
1242
|
+
"""Build and persist the vector index incrementally."""
|
|
1243
|
+
global _config, STORAGE_DIR, STATE_DB_PATH, RETRIEVAL_MODEL_CACHE_DIR, BM25_INDEX_DIR, HEADING_STORE_PATH
|
|
1244
|
+
global RETRIEVAL_EMBED_MODEL_NAME, RETRIEVAL_RERANK_MODEL_NAME
|
|
1245
|
+
|
|
1246
|
+
if config_path:
|
|
1247
|
+
cfg = load_config(config_path)
|
|
1248
|
+
_config = cfg
|
|
1249
|
+
STORAGE_DIR = Path(cfg["storage"]["storage_dir"])
|
|
1250
|
+
STATE_DB_PATH = STORAGE_DIR / "ingestion_state.db"
|
|
1251
|
+
RETRIEVAL_MODEL_CACHE_DIR = Path(cfg["storage"]["model_cache_dir"])
|
|
1252
|
+
BM25_INDEX_DIR = STORAGE_DIR / "bm25_index"
|
|
1253
|
+
HEADING_STORE_PATH = STORAGE_DIR / "heading_store.json"
|
|
1254
|
+
RETRIEVAL_EMBED_MODEL_NAME = cfg["retrieval"]["embed_model_name"]
|
|
1255
|
+
RETRIEVAL_RERANK_MODEL_NAME = cfg["retrieval"]["rerank_model_name"]
|
|
1256
|
+
|
|
1257
|
+
# Override model cache dir if specified via CLI
|
|
1258
|
+
if model_cache_dir:
|
|
1259
|
+
RETRIEVAL_MODEL_CACHE_DIR = model_cache_dir
|
|
1260
|
+
|
|
1261
|
+
# Read offline setting from config; force online when downloading models
|
|
1262
|
+
offline = False if download_only else _config["retrieval"].get("offline", False)
|
|
1263
|
+
cache_dir = RETRIEVAL_MODEL_CACHE_DIR
|
|
1264
|
+
configure_offline_mode(offline, cache_dir)
|
|
1265
|
+
|
|
1266
|
+
# Load configuration
|
|
1267
|
+
index_config = load_index_config()
|
|
1268
|
+
logger.info(f"Indexing configured with {len(index_config.directories)} directories")
|
|
1269
|
+
|
|
1270
|
+
ensure_embedding_model_cached(cache_dir, offline=offline)
|
|
1271
|
+
try:
|
|
1272
|
+
ensure_rerank_model_cached(cache_dir, offline=offline)
|
|
1273
|
+
except FileNotFoundError:
|
|
1274
|
+
if download_only or offline:
|
|
1275
|
+
raise
|
|
1276
|
+
logger.warning("Rerank model could not be cached yet; continuing without it.")
|
|
1277
|
+
|
|
1278
|
+
if download_only:
|
|
1279
|
+
logger.info("Models downloaded; skipping index build.")
|
|
1280
|
+
return
|
|
1281
|
+
|
|
1282
|
+
# Initialize State and Multi-Directory Data Source
|
|
1283
|
+
ingestion_state = IngestionState(STATE_DB_PATH)
|
|
1284
|
+
data_source = MultiDirectoryDataSource(index_config)
|
|
1285
|
+
|
|
1286
|
+
# Log directory summary
|
|
1287
|
+
summary = data_source.get_summary()
|
|
1288
|
+
logger.info(f"Active directories: {summary['available']}")
|
|
1289
|
+
if summary['unavailable']:
|
|
1290
|
+
logger.warning(f"Unavailable directories (skipped): {summary['unavailable']}")
|
|
1291
|
+
|
|
1292
|
+
if not data_source.sources:
|
|
1293
|
+
logger.error("No available directories to index. Check your config.yaml indexing.directories.")
|
|
1294
|
+
return
|
|
1295
|
+
|
|
1296
|
+
# Initialize Embedding Model
|
|
1297
|
+
logger.info(f"Initializing embedding model: {RETRIEVAL_EMBED_MODEL_NAME}")
|
|
1298
|
+
with Spinner("Initializing embedding model"):
|
|
1299
|
+
embed_model = _create_fastembed_embedding(RETRIEVAL_MODEL_CACHE_DIR, offline=offline)
|
|
1300
|
+
Settings.embed_model = embed_model
|
|
1301
|
+
|
|
1302
|
+
# Configure Text Splitter using config values
|
|
1303
|
+
text_splitter = SentenceSplitter(
|
|
1304
|
+
chunk_size=index_config.chunk_size,
|
|
1305
|
+
chunk_overlap=index_config.chunk_overlap,
|
|
1306
|
+
separator=" ",
|
|
1307
|
+
)
|
|
1308
|
+
Settings.text_splitter = text_splitter
|
|
1309
|
+
|
|
1310
|
+
# Load existing index or create new
|
|
1311
|
+
if (STORAGE_DIR / "docstore.json").exists():
|
|
1312
|
+
logger.info("Loading existing index context...")
|
|
1313
|
+
storage_context = StorageContext.from_defaults(persist_dir=str(STORAGE_DIR))
|
|
1314
|
+
index = load_index_from_storage(storage_context, embed_model=embed_model)
|
|
1315
|
+
else:
|
|
1316
|
+
logger.info("Creating new index context...")
|
|
1317
|
+
storage_context = StorageContext.from_defaults()
|
|
1318
|
+
index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
|
|
1319
|
+
|
|
1320
|
+
# Change Detection
|
|
1321
|
+
tracked_files = ingestion_state.get_all_files()
|
|
1322
|
+
found_files: Set[str] = set()
|
|
1323
|
+
files_to_process: List[FileInfo] = []
|
|
1324
|
+
|
|
1325
|
+
logger.info("Scanning for changes...")
|
|
1326
|
+
for file_info in data_source.iter_files():
|
|
1327
|
+
found_files.add(file_info.path)
|
|
1328
|
+
existing_state = tracked_files.get(file_info.path)
|
|
1329
|
+
|
|
1330
|
+
if existing_state:
|
|
1331
|
+
# Check if modified
|
|
1332
|
+
if existing_state["hash"] != file_info.hash:
|
|
1333
|
+
logger.info(f"Modified file detected: {file_info.path}")
|
|
1334
|
+
files_to_process.append(file_info)
|
|
1335
|
+
else:
|
|
1336
|
+
# New file
|
|
1337
|
+
logger.info(f"New file detected: {file_info.path}")
|
|
1338
|
+
files_to_process.append(file_info)
|
|
1339
|
+
|
|
1340
|
+
# Identify Deleted Files
|
|
1341
|
+
deleted_files = set(tracked_files.keys()) - found_files
|
|
1342
|
+
for deleted_path in deleted_files:
|
|
1343
|
+
logger.info(f"Deleted file detected: {deleted_path}")
|
|
1344
|
+
doc_ids = tracked_files[deleted_path]["doc_ids"]
|
|
1345
|
+
for doc_id in doc_ids:
|
|
1346
|
+
try:
|
|
1347
|
+
index.delete_ref_doc(doc_id, delete_from_docstore=True)
|
|
1348
|
+
except Exception as e:
|
|
1349
|
+
logger.warning(f"Failed to delete doc {doc_id} from index: {e}")
|
|
1350
|
+
# Clean up heading data for deleted file
|
|
1351
|
+
get_heading_store().remove_headings(deleted_path)
|
|
1352
|
+
ingestion_state.remove_file_state(deleted_path)
|
|
1353
|
+
|
|
1354
|
+
if not files_to_process and not deleted_files:
|
|
1355
|
+
logger.info("No changes detected. Index is up to date.")
|
|
1356
|
+
return
|
|
1357
|
+
|
|
1358
|
+
# Process New/Modified Files
|
|
1359
|
+
if files_to_process:
|
|
1360
|
+
progress = SimpleProgressBar(len(files_to_process), desc="Processing files", unit="file")
|
|
1361
|
+
for file_info in files_to_process:
|
|
1362
|
+
# Remove old versions if they exist
|
|
1363
|
+
existing_state = tracked_files.get(file_info.path)
|
|
1364
|
+
if existing_state:
|
|
1365
|
+
for doc_id in existing_state["doc_ids"]:
|
|
1366
|
+
try:
|
|
1367
|
+
index.delete_ref_doc(doc_id, delete_from_docstore=True)
|
|
1368
|
+
except KeyError:
|
|
1369
|
+
pass # Document might already be gone
|
|
1370
|
+
|
|
1371
|
+
# Load and Index New Version
|
|
1372
|
+
docs = data_source.load_file(file_info)
|
|
1373
|
+
doc_ids = []
|
|
1374
|
+
for doc in docs:
|
|
1375
|
+
index.insert(doc)
|
|
1376
|
+
doc_ids.append(doc.doc_id)
|
|
1377
|
+
|
|
1378
|
+
# Update State
|
|
1379
|
+
ingestion_state.update_file_state(file_info, doc_ids)
|
|
1380
|
+
progress.update()
|
|
1381
|
+
|
|
1382
|
+
# Persist Index
|
|
1383
|
+
STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
|
1384
|
+
logger.info(f"Persisting index to {STORAGE_DIR}")
|
|
1385
|
+
index.storage_context.persist(persist_dir=str(STORAGE_DIR))
|
|
1386
|
+
|
|
1387
|
+
# Build BM25 index for file name matching
|
|
1388
|
+
build_bm25_index(index, STORAGE_DIR)
|
|
1389
|
+
|
|
1390
|
+
logger.info("Indexing complete.")
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
if __name__ == "__main__":
|
|
1394
|
+
parser = argparse.ArgumentParser(description="Build the document index")
|
|
1395
|
+
parser.add_argument(
|
|
1396
|
+
"--download-models",
|
|
1397
|
+
action="store_true",
|
|
1398
|
+
help="Download the retrieval models and exit",
|
|
1399
|
+
)
|
|
1400
|
+
parser.add_argument(
|
|
1401
|
+
"--config",
|
|
1402
|
+
type=str,
|
|
1403
|
+
help="Path to config.yaml (overrides auto-discovery)",
|
|
1404
|
+
)
|
|
1405
|
+
parser.add_argument(
|
|
1406
|
+
"--model-cache-dir",
|
|
1407
|
+
type=str,
|
|
1408
|
+
help="Directory to download/cache models (overrides config)",
|
|
1409
|
+
)
|
|
1410
|
+
args = parser.parse_args()
|
|
1411
|
+
|
|
1412
|
+
try:
|
|
1413
|
+
build_index(
|
|
1414
|
+
download_only=args.download_models,
|
|
1415
|
+
config_path=Path(args.config) if args.config else None,
|
|
1416
|
+
model_cache_dir=Path(args.model_cache_dir) if args.model_cache_dir else None,
|
|
1417
|
+
)
|
|
1418
|
+
except Exception as e:
|
|
1419
|
+
logger.error(f"Indexing failed: {e}", exc_info=True)
|
|
1420
|
+
raise
|