agent-brain-rag 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,476 @@
1
+ """Indexing service that orchestrates the document indexing pipeline."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import uuid
7
+ from collections.abc import Awaitable
8
+ from datetime import datetime, timezone
9
+ from typing import Any, Callable, Optional, Union
10
+
11
+ from llama_index.core.schema import TextNode
12
+
13
+ from doc_serve_server.indexing import (
14
+ BM25IndexManager,
15
+ ContextAwareChunker,
16
+ DocumentLoader,
17
+ EmbeddingGenerator,
18
+ get_bm25_manager,
19
+ )
20
+ from doc_serve_server.indexing.chunking import CodeChunk, CodeChunker, TextChunk
21
+ from doc_serve_server.models import IndexingState, IndexingStatusEnum, IndexRequest
22
+ from doc_serve_server.storage import VectorStoreManager, get_vector_store
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # Type alias for progress callback
28
+ ProgressCallback = Callable[[int, int, str], Awaitable[None]]
29
+
30
+
31
+ class IndexingService:
32
+ """
33
+ Orchestrates the document indexing pipeline.
34
+
35
+ Coordinates document loading, chunking, embedding generation,
36
+ and vector store storage with progress tracking.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ vector_store: Optional[VectorStoreManager] = None,
42
+ document_loader: Optional[DocumentLoader] = None,
43
+ chunker: Optional[ContextAwareChunker] = None,
44
+ embedding_generator: Optional[EmbeddingGenerator] = None,
45
+ bm25_manager: Optional[BM25IndexManager] = None,
46
+ ):
47
+ """
48
+ Initialize the indexing service.
49
+
50
+ Args:
51
+ vector_store: Vector store manager instance.
52
+ document_loader: Document loader instance.
53
+ chunker: Text chunker instance.
54
+ embedding_generator: Embedding generator instance.
55
+ bm25_manager: BM25 index manager instance.
56
+ """
57
+ self.vector_store = vector_store or get_vector_store()
58
+ self.document_loader = document_loader or DocumentLoader()
59
+ self.chunker = chunker or ContextAwareChunker()
60
+ self.embedding_generator = embedding_generator or EmbeddingGenerator()
61
+ self.bm25_manager = bm25_manager or get_bm25_manager()
62
+
63
+ # Internal state
64
+ self._state = IndexingState(
65
+ current_job_id="",
66
+ folder_path="",
67
+ started_at=None,
68
+ completed_at=None,
69
+ error=None,
70
+ )
71
+ self._lock = asyncio.Lock()
72
+ self._indexed_folders: set[str] = set()
73
+ self._total_doc_chunks = 0
74
+ self._total_code_chunks = 0
75
+ self._supported_languages: set[str] = set()
76
+
77
+ @property
78
+ def state(self) -> IndexingState:
79
+ """Get the current indexing state."""
80
+ return self._state
81
+
82
+ @property
83
+ def is_indexing(self) -> bool:
84
+ """Check if indexing is currently in progress."""
85
+ return self._state.is_indexing
86
+
87
+ @property
88
+ def is_ready(self) -> bool:
89
+ """Check if the system is ready for queries."""
90
+ return (
91
+ self.vector_store.is_initialized
92
+ and not self.is_indexing
93
+ and self._state.status != IndexingStatusEnum.FAILED
94
+ )
95
+
96
+ async def start_indexing(
97
+ self,
98
+ request: IndexRequest,
99
+ progress_callback: Optional[ProgressCallback] = None,
100
+ ) -> str:
101
+ """
102
+ Start a new indexing job.
103
+
104
+ Args:
105
+ request: IndexRequest with folder path and configuration.
106
+ progress_callback: Optional callback for progress updates.
107
+
108
+ Returns:
109
+ Job ID for tracking the indexing operation.
110
+
111
+ Raises:
112
+ RuntimeError: If indexing is already in progress.
113
+ """
114
+ async with self._lock:
115
+ if self._state.is_indexing:
116
+ raise RuntimeError("Indexing already in progress")
117
+
118
+ # Generate job ID and initialize state
119
+ job_id = f"job_{uuid.uuid4().hex[:12]}"
120
+ self._state = IndexingState(
121
+ current_job_id=job_id,
122
+ status=IndexingStatusEnum.INDEXING,
123
+ is_indexing=True,
124
+ folder_path=request.folder_path,
125
+ started_at=datetime.now(timezone.utc),
126
+ completed_at=None,
127
+ error=None,
128
+ )
129
+
130
+ logger.info(f"Starting indexing job {job_id} for {request.folder_path}")
131
+
132
+ # Run indexing in background
133
+ asyncio.create_task(
134
+ self._run_indexing_pipeline(request, job_id, progress_callback)
135
+ )
136
+
137
+ return job_id
138
+
139
+ async def _run_indexing_pipeline(
140
+ self,
141
+ request: IndexRequest,
142
+ job_id: str,
143
+ progress_callback: Optional[ProgressCallback] = None,
144
+ ) -> None:
145
+ """
146
+ Execute the full indexing pipeline.
147
+
148
+ Args:
149
+ request: Indexing request configuration.
150
+ job_id: Job identifier for tracking.
151
+ progress_callback: Optional progress callback.
152
+ """
153
+ try:
154
+ # Ensure vector store is initialized
155
+ await self.vector_store.initialize()
156
+
157
+ # Step 1: Load documents
158
+ if progress_callback:
159
+ await progress_callback(0, 100, "Loading documents...")
160
+
161
+ # Normalize folder path to absolute path to avoid duplicates
162
+ abs_folder_path = os.path.abspath(request.folder_path)
163
+ logger.info(
164
+ f"Normalizing indexing path: {request.folder_path} -> {abs_folder_path}"
165
+ )
166
+
167
+ documents = await self.document_loader.load_files(
168
+ abs_folder_path,
169
+ recursive=request.recursive,
170
+ include_code=request.include_code,
171
+ )
172
+
173
+ self._state.total_documents = len(documents)
174
+ logger.info(f"Loaded {len(documents)} documents")
175
+
176
+ if not documents:
177
+ logger.warning(f"No documents found in {request.folder_path}")
178
+ self._state.status = IndexingStatusEnum.COMPLETED
179
+ self._state.is_indexing = False
180
+ self._state.completed_at = datetime.now(timezone.utc)
181
+ return
182
+
183
+ # Step 2: Chunk documents and code files
184
+ if progress_callback:
185
+ await progress_callback(20, 100, "Chunking documents...")
186
+
187
+ # Separate documents by type
188
+ doc_documents = [
189
+ d for d in documents if d.metadata.get("source_type") == "doc"
190
+ ]
191
+ code_documents = [
192
+ d for d in documents if d.metadata.get("source_type") == "code"
193
+ ]
194
+
195
+ logger.info(
196
+ f"Processing {len(doc_documents)} documents and "
197
+ f"{len(code_documents)} code files"
198
+ )
199
+
200
+ all_chunks: list[Union[TextChunk, CodeChunk]] = []
201
+ total_to_process = len(documents)
202
+
203
+ # Chunk documents
204
+ doc_chunker = None
205
+ if doc_documents:
206
+ doc_chunker = ContextAwareChunker(
207
+ chunk_size=request.chunk_size,
208
+ chunk_overlap=request.chunk_overlap,
209
+ )
210
+
211
+ async def doc_chunk_progress(processed: int, total: int) -> None:
212
+ self._state.processed_documents = processed
213
+ if progress_callback:
214
+ pct = 20 + int((processed / total_to_process) * 15)
215
+ await progress_callback(
216
+ pct, 100, f"Chunking docs: {processed}/{total}"
217
+ )
218
+
219
+ doc_chunks = await doc_chunker.chunk_documents(
220
+ doc_documents, doc_chunk_progress
221
+ )
222
+ all_chunks.extend(doc_chunks)
223
+ self._total_doc_chunks += len(doc_chunks)
224
+ logger.info(f"Created {len(doc_chunks)} document chunks")
225
+
226
+ # Chunk code files
227
+ if code_documents:
228
+ # Group code documents by language for efficient chunking
229
+ code_by_language: dict[str, list[Any]] = {}
230
+ for doc in code_documents:
231
+ lang = doc.metadata.get("language", "unknown")
232
+ if lang not in code_by_language:
233
+ code_by_language[lang] = []
234
+ code_by_language[lang].append(doc)
235
+
236
+ # Track total code documents processed across all languages
237
+ total_code_processed = 0
238
+
239
+ for lang, lang_docs in code_by_language.items():
240
+ if lang == "unknown":
241
+ logger.warning(
242
+ f"Skipping {len(lang_docs)} code files with unknown "
243
+ "language"
244
+ )
245
+ continue
246
+
247
+ try:
248
+ code_chunker = CodeChunker(
249
+ language=lang, generate_summaries=request.generate_summaries
250
+ )
251
+
252
+ # Create progress callback with fixed offset for this language
253
+ def make_progress_callback(
254
+ offset: int,
255
+ ) -> Callable[[int, int], Awaitable[None]]:
256
+ async def progress_callback_fn(
257
+ processed: int,
258
+ total: int,
259
+ ) -> None:
260
+ # processed is relative to current language batch
261
+ # Convert to total documents processed across
262
+ # all languages
263
+ total_processed = offset + processed
264
+ self._state.processed_documents = total_processed
265
+ if progress_callback:
266
+ pct = 35 + int(
267
+ (total_processed / total_to_process) * 15
268
+ )
269
+ await progress_callback(
270
+ pct,
271
+ 100,
272
+ f"Chunking code: {total_processed}/"
273
+ f"{total_to_process}",
274
+ )
275
+
276
+ return progress_callback_fn
277
+
278
+ # Calculate offset and create callback for this language batch
279
+ # Progress callback created but not used in
280
+ # current implementation
281
+ # progress_offset = len(doc_documents) + total_code_processed
282
+ # code_chunk_progress = make_progress_callback(progress_offset)
283
+
284
+ for doc in lang_docs:
285
+ code_chunks = await code_chunker.chunk_code_document(doc)
286
+ all_chunks.extend(code_chunks)
287
+ self._total_code_chunks += len(code_chunks)
288
+ self._supported_languages.add(lang)
289
+
290
+ # Update the total code documents processed
291
+ total_code_processed += len(lang_docs)
292
+
293
+ chunk_count = sum(
294
+ 1 for c in all_chunks if c.metadata.language == lang
295
+ )
296
+ logger.info(f"Created {chunk_count} {lang} chunks")
297
+
298
+ except Exception as e:
299
+ logger.error(f"Failed to chunk {lang} files: {e}")
300
+ # Fallback: treat as documents
301
+ if doc_chunker is not None: # Reuse doc chunker if available
302
+ fallback_chunks = await doc_chunker.chunk_documents(
303
+ lang_docs
304
+ )
305
+ all_chunks.extend(fallback_chunks)
306
+ logger.info(
307
+ f"Fell back to document chunking for "
308
+ f"{len(fallback_chunks)} {lang} files"
309
+ )
310
+ else:
311
+ # Create a temporary chunker for fallback
312
+ fallback_chunker = ContextAwareChunker(
313
+ chunk_size=request.chunk_size,
314
+ chunk_overlap=request.chunk_overlap,
315
+ )
316
+ fallback_chunks = await fallback_chunker.chunk_documents(
317
+ lang_docs
318
+ )
319
+ all_chunks.extend(fallback_chunks)
320
+ logger.info(
321
+ f"Fell back to document chunking for "
322
+ f"{len(fallback_chunks)} {lang} files"
323
+ )
324
+
325
+ chunks = all_chunks
326
+ self._state.total_chunks = len(chunks)
327
+ logger.info(f"Created {len(chunks)} total chunks")
328
+
329
+ # Step 3: Generate embeddings
330
+ if progress_callback:
331
+ await progress_callback(50, 100, "Generating embeddings...")
332
+
333
+ async def embedding_progress(processed: int, total: int) -> None:
334
+ if progress_callback:
335
+ pct = 50 + int((processed / total) * 40)
336
+ await progress_callback(pct, 100, f"Embedding: {processed}/{total}")
337
+
338
+ # The chunks list contains both TextChunk and CodeChunk,
339
+ # but both are TextChunk subclasses
340
+ embeddings = await self.embedding_generator.embed_chunks(
341
+ chunks, # type: ignore
342
+ embedding_progress,
343
+ )
344
+ logger.info(f"Generated {len(embeddings)} embeddings")
345
+
346
+ # Step 4: Store in vector database
347
+ if progress_callback:
348
+ await progress_callback(90, 100, "Storing in vector database...")
349
+
350
+ # ChromaDB has a max batch size of 41666, so we need to batch our upserts
351
+ # Use a safe batch size of 40000 to leave some margin
352
+ chroma_batch_size = 40000
353
+
354
+ for batch_start in range(0, len(chunks), chroma_batch_size):
355
+ batch_end = min(batch_start + chroma_batch_size, len(chunks))
356
+ batch_chunks = chunks[batch_start:batch_end]
357
+ batch_embeddings = embeddings[batch_start:batch_end]
358
+
359
+ await self.vector_store.upsert_documents(
360
+ ids=[chunk.chunk_id for chunk in batch_chunks],
361
+ embeddings=batch_embeddings,
362
+ documents=[chunk.text for chunk in batch_chunks],
363
+ metadatas=[chunk.metadata.to_dict() for chunk in batch_chunks],
364
+ )
365
+
366
+ logger.info(
367
+ f"Stored batch {batch_start // chroma_batch_size + 1} "
368
+ f"({len(batch_chunks)} chunks) in vector database"
369
+ )
370
+
371
+ # Step 5: Build BM25 index
372
+ if progress_callback:
373
+ await progress_callback(95, 100, "Building BM25 index...")
374
+
375
+ nodes = [
376
+ TextNode(
377
+ text=chunk.text,
378
+ id_=chunk.chunk_id,
379
+ metadata=chunk.metadata.to_dict(),
380
+ )
381
+ for chunk in chunks
382
+ ]
383
+ self.bm25_manager.build_index(nodes)
384
+
385
+ # Mark as completed
386
+ self._state.status = IndexingStatusEnum.COMPLETED
387
+ self._state.completed_at = datetime.now(timezone.utc)
388
+ self._state.is_indexing = False
389
+ self._indexed_folders.add(abs_folder_path)
390
+
391
+ if progress_callback:
392
+ await progress_callback(100, 100, "Indexing complete!")
393
+
394
+ logger.info(
395
+ f"Indexing job {job_id} completed: "
396
+ f"{len(documents)} docs, {len(chunks)} chunks"
397
+ )
398
+
399
+ except Exception as e:
400
+ logger.error(f"Indexing job {job_id} failed: {e}")
401
+ self._state.status = IndexingStatusEnum.FAILED
402
+ self._state.error = str(e)
403
+ self._state.is_indexing = False
404
+ raise
405
+
406
+ finally:
407
+ self._state.is_indexing = False
408
+
409
+ async def get_status(self) -> dict[str, Any]:
410
+ """
411
+ Get current indexing status.
412
+
413
+ Returns:
414
+ Dictionary with status information.
415
+ """
416
+ total_chunks = (
417
+ await self.vector_store.get_count()
418
+ if self.vector_store.is_initialized
419
+ else 0
420
+ )
421
+
422
+ # Use the instance variables we've been tracking during indexing
423
+ total_doc_chunks = self._total_doc_chunks
424
+ total_code_chunks = self._total_code_chunks
425
+ supported_languages = sorted(self._supported_languages)
426
+
427
+ return {
428
+ "status": self._state.status.value,
429
+ "is_indexing": self._state.is_indexing,
430
+ "current_job_id": self._state.current_job_id,
431
+ "folder_path": self._state.folder_path,
432
+ "total_documents": self._state.total_documents,
433
+ "processed_documents": self._state.processed_documents,
434
+ "total_chunks": total_chunks,
435
+ "total_doc_chunks": total_doc_chunks,
436
+ "total_code_chunks": total_code_chunks,
437
+ "supported_languages": supported_languages,
438
+ "progress_percent": self._state.progress_percent,
439
+ "started_at": (
440
+ self._state.started_at.isoformat() if self._state.started_at else None
441
+ ),
442
+ "completed_at": (
443
+ self._state.completed_at.isoformat()
444
+ if self._state.completed_at
445
+ else None
446
+ ),
447
+ "error": self._state.error,
448
+ "indexed_folders": sorted(self._indexed_folders),
449
+ }
450
+
451
+ async def reset(self) -> None:
452
+ """Reset the indexing service and vector store."""
453
+ async with self._lock:
454
+ await self.vector_store.reset()
455
+ self.bm25_manager.reset()
456
+ self._state = IndexingState(
457
+ current_job_id="",
458
+ folder_path="",
459
+ started_at=None,
460
+ completed_at=None,
461
+ error=None,
462
+ )
463
+ self._indexed_folders.clear()
464
+ logger.info("Indexing service reset")
465
+
466
+
467
+ # Singleton instance
468
+ _indexing_service: Optional[IndexingService] = None
469
+
470
+
471
+ def get_indexing_service() -> IndexingService:
472
+ """Get the global indexing service instance."""
473
+ global _indexing_service
474
+ if _indexing_service is None:
475
+ _indexing_service = IndexingService()
476
+ return _indexing_service