agent-brain-rag 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,40 +1,52 @@
1
- """Indexing endpoints for document processing."""
1
+ """Indexing endpoints for document processing with job queue support."""
2
2
 
3
3
  import os
4
4
  from pathlib import Path
5
5
 
6
- from fastapi import APIRouter, HTTPException, Request, status
6
+ from fastapi import APIRouter, HTTPException, Query, Request, status
7
7
 
8
+ from agent_brain_server.config import settings
8
9
  from agent_brain_server.models import IndexRequest, IndexResponse
9
10
 
10
11
  router = APIRouter()
11
12
 
13
+ # Maximum queue length for backpressure
14
+ MAX_QUEUE_LENGTH = settings.AGENT_BRAIN_MAX_QUEUE
15
+
12
16
 
13
17
  @router.post(
14
18
  "/",
15
19
  response_model=IndexResponse,
16
20
  status_code=status.HTTP_202_ACCEPTED,
17
21
  summary="Index Documents",
18
- description="Start indexing documents from a folder.",
22
+ description="Enqueue a job to index documents from a folder.",
19
23
  )
20
24
  async def index_documents(
21
- request_body: IndexRequest, request: Request
25
+ request_body: IndexRequest,
26
+ request: Request,
27
+ force: bool = Query(False, description="Bypass deduplication and force a new job"),
28
+ allow_external: bool = Query(
29
+ False, description="Allow paths outside the project directory"
30
+ ),
22
31
  ) -> IndexResponse:
23
- """Start indexing documents from the specified folder.
32
+ """Enqueue an indexing job for documents from the specified folder.
24
33
 
25
- This endpoint initiates a background indexing job and returns immediately.
26
- Use the /health/status endpoint to monitor progress.
34
+ This endpoint accepts the request and returns immediately with a job ID.
35
+ The job is processed asynchronously by a background worker.
36
+ Use the /index/jobs/{job_id} endpoint to monitor progress.
27
37
 
28
38
  Args:
29
39
  request_body: IndexRequest with folder_path and optional configuration.
30
40
  request: FastAPI request for accessing app state.
41
+ force: If True, bypass deduplication and create a new job.
42
+ allow_external: If True, allow indexing paths outside the project.
31
43
 
32
44
  Returns:
33
45
  IndexResponse with job_id and status.
34
46
 
35
47
  Raises:
36
- 400: Invalid folder path
37
- 409: Indexing already in progress
48
+ 400: Invalid folder path or path outside project (without allow_external)
49
+ 429: Queue is full (backpressure)
38
50
  """
39
51
  # Validate folder path
40
52
  folder_path = Path(request_body.folder_path).expanduser().resolve()
@@ -57,17 +69,20 @@ async def index_documents(
57
69
  detail=f"Cannot read folder: {request_body.folder_path}",
58
70
  )
59
71
 
60
- # Get indexing service from app state
61
- indexing_service = request.app.state.indexing_service
72
+ # Get job service from app state
73
+ job_service = request.app.state.job_service
62
74
 
63
- # Check if already indexing
64
- if indexing_service.is_indexing:
75
+ # Backpressure check (pending + running to prevent overflow)
76
+ stats = await job_service.get_queue_stats()
77
+ active_jobs = stats.pending + stats.running
78
+ if active_jobs >= MAX_QUEUE_LENGTH:
65
79
  raise HTTPException(
66
- status_code=status.HTTP_409_CONFLICT,
67
- detail="Indexing already in progress. Please wait for completion.",
80
+ status_code=status.HTTP_429_TOO_MANY_REQUESTS,
81
+ detail=f"Queue full ({stats.pending} pending, {stats.running} running). "
82
+ "Try again later.",
68
83
  )
69
84
 
70
- # Start indexing
85
+ # Enqueue the job
71
86
  try:
72
87
  # Update request with resolved path
73
88
  resolved_request = IndexRequest(
@@ -82,17 +97,37 @@ async def index_documents(
82
97
  exclude_patterns=request_body.exclude_patterns,
83
98
  generate_summaries=request_body.generate_summaries,
84
99
  )
85
- job_id = await indexing_service.start_indexing(resolved_request)
100
+
101
+ result = await job_service.enqueue_job(
102
+ request=resolved_request,
103
+ operation="index",
104
+ force=force,
105
+ allow_external=allow_external,
106
+ )
107
+ except ValueError as e:
108
+ # Path validation error (outside project)
109
+ raise HTTPException(
110
+ status_code=status.HTTP_400_BAD_REQUEST,
111
+ detail=str(e),
112
+ ) from e
86
113
  except Exception as e:
87
114
  raise HTTPException(
88
115
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
89
- detail=f"Failed to start indexing: {str(e)}",
116
+ detail=f"Failed to enqueue indexing job: {str(e)}",
90
117
  ) from e
91
118
 
119
+ # Build response message
120
+ if result.dedupe_hit:
121
+ message = (
122
+ f"Duplicate detected - existing job {result.job_id} is {result.status}"
123
+ )
124
+ else:
125
+ message = f"Job queued for {request_body.folder_path}"
126
+
92
127
  return IndexResponse(
93
- job_id=job_id,
94
- status="started",
95
- message=f"Indexing started for {request_body.folder_path}",
128
+ job_id=result.job_id,
129
+ status=result.status,
130
+ message=message,
96
131
  )
97
132
 
98
133
 
@@ -101,10 +136,17 @@ async def index_documents(
101
136
  response_model=IndexResponse,
102
137
  status_code=status.HTTP_202_ACCEPTED,
103
138
  summary="Add Documents",
104
- description="Add documents from another folder to the existing index.",
139
+ description="Enqueue a job to add documents from another folder.",
105
140
  )
106
- async def add_documents(request_body: IndexRequest, request: Request) -> IndexResponse:
107
- """Add documents from a new folder to the existing index.
141
+ async def add_documents(
142
+ request_body: IndexRequest,
143
+ request: Request,
144
+ force: bool = Query(False, description="Bypass deduplication and force a new job"),
145
+ allow_external: bool = Query(
146
+ False, description="Allow paths outside the project directory"
147
+ ),
148
+ ) -> IndexResponse:
149
+ """Enqueue a job to add documents from a new folder to the existing index.
108
150
 
109
151
  This is similar to the index endpoint but adds to the existing
110
152
  vector store instead of replacing it.
@@ -112,6 +154,8 @@ async def add_documents(request_body: IndexRequest, request: Request) -> IndexRe
112
154
  Args:
113
155
  request_body: IndexRequest with folder_path and optional configuration.
114
156
  request: FastAPI request for accessing app state.
157
+ force: If True, bypass deduplication and create a new job.
158
+ allow_external: If True, allow indexing paths outside the project.
115
159
 
116
160
  Returns:
117
161
  IndexResponse with job_id and status.
@@ -131,12 +175,17 @@ async def add_documents(request_body: IndexRequest, request: Request) -> IndexRe
131
175
  detail=f"Path is not a directory: {request_body.folder_path}",
132
176
  )
133
177
 
134
- indexing_service = request.app.state.indexing_service
178
+ # Get job service from app state
179
+ job_service = request.app.state.job_service
135
180
 
136
- if indexing_service.is_indexing:
181
+ # Backpressure check (pending + running to prevent overflow)
182
+ stats = await job_service.get_queue_stats()
183
+ active_jobs = stats.pending + stats.running
184
+ if active_jobs >= MAX_QUEUE_LENGTH:
137
185
  raise HTTPException(
138
- status_code=status.HTTP_409_CONFLICT,
139
- detail="Indexing already in progress. Please wait for completion.",
186
+ status_code=status.HTTP_429_TOO_MANY_REQUESTS,
187
+ detail=f"Queue full ({stats.pending} pending, {stats.running} running). "
188
+ "Try again later.",
140
189
  )
141
190
 
142
191
  try:
@@ -151,17 +200,36 @@ async def add_documents(request_body: IndexRequest, request: Request) -> IndexRe
151
200
  include_patterns=request_body.include_patterns,
152
201
  exclude_patterns=request_body.exclude_patterns,
153
202
  )
154
- job_id = await indexing_service.start_indexing(resolved_request)
203
+
204
+ result = await job_service.enqueue_job(
205
+ request=resolved_request,
206
+ operation="add",
207
+ force=force,
208
+ allow_external=allow_external,
209
+ )
210
+ except ValueError as e:
211
+ raise HTTPException(
212
+ status_code=status.HTTP_400_BAD_REQUEST,
213
+ detail=str(e),
214
+ ) from e
155
215
  except Exception as e:
156
216
  raise HTTPException(
157
217
  status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
158
- detail=f"Failed to add documents: {str(e)}",
218
+ detail=f"Failed to enqueue add job: {str(e)}",
159
219
  ) from e
160
220
 
221
+ # Build response message
222
+ if result.dedupe_hit:
223
+ message = (
224
+ f"Duplicate detected - existing job {result.job_id} is {result.status}"
225
+ )
226
+ else:
227
+ message = f"Job queued to add documents from {request_body.folder_path}"
228
+
161
229
  return IndexResponse(
162
- job_id=job_id,
163
- status="started",
164
- message=f"Adding documents from {request_body.folder_path}",
230
+ job_id=result.job_id,
231
+ status=result.status,
232
+ message=message,
165
233
  )
166
234
 
167
235
 
@@ -175,6 +243,7 @@ async def reset_index(request: Request) -> IndexResponse:
175
243
  """Reset the index by deleting all stored documents.
176
244
 
177
245
  Warning: This permanently removes all indexed content.
246
+ Cannot be performed while jobs are running.
178
247
 
179
248
  Args:
180
249
  request: FastAPI request for accessing app state.
@@ -183,14 +252,17 @@ async def reset_index(request: Request) -> IndexResponse:
183
252
  IndexResponse confirming the reset.
184
253
 
185
254
  Raises:
186
- 409: Indexing in progress
255
+ 409: Jobs in progress
187
256
  """
257
+ job_service = request.app.state.job_service
188
258
  indexing_service = request.app.state.indexing_service
189
259
 
190
- if indexing_service.is_indexing:
260
+ # Check if any jobs are running
261
+ stats = await job_service.get_queue_stats()
262
+ if stats.running > 0:
191
263
  raise HTTPException(
192
264
  status_code=status.HTTP_409_CONFLICT,
193
- detail="Cannot reset while indexing is in progress.",
265
+ detail="Cannot reset while indexing jobs are in progress.",
194
266
  )
195
267
 
196
268
  try:
@@ -0,0 +1,111 @@
1
+ """Job management endpoints for indexing job queue."""
2
+
3
+ from typing import Any
4
+
5
+ from fastapi import APIRouter, HTTPException, Query, Request, status
6
+
7
+ from agent_brain_server.job_queue.job_service import JobQueueService
8
+ from agent_brain_server.models.job import JobDetailResponse, JobListResponse
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.get(
14
+ "/",
15
+ response_model=JobListResponse,
16
+ summary="List Jobs",
17
+ description="List all indexing jobs with pagination.",
18
+ )
19
+ async def list_jobs(
20
+ request: Request,
21
+ limit: int = Query(
22
+ 50, ge=1, le=100, description="Maximum number of jobs to return"
23
+ ),
24
+ offset: int = Query(0, ge=0, description="Number of jobs to skip"),
25
+ ) -> JobListResponse:
26
+ """List all jobs with pagination.
27
+
28
+ Returns a paginated list of jobs with summary information and queue statistics.
29
+
30
+ Args:
31
+ request: FastAPI request for accessing app state.
32
+ limit: Maximum number of jobs to return (1-100, default 50).
33
+ offset: Number of jobs to skip for pagination (default 0).
34
+
35
+ Returns:
36
+ JobListResponse with list of job summaries and queue statistics.
37
+ """
38
+ job_service: JobQueueService = request.app.state.job_service
39
+ return await job_service.list_jobs(limit=limit, offset=offset)
40
+
41
+
42
+ @router.get(
43
+ "/{job_id}",
44
+ response_model=JobDetailResponse,
45
+ summary="Get Job Details",
46
+ description="Get detailed information about a specific job.",
47
+ )
48
+ async def get_job(job_id: str, request: Request) -> JobDetailResponse:
49
+ """Get details for a specific job.
50
+
51
+ Returns full job information including progress, timestamps, and results.
52
+
53
+ Args:
54
+ job_id: The unique job identifier.
55
+ request: FastAPI request for accessing app state.
56
+
57
+ Returns:
58
+ JobDetailResponse with full job details.
59
+
60
+ Raises:
61
+ 404: Job not found.
62
+ """
63
+ job_service: JobQueueService = request.app.state.job_service
64
+ job = await job_service.get_job(job_id)
65
+ if not job:
66
+ raise HTTPException(
67
+ status_code=status.HTTP_404_NOT_FOUND,
68
+ detail=f"Job {job_id} not found",
69
+ )
70
+ return job
71
+
72
+
73
+ @router.delete(
74
+ "/{job_id}",
75
+ summary="Cancel Job",
76
+ description="Cancel a pending or running job.",
77
+ )
78
+ async def cancel_job(job_id: str, request: Request) -> dict[str, Any]:
79
+ """Cancel a job.
80
+
81
+ Cancellation behavior depends on job status:
82
+ - PENDING jobs are cancelled immediately
83
+ - RUNNING jobs have cancel_requested flag set; worker will stop at next checkpoint
84
+ - Completed/Failed/Cancelled jobs return 409 Conflict
85
+
86
+ Args:
87
+ job_id: The unique job identifier.
88
+ request: FastAPI request for accessing app state.
89
+
90
+ Returns:
91
+ Dictionary with cancellation status and message.
92
+
93
+ Raises:
94
+ 404: Job not found.
95
+ 409: Job cannot be cancelled (already completed, failed, or cancelled).
96
+ """
97
+ job_service: JobQueueService = request.app.state.job_service
98
+
99
+ try:
100
+ result = await job_service.cancel_job(job_id)
101
+ return result
102
+ except KeyError as e:
103
+ raise HTTPException(
104
+ status_code=status.HTTP_404_NOT_FOUND,
105
+ detail=str(e),
106
+ ) from e
107
+ except ValueError as e:
108
+ raise HTTPException(
109
+ status_code=status.HTTP_409_CONFLICT,
110
+ detail=str(e),
111
+ ) from e
@@ -32,6 +32,10 @@ class EmbeddingConfig(BaseModel):
32
32
  default="text-embedding-3-large",
33
33
  description="Model name for embeddings",
34
34
  )
35
+ api_key: Optional[str] = Field(
36
+ default=None,
37
+ description="API key (alternative to api_key_env for local config files)",
38
+ )
35
39
  api_key_env: Optional[str] = Field(
36
40
  default="OPENAI_API_KEY",
37
41
  description="Environment variable name containing API key",
@@ -58,13 +62,21 @@ class EmbeddingConfig(BaseModel):
58
62
  return EmbeddingProviderType(v)
59
63
 
60
64
  def get_api_key(self) -> Optional[str]:
61
- """Resolve API key from environment variable.
65
+ """Resolve API key from config or environment variable.
66
+
67
+ Resolution order:
68
+ 1. api_key field in config (direct value)
69
+ 2. Environment variable specified by api_key_env
62
70
 
63
71
  Returns:
64
72
  API key value or None if not found/not needed
65
73
  """
66
74
  if self.provider == EmbeddingProviderType.OLLAMA:
67
75
  return None # Ollama doesn't need API key
76
+ # Check direct api_key first
77
+ if self.api_key:
78
+ return self.api_key
79
+ # Fall back to environment variable
68
80
  if self.api_key_env:
69
81
  return os.getenv(self.api_key_env)
70
82
  return None
@@ -93,6 +105,10 @@ class SummarizationConfig(BaseModel):
93
105
  default="claude-haiku-4-5-20251001",
94
106
  description="Model name for summarization",
95
107
  )
108
+ api_key: Optional[str] = Field(
109
+ default=None,
110
+ description="API key (alternative to api_key_env for local config files)",
111
+ )
96
112
  api_key_env: Optional[str] = Field(
97
113
  default="ANTHROPIC_API_KEY",
98
114
  description="Environment variable name containing API key",
@@ -119,13 +135,21 @@ class SummarizationConfig(BaseModel):
119
135
  return SummarizationProviderType(v)
120
136
 
121
137
  def get_api_key(self) -> Optional[str]:
122
- """Resolve API key from environment variable.
138
+ """Resolve API key from config or environment variable.
139
+
140
+ Resolution order:
141
+ 1. api_key field in config (direct value)
142
+ 2. Environment variable specified by api_key_env
123
143
 
124
144
  Returns:
125
145
  API key value or None if not found/not needed
126
146
  """
127
147
  if self.provider == SummarizationProviderType.OLLAMA:
128
148
  return None # Ollama doesn't need API key
149
+ # Check direct api_key first
150
+ if self.api_key:
151
+ return self.api_key
152
+ # Fall back to environment variable
129
153
  if self.api_key_env:
130
154
  return os.getenv(self.api_key_env)
131
155
  return None
@@ -162,40 +186,60 @@ def _find_config_file() -> Optional[Path]:
162
186
  """Find the configuration file in standard locations.
163
187
 
164
188
  Search order:
165
- 1. DOC_SERVE_CONFIG environment variable
166
- 2. Current directory config.yaml
167
- 3. State directory config.yaml (if DOC_SERVE_STATE_DIR set)
168
- 4. Project root config.yaml
189
+ 1. AGENT_BRAIN_CONFIG environment variable
190
+ 2. State directory config.yaml (if AGENT_BRAIN_STATE_DIR or DOC_SERVE_STATE_DIR set)
191
+ 3. Current directory config.yaml
192
+ 4. Walk up from CWD looking for .claude/agent-brain/config.yaml
193
+ 5. User home ~/.agent-brain/config.yaml
194
+ 6. XDG config ~/.config/agent-brain/config.yaml
169
195
 
170
196
  Returns:
171
197
  Path to config file or None if not found
172
198
  """
173
199
  # 1. Environment variable override
174
- env_config = os.getenv("DOC_SERVE_CONFIG")
200
+ env_config = os.getenv("AGENT_BRAIN_CONFIG")
175
201
  if env_config:
176
202
  path = Path(env_config)
177
203
  if path.exists():
204
+ logger.debug(f"Found config via AGENT_BRAIN_CONFIG: {path}")
178
205
  return path
179
- logger.warning(f"DOC_SERVE_CONFIG points to non-existent file: {env_config}")
180
-
181
- # 2. Current directory
182
- cwd_config = Path.cwd() / "config.yaml"
183
- if cwd_config.exists():
184
- return cwd_config
206
+ logger.warning(f"AGENT_BRAIN_CONFIG points to non-existent file: {env_config}")
185
207
 
186
- # 3. State directory
187
- state_dir = os.getenv("DOC_SERVE_STATE_DIR")
208
+ # 2. State directory (check both new and legacy env vars)
209
+ state_dir = os.getenv("AGENT_BRAIN_STATE_DIR") or os.getenv("DOC_SERVE_STATE_DIR")
188
210
  if state_dir:
189
211
  state_config = Path(state_dir) / "config.yaml"
190
212
  if state_config.exists():
213
+ logger.debug(f"Found config in state directory: {state_config}")
191
214
  return state_config
192
215
 
193
- # 4. .claude/doc-serve directory (project root pattern)
194
- claude_dir = Path.cwd() / ".claude" / "doc-serve"
195
- if claude_dir.exists():
196
- claude_config = claude_dir / "config.yaml"
216
+ # 3. Current directory
217
+ cwd_config = Path.cwd() / "config.yaml"
218
+ if cwd_config.exists():
219
+ logger.debug(f"Found config in current directory: {cwd_config}")
220
+ return cwd_config
221
+
222
+ # 4. Walk up from CWD looking for .claude/agent-brain/config.yaml
223
+ current = Path.cwd()
224
+ root = Path(current.anchor)
225
+ while current != root:
226
+ claude_config = current / ".claude" / "agent-brain" / "config.yaml"
197
227
  if claude_config.exists():
228
+ logger.debug(f"Found config walking up from CWD: {claude_config}")
198
229
  return claude_config
230
+ current = current.parent
231
+
232
+ # 5. User home directory ~/.agent-brain/config.yaml
233
+ home_config = Path.home() / ".agent-brain" / "config.yaml"
234
+ if home_config.exists():
235
+ logger.debug(f"Found config in home directory: {home_config}")
236
+ return home_config
237
+
238
+ # 6. XDG config directory ~/.config/agent-brain/config.yaml
239
+ xdg_config = Path.home() / ".config" / "agent-brain" / "config.yaml"
240
+ if xdg_config.exists():
241
+ logger.debug(f"Found config in XDG config directory: {xdg_config}")
242
+ return xdg_config
199
243
 
200
244
  return None
201
245
 
@@ -31,7 +31,7 @@ class Settings(BaseSettings):
31
31
  # Chroma Configuration
32
32
  CHROMA_PERSIST_DIR: str = "./chroma_db"
33
33
  BM25_INDEX_PATH: str = "./bm25_index"
34
- COLLECTION_NAME: str = "doc_serve_collection"
34
+ COLLECTION_NAME: str = "agent_brain_collection"
35
35
 
36
36
  # Chunking Configuration
37
37
  DEFAULT_CHUNK_SIZE: int = 512
@@ -48,8 +48,8 @@ class Settings(BaseSettings):
48
48
  EMBEDDING_BATCH_SIZE: int = 100
49
49
 
50
50
  # Multi-instance Configuration
51
- DOC_SERVE_STATE_DIR: Optional[str] = None # Override state directory
52
- DOC_SERVE_MODE: str = "project" # "project" or "shared"
51
+ AGENT_BRAIN_STATE_DIR: Optional[str] = None # Override state directory
52
+ AGENT_BRAIN_MODE: str = "project" # "project" or "shared"
53
53
 
54
54
  # GraphRAG Configuration (Feature 113)
55
55
  ENABLE_GRAPH_INDEX: bool = False # Master switch for graph indexing
@@ -62,11 +62,17 @@ class Settings(BaseSettings):
62
62
  GRAPH_TRAVERSAL_DEPTH: int = 2 # Depth for graph traversal in queries
63
63
  GRAPH_RRF_K: int = 60 # Reciprocal Rank Fusion constant for multi-retrieval
64
64
 
65
+ # Job Queue Configuration (Feature 115)
66
+ AGENT_BRAIN_MAX_QUEUE: int = 100 # Max pending jobs in queue
67
+ AGENT_BRAIN_JOB_TIMEOUT: int = 7200 # Job timeout in seconds (2 hours)
68
+ AGENT_BRAIN_MAX_RETRIES: int = 3 # Max retries for failed jobs
69
+ AGENT_BRAIN_CHECKPOINT_INTERVAL: int = 50 # Progress checkpoint every N files
70
+
65
71
  model_config = SettingsConfigDict(
66
72
  env_file=[
67
73
  ".env", # Current directory
68
74
  Path(__file__).parent.parent.parent / ".env", # Project root
69
- Path(__file__).parent.parent / ".env", # doc-serve-server directory
75
+ Path(__file__).parent.parent / ".env", # agent-brain-server directory
70
76
  ],
71
77
  env_file_encoding="utf-8",
72
78
  case_sensitive=True,
@@ -89,10 +89,23 @@ class BM25IndexManager:
89
89
  if not self._retriever:
90
90
  raise RuntimeError("BM25 index not initialized")
91
91
 
92
- # BM25Retriever similarity_top_k is usually set during initialization.
93
- self._retriever.similarity_top_k = top_k
92
+ # Cap top_k to corpus size to avoid bm25s "k larger than available scores" error
93
+ corpus_size = len(self._retriever.corpus) if self._retriever.corpus else 0
94
+ if corpus_size > 0:
95
+ effective_top_k = min(top_k, corpus_size)
96
+ else:
97
+ effective_top_k = top_k
98
+
99
+ self._retriever.similarity_top_k = effective_top_k
94
100
  return self._retriever
95
101
 
102
+ @property
103
+ def corpus_size(self) -> int:
104
+ """Get the number of documents in the BM25 index."""
105
+ if not self._retriever or not self._retriever.corpus:
106
+ return 0
107
+ return len(self._retriever.corpus)
108
+
96
109
  async def search_with_filters(
97
110
  self,
98
111
  query: str,
@@ -1,5 +1,6 @@
1
1
  """Document loading from various file formats using LlamaIndex."""
2
2
 
3
+ import asyncio
3
4
  import logging
4
5
  import re
5
6
  from dataclasses import dataclass, field
@@ -272,9 +273,30 @@ class DocumentLoader:
272
273
 
273
274
  SUPPORTED_EXTENSIONS: set[str] = DOCUMENT_EXTENSIONS | CODE_EXTENSIONS
274
275
 
276
+ # Default directories to exclude from indexing
277
+ DEFAULT_EXCLUDE_PATTERNS: list[str] = [
278
+ "**/node_modules/**",
279
+ "**/__pycache__/**",
280
+ "**/.venv/**",
281
+ "**/venv/**",
282
+ "**/.git/**",
283
+ "**/dist/**",
284
+ "**/build/**",
285
+ "**/target/**",
286
+ "**/.next/**",
287
+ "**/.nuxt/**",
288
+ "**/coverage/**",
289
+ "**/.pytest_cache/**",
290
+ "**/.mypy_cache/**",
291
+ "**/.tox/**",
292
+ "**/egg-info/**",
293
+ "**/*.egg-info/**",
294
+ ]
295
+
275
296
  def __init__(
276
297
  self,
277
298
  supported_extensions: Optional[set[str]] = None,
299
+ exclude_patterns: Optional[list[str]] = None,
278
300
  ):
279
301
  """
280
302
  Initialize the document loader.
@@ -282,8 +304,15 @@ class DocumentLoader:
282
304
  Args:
283
305
  supported_extensions: Set of file extensions to load.
284
306
  Defaults to SUPPORTED_EXTENSIONS.
307
+ exclude_patterns: List of glob patterns to exclude.
308
+ Defaults to DEFAULT_EXCLUDE_PATTERNS.
285
309
  """
286
310
  self.extensions = supported_extensions or self.SUPPORTED_EXTENSIONS
311
+ self.exclude_patterns = (
312
+ exclude_patterns
313
+ if exclude_patterns is not None
314
+ else self.DEFAULT_EXCLUDE_PATTERNS
315
+ )
287
316
 
288
317
  async def load_from_folder(
289
318
  self,
@@ -313,16 +342,24 @@ class DocumentLoader:
313
342
  raise ValueError(f"Path is not a directory: {folder_path}")
314
343
 
315
344
  logger.info(f"Loading documents from: {folder_path} (recursive={recursive})")
345
+ if self.exclude_patterns:
346
+ logger.info(
347
+ f"Excluding patterns: {self.exclude_patterns[:3]}... "
348
+ f"({len(self.exclude_patterns)} total)"
349
+ )
316
350
 
317
351
  # Use LlamaIndex's SimpleDirectoryReader
352
+ # Run in thread pool to avoid blocking the event loop
318
353
  try:
319
354
  reader = SimpleDirectoryReader(
320
355
  input_dir=str(path),
321
356
  recursive=recursive,
322
357
  required_exts=list(self.extensions),
358
+ exclude=self.exclude_patterns,
323
359
  filename_as_id=True,
324
360
  )
325
- llama_documents: list[Document] = reader.load_data()
361
+ # reader.load_data() is blocking I/O - run in thread pool
362
+ llama_documents: list[Document] = await asyncio.to_thread(reader.load_data)
326
363
  except Exception as e:
327
364
  logger.error(f"Failed to load documents: {e}")
328
365
  raise
@@ -398,7 +435,8 @@ class DocumentLoader:
398
435
  input_files=[str(path)],
399
436
  filename_as_id=True,
400
437
  )
401
- docs = reader.load_data()
438
+ # Run in thread pool to avoid blocking the event loop
439
+ docs = await asyncio.to_thread(reader.load_data)
402
440
 
403
441
  if not docs:
404
442
  raise ValueError(f"No content loaded from file: {file_path}")
@@ -456,8 +494,11 @@ class DocumentLoader:
456
494
  # Use only document extensions
457
495
  effective_extensions = self.DOCUMENT_EXTENSIONS
458
496
 
459
- # Create a temporary loader with the effective extensions
460
- temp_loader = DocumentLoader(supported_extensions=effective_extensions)
497
+ # Create a temporary loader with the effective extensions and exclude patterns
498
+ temp_loader = DocumentLoader(
499
+ supported_extensions=effective_extensions,
500
+ exclude_patterns=self.exclude_patterns,
501
+ )
461
502
 
462
503
  # Load files using the configured extensions
463
504
  loaded_docs = await temp_loader.load_from_folder(folder_path, recursive)