agent-brain-rag 1.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-3.0.0.dist-info}/METADATA +55 -18
  2. agent_brain_rag-3.0.0.dist-info/RECORD +56 -0
  3. {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-3.0.0.dist-info}/WHEEL +1 -1
  4. {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-3.0.0.dist-info}/entry_points.txt +0 -1
  5. agent_brain_server/__init__.py +1 -1
  6. agent_brain_server/api/main.py +146 -45
  7. agent_brain_server/api/routers/__init__.py +2 -0
  8. agent_brain_server/api/routers/health.py +85 -21
  9. agent_brain_server/api/routers/index.py +108 -36
  10. agent_brain_server/api/routers/jobs.py +111 -0
  11. agent_brain_server/config/provider_config.py +352 -0
  12. agent_brain_server/config/settings.py +22 -5
  13. agent_brain_server/indexing/__init__.py +21 -0
  14. agent_brain_server/indexing/bm25_index.py +15 -2
  15. agent_brain_server/indexing/document_loader.py +45 -4
  16. agent_brain_server/indexing/embedding.py +86 -135
  17. agent_brain_server/indexing/graph_extractors.py +582 -0
  18. agent_brain_server/indexing/graph_index.py +536 -0
  19. agent_brain_server/job_queue/__init__.py +11 -0
  20. agent_brain_server/job_queue/job_service.py +317 -0
  21. agent_brain_server/job_queue/job_store.py +427 -0
  22. agent_brain_server/job_queue/job_worker.py +434 -0
  23. agent_brain_server/locking.py +101 -8
  24. agent_brain_server/models/__init__.py +28 -0
  25. agent_brain_server/models/graph.py +253 -0
  26. agent_brain_server/models/health.py +30 -3
  27. agent_brain_server/models/job.py +289 -0
  28. agent_brain_server/models/query.py +16 -3
  29. agent_brain_server/project_root.py +1 -1
  30. agent_brain_server/providers/__init__.py +64 -0
  31. agent_brain_server/providers/base.py +251 -0
  32. agent_brain_server/providers/embedding/__init__.py +23 -0
  33. agent_brain_server/providers/embedding/cohere.py +163 -0
  34. agent_brain_server/providers/embedding/ollama.py +150 -0
  35. agent_brain_server/providers/embedding/openai.py +118 -0
  36. agent_brain_server/providers/exceptions.py +95 -0
  37. agent_brain_server/providers/factory.py +157 -0
  38. agent_brain_server/providers/summarization/__init__.py +41 -0
  39. agent_brain_server/providers/summarization/anthropic.py +87 -0
  40. agent_brain_server/providers/summarization/gemini.py +96 -0
  41. agent_brain_server/providers/summarization/grok.py +95 -0
  42. agent_brain_server/providers/summarization/ollama.py +114 -0
  43. agent_brain_server/providers/summarization/openai.py +87 -0
  44. agent_brain_server/runtime.py +2 -2
  45. agent_brain_server/services/indexing_service.py +39 -0
  46. agent_brain_server/services/query_service.py +203 -0
  47. agent_brain_server/storage/__init__.py +18 -2
  48. agent_brain_server/storage/graph_store.py +519 -0
  49. agent_brain_server/storage/vector_store.py +35 -0
  50. agent_brain_server/storage_paths.py +5 -3
  51. agent_brain_rag-1.2.0.dist-info/RECORD +0 -31
@@ -0,0 +1,582 @@
1
+ """Entity extraction for GraphRAG (Feature 113).
2
+
3
+ Provides extractors for building the knowledge graph:
4
+ - LLMEntityExtractor: Uses LLM to extract entity-relationship triplets
5
+ - CodeMetadataExtractor: Extracts relationships from code AST metadata
6
+
7
+ All extractors return GraphTriple objects for graph construction.
8
+ """
9
+
10
+ import logging
11
+ import re
12
+ from typing import Any, Optional
13
+
14
+ from agent_brain_server.config import settings
15
+ from agent_brain_server.models.graph import GraphTriple
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class LLMEntityExtractor:
21
+ """Wrapper for LLM-based entity extraction.
22
+
23
+ Uses Claude to extract entity-relationship triplets from text.
24
+ Implements graceful degradation when LLM is unavailable.
25
+
26
+ Attributes:
27
+ model: The LLM model to use for extraction.
28
+ max_triplets: Maximum triplets to extract per chunk.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ model: Optional[str] = None,
34
+ max_triplets: Optional[int] = None,
35
+ ) -> None:
36
+ """Initialize LLM entity extractor.
37
+
38
+ Args:
39
+ model: LLM model to use (defaults to settings.GRAPH_EXTRACTION_MODEL).
40
+ max_triplets: Max triplets per chunk (defaults to settings value).
41
+ """
42
+ self.model = model or settings.GRAPH_EXTRACTION_MODEL
43
+ self.max_triplets = max_triplets or settings.GRAPH_MAX_TRIPLETS_PER_CHUNK
44
+ self._client: Optional[Any] = None
45
+
46
+ def _get_client(self) -> Optional[Any]:
47
+ """Get or create Anthropic client.
48
+
49
+ Returns:
50
+ Anthropic client or None if unavailable.
51
+ """
52
+ if self._client is not None:
53
+ return self._client
54
+
55
+ try:
56
+ import anthropic
57
+
58
+ api_key = settings.ANTHROPIC_API_KEY
59
+ if not api_key:
60
+ logger.debug("No Anthropic API key, LLM extraction disabled")
61
+ return None
62
+
63
+ self._client = anthropic.Anthropic(api_key=api_key)
64
+ return self._client
65
+ except ImportError:
66
+ logger.debug("Anthropic SDK not installed, LLM extraction disabled")
67
+ return None
68
+ except Exception as e:
69
+ logger.warning(f"Failed to create Anthropic client: {e}")
70
+ return None
71
+
72
+ def extract_triplets(
73
+ self,
74
+ text: str,
75
+ max_triplets: Optional[int] = None,
76
+ source_chunk_id: Optional[str] = None,
77
+ ) -> list[GraphTriple]:
78
+ """Extract entity-relationship triplets from text using LLM.
79
+
80
+ Args:
81
+ text: Text content to extract entities from.
82
+ max_triplets: Override for max triplets (uses instance default).
83
+ source_chunk_id: Optional source chunk ID for provenance.
84
+
85
+ Returns:
86
+ List of GraphTriple objects extracted from text.
87
+ Returns empty list on failure (graceful degradation).
88
+ """
89
+ if not settings.ENABLE_GRAPH_INDEX:
90
+ return []
91
+
92
+ if not settings.GRAPH_USE_LLM_EXTRACTION:
93
+ logger.debug("LLM extraction disabled in settings")
94
+ return []
95
+
96
+ client = self._get_client()
97
+ if client is None:
98
+ return []
99
+
100
+ max_count = max_triplets or self.max_triplets
101
+
102
+ # Truncate very long text to avoid token limits
103
+ max_chars = 4000
104
+ if len(text) > max_chars:
105
+ text = text[:max_chars] + "..."
106
+
107
+ prompt = self._build_extraction_prompt(text, max_count)
108
+
109
+ try:
110
+ response = client.messages.create(
111
+ model=self.model,
112
+ max_tokens=1024,
113
+ messages=[{"role": "user", "content": prompt}],
114
+ )
115
+
116
+ response_text = response.content[0].text
117
+ triplets = self._parse_triplets(response_text, source_chunk_id)
118
+
119
+ logger.debug(f"Extracted {len(triplets)} triplets from text chunk")
120
+ return triplets
121
+
122
+ except Exception as e:
123
+ logger.warning(f"LLM entity extraction failed: {e}")
124
+ return []
125
+
126
+ def _build_extraction_prompt(self, text: str, max_triplets: int) -> str:
127
+ """Build the extraction prompt for the LLM.
128
+
129
+ Args:
130
+ text: Text to extract from.
131
+ max_triplets: Maximum number of triplets to request.
132
+
133
+ Returns:
134
+ Formatted prompt string.
135
+ """
136
+ return f"""Extract key entity relationships from the following text.
137
+ Return up to {max_triplets} triplets in the format:
138
+ SUBJECT | SUBJECT_TYPE | PREDICATE | OBJECT | OBJECT_TYPE
139
+
140
+ Rules:
141
+ - SUBJECT and OBJECT are entity names (classes, functions, concepts, etc.)
142
+ - SUBJECT_TYPE and OBJECT_TYPE are entity types (Class, Function, Module, Concept, etc.)
143
+ - PREDICATE is the relationship (uses, calls, extends, implements, contains, etc.)
144
+ - One triplet per line
145
+ - Only output triplets, no explanations
146
+
147
+ Text:
148
+ {text}
149
+
150
+ Triplets:"""
151
+
152
+ def _parse_triplets(
153
+ self,
154
+ response: str,
155
+ source_chunk_id: Optional[str] = None,
156
+ ) -> list[GraphTriple]:
157
+ """Parse triplets from LLM response.
158
+
159
+ Args:
160
+ response: Raw LLM response text.
161
+ source_chunk_id: Optional source chunk ID.
162
+
163
+ Returns:
164
+ List of parsed GraphTriple objects.
165
+ """
166
+ triplets: list[GraphTriple] = []
167
+
168
+ for line in response.strip().split("\n"):
169
+ line = line.strip()
170
+ if not line or "|" not in line:
171
+ continue
172
+
173
+ parts = [p.strip() for p in line.split("|")]
174
+ if len(parts) < 3:
175
+ continue
176
+
177
+ # Handle both 3-part and 5-part formats
178
+ if len(parts) == 3:
179
+ subject, predicate, obj = parts
180
+ subject_type = None
181
+ object_type = None
182
+ elif len(parts) >= 5:
183
+ subject, subject_type, predicate, obj, object_type = parts[:5]
184
+ # Clean up types
185
+ subject_type = subject_type if subject_type else None
186
+ object_type = object_type if object_type else None
187
+ else:
188
+ continue
189
+
190
+ # Validate and clean
191
+ if not subject or not predicate or not obj:
192
+ continue
193
+
194
+ try:
195
+ triplet = GraphTriple(
196
+ subject=subject,
197
+ subject_type=subject_type,
198
+ predicate=predicate,
199
+ object=obj,
200
+ object_type=object_type,
201
+ source_chunk_id=source_chunk_id,
202
+ )
203
+ triplets.append(triplet)
204
+ except Exception as e:
205
+ logger.debug(f"Failed to create triplet: {e}")
206
+ continue
207
+
208
+ return triplets
209
+
210
+
211
+ class CodeMetadataExtractor:
212
+ """Extract relationships from code AST metadata.
213
+
214
+ Analyzes code chunk metadata to extract structural relationships
215
+ such as imports, containment, and function calls.
216
+
217
+ This extractor uses pre-computed AST metadata from the code chunking
218
+ pipeline, making it fast and deterministic.
219
+ """
220
+
221
+ # Common relationship predicates for code
222
+ PREDICATE_IMPORTS = "imports"
223
+ PREDICATE_CONTAINS = "contains"
224
+ PREDICATE_CALLS = "calls"
225
+ PREDICATE_EXTENDS = "extends"
226
+ PREDICATE_IMPLEMENTS = "implements"
227
+ PREDICATE_DEFINED_IN = "defined_in"
228
+
229
+ def __init__(self) -> None:
230
+ """Initialize code metadata extractor."""
231
+ pass
232
+
233
+ def extract_from_metadata(
234
+ self,
235
+ metadata: dict[str, Any],
236
+ source_chunk_id: Optional[str] = None,
237
+ ) -> list[GraphTriple]:
238
+ """Extract import and containment relationships from code metadata.
239
+
240
+ Looks for standard code metadata fields:
241
+ - 'imports': List of imported modules/symbols
242
+ - 'symbol_name': Name of the current code symbol
243
+ - 'symbol_type': Type of symbol (function, class, method)
244
+ - 'parent_symbol': Parent containing symbol
245
+ - 'file_path': Source file path
246
+
247
+ Args:
248
+ metadata: Code chunk metadata dictionary.
249
+ source_chunk_id: Optional source chunk ID for provenance.
250
+
251
+ Returns:
252
+ List of GraphTriple objects extracted from metadata.
253
+ """
254
+ if not settings.ENABLE_GRAPH_INDEX:
255
+ return []
256
+
257
+ if not settings.GRAPH_USE_CODE_METADATA:
258
+ return []
259
+
260
+ triplets: list[GraphTriple] = []
261
+
262
+ symbol_name = metadata.get("symbol_name")
263
+ symbol_type = metadata.get("symbol_type")
264
+ parent_symbol = metadata.get("parent_symbol")
265
+ file_path = metadata.get("file_path") or metadata.get("source")
266
+ imports = metadata.get("imports", [])
267
+ class_name = metadata.get("class_name")
268
+
269
+ # Extract module name from file path
270
+ module_name = self._extract_module_name(file_path) if file_path else None
271
+
272
+ # 1. Symbol -> imports -> ImportedModule
273
+ if isinstance(imports, list):
274
+ for imp in imports:
275
+ if isinstance(imp, str) and imp:
276
+ triplet = GraphTriple(
277
+ subject=symbol_name or module_name or "unknown",
278
+ subject_type=symbol_type or "Module",
279
+ predicate=self.PREDICATE_IMPORTS,
280
+ object=imp,
281
+ object_type="Module",
282
+ source_chunk_id=source_chunk_id,
283
+ )
284
+ triplets.append(triplet)
285
+
286
+ # 2. Parent -> contains -> Symbol
287
+ if symbol_name and parent_symbol:
288
+ triplet = GraphTriple(
289
+ subject=parent_symbol,
290
+ subject_type="Class" if "." not in parent_symbol else "Module",
291
+ predicate=self.PREDICATE_CONTAINS,
292
+ object=symbol_name,
293
+ object_type=symbol_type or "Symbol",
294
+ source_chunk_id=source_chunk_id,
295
+ )
296
+ triplets.append(triplet)
297
+
298
+ # 3. Class -> contains -> Method (for methods)
299
+ if symbol_name and class_name and symbol_type in ("method", "function"):
300
+ if class_name != symbol_name: # Avoid self-reference
301
+ triplet = GraphTriple(
302
+ subject=class_name,
303
+ subject_type="Class",
304
+ predicate=self.PREDICATE_CONTAINS,
305
+ object=symbol_name,
306
+ object_type=symbol_type.capitalize(),
307
+ source_chunk_id=source_chunk_id,
308
+ )
309
+ triplets.append(triplet)
310
+
311
+ # 4. Module -> contains -> TopLevelSymbol
312
+ if module_name and symbol_name and not parent_symbol and not class_name:
313
+ triplet = GraphTriple(
314
+ subject=module_name,
315
+ subject_type="Module",
316
+ predicate=self.PREDICATE_CONTAINS,
317
+ object=symbol_name,
318
+ object_type=symbol_type or "Symbol",
319
+ source_chunk_id=source_chunk_id,
320
+ )
321
+ triplets.append(triplet)
322
+
323
+ # 5. Symbol -> defined_in -> Module
324
+ if symbol_name and module_name:
325
+ triplet = GraphTriple(
326
+ subject=symbol_name,
327
+ subject_type=symbol_type or "Symbol",
328
+ predicate=self.PREDICATE_DEFINED_IN,
329
+ object=module_name,
330
+ object_type="Module",
331
+ source_chunk_id=source_chunk_id,
332
+ )
333
+ triplets.append(triplet)
334
+
335
+ logger.debug(
336
+ f"Extracted {len(triplets)} triplets from code metadata "
337
+ f"(symbol={symbol_name})"
338
+ )
339
+ return triplets
340
+
341
+ def _extract_module_name(self, file_path: str) -> Optional[str]:
342
+ """Extract module name from file path.
343
+
344
+ Args:
345
+ file_path: Path to source file.
346
+
347
+ Returns:
348
+ Module name derived from file path, or None.
349
+ """
350
+ if not file_path:
351
+ return None
352
+
353
+ # Remove common prefixes and extensions
354
+ path = file_path.replace("\\", "/")
355
+
356
+ # Get just the filename without extension
357
+ if "/" in path:
358
+ path = path.rsplit("/", 1)[-1]
359
+
360
+ # Remove extension
361
+ if "." in path:
362
+ path = path.rsplit(".", 1)[0]
363
+
364
+ # Clean up invalid characters
365
+ path = re.sub(r"[^a-zA-Z0-9_]", "_", path)
366
+
367
+ return path if path else None
368
+
369
+ def extract_from_text(
370
+ self,
371
+ text: str,
372
+ language: Optional[str] = None,
373
+ source_chunk_id: Optional[str] = None,
374
+ ) -> list[GraphTriple]:
375
+ """Extract relationships from code text using pattern matching.
376
+
377
+ This is a fallback when AST metadata is not available.
378
+ Uses regex patterns to identify imports and definitions.
379
+
380
+ Args:
381
+ text: Code text content.
382
+ language: Programming language (python, javascript, etc.).
383
+ source_chunk_id: Optional source chunk ID.
384
+
385
+ Returns:
386
+ List of GraphTriple objects.
387
+ """
388
+ if not settings.ENABLE_GRAPH_INDEX:
389
+ return []
390
+
391
+ triplets: list[GraphTriple] = []
392
+
393
+ if not language:
394
+ return triplets
395
+
396
+ language = language.lower()
397
+
398
+ # Extract Python imports
399
+ if language == "python":
400
+ triplets.extend(self._extract_python_imports(text, source_chunk_id))
401
+
402
+ # Extract JavaScript/TypeScript imports
403
+ elif language in ("javascript", "typescript", "tsx", "jsx"):
404
+ triplets.extend(self._extract_js_imports(text, source_chunk_id))
405
+
406
+ # Extract Java imports
407
+ elif language == "java":
408
+ triplets.extend(self._extract_java_imports(text, source_chunk_id))
409
+
410
+ # Extract Go imports
411
+ elif language == "go":
412
+ triplets.extend(self._extract_go_imports(text, source_chunk_id))
413
+
414
+ return triplets
415
+
416
+ def _extract_python_imports(
417
+ self,
418
+ text: str,
419
+ source_chunk_id: Optional[str],
420
+ ) -> list[GraphTriple]:
421
+ """Extract imports from Python code."""
422
+ triplets: list[GraphTriple] = []
423
+
424
+ # Match: import module
425
+ for match in re.finditer(r"^import\s+([\w.]+)", text, re.MULTILINE):
426
+ module = match.group(1)
427
+ triplets.append(
428
+ GraphTriple(
429
+ subject="current_module",
430
+ subject_type="Module",
431
+ predicate=self.PREDICATE_IMPORTS,
432
+ object=module,
433
+ object_type="Module",
434
+ source_chunk_id=source_chunk_id,
435
+ )
436
+ )
437
+
438
+ # Match: from module import ...
439
+ for match in re.finditer(r"^from\s+([\w.]+)\s+import", text, re.MULTILINE):
440
+ module = match.group(1)
441
+ triplets.append(
442
+ GraphTriple(
443
+ subject="current_module",
444
+ subject_type="Module",
445
+ predicate=self.PREDICATE_IMPORTS,
446
+ object=module,
447
+ object_type="Module",
448
+ source_chunk_id=source_chunk_id,
449
+ )
450
+ )
451
+
452
+ return triplets
453
+
454
+ def _extract_js_imports(
455
+ self,
456
+ text: str,
457
+ source_chunk_id: Optional[str],
458
+ ) -> list[GraphTriple]:
459
+ """Extract imports from JavaScript/TypeScript code."""
460
+ triplets: list[GraphTriple] = []
461
+
462
+ # Match: import ... from 'module'
463
+ for match in re.finditer(r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", text):
464
+ module = match.group(1)
465
+ triplets.append(
466
+ GraphTriple(
467
+ subject="current_module",
468
+ subject_type="Module",
469
+ predicate=self.PREDICATE_IMPORTS,
470
+ object=module,
471
+ object_type="Module",
472
+ source_chunk_id=source_chunk_id,
473
+ )
474
+ )
475
+
476
+ # Match: require('module')
477
+ for match in re.finditer(r"require\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
478
+ module = match.group(1)
479
+ triplets.append(
480
+ GraphTriple(
481
+ subject="current_module",
482
+ subject_type="Module",
483
+ predicate=self.PREDICATE_IMPORTS,
484
+ object=module,
485
+ object_type="Module",
486
+ source_chunk_id=source_chunk_id,
487
+ )
488
+ )
489
+
490
+ return triplets
491
+
492
+ def _extract_java_imports(
493
+ self,
494
+ text: str,
495
+ source_chunk_id: Optional[str],
496
+ ) -> list[GraphTriple]:
497
+ """Extract imports from Java code."""
498
+ triplets: list[GraphTriple] = []
499
+
500
+ # Match: import package.Class;
501
+ for match in re.finditer(r"^import\s+([\w.]+);", text, re.MULTILINE):
502
+ module = match.group(1)
503
+ triplets.append(
504
+ GraphTriple(
505
+ subject="current_module",
506
+ subject_type="Module",
507
+ predicate=self.PREDICATE_IMPORTS,
508
+ object=module,
509
+ object_type="Class",
510
+ source_chunk_id=source_chunk_id,
511
+ )
512
+ )
513
+
514
+ return triplets
515
+
516
+ def _extract_go_imports(
517
+ self,
518
+ text: str,
519
+ source_chunk_id: Optional[str],
520
+ ) -> list[GraphTriple]:
521
+ """Extract imports from Go code."""
522
+ triplets: list[GraphTriple] = []
523
+
524
+ # Match: import "package"
525
+ for match in re.finditer(r'import\s+"([^"]+)"', text):
526
+ module = match.group(1)
527
+ triplets.append(
528
+ GraphTriple(
529
+ subject="current_module",
530
+ subject_type="Module",
531
+ predicate=self.PREDICATE_IMPORTS,
532
+ object=module,
533
+ object_type="Package",
534
+ source_chunk_id=source_chunk_id,
535
+ )
536
+ )
537
+
538
+ # Match imports in parentheses
539
+ import_block = re.search(r"import\s*\((.*?)\)", text, re.DOTALL)
540
+ if import_block:
541
+ for match in re.finditer(r'"([^"]+)"', import_block.group(1)):
542
+ module = match.group(1)
543
+ triplets.append(
544
+ GraphTriple(
545
+ subject="current_module",
546
+ subject_type="Module",
547
+ predicate=self.PREDICATE_IMPORTS,
548
+ object=module,
549
+ object_type="Package",
550
+ source_chunk_id=source_chunk_id,
551
+ )
552
+ )
553
+
554
+ return triplets
555
+
556
+
557
+ # Module-level singleton instances
558
+ _llm_extractor: Optional[LLMEntityExtractor] = None
559
+ _code_extractor: Optional[CodeMetadataExtractor] = None
560
+
561
+
562
+ def get_llm_extractor() -> LLMEntityExtractor:
563
+ """Get the global LLM entity extractor instance."""
564
+ global _llm_extractor
565
+ if _llm_extractor is None:
566
+ _llm_extractor = LLMEntityExtractor()
567
+ return _llm_extractor
568
+
569
+
570
+ def get_code_extractor() -> CodeMetadataExtractor:
571
+ """Get the global code metadata extractor instance."""
572
+ global _code_extractor
573
+ if _code_extractor is None:
574
+ _code_extractor = CodeMetadataExtractor()
575
+ return _code_extractor
576
+
577
+
578
+ def reset_extractors() -> None:
579
+ """Reset extractor singletons. Used for testing."""
580
+ global _llm_extractor, _code_extractor
581
+ _llm_extractor = None
582
+ _code_extractor = None