agent-brain-rag 1.2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-2.0.0.dist-info}/METADATA +54 -16
- agent_brain_rag-2.0.0.dist-info/RECORD +50 -0
- agent_brain_server/__init__.py +1 -1
- agent_brain_server/api/main.py +30 -2
- agent_brain_server/api/routers/health.py +1 -0
- agent_brain_server/config/provider_config.py +308 -0
- agent_brain_server/config/settings.py +12 -1
- agent_brain_server/indexing/__init__.py +21 -0
- agent_brain_server/indexing/embedding.py +86 -135
- agent_brain_server/indexing/graph_extractors.py +582 -0
- agent_brain_server/indexing/graph_index.py +536 -0
- agent_brain_server/models/__init__.py +9 -0
- agent_brain_server/models/graph.py +253 -0
- agent_brain_server/models/health.py +15 -3
- agent_brain_server/models/query.py +14 -1
- agent_brain_server/providers/__init__.py +64 -0
- agent_brain_server/providers/base.py +251 -0
- agent_brain_server/providers/embedding/__init__.py +23 -0
- agent_brain_server/providers/embedding/cohere.py +163 -0
- agent_brain_server/providers/embedding/ollama.py +150 -0
- agent_brain_server/providers/embedding/openai.py +118 -0
- agent_brain_server/providers/exceptions.py +95 -0
- agent_brain_server/providers/factory.py +157 -0
- agent_brain_server/providers/summarization/__init__.py +41 -0
- agent_brain_server/providers/summarization/anthropic.py +87 -0
- agent_brain_server/providers/summarization/gemini.py +96 -0
- agent_brain_server/providers/summarization/grok.py +95 -0
- agent_brain_server/providers/summarization/ollama.py +114 -0
- agent_brain_server/providers/summarization/openai.py +87 -0
- agent_brain_server/services/indexing_service.py +39 -0
- agent_brain_server/services/query_service.py +203 -0
- agent_brain_server/storage/__init__.py +18 -2
- agent_brain_server/storage/graph_store.py +519 -0
- agent_brain_server/storage/vector_store.py +35 -0
- agent_brain_server/storage_paths.py +2 -0
- agent_brain_rag-1.2.0.dist-info/RECORD +0 -31
- {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-2.0.0.dist-info}/WHEEL +0 -0
- {agent_brain_rag-1.2.0.dist-info → agent_brain_rag-2.0.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
"""Entity extraction for GraphRAG (Feature 113).
|
|
2
|
+
|
|
3
|
+
Provides extractors for building the knowledge graph:
|
|
4
|
+
- LLMEntityExtractor: Uses LLM to extract entity-relationship triplets
|
|
5
|
+
- CodeMetadataExtractor: Extracts relationships from code AST metadata
|
|
6
|
+
|
|
7
|
+
All extractors return GraphTriple objects for graph construction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from agent_brain_server.config import settings
|
|
15
|
+
from agent_brain_server.models.graph import GraphTriple
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LLMEntityExtractor:
|
|
21
|
+
"""Wrapper for LLM-based entity extraction.
|
|
22
|
+
|
|
23
|
+
Uses Claude to extract entity-relationship triplets from text.
|
|
24
|
+
Implements graceful degradation when LLM is unavailable.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
model: The LLM model to use for extraction.
|
|
28
|
+
max_triplets: Maximum triplets to extract per chunk.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
model: Optional[str] = None,
|
|
34
|
+
max_triplets: Optional[int] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Initialize LLM entity extractor.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
model: LLM model to use (defaults to settings.GRAPH_EXTRACTION_MODEL).
|
|
40
|
+
max_triplets: Max triplets per chunk (defaults to settings value).
|
|
41
|
+
"""
|
|
42
|
+
self.model = model or settings.GRAPH_EXTRACTION_MODEL
|
|
43
|
+
self.max_triplets = max_triplets or settings.GRAPH_MAX_TRIPLETS_PER_CHUNK
|
|
44
|
+
self._client: Optional[Any] = None
|
|
45
|
+
|
|
46
|
+
def _get_client(self) -> Optional[Any]:
|
|
47
|
+
"""Get or create Anthropic client.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Anthropic client or None if unavailable.
|
|
51
|
+
"""
|
|
52
|
+
if self._client is not None:
|
|
53
|
+
return self._client
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
import anthropic
|
|
57
|
+
|
|
58
|
+
api_key = settings.ANTHROPIC_API_KEY
|
|
59
|
+
if not api_key:
|
|
60
|
+
logger.debug("No Anthropic API key, LLM extraction disabled")
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
self._client = anthropic.Anthropic(api_key=api_key)
|
|
64
|
+
return self._client
|
|
65
|
+
except ImportError:
|
|
66
|
+
logger.debug("Anthropic SDK not installed, LLM extraction disabled")
|
|
67
|
+
return None
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning(f"Failed to create Anthropic client: {e}")
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def extract_triplets(
|
|
73
|
+
self,
|
|
74
|
+
text: str,
|
|
75
|
+
max_triplets: Optional[int] = None,
|
|
76
|
+
source_chunk_id: Optional[str] = None,
|
|
77
|
+
) -> list[GraphTriple]:
|
|
78
|
+
"""Extract entity-relationship triplets from text using LLM.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Text content to extract entities from.
|
|
82
|
+
max_triplets: Override for max triplets (uses instance default).
|
|
83
|
+
source_chunk_id: Optional source chunk ID for provenance.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of GraphTriple objects extracted from text.
|
|
87
|
+
Returns empty list on failure (graceful degradation).
|
|
88
|
+
"""
|
|
89
|
+
if not settings.ENABLE_GRAPH_INDEX:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
if not settings.GRAPH_USE_LLM_EXTRACTION:
|
|
93
|
+
logger.debug("LLM extraction disabled in settings")
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
client = self._get_client()
|
|
97
|
+
if client is None:
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
max_count = max_triplets or self.max_triplets
|
|
101
|
+
|
|
102
|
+
# Truncate very long text to avoid token limits
|
|
103
|
+
max_chars = 4000
|
|
104
|
+
if len(text) > max_chars:
|
|
105
|
+
text = text[:max_chars] + "..."
|
|
106
|
+
|
|
107
|
+
prompt = self._build_extraction_prompt(text, max_count)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
response = client.messages.create(
|
|
111
|
+
model=self.model,
|
|
112
|
+
max_tokens=1024,
|
|
113
|
+
messages=[{"role": "user", "content": prompt}],
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
response_text = response.content[0].text
|
|
117
|
+
triplets = self._parse_triplets(response_text, source_chunk_id)
|
|
118
|
+
|
|
119
|
+
logger.debug(f"Extracted {len(triplets)} triplets from text chunk")
|
|
120
|
+
return triplets
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.warning(f"LLM entity extraction failed: {e}")
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
def _build_extraction_prompt(self, text: str, max_triplets: int) -> str:
|
|
127
|
+
"""Build the extraction prompt for the LLM.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
text: Text to extract from.
|
|
131
|
+
max_triplets: Maximum number of triplets to request.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Formatted prompt string.
|
|
135
|
+
"""
|
|
136
|
+
return f"""Extract key entity relationships from the following text.
|
|
137
|
+
Return up to {max_triplets} triplets in the format:
|
|
138
|
+
SUBJECT | SUBJECT_TYPE | PREDICATE | OBJECT | OBJECT_TYPE
|
|
139
|
+
|
|
140
|
+
Rules:
|
|
141
|
+
- SUBJECT and OBJECT are entity names (classes, functions, concepts, etc.)
|
|
142
|
+
- SUBJECT_TYPE and OBJECT_TYPE are entity types (Class, Function, Module, Concept, etc.)
|
|
143
|
+
- PREDICATE is the relationship (uses, calls, extends, implements, contains, etc.)
|
|
144
|
+
- One triplet per line
|
|
145
|
+
- Only output triplets, no explanations
|
|
146
|
+
|
|
147
|
+
Text:
|
|
148
|
+
{text}
|
|
149
|
+
|
|
150
|
+
Triplets:"""
|
|
151
|
+
|
|
152
|
+
def _parse_triplets(
|
|
153
|
+
self,
|
|
154
|
+
response: str,
|
|
155
|
+
source_chunk_id: Optional[str] = None,
|
|
156
|
+
) -> list[GraphTriple]:
|
|
157
|
+
"""Parse triplets from LLM response.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
response: Raw LLM response text.
|
|
161
|
+
source_chunk_id: Optional source chunk ID.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of parsed GraphTriple objects.
|
|
165
|
+
"""
|
|
166
|
+
triplets: list[GraphTriple] = []
|
|
167
|
+
|
|
168
|
+
for line in response.strip().split("\n"):
|
|
169
|
+
line = line.strip()
|
|
170
|
+
if not line or "|" not in line:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
parts = [p.strip() for p in line.split("|")]
|
|
174
|
+
if len(parts) < 3:
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
# Handle both 3-part and 5-part formats
|
|
178
|
+
if len(parts) == 3:
|
|
179
|
+
subject, predicate, obj = parts
|
|
180
|
+
subject_type = None
|
|
181
|
+
object_type = None
|
|
182
|
+
elif len(parts) >= 5:
|
|
183
|
+
subject, subject_type, predicate, obj, object_type = parts[:5]
|
|
184
|
+
# Clean up types
|
|
185
|
+
subject_type = subject_type if subject_type else None
|
|
186
|
+
object_type = object_type if object_type else None
|
|
187
|
+
else:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# Validate and clean
|
|
191
|
+
if not subject or not predicate or not obj:
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
triplet = GraphTriple(
|
|
196
|
+
subject=subject,
|
|
197
|
+
subject_type=subject_type,
|
|
198
|
+
predicate=predicate,
|
|
199
|
+
object=obj,
|
|
200
|
+
object_type=object_type,
|
|
201
|
+
source_chunk_id=source_chunk_id,
|
|
202
|
+
)
|
|
203
|
+
triplets.append(triplet)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.debug(f"Failed to create triplet: {e}")
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
return triplets
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class CodeMetadataExtractor:
|
|
212
|
+
"""Extract relationships from code AST metadata.
|
|
213
|
+
|
|
214
|
+
Analyzes code chunk metadata to extract structural relationships
|
|
215
|
+
such as imports, containment, and function calls.
|
|
216
|
+
|
|
217
|
+
This extractor uses pre-computed AST metadata from the code chunking
|
|
218
|
+
pipeline, making it fast and deterministic.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
# Common relationship predicates for code
|
|
222
|
+
PREDICATE_IMPORTS = "imports"
|
|
223
|
+
PREDICATE_CONTAINS = "contains"
|
|
224
|
+
PREDICATE_CALLS = "calls"
|
|
225
|
+
PREDICATE_EXTENDS = "extends"
|
|
226
|
+
PREDICATE_IMPLEMENTS = "implements"
|
|
227
|
+
PREDICATE_DEFINED_IN = "defined_in"
|
|
228
|
+
|
|
229
|
+
def __init__(self) -> None:
|
|
230
|
+
"""Initialize code metadata extractor."""
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
def extract_from_metadata(
|
|
234
|
+
self,
|
|
235
|
+
metadata: dict[str, Any],
|
|
236
|
+
source_chunk_id: Optional[str] = None,
|
|
237
|
+
) -> list[GraphTriple]:
|
|
238
|
+
"""Extract import and containment relationships from code metadata.
|
|
239
|
+
|
|
240
|
+
Looks for standard code metadata fields:
|
|
241
|
+
- 'imports': List of imported modules/symbols
|
|
242
|
+
- 'symbol_name': Name of the current code symbol
|
|
243
|
+
- 'symbol_type': Type of symbol (function, class, method)
|
|
244
|
+
- 'parent_symbol': Parent containing symbol
|
|
245
|
+
- 'file_path': Source file path
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
metadata: Code chunk metadata dictionary.
|
|
249
|
+
source_chunk_id: Optional source chunk ID for provenance.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
List of GraphTriple objects extracted from metadata.
|
|
253
|
+
"""
|
|
254
|
+
if not settings.ENABLE_GRAPH_INDEX:
|
|
255
|
+
return []
|
|
256
|
+
|
|
257
|
+
if not settings.GRAPH_USE_CODE_METADATA:
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
triplets: list[GraphTriple] = []
|
|
261
|
+
|
|
262
|
+
symbol_name = metadata.get("symbol_name")
|
|
263
|
+
symbol_type = metadata.get("symbol_type")
|
|
264
|
+
parent_symbol = metadata.get("parent_symbol")
|
|
265
|
+
file_path = metadata.get("file_path") or metadata.get("source")
|
|
266
|
+
imports = metadata.get("imports", [])
|
|
267
|
+
class_name = metadata.get("class_name")
|
|
268
|
+
|
|
269
|
+
# Extract module name from file path
|
|
270
|
+
module_name = self._extract_module_name(file_path) if file_path else None
|
|
271
|
+
|
|
272
|
+
# 1. Symbol -> imports -> ImportedModule
|
|
273
|
+
if isinstance(imports, list):
|
|
274
|
+
for imp in imports:
|
|
275
|
+
if isinstance(imp, str) and imp:
|
|
276
|
+
triplet = GraphTriple(
|
|
277
|
+
subject=symbol_name or module_name or "unknown",
|
|
278
|
+
subject_type=symbol_type or "Module",
|
|
279
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
280
|
+
object=imp,
|
|
281
|
+
object_type="Module",
|
|
282
|
+
source_chunk_id=source_chunk_id,
|
|
283
|
+
)
|
|
284
|
+
triplets.append(triplet)
|
|
285
|
+
|
|
286
|
+
# 2. Parent -> contains -> Symbol
|
|
287
|
+
if symbol_name and parent_symbol:
|
|
288
|
+
triplet = GraphTriple(
|
|
289
|
+
subject=parent_symbol,
|
|
290
|
+
subject_type="Class" if "." not in parent_symbol else "Module",
|
|
291
|
+
predicate=self.PREDICATE_CONTAINS,
|
|
292
|
+
object=symbol_name,
|
|
293
|
+
object_type=symbol_type or "Symbol",
|
|
294
|
+
source_chunk_id=source_chunk_id,
|
|
295
|
+
)
|
|
296
|
+
triplets.append(triplet)
|
|
297
|
+
|
|
298
|
+
# 3. Class -> contains -> Method (for methods)
|
|
299
|
+
if symbol_name and class_name and symbol_type in ("method", "function"):
|
|
300
|
+
if class_name != symbol_name: # Avoid self-reference
|
|
301
|
+
triplet = GraphTriple(
|
|
302
|
+
subject=class_name,
|
|
303
|
+
subject_type="Class",
|
|
304
|
+
predicate=self.PREDICATE_CONTAINS,
|
|
305
|
+
object=symbol_name,
|
|
306
|
+
object_type=symbol_type.capitalize(),
|
|
307
|
+
source_chunk_id=source_chunk_id,
|
|
308
|
+
)
|
|
309
|
+
triplets.append(triplet)
|
|
310
|
+
|
|
311
|
+
# 4. Module -> contains -> TopLevelSymbol
|
|
312
|
+
if module_name and symbol_name and not parent_symbol and not class_name:
|
|
313
|
+
triplet = GraphTriple(
|
|
314
|
+
subject=module_name,
|
|
315
|
+
subject_type="Module",
|
|
316
|
+
predicate=self.PREDICATE_CONTAINS,
|
|
317
|
+
object=symbol_name,
|
|
318
|
+
object_type=symbol_type or "Symbol",
|
|
319
|
+
source_chunk_id=source_chunk_id,
|
|
320
|
+
)
|
|
321
|
+
triplets.append(triplet)
|
|
322
|
+
|
|
323
|
+
# 5. Symbol -> defined_in -> Module
|
|
324
|
+
if symbol_name and module_name:
|
|
325
|
+
triplet = GraphTriple(
|
|
326
|
+
subject=symbol_name,
|
|
327
|
+
subject_type=symbol_type or "Symbol",
|
|
328
|
+
predicate=self.PREDICATE_DEFINED_IN,
|
|
329
|
+
object=module_name,
|
|
330
|
+
object_type="Module",
|
|
331
|
+
source_chunk_id=source_chunk_id,
|
|
332
|
+
)
|
|
333
|
+
triplets.append(triplet)
|
|
334
|
+
|
|
335
|
+
logger.debug(
|
|
336
|
+
f"Extracted {len(triplets)} triplets from code metadata "
|
|
337
|
+
f"(symbol={symbol_name})"
|
|
338
|
+
)
|
|
339
|
+
return triplets
|
|
340
|
+
|
|
341
|
+
def _extract_module_name(self, file_path: str) -> Optional[str]:
|
|
342
|
+
"""Extract module name from file path.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
file_path: Path to source file.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Module name derived from file path, or None.
|
|
349
|
+
"""
|
|
350
|
+
if not file_path:
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
# Remove common prefixes and extensions
|
|
354
|
+
path = file_path.replace("\\", "/")
|
|
355
|
+
|
|
356
|
+
# Get just the filename without extension
|
|
357
|
+
if "/" in path:
|
|
358
|
+
path = path.rsplit("/", 1)[-1]
|
|
359
|
+
|
|
360
|
+
# Remove extension
|
|
361
|
+
if "." in path:
|
|
362
|
+
path = path.rsplit(".", 1)[0]
|
|
363
|
+
|
|
364
|
+
# Clean up invalid characters
|
|
365
|
+
path = re.sub(r"[^a-zA-Z0-9_]", "_", path)
|
|
366
|
+
|
|
367
|
+
return path if path else None
|
|
368
|
+
|
|
369
|
+
def extract_from_text(
|
|
370
|
+
self,
|
|
371
|
+
text: str,
|
|
372
|
+
language: Optional[str] = None,
|
|
373
|
+
source_chunk_id: Optional[str] = None,
|
|
374
|
+
) -> list[GraphTriple]:
|
|
375
|
+
"""Extract relationships from code text using pattern matching.
|
|
376
|
+
|
|
377
|
+
This is a fallback when AST metadata is not available.
|
|
378
|
+
Uses regex patterns to identify imports and definitions.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
text: Code text content.
|
|
382
|
+
language: Programming language (python, javascript, etc.).
|
|
383
|
+
source_chunk_id: Optional source chunk ID.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
List of GraphTriple objects.
|
|
387
|
+
"""
|
|
388
|
+
if not settings.ENABLE_GRAPH_INDEX:
|
|
389
|
+
return []
|
|
390
|
+
|
|
391
|
+
triplets: list[GraphTriple] = []
|
|
392
|
+
|
|
393
|
+
if not language:
|
|
394
|
+
return triplets
|
|
395
|
+
|
|
396
|
+
language = language.lower()
|
|
397
|
+
|
|
398
|
+
# Extract Python imports
|
|
399
|
+
if language == "python":
|
|
400
|
+
triplets.extend(self._extract_python_imports(text, source_chunk_id))
|
|
401
|
+
|
|
402
|
+
# Extract JavaScript/TypeScript imports
|
|
403
|
+
elif language in ("javascript", "typescript", "tsx", "jsx"):
|
|
404
|
+
triplets.extend(self._extract_js_imports(text, source_chunk_id))
|
|
405
|
+
|
|
406
|
+
# Extract Java imports
|
|
407
|
+
elif language == "java":
|
|
408
|
+
triplets.extend(self._extract_java_imports(text, source_chunk_id))
|
|
409
|
+
|
|
410
|
+
# Extract Go imports
|
|
411
|
+
elif language == "go":
|
|
412
|
+
triplets.extend(self._extract_go_imports(text, source_chunk_id))
|
|
413
|
+
|
|
414
|
+
return triplets
|
|
415
|
+
|
|
416
|
+
def _extract_python_imports(
|
|
417
|
+
self,
|
|
418
|
+
text: str,
|
|
419
|
+
source_chunk_id: Optional[str],
|
|
420
|
+
) -> list[GraphTriple]:
|
|
421
|
+
"""Extract imports from Python code."""
|
|
422
|
+
triplets: list[GraphTriple] = []
|
|
423
|
+
|
|
424
|
+
# Match: import module
|
|
425
|
+
for match in re.finditer(r"^import\s+([\w.]+)", text, re.MULTILINE):
|
|
426
|
+
module = match.group(1)
|
|
427
|
+
triplets.append(
|
|
428
|
+
GraphTriple(
|
|
429
|
+
subject="current_module",
|
|
430
|
+
subject_type="Module",
|
|
431
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
432
|
+
object=module,
|
|
433
|
+
object_type="Module",
|
|
434
|
+
source_chunk_id=source_chunk_id,
|
|
435
|
+
)
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Match: from module import ...
|
|
439
|
+
for match in re.finditer(r"^from\s+([\w.]+)\s+import", text, re.MULTILINE):
|
|
440
|
+
module = match.group(1)
|
|
441
|
+
triplets.append(
|
|
442
|
+
GraphTriple(
|
|
443
|
+
subject="current_module",
|
|
444
|
+
subject_type="Module",
|
|
445
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
446
|
+
object=module,
|
|
447
|
+
object_type="Module",
|
|
448
|
+
source_chunk_id=source_chunk_id,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return triplets
|
|
453
|
+
|
|
454
|
+
def _extract_js_imports(
|
|
455
|
+
self,
|
|
456
|
+
text: str,
|
|
457
|
+
source_chunk_id: Optional[str],
|
|
458
|
+
) -> list[GraphTriple]:
|
|
459
|
+
"""Extract imports from JavaScript/TypeScript code."""
|
|
460
|
+
triplets: list[GraphTriple] = []
|
|
461
|
+
|
|
462
|
+
# Match: import ... from 'module'
|
|
463
|
+
for match in re.finditer(r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]", text):
|
|
464
|
+
module = match.group(1)
|
|
465
|
+
triplets.append(
|
|
466
|
+
GraphTriple(
|
|
467
|
+
subject="current_module",
|
|
468
|
+
subject_type="Module",
|
|
469
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
470
|
+
object=module,
|
|
471
|
+
object_type="Module",
|
|
472
|
+
source_chunk_id=source_chunk_id,
|
|
473
|
+
)
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Match: require('module')
|
|
477
|
+
for match in re.finditer(r"require\s*\(\s*['\"]([^'\"]+)['\"]\s*\)", text):
|
|
478
|
+
module = match.group(1)
|
|
479
|
+
triplets.append(
|
|
480
|
+
GraphTriple(
|
|
481
|
+
subject="current_module",
|
|
482
|
+
subject_type="Module",
|
|
483
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
484
|
+
object=module,
|
|
485
|
+
object_type="Module",
|
|
486
|
+
source_chunk_id=source_chunk_id,
|
|
487
|
+
)
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
return triplets
|
|
491
|
+
|
|
492
|
+
def _extract_java_imports(
|
|
493
|
+
self,
|
|
494
|
+
text: str,
|
|
495
|
+
source_chunk_id: Optional[str],
|
|
496
|
+
) -> list[GraphTriple]:
|
|
497
|
+
"""Extract imports from Java code."""
|
|
498
|
+
triplets: list[GraphTriple] = []
|
|
499
|
+
|
|
500
|
+
# Match: import package.Class;
|
|
501
|
+
for match in re.finditer(r"^import\s+([\w.]+);", text, re.MULTILINE):
|
|
502
|
+
module = match.group(1)
|
|
503
|
+
triplets.append(
|
|
504
|
+
GraphTriple(
|
|
505
|
+
subject="current_module",
|
|
506
|
+
subject_type="Module",
|
|
507
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
508
|
+
object=module,
|
|
509
|
+
object_type="Class",
|
|
510
|
+
source_chunk_id=source_chunk_id,
|
|
511
|
+
)
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return triplets
|
|
515
|
+
|
|
516
|
+
def _extract_go_imports(
|
|
517
|
+
self,
|
|
518
|
+
text: str,
|
|
519
|
+
source_chunk_id: Optional[str],
|
|
520
|
+
) -> list[GraphTriple]:
|
|
521
|
+
"""Extract imports from Go code."""
|
|
522
|
+
triplets: list[GraphTriple] = []
|
|
523
|
+
|
|
524
|
+
# Match: import "package"
|
|
525
|
+
for match in re.finditer(r'import\s+"([^"]+)"', text):
|
|
526
|
+
module = match.group(1)
|
|
527
|
+
triplets.append(
|
|
528
|
+
GraphTriple(
|
|
529
|
+
subject="current_module",
|
|
530
|
+
subject_type="Module",
|
|
531
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
532
|
+
object=module,
|
|
533
|
+
object_type="Package",
|
|
534
|
+
source_chunk_id=source_chunk_id,
|
|
535
|
+
)
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Match imports in parentheses
|
|
539
|
+
import_block = re.search(r"import\s*\((.*?)\)", text, re.DOTALL)
|
|
540
|
+
if import_block:
|
|
541
|
+
for match in re.finditer(r'"([^"]+)"', import_block.group(1)):
|
|
542
|
+
module = match.group(1)
|
|
543
|
+
triplets.append(
|
|
544
|
+
GraphTriple(
|
|
545
|
+
subject="current_module",
|
|
546
|
+
subject_type="Module",
|
|
547
|
+
predicate=self.PREDICATE_IMPORTS,
|
|
548
|
+
object=module,
|
|
549
|
+
object_type="Package",
|
|
550
|
+
source_chunk_id=source_chunk_id,
|
|
551
|
+
)
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
return triplets
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
# Module-level singleton instances
|
|
558
|
+
_llm_extractor: Optional[LLMEntityExtractor] = None
|
|
559
|
+
_code_extractor: Optional[CodeMetadataExtractor] = None
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def get_llm_extractor() -> LLMEntityExtractor:
|
|
563
|
+
"""Get the global LLM entity extractor instance."""
|
|
564
|
+
global _llm_extractor
|
|
565
|
+
if _llm_extractor is None:
|
|
566
|
+
_llm_extractor = LLMEntityExtractor()
|
|
567
|
+
return _llm_extractor
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def get_code_extractor() -> CodeMetadataExtractor:
|
|
571
|
+
"""Get the global code metadata extractor instance."""
|
|
572
|
+
global _code_extractor
|
|
573
|
+
if _code_extractor is None:
|
|
574
|
+
_code_extractor = CodeMetadataExtractor()
|
|
575
|
+
return _code_extractor
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def reset_extractors() -> None:
|
|
579
|
+
"""Reset extractor singletons. Used for testing."""
|
|
580
|
+
global _llm_extractor, _code_extractor
|
|
581
|
+
_llm_extractor = None
|
|
582
|
+
_code_extractor = None
|