codebase-retrieval-context-engine 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.2.dist-info}/METADATA +1 -4
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.2.dist-info}/RECORD +13 -13
- corbell/__init__.py +1 -1
- corbell/core/embeddings/model.py +5 -33
- corbell/core/graph/builder.py +55 -15
- corbell/core/graph/method_graph.py +48 -26
- corbell/core/graph/sqlite_store.py +25 -0
- corbell/core/indexing/builder.py +622 -608
- corbell/core/query/engine.py +7 -2
- corbell/core/workspace.py +5 -10
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.2.dist-info}/WHEEL +0 -0
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.2.dist-info}/entry_points.txt +0 -0
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebase-retrieval-context-engine
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.2
|
|
4
4
|
Summary: Code retrieval engine — hybrid embedding + graph search for LLM context injection.
|
|
5
5
|
Project-URL: Homepage, https://github.com/nullmastermind/local-context-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/nullmastermind/local-context-engine
|
|
@@ -41,9 +41,6 @@ Requires-Dist: anthropic[vertex]>=0.25; extra == 'gcp'
|
|
|
41
41
|
Requires-Dist: google-cloud-aiplatform>=1.38; extra == 'gcp'
|
|
42
42
|
Provides-Extra: google
|
|
43
43
|
Requires-Dist: google-genai>=2.7.0; extra == 'google'
|
|
44
|
-
Provides-Extra: local
|
|
45
|
-
Requires-Dist: sentence-transformers>=3.0; extra == 'local'
|
|
46
|
-
Requires-Dist: transformers<5.0.0; extra == 'local'
|
|
47
44
|
Provides-Extra: openai
|
|
48
45
|
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
49
46
|
Provides-Extra: treesitter
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
corbell/__init__.py,sha256=
|
|
1
|
+
corbell/__init__.py,sha256=DK8C29me67FSOnq2v_CAPc0COnXW4plMGTNHfZvmX5Y,124
|
|
2
2
|
corbell/cli/__init__.py,sha256=5-MP6JIWgp4nDLNIhqP6Gtx97GESaIYg3NGxtRGaMv0,28
|
|
3
3
|
corbell/cli/main.py,sha256=anYpXiyQD6_1wMS0Dtef6Rxtxd0NEFe7HHnerHxf3J4,1835
|
|
4
4
|
corbell/cli/commands/__init__.py,sha256=0mAOs3RWC7XMZnGRN677hjPCHHQKDq9ASjIr_GQM3js,37
|
|
@@ -8,39 +8,39 @@ corbell/core/__init__.py,sha256=VS9PnhHr4NXYlWs1TLCyllnVCNsiwVZ1Xj-AOBhZpAU,29
|
|
|
8
8
|
corbell/core/constants.py,sha256=HTGYpShlp9pP2_a4WngHtTujUQfHcypFAYoaczmkBdQ,1061
|
|
9
9
|
corbell/core/gitignore.py,sha256=VS7_s6NwZWQAwgLiaRzPHdBRIj86XdnPm_P_x_e0hvI,2266
|
|
10
10
|
corbell/core/llm_client.py,sha256=2MDwe6kr_EyY3DFv3fNO91WCig8ER021ogzdLGH3IN8,26219
|
|
11
|
-
corbell/core/workspace.py,sha256=
|
|
11
|
+
corbell/core/workspace.py,sha256=NsfByxnqTbPeflXLBqXAkqVaQCQ9Qs9maUmxp2Y6n1k,14024
|
|
12
12
|
corbell/core/embeddings/__init__.py,sha256=RCekvfNkFuMGEDLnls78i3znR84cTdnj4KJ_PeQrMNg,213
|
|
13
13
|
corbell/core/embeddings/base.py,sha256=udPW4XmcPhCpNQA6n8KqMcu2JXvVNv1JjdRJmFq5ZRA,2175
|
|
14
14
|
corbell/core/embeddings/extractor.py,sha256=hOolMX6JX3sVBf062h2zUQpr9SVt81S0hzhNCeJoV1I,7180
|
|
15
15
|
corbell/core/embeddings/factory.py,sha256=Lonjbk8Lsxykz-2ZEgFCWoH9zZ005Qm4dXVdA6P4qJY,1817
|
|
16
|
-
corbell/core/embeddings/model.py,sha256=
|
|
16
|
+
corbell/core/embeddings/model.py,sha256=sKFjUYJ8-COth1CXjgX9Bn_oPcf1OSbbq04oSywMDSo,14128
|
|
17
17
|
corbell/core/embeddings/search_cache.py,sha256=FHzO3mu4m4MJGy2jOFwb9GCEypcT11CcVrLts4Ib0ho,3351
|
|
18
18
|
corbell/core/embeddings/sqlite_store.py,sha256=8rv89WOMqMm-JhJO36-FdRiC68Ija3TwHkrmRrPr1os,10158
|
|
19
19
|
corbell/core/graph/__init__.py,sha256=VaxDKeXMgMEBBMC0dglwj68A_aNYRI5O8VM6oMC1GIM,29
|
|
20
|
-
corbell/core/graph/builder.py,sha256=
|
|
21
|
-
corbell/core/graph/method_graph.py,sha256=
|
|
20
|
+
corbell/core/graph/builder.py,sha256=_TjcKfOKObeJ3ScCMLZNHhtzmBYs1VtJEEp3UJLfoO0,32118
|
|
21
|
+
corbell/core/graph/method_graph.py,sha256=x6X91Dz3DzNAuzld2f7ORkODt3qC5L1Fzg1bdAcIhK4,50851
|
|
22
22
|
corbell/core/graph/schema.py,sha256=swy1VZZpL88LPEj6zihl5bglQLrGD-ohOYjFeNC31a0,5253
|
|
23
|
-
corbell/core/graph/sqlite_store.py,sha256=
|
|
23
|
+
corbell/core/graph/sqlite_store.py,sha256=B1ObNit7MXbQpst6dpuloTcFAmUim_MoP3PSCATf_4A,21116
|
|
24
24
|
corbell/core/graph/providers/__init__.py,sha256=__ZVe1uwIHSyFh_t-V4MyT5MsM5hooTOrxxkm9Txt7o,268
|
|
25
25
|
corbell/core/graph/providers/aws_patterns.py,sha256=w2iF5qQJcV7S6J64ZYb3IzGPdXjCc37YX5sNnHz8mXY,2818
|
|
26
26
|
corbell/core/graph/providers/azure_patterns.py,sha256=tJ9AQQXW2xYzJ36wNOxTHHhaivaCv3RYEMJUjw8WjeQ,3515
|
|
27
27
|
corbell/core/graph/providers/gcp_patterns.py,sha256=vIofjanvRWGhFftuGdzt9YgTIGZRJz7lLG0abUNjFdA,2789
|
|
28
28
|
corbell/core/indexing/__init__.py,sha256=VczeSHUfKR3YVowGCleFjo2pIpDHfl9kl-OkEl8szow,47
|
|
29
|
-
corbell/core/indexing/builder.py,sha256=
|
|
29
|
+
corbell/core/indexing/builder.py,sha256=mxWdHqgAx6akO8vb8-tlshD4zTlmbRuR-TOt-jETDLs,23303
|
|
30
30
|
corbell/core/indexing/lock.py,sha256=uUMelIrtrp6Ww9rTfbl2OvomByc-IJyiHIMnptfA4xI,4743
|
|
31
31
|
corbell/core/indexing/tracker.py,sha256=mbL1M-EeYf6KoIT5qoz7LCHwSHL6UlZNX7mjm4DczR0,8469
|
|
32
32
|
corbell/core/mcp/__init__.py,sha256=DDzfuVbX_GBTM5Nqy34JVgDUMeFd2_5ZcVMVuvjOddU,32
|
|
33
33
|
corbell/core/mcp/server.py,sha256=nTiPQ9yyenL7uhgLCsGwEm7yyoqk1tUPTsZYFAAmPBU,7270
|
|
34
34
|
corbell/core/query/__init__.py,sha256=OCyVRZOyh_eLGhOxR_JYyH6zp8O7qy_-rC3fqGHm7Bc,56
|
|
35
35
|
corbell/core/query/diagnostics.py,sha256=ObQyZWmMVRXEHFYGXBP2-EMBmM8SYr0H6cCi95uFnIk,1406
|
|
36
|
-
corbell/core/query/engine.py,sha256=
|
|
36
|
+
corbell/core/query/engine.py,sha256=cEueZdZQcg_o5HaPaayE4hCGiCvyIxvv0OWnXWD2DzU,11855
|
|
37
37
|
corbell/core/query/enhancer.py,sha256=w5mvm1B8qQZpL6RVhMuhq_rls77hakGSNUyanfkyNEU,3934
|
|
38
38
|
corbell/core/query/formatter.py,sha256=xMr8HE-oxBSEKb514aixY7aoUWGeYoK1w5wnaIlCYEc,2813
|
|
39
39
|
corbell/core/query/graph_expander.py,sha256=Y-yKnr6db-OM2Gh8ukYgVIcUZa6-wfWA-GhdvOwf_yA,9184
|
|
40
40
|
corbell/core/query/merger.py,sha256=fs6PL7X7EweXnSnDRnpzmpaU8JjwJpL0akzm4hSwLJk,6168
|
|
41
41
|
corbell/core/query/reranker.py,sha256=HYckYiUVZ80mbLGHhK4IHxNI7uUqNaztwXLbYgdnoWU,4298
|
|
42
|
-
codebase_retrieval_context_engine-2.0.
|
|
43
|
-
codebase_retrieval_context_engine-2.0.
|
|
44
|
-
codebase_retrieval_context_engine-2.0.
|
|
45
|
-
codebase_retrieval_context_engine-2.0.
|
|
46
|
-
codebase_retrieval_context_engine-2.0.
|
|
42
|
+
codebase_retrieval_context_engine-2.0.2.dist-info/METADATA,sha256=20ALXtYeqFP5ZR_j0hsKhZpI1YAxRvLFm2CzM8BRHgQ,17304
|
|
43
|
+
codebase_retrieval_context_engine-2.0.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
44
|
+
codebase_retrieval_context_engine-2.0.2.dist-info/entry_points.txt,sha256=vFB4a4Qb7Ty182usK8deJXiis0UYnGIUDusw0V3Jya8,115
|
|
45
|
+
codebase_retrieval_context_engine-2.0.2.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
46
|
+
codebase_retrieval_context_engine-2.0.2.dist-info/RECORD,,
|
corbell/__init__.py
CHANGED
corbell/core/embeddings/model.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Embedding model interface +
|
|
1
|
+
"""Embedding model interface + cloud provider implementations (Google, Voyage)."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -33,32 +33,6 @@ class EmbeddingModel(ABC):
|
|
|
33
33
|
...
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
class SentenceTransformerModel(EmbeddingModel):
|
|
37
|
-
"""Wraps ``sentence-transformers`` with lazy loading.
|
|
38
|
-
|
|
39
|
-
Uses ``all-MiniLM-L6-v2`` by default (384-dim, fast, no API key).
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
43
|
-
self.model_name = model_name
|
|
44
|
-
self._model = None # lazy-loaded
|
|
45
|
-
|
|
46
|
-
def _get_model(self):
|
|
47
|
-
if self._model is None:
|
|
48
|
-
from sentence_transformers import SentenceTransformer
|
|
49
|
-
self._model = SentenceTransformer(f"sentence-transformers/{self.model_name}")
|
|
50
|
-
return self._model
|
|
51
|
-
|
|
52
|
-
def encode(self, texts: List[str]) -> List[List[float]]:
|
|
53
|
-
model = self._get_model()
|
|
54
|
-
vecs = model.encode(texts, show_progress_bar=False)
|
|
55
|
-
return [v.tolist() for v in vecs]
|
|
56
|
-
|
|
57
|
-
@property
|
|
58
|
-
def dimension(self) -> int:
|
|
59
|
-
return self._get_model().get_sentence_embedding_dimension()
|
|
60
|
-
|
|
61
|
-
|
|
62
36
|
def _is_voyage_rate_limit_error(e: Exception) -> bool:
|
|
63
37
|
"""Return True when a Voyage API error is a 429 rate limit."""
|
|
64
38
|
status = getattr(e, "status_code", None)
|
|
@@ -130,9 +104,8 @@ class GoogleEmbeddingModel(EmbeddingModel):
|
|
|
130
104
|
if not self._api_keys:
|
|
131
105
|
raise ValueError(
|
|
132
106
|
"GOOGLE_API_KEY is not set. "
|
|
133
|
-
"Set it in your environment or
|
|
134
|
-
" export GOOGLE_API_KEY=AIza
|
|
135
|
-
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
107
|
+
"Set it in your environment or .env file:\n"
|
|
108
|
+
" export GOOGLE_API_KEY=AIza..."
|
|
136
109
|
)
|
|
137
110
|
self._key_index: int = random.randrange(len(self._api_keys))
|
|
138
111
|
# kept for backwards-compat with tests that read _api_key directly
|
|
@@ -302,9 +275,8 @@ class VoyageEmbeddingModel(EmbeddingModel):
|
|
|
302
275
|
if not self._api_keys:
|
|
303
276
|
raise ValueError(
|
|
304
277
|
"VOYAGE_API_KEY is not set. "
|
|
305
|
-
"Set it in your environment or
|
|
306
|
-
" export VOYAGE_API_KEY=pa
|
|
307
|
-
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
278
|
+
"Set it in your environment or .env file:\n"
|
|
279
|
+
" export VOYAGE_API_KEY=pa-..."
|
|
308
280
|
)
|
|
309
281
|
self._key_index: int = random.randrange(len(self._api_keys))
|
|
310
282
|
# kept for backwards-compat with tests that read _api_key directly
|
corbell/core/graph/builder.py
CHANGED
|
@@ -244,12 +244,16 @@ class ServiceGraphBuilder:
|
|
|
244
244
|
graph_store: Instance of :class:`~corbell.core.graph.schema.GraphStore`.
|
|
245
245
|
"""
|
|
246
246
|
self.store = graph_store
|
|
247
|
+
self._content_cache: Dict[Path, str] = {}
|
|
248
|
+
self._pending_nodes: list = []
|
|
249
|
+
self._pending_edges: list = []
|
|
247
250
|
|
|
248
251
|
def build_from_workspace(
|
|
249
252
|
self,
|
|
250
253
|
services: List[Dict[str, Any]],
|
|
251
254
|
clear_existing: bool = True,
|
|
252
255
|
method_level: bool = False,
|
|
256
|
+
file_list: Optional[List[Path]] = None,
|
|
253
257
|
) -> Dict[str, Any]:
|
|
254
258
|
"""Scan all service repos and populate the graph.
|
|
255
259
|
|
|
@@ -258,6 +262,9 @@ class ServiceGraphBuilder:
|
|
|
258
262
|
``language``, ``tags``.
|
|
259
263
|
clear_existing: Clear the store before building.
|
|
260
264
|
method_level: If True, also build method-call edges.
|
|
265
|
+
file_list: Optional pre-filtered list of Path objects covering all
|
|
266
|
+
repos. When provided, used instead of calling ``_iter_files``
|
|
267
|
+
(skips rglob). Falls back to rglob when None.
|
|
261
268
|
|
|
262
269
|
Returns:
|
|
263
270
|
Summary dict with counts of services, datastores, queues, methods.
|
|
@@ -265,6 +272,9 @@ class ServiceGraphBuilder:
|
|
|
265
272
|
if clear_existing:
|
|
266
273
|
self.store.clear()
|
|
267
274
|
|
|
275
|
+
self._pending_nodes = []
|
|
276
|
+
self._pending_edges = []
|
|
277
|
+
|
|
268
278
|
discovered: List[Dict] = []
|
|
269
279
|
|
|
270
280
|
for svc in services:
|
|
@@ -277,8 +287,17 @@ class ServiceGraphBuilder:
|
|
|
277
287
|
continue
|
|
278
288
|
|
|
279
289
|
# Gather all relevant files first so we can sniff the service type
|
|
280
|
-
|
|
281
|
-
|
|
290
|
+
if file_list is not None:
|
|
291
|
+
# Use pre-filtered list — filter to files under this repo_path
|
|
292
|
+
repo_path_str = str(repo_path)
|
|
293
|
+
files = [
|
|
294
|
+
fp for fp in file_list
|
|
295
|
+
if str(fp).startswith(repo_path_str)
|
|
296
|
+
]
|
|
297
|
+
else:
|
|
298
|
+
gitignore_spec = load_gitignore(repo_path)
|
|
299
|
+
files = list(self._iter_files(repo_path, language, gitignore_spec))
|
|
300
|
+
|
|
282
301
|
service_type = self._detect_service_type(files, language)
|
|
283
302
|
|
|
284
303
|
node = ServiceNode(
|
|
@@ -289,7 +308,7 @@ class ServiceGraphBuilder:
|
|
|
289
308
|
tags=tags,
|
|
290
309
|
service_type=service_type,
|
|
291
310
|
)
|
|
292
|
-
self.
|
|
311
|
+
self._pending_nodes.append(node)
|
|
293
312
|
discovered.append(
|
|
294
313
|
{
|
|
295
314
|
"id": svc_id,
|
|
@@ -300,6 +319,16 @@ class ServiceGraphBuilder:
|
|
|
300
319
|
)
|
|
301
320
|
|
|
302
321
|
# Phase 2: deps, HTTP calls
|
|
322
|
+
# Populate content cache for all discovered files (read once, used across phases)
|
|
323
|
+
self._content_cache = {}
|
|
324
|
+
for svc in discovered:
|
|
325
|
+
for fp in svc["files"]:
|
|
326
|
+
if fp not in self._content_cache:
|
|
327
|
+
try:
|
|
328
|
+
self._content_cache[fp] = fp.read_text(encoding="utf-8", errors="ignore")
|
|
329
|
+
except Exception:
|
|
330
|
+
self._content_cache[fp] = ""
|
|
331
|
+
|
|
303
332
|
datastore_ids: set = set()
|
|
304
333
|
queue_ids: set = set()
|
|
305
334
|
|
|
@@ -313,6 +342,15 @@ class ServiceGraphBuilder:
|
|
|
313
342
|
self._detect_http_calls(svc, discovered)
|
|
314
343
|
self._detect_library_deps(svc, all_service_ids)
|
|
315
344
|
|
|
345
|
+
# Free cached content
|
|
346
|
+
self._content_cache = {}
|
|
347
|
+
|
|
348
|
+
# Flush all accumulated nodes and edges in two bulk writes
|
|
349
|
+
self.store.upsert_nodes_batch(self._pending_nodes)
|
|
350
|
+
self.store.upsert_edges_batch(self._pending_edges)
|
|
351
|
+
self._pending_nodes = []
|
|
352
|
+
self._pending_edges = []
|
|
353
|
+
|
|
316
354
|
# Phase 4: method-level graph
|
|
317
355
|
service_diagnostics: Dict[str, Any] = {}
|
|
318
356
|
if method_level:
|
|
@@ -405,6 +443,8 @@ class ServiceGraphBuilder:
|
|
|
405
443
|
return False
|
|
406
444
|
|
|
407
445
|
def _read(self, fp: Path) -> str:
|
|
446
|
+
if fp in self._content_cache:
|
|
447
|
+
return self._content_cache[fp]
|
|
408
448
|
try:
|
|
409
449
|
return fp.read_text(encoding="utf-8", errors="ignore")
|
|
410
450
|
except Exception:
|
|
@@ -486,10 +526,10 @@ class ServiceGraphBuilder:
|
|
|
486
526
|
ds_id = f"datastore:{db_type}:{db_name}"
|
|
487
527
|
if ds_id not in datastore_ids:
|
|
488
528
|
datastore_ids.add(ds_id)
|
|
489
|
-
self.
|
|
490
|
-
|
|
529
|
+
self._pending_nodes.append(DataStoreNode(id=ds_id, kind=db_type, name=db_name))
|
|
530
|
+
|
|
491
531
|
direction = self._classify_io_direction(content, idx)
|
|
492
|
-
self.
|
|
532
|
+
self._pending_edges.append(
|
|
493
533
|
DependencyEdge(
|
|
494
534
|
source_id=svc_id,
|
|
495
535
|
target_id=ds_id,
|
|
@@ -516,11 +556,11 @@ class ServiceGraphBuilder:
|
|
|
516
556
|
q_id = f"queue:{q_type}:{q_name}"
|
|
517
557
|
if q_id not in queue_ids:
|
|
518
558
|
queue_ids.add(q_id)
|
|
519
|
-
self.
|
|
520
|
-
|
|
559
|
+
self._pending_nodes.append(QueueNode(id=q_id, kind=q_type, name=q_name))
|
|
560
|
+
|
|
521
561
|
direction = self._classify_io_direction(content, idx)
|
|
522
562
|
edge_kind = "queue_publish" if direction == "write" else "queue_consume"
|
|
523
|
-
self.
|
|
563
|
+
self._pending_edges.append(
|
|
524
564
|
DependencyEdge(
|
|
525
565
|
source_id=svc_id,
|
|
526
566
|
target_id=q_id,
|
|
@@ -552,7 +592,7 @@ class ServiceGraphBuilder:
|
|
|
552
592
|
svc_slug = other_id.replace("-", "").replace("_", "").lower()
|
|
553
593
|
url_clean = url_host.replace("-", "").replace("_", "").lower()
|
|
554
594
|
if svc_slug in url_clean:
|
|
555
|
-
self.
|
|
595
|
+
self._pending_edges.append(
|
|
556
596
|
DependencyEdge(
|
|
557
597
|
source_id=svc_id,
|
|
558
598
|
target_id=other_id,
|
|
@@ -584,7 +624,7 @@ class ServiceGraphBuilder:
|
|
|
584
624
|
break
|
|
585
625
|
|
|
586
626
|
if mapped_svc_id:
|
|
587
|
-
self.
|
|
627
|
+
self._pending_edges.append(
|
|
588
628
|
DependencyEdge(
|
|
589
629
|
source_id=svc_id,
|
|
590
630
|
target_id=mapped_svc_id,
|
|
@@ -597,7 +637,7 @@ class ServiceGraphBuilder:
|
|
|
597
637
|
)
|
|
598
638
|
)
|
|
599
639
|
else:
|
|
600
|
-
self.
|
|
640
|
+
self._pending_edges.append(
|
|
601
641
|
DependencyEdge(
|
|
602
642
|
source_id=svc_id,
|
|
603
643
|
target_id="external:env_url",
|
|
@@ -630,7 +670,7 @@ class ServiceGraphBuilder:
|
|
|
630
670
|
break
|
|
631
671
|
|
|
632
672
|
if mapped_svc_id:
|
|
633
|
-
self.
|
|
673
|
+
self._pending_edges.append(
|
|
634
674
|
DependencyEdge(
|
|
635
675
|
source_id=svc_id,
|
|
636
676
|
target_id=mapped_svc_id,
|
|
@@ -665,7 +705,7 @@ class ServiceGraphBuilder:
|
|
|
665
705
|
content = self._read(fp)
|
|
666
706
|
for exact_name, target_id in exact_to_id.items():
|
|
667
707
|
if f'"{exact_name}"' in content or f"'{exact_name}'" in content or f" {exact_name}==" in content:
|
|
668
|
-
self.
|
|
708
|
+
self._pending_edges.append(
|
|
669
709
|
DependencyEdge(
|
|
670
710
|
source_id=svc_id,
|
|
671
711
|
target_id=target_id,
|
|
@@ -685,7 +725,7 @@ class ServiceGraphBuilder:
|
|
|
685
725
|
import_pattern_js = rf"(?:import|require).*{exact_name}"
|
|
686
726
|
|
|
687
727
|
if re.search(import_pattern_py, content) or re.search(import_pattern_js, content):
|
|
688
|
-
self.
|
|
728
|
+
self._pending_edges.append(
|
|
689
729
|
DependencyEdge(
|
|
690
730
|
source_id=svc_id,
|
|
691
731
|
target_id=target_id,
|
|
@@ -297,7 +297,9 @@ class MethodGraphBuilder:
|
|
|
297
297
|
def __init__(self, graph_store: GraphStore):
|
|
298
298
|
self.store = graph_store
|
|
299
299
|
|
|
300
|
-
def build_for_service(
|
|
300
|
+
def build_for_service(
|
|
301
|
+
self, service_id: str, repo_path: Path, file_list: Optional[List[Path]] = None
|
|
302
|
+
) -> Dict[str, Any]:
|
|
301
303
|
"""Scan *repo_path* and populate method nodes + call edges.
|
|
302
304
|
|
|
303
305
|
Uses tree-sitter for all supported languages when the grammar packages
|
|
@@ -307,6 +309,10 @@ class MethodGraphBuilder:
|
|
|
307
309
|
Args:
|
|
308
310
|
service_id: Identifier for the owning service.
|
|
309
311
|
repo_path: Root directory of the repository to scan.
|
|
312
|
+
file_list: Optional pre-filtered list of Path objects to scan.
|
|
313
|
+
When provided, skips rglob/gitignore/skip_dirs filtering —
|
|
314
|
+
the caller is responsible for pre-filtering. Falls back to
|
|
315
|
+
the original rglob behavior when None.
|
|
310
316
|
|
|
311
317
|
Returns:
|
|
312
318
|
Summary dict with ``methods``, ``calls``, ``files_scanned``, ``ts_available``.
|
|
@@ -315,28 +321,41 @@ class MethodGraphBuilder:
|
|
|
315
321
|
all_calls: List[Dict] = []
|
|
316
322
|
files_scanned = 0
|
|
317
323
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
324
|
+
if file_list is not None:
|
|
325
|
+
# Pre-filtered list from caller — skip all filtering
|
|
326
|
+
for fp in file_list:
|
|
327
|
+
lang = _EXT_LANG.get(fp.suffix)
|
|
328
|
+
if not lang:
|
|
329
|
+
continue
|
|
330
|
+
files_scanned += 1
|
|
331
|
+
result = self._analyze_file(fp, service_id, lang)
|
|
332
|
+
for m in result["methods"]:
|
|
333
|
+
all_methods[m["id"]] = m
|
|
334
|
+
all_calls.extend(result["calls"])
|
|
335
|
+
else:
|
|
336
|
+
gitignore_spec = load_gitignore(Path(repo_path))
|
|
337
|
+
|
|
338
|
+
for fp in Path(repo_path).rglob("*"):
|
|
339
|
+
if not fp.is_file():
|
|
340
|
+
continue
|
|
341
|
+
# Only skip if the immediate parent directory name is in SKIP_DIRS
|
|
342
|
+
# (avoids false-positives from matching path segments like 'corbel')
|
|
343
|
+
rel = fp.relative_to(repo_path)
|
|
344
|
+
if any(part in _SKIP_DIRS for part in rel.parts):
|
|
345
|
+
continue
|
|
346
|
+
if gitignore_spec.match_file(str(rel).replace("\\", "/")):
|
|
347
|
+
continue
|
|
348
|
+
lang = _EXT_LANG.get(fp.suffix)
|
|
349
|
+
if not lang:
|
|
350
|
+
continue
|
|
351
|
+
files_scanned += 1
|
|
352
|
+
result = self._analyze_file(fp, service_id, lang)
|
|
353
|
+
for m in result["methods"]:
|
|
354
|
+
all_methods[m["id"]] = m
|
|
355
|
+
all_calls.extend(result["calls"])
|
|
356
|
+
|
|
357
|
+
# Persist method nodes (batched)
|
|
358
|
+
method_nodes = []
|
|
340
359
|
for method_id, info in all_methods.items():
|
|
341
360
|
node = MethodNode(
|
|
342
361
|
id=method_id,
|
|
@@ -351,12 +370,14 @@ class MethodGraphBuilder:
|
|
|
351
370
|
service_id=service_id,
|
|
352
371
|
typed_signature=info.get("typed_signature"),
|
|
353
372
|
)
|
|
354
|
-
|
|
373
|
+
method_nodes.append(node)
|
|
374
|
+
self.store.upsert_nodes_batch(method_nodes)
|
|
355
375
|
|
|
356
|
-
# Build and persist call graph edges
|
|
376
|
+
# Build and persist call graph edges (batched)
|
|
357
377
|
call_graph = self._build_call_graph(all_methods, all_calls)
|
|
378
|
+
edge_objects = []
|
|
358
379
|
for caller_id, callee_id, meta in call_graph:
|
|
359
|
-
|
|
380
|
+
edge_objects.append(
|
|
360
381
|
DependencyEdge(
|
|
361
382
|
source_id=caller_id,
|
|
362
383
|
target_id=callee_id,
|
|
@@ -364,6 +385,7 @@ class MethodGraphBuilder:
|
|
|
364
385
|
metadata=meta,
|
|
365
386
|
)
|
|
366
387
|
)
|
|
388
|
+
self.store.upsert_edges_batch(edge_objects)
|
|
367
389
|
|
|
368
390
|
return {
|
|
369
391
|
"methods": len(all_methods),
|
|
@@ -134,6 +134,31 @@ class SQLiteGraphStore(GraphStore):
|
|
|
134
134
|
)
|
|
135
135
|
conn.commit()
|
|
136
136
|
|
|
137
|
+
def upsert_nodes_batch(self, nodes: list) -> None:
|
|
138
|
+
"""Batch-upsert multiple nodes in a single transaction."""
|
|
139
|
+
if not nodes:
|
|
140
|
+
return
|
|
141
|
+
with self._conn() as conn:
|
|
142
|
+
conn.executemany(
|
|
143
|
+
"INSERT OR REPLACE INTO graph_nodes (id, node_type, data) VALUES (?, ?, ?)",
|
|
144
|
+
[(node.id, _node_type_str(node), json.dumps(_node_to_dict(node))) for node in nodes],
|
|
145
|
+
)
|
|
146
|
+
conn.commit()
|
|
147
|
+
|
|
148
|
+
def upsert_edges_batch(self, edges: list) -> None:
|
|
149
|
+
"""Batch-upsert multiple edges in a single transaction."""
|
|
150
|
+
if not edges:
|
|
151
|
+
return
|
|
152
|
+
with self._conn() as conn:
|
|
153
|
+
conn.executemany(
|
|
154
|
+
"""INSERT INTO graph_edges (source_id, target_id, kind, metadata)
|
|
155
|
+
VALUES (?, ?, ?, ?)
|
|
156
|
+
ON CONFLICT(source_id, target_id, kind)
|
|
157
|
+
DO UPDATE SET metadata = excluded.metadata""",
|
|
158
|
+
[(e.source_id, e.target_id, e.kind, json.dumps(e.metadata)) for e in edges],
|
|
159
|
+
)
|
|
160
|
+
conn.commit()
|
|
161
|
+
|
|
137
162
|
def _load_node(self, row) -> ServiceNode | DataStoreNode | QueueNode | MethodNode:
|
|
138
163
|
return _dict_to_node(row["node_type"], json.loads(row["data"]))
|
|
139
164
|
|