codebase-retrieval-context-engine 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codebase-retrieval-context-engine
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: Code retrieval engine — hybrid embedding + graph search for LLM context injection.
5
5
  Project-URL: Homepage, https://github.com/nullmastermind/local-context-engine
6
6
  Project-URL: Repository, https://github.com/nullmastermind/local-context-engine
@@ -17,6 +17,7 @@ Classifier: Topic :: Software Development :: Libraries
17
17
  Requires-Python: >=3.11
18
18
  Requires-Dist: mcp>=1.1.2
19
19
  Requires-Dist: numpy>=2.0
20
+ Requires-Dist: pathspec>=0.11
20
21
  Requires-Dist: pydantic>=2.0
21
22
  Requires-Dist: python-dotenv>=1.0
22
23
  Requires-Dist: rich>=13.0
@@ -40,9 +41,6 @@ Requires-Dist: anthropic[vertex]>=0.25; extra == 'gcp'
40
41
  Requires-Dist: google-cloud-aiplatform>=1.38; extra == 'gcp'
41
42
  Provides-Extra: google
42
43
  Requires-Dist: google-genai>=2.7.0; extra == 'google'
43
- Provides-Extra: local
44
- Requires-Dist: sentence-transformers>=3.0; extra == 'local'
45
- Requires-Dist: transformers<5.0.0; extra == 'local'
46
44
  Provides-Extra: openai
47
45
  Requires-Dist: openai>=1.0; extra == 'openai'
48
46
  Provides-Extra: treesitter
@@ -1,4 +1,4 @@
1
- corbell/__init__.py,sha256=ECnnvynYrlq_niQjMhnC171R8q2axnwhEGqVRoWu50E,124
1
+ corbell/__init__.py,sha256=DK8C29me67FSOnq2v_CAPc0COnXW4plMGTNHfZvmX5Y,124
2
2
  corbell/cli/__init__.py,sha256=5-MP6JIWgp4nDLNIhqP6Gtx97GESaIYg3NGxtRGaMv0,28
3
3
  corbell/cli/main.py,sha256=anYpXiyQD6_1wMS0Dtef6Rxtxd0NEFe7HHnerHxf3J4,1835
4
4
  corbell/cli/commands/__init__.py,sha256=0mAOs3RWC7XMZnGRN677hjPCHHQKDq9ASjIr_GQM3js,37
@@ -8,39 +8,39 @@ corbell/core/__init__.py,sha256=VS9PnhHr4NXYlWs1TLCyllnVCNsiwVZ1Xj-AOBhZpAU,29
8
8
  corbell/core/constants.py,sha256=HTGYpShlp9pP2_a4WngHtTujUQfHcypFAYoaczmkBdQ,1061
9
9
  corbell/core/gitignore.py,sha256=VS7_s6NwZWQAwgLiaRzPHdBRIj86XdnPm_P_x_e0hvI,2266
10
10
  corbell/core/llm_client.py,sha256=2MDwe6kr_EyY3DFv3fNO91WCig8ER021ogzdLGH3IN8,26219
11
- corbell/core/workspace.py,sha256=v9H56MxhI2iHPr3P9fU5DnIiniGehozWScfcd6TG2gQ,14189
11
+ corbell/core/workspace.py,sha256=NsfByxnqTbPeflXLBqXAkqVaQCQ9Qs9maUmxp2Y6n1k,14024
12
12
  corbell/core/embeddings/__init__.py,sha256=RCekvfNkFuMGEDLnls78i3znR84cTdnj4KJ_PeQrMNg,213
13
13
  corbell/core/embeddings/base.py,sha256=udPW4XmcPhCpNQA6n8KqMcu2JXvVNv1JjdRJmFq5ZRA,2175
14
14
  corbell/core/embeddings/extractor.py,sha256=hOolMX6JX3sVBf062h2zUQpr9SVt81S0hzhNCeJoV1I,7180
15
15
  corbell/core/embeddings/factory.py,sha256=Lonjbk8Lsxykz-2ZEgFCWoH9zZ005Qm4dXVdA6P4qJY,1817
16
- corbell/core/embeddings/model.py,sha256=5guf3qpqOaC7fSp-OZlQjBUQt7jOf_3eE3w6X6yNY2I,15224
16
+ corbell/core/embeddings/model.py,sha256=sKFjUYJ8-COth1CXjgX9Bn_oPcf1OSbbq04oSywMDSo,14128
17
17
  corbell/core/embeddings/search_cache.py,sha256=FHzO3mu4m4MJGy2jOFwb9GCEypcT11CcVrLts4Ib0ho,3351
18
18
  corbell/core/embeddings/sqlite_store.py,sha256=8rv89WOMqMm-JhJO36-FdRiC68Ija3TwHkrmRrPr1os,10158
19
19
  corbell/core/graph/__init__.py,sha256=VaxDKeXMgMEBBMC0dglwj68A_aNYRI5O8VM6oMC1GIM,29
20
- corbell/core/graph/builder.py,sha256=8BtUVvh_3Sq_XQiYf3yZifRygTrc-_hE770luqwZ3e0,30423
21
- corbell/core/graph/method_graph.py,sha256=XJpojhBVXon2RNMIaOqE3NgRjUdP0IAnRQuKYGhO5js,49753
20
+ corbell/core/graph/builder.py,sha256=_TjcKfOKObeJ3ScCMLZNHhtzmBYs1VtJEEp3UJLfoO0,32118
21
+ corbell/core/graph/method_graph.py,sha256=x6X91Dz3DzNAuzld2f7ORkODt3qC5L1Fzg1bdAcIhK4,50851
22
22
  corbell/core/graph/schema.py,sha256=swy1VZZpL88LPEj6zihl5bglQLrGD-ohOYjFeNC31a0,5253
23
- corbell/core/graph/sqlite_store.py,sha256=D9que8zHV68x_tiFY-q09Ipk-dXR9LWM3hFPQLGNDto,20019
23
+ corbell/core/graph/sqlite_store.py,sha256=B1ObNit7MXbQpst6dpuloTcFAmUim_MoP3PSCATf_4A,21116
24
24
  corbell/core/graph/providers/__init__.py,sha256=__ZVe1uwIHSyFh_t-V4MyT5MsM5hooTOrxxkm9Txt7o,268
25
25
  corbell/core/graph/providers/aws_patterns.py,sha256=w2iF5qQJcV7S6J64ZYb3IzGPdXjCc37YX5sNnHz8mXY,2818
26
26
  corbell/core/graph/providers/azure_patterns.py,sha256=tJ9AQQXW2xYzJ36wNOxTHHhaivaCv3RYEMJUjw8WjeQ,3515
27
27
  corbell/core/graph/providers/gcp_patterns.py,sha256=vIofjanvRWGhFftuGdzt9YgTIGZRJz7lLG0abUNjFdA,2789
28
28
  corbell/core/indexing/__init__.py,sha256=VczeSHUfKR3YVowGCleFjo2pIpDHfl9kl-OkEl8szow,47
29
- corbell/core/indexing/builder.py,sha256=l1Hj9Lrze2zuBvwtauDgmjAWWVvKbKXWKMgtv8uA-TQ,21967
29
+ corbell/core/indexing/builder.py,sha256=mxWdHqgAx6akO8vb8-tlshD4zTlmbRuR-TOt-jETDLs,23303
30
30
  corbell/core/indexing/lock.py,sha256=uUMelIrtrp6Ww9rTfbl2OvomByc-IJyiHIMnptfA4xI,4743
31
31
  corbell/core/indexing/tracker.py,sha256=mbL1M-EeYf6KoIT5qoz7LCHwSHL6UlZNX7mjm4DczR0,8469
32
32
  corbell/core/mcp/__init__.py,sha256=DDzfuVbX_GBTM5Nqy34JVgDUMeFd2_5ZcVMVuvjOddU,32
33
33
  corbell/core/mcp/server.py,sha256=nTiPQ9yyenL7uhgLCsGwEm7yyoqk1tUPTsZYFAAmPBU,7270
34
34
  corbell/core/query/__init__.py,sha256=OCyVRZOyh_eLGhOxR_JYyH6zp8O7qy_-rC3fqGHm7Bc,56
35
35
  corbell/core/query/diagnostics.py,sha256=ObQyZWmMVRXEHFYGXBP2-EMBmM8SYr0H6cCi95uFnIk,1406
36
- corbell/core/query/engine.py,sha256=4PRW3R0Wws7BPAqDD4hrWjpqOq8mx9dCrtABWA-Wmus,11635
36
+ corbell/core/query/engine.py,sha256=cEueZdZQcg_o5HaPaayE4hCGiCvyIxvv0OWnXWD2DzU,11855
37
37
  corbell/core/query/enhancer.py,sha256=w5mvm1B8qQZpL6RVhMuhq_rls77hakGSNUyanfkyNEU,3934
38
38
  corbell/core/query/formatter.py,sha256=xMr8HE-oxBSEKb514aixY7aoUWGeYoK1w5wnaIlCYEc,2813
39
39
  corbell/core/query/graph_expander.py,sha256=Y-yKnr6db-OM2Gh8ukYgVIcUZa6-wfWA-GhdvOwf_yA,9184
40
40
  corbell/core/query/merger.py,sha256=fs6PL7X7EweXnSnDRnpzmpaU8JjwJpL0akzm4hSwLJk,6168
41
41
  corbell/core/query/reranker.py,sha256=HYckYiUVZ80mbLGHhK4IHxNI7uUqNaztwXLbYgdnoWU,4298
42
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA,sha256=SVKzXFoaizz5EyBgOZbmwdtsKI72ZcKgjzCyes_8mJY,17408
43
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
44
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt,sha256=vFB4a4Qb7Ty182usK8deJXiis0UYnGIUDusw0V3Jya8,115
45
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
46
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD,,
42
+ codebase_retrieval_context_engine-2.0.2.dist-info/METADATA,sha256=20ALXtYeqFP5ZR_j0hsKhZpI1YAxRvLFm2CzM8BRHgQ,17304
43
+ codebase_retrieval_context_engine-2.0.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
44
+ codebase_retrieval_context_engine-2.0.2.dist-info/entry_points.txt,sha256=vFB4a4Qb7Ty182usK8deJXiis0UYnGIUDusw0V3Jya8,115
45
+ codebase_retrieval_context_engine-2.0.2.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
46
+ codebase_retrieval_context_engine-2.0.2.dist-info/RECORD,,
corbell/__init__.py CHANGED
@@ -2,5 +2,5 @@
2
2
  Corbell — Code retrieval engine for LLM context injection.
3
3
  """
4
4
 
5
- __version__ = "2.0.0"
5
+ __version__ = "2.0.2"
6
6
  __all__ = ["__version__"]
@@ -1,4 +1,4 @@
1
- """Embedding model interface + SentenceTransformers implementation."""
1
+ """Embedding model interface + cloud provider implementations (Google, Voyage)."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -33,32 +33,6 @@ class EmbeddingModel(ABC):
33
33
  ...
34
34
 
35
35
 
36
- class SentenceTransformerModel(EmbeddingModel):
37
- """Wraps ``sentence-transformers`` with lazy loading.
38
-
39
- Uses ``all-MiniLM-L6-v2`` by default (384-dim, fast, no API key).
40
- """
41
-
42
- def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
43
- self.model_name = model_name
44
- self._model = None # lazy-loaded
45
-
46
- def _get_model(self):
47
- if self._model is None:
48
- from sentence_transformers import SentenceTransformer
49
- self._model = SentenceTransformer(f"sentence-transformers/{self.model_name}")
50
- return self._model
51
-
52
- def encode(self, texts: List[str]) -> List[List[float]]:
53
- model = self._get_model()
54
- vecs = model.encode(texts, show_progress_bar=False)
55
- return [v.tolist() for v in vecs]
56
-
57
- @property
58
- def dimension(self) -> int:
59
- return self._get_model().get_sentence_embedding_dimension()
60
-
61
-
62
36
  def _is_voyage_rate_limit_error(e: Exception) -> bool:
63
37
  """Return True when a Voyage API error is a 429 rate limit."""
64
38
  status = getattr(e, "status_code", None)
@@ -130,9 +104,8 @@ class GoogleEmbeddingModel(EmbeddingModel):
130
104
  if not self._api_keys:
131
105
  raise ValueError(
132
106
  "GOOGLE_API_KEY is not set. "
133
- "Set it in your environment or workspace.yaml:\n"
134
- " export GOOGLE_API_KEY=AIza...\n"
135
- "Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
107
+ "Set it in your environment or .env file:\n"
108
+ " export GOOGLE_API_KEY=AIza..."
136
109
  )
137
110
  self._key_index: int = random.randrange(len(self._api_keys))
138
111
  # kept for backwards-compat with tests that read _api_key directly
@@ -302,9 +275,8 @@ class VoyageEmbeddingModel(EmbeddingModel):
302
275
  if not self._api_keys:
303
276
  raise ValueError(
304
277
  "VOYAGE_API_KEY is not set. "
305
- "Set it in your environment or workspace.yaml:\n"
306
- " export VOYAGE_API_KEY=pa-...\n"
307
- "Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
278
+ "Set it in your environment or .env file:\n"
279
+ " export VOYAGE_API_KEY=pa-..."
308
280
  )
309
281
  self._key_index: int = random.randrange(len(self._api_keys))
310
282
  # kept for backwards-compat with tests that read _api_key directly
@@ -244,12 +244,16 @@ class ServiceGraphBuilder:
244
244
  graph_store: Instance of :class:`~corbell.core.graph.schema.GraphStore`.
245
245
  """
246
246
  self.store = graph_store
247
+ self._content_cache: Dict[Path, str] = {}
248
+ self._pending_nodes: list = []
249
+ self._pending_edges: list = []
247
250
 
248
251
  def build_from_workspace(
249
252
  self,
250
253
  services: List[Dict[str, Any]],
251
254
  clear_existing: bool = True,
252
255
  method_level: bool = False,
256
+ file_list: Optional[List[Path]] = None,
253
257
  ) -> Dict[str, Any]:
254
258
  """Scan all service repos and populate the graph.
255
259
 
@@ -258,6 +262,9 @@ class ServiceGraphBuilder:
258
262
  ``language``, ``tags``.
259
263
  clear_existing: Clear the store before building.
260
264
  method_level: If True, also build method-call edges.
265
+ file_list: Optional pre-filtered list of Path objects covering all
266
+ repos. When provided, used instead of calling ``_iter_files``
267
+ (skips rglob). Falls back to rglob when None.
261
268
 
262
269
  Returns:
263
270
  Summary dict with counts of services, datastores, queues, methods.
@@ -265,6 +272,9 @@ class ServiceGraphBuilder:
265
272
  if clear_existing:
266
273
  self.store.clear()
267
274
 
275
+ self._pending_nodes = []
276
+ self._pending_edges = []
277
+
268
278
  discovered: List[Dict] = []
269
279
 
270
280
  for svc in services:
@@ -277,8 +287,17 @@ class ServiceGraphBuilder:
277
287
  continue
278
288
 
279
289
  # Gather all relevant files first so we can sniff the service type
280
- gitignore_spec = load_gitignore(repo_path)
281
- files = list(self._iter_files(repo_path, language, gitignore_spec))
290
+ if file_list is not None:
291
+ # Use pre-filtered list filter to files under this repo_path
292
+ repo_path_str = str(repo_path)
293
+ files = [
294
+ fp for fp in file_list
295
+ if str(fp).startswith(repo_path_str)
296
+ ]
297
+ else:
298
+ gitignore_spec = load_gitignore(repo_path)
299
+ files = list(self._iter_files(repo_path, language, gitignore_spec))
300
+
282
301
  service_type = self._detect_service_type(files, language)
283
302
 
284
303
  node = ServiceNode(
@@ -289,7 +308,7 @@ class ServiceGraphBuilder:
289
308
  tags=tags,
290
309
  service_type=service_type,
291
310
  )
292
- self.store.upsert_node(node)
311
+ self._pending_nodes.append(node)
293
312
  discovered.append(
294
313
  {
295
314
  "id": svc_id,
@@ -300,6 +319,16 @@ class ServiceGraphBuilder:
300
319
  )
301
320
 
302
321
  # Phase 2: deps, HTTP calls
322
+ # Populate content cache for all discovered files (read once, used across phases)
323
+ self._content_cache = {}
324
+ for svc in discovered:
325
+ for fp in svc["files"]:
326
+ if fp not in self._content_cache:
327
+ try:
328
+ self._content_cache[fp] = fp.read_text(encoding="utf-8", errors="ignore")
329
+ except Exception:
330
+ self._content_cache[fp] = ""
331
+
303
332
  datastore_ids: set = set()
304
333
  queue_ids: set = set()
305
334
 
@@ -313,6 +342,15 @@ class ServiceGraphBuilder:
313
342
  self._detect_http_calls(svc, discovered)
314
343
  self._detect_library_deps(svc, all_service_ids)
315
344
 
345
+ # Free cached content
346
+ self._content_cache = {}
347
+
348
+ # Flush all accumulated nodes and edges in two bulk writes
349
+ self.store.upsert_nodes_batch(self._pending_nodes)
350
+ self.store.upsert_edges_batch(self._pending_edges)
351
+ self._pending_nodes = []
352
+ self._pending_edges = []
353
+
316
354
  # Phase 4: method-level graph
317
355
  service_diagnostics: Dict[str, Any] = {}
318
356
  if method_level:
@@ -405,6 +443,8 @@ class ServiceGraphBuilder:
405
443
  return False
406
444
 
407
445
  def _read(self, fp: Path) -> str:
446
+ if fp in self._content_cache:
447
+ return self._content_cache[fp]
408
448
  try:
409
449
  return fp.read_text(encoding="utf-8", errors="ignore")
410
450
  except Exception:
@@ -486,10 +526,10 @@ class ServiceGraphBuilder:
486
526
  ds_id = f"datastore:{db_type}:{db_name}"
487
527
  if ds_id not in datastore_ids:
488
528
  datastore_ids.add(ds_id)
489
- self.store.upsert_node(DataStoreNode(id=ds_id, kind=db_type, name=db_name))
490
-
529
+ self._pending_nodes.append(DataStoreNode(id=ds_id, kind=db_type, name=db_name))
530
+
491
531
  direction = self._classify_io_direction(content, idx)
492
- self.store.upsert_edge(
532
+ self._pending_edges.append(
493
533
  DependencyEdge(
494
534
  source_id=svc_id,
495
535
  target_id=ds_id,
@@ -516,11 +556,11 @@ class ServiceGraphBuilder:
516
556
  q_id = f"queue:{q_type}:{q_name}"
517
557
  if q_id not in queue_ids:
518
558
  queue_ids.add(q_id)
519
- self.store.upsert_node(QueueNode(id=q_id, kind=q_type, name=q_name))
520
-
559
+ self._pending_nodes.append(QueueNode(id=q_id, kind=q_type, name=q_name))
560
+
521
561
  direction = self._classify_io_direction(content, idx)
522
562
  edge_kind = "queue_publish" if direction == "write" else "queue_consume"
523
- self.store.upsert_edge(
563
+ self._pending_edges.append(
524
564
  DependencyEdge(
525
565
  source_id=svc_id,
526
566
  target_id=q_id,
@@ -552,7 +592,7 @@ class ServiceGraphBuilder:
552
592
  svc_slug = other_id.replace("-", "").replace("_", "").lower()
553
593
  url_clean = url_host.replace("-", "").replace("_", "").lower()
554
594
  if svc_slug in url_clean:
555
- self.store.upsert_edge(
595
+ self._pending_edges.append(
556
596
  DependencyEdge(
557
597
  source_id=svc_id,
558
598
  target_id=other_id,
@@ -584,7 +624,7 @@ class ServiceGraphBuilder:
584
624
  break
585
625
 
586
626
  if mapped_svc_id:
587
- self.store.upsert_edge(
627
+ self._pending_edges.append(
588
628
  DependencyEdge(
589
629
  source_id=svc_id,
590
630
  target_id=mapped_svc_id,
@@ -597,7 +637,7 @@ class ServiceGraphBuilder:
597
637
  )
598
638
  )
599
639
  else:
600
- self.store.upsert_edge(
640
+ self._pending_edges.append(
601
641
  DependencyEdge(
602
642
  source_id=svc_id,
603
643
  target_id="external:env_url",
@@ -630,7 +670,7 @@ class ServiceGraphBuilder:
630
670
  break
631
671
 
632
672
  if mapped_svc_id:
633
- self.store.upsert_edge(
673
+ self._pending_edges.append(
634
674
  DependencyEdge(
635
675
  source_id=svc_id,
636
676
  target_id=mapped_svc_id,
@@ -665,7 +705,7 @@ class ServiceGraphBuilder:
665
705
  content = self._read(fp)
666
706
  for exact_name, target_id in exact_to_id.items():
667
707
  if f'"{exact_name}"' in content or f"'{exact_name}'" in content or f" {exact_name}==" in content:
668
- self.store.upsert_edge(
708
+ self._pending_edges.append(
669
709
  DependencyEdge(
670
710
  source_id=svc_id,
671
711
  target_id=target_id,
@@ -685,7 +725,7 @@ class ServiceGraphBuilder:
685
725
  import_pattern_js = rf"(?:import|require).*{exact_name}"
686
726
 
687
727
  if re.search(import_pattern_py, content) or re.search(import_pattern_js, content):
688
- self.store.upsert_edge(
728
+ self._pending_edges.append(
689
729
  DependencyEdge(
690
730
  source_id=svc_id,
691
731
  target_id=target_id,
@@ -297,7 +297,9 @@ class MethodGraphBuilder:
297
297
  def __init__(self, graph_store: GraphStore):
298
298
  self.store = graph_store
299
299
 
300
- def build_for_service(self, service_id: str, repo_path: Path) -> Dict[str, Any]:
300
+ def build_for_service(
301
+ self, service_id: str, repo_path: Path, file_list: Optional[List[Path]] = None
302
+ ) -> Dict[str, Any]:
301
303
  """Scan *repo_path* and populate method nodes + call edges.
302
304
 
303
305
  Uses tree-sitter for all supported languages when the grammar packages
@@ -307,6 +309,10 @@ class MethodGraphBuilder:
307
309
  Args:
308
310
  service_id: Identifier for the owning service.
309
311
  repo_path: Root directory of the repository to scan.
312
+ file_list: Optional pre-filtered list of Path objects to scan.
313
+ When provided, skips rglob/gitignore/skip_dirs filtering —
314
+ the caller is responsible for pre-filtering. Falls back to
315
+ the original rglob behavior when None.
310
316
 
311
317
  Returns:
312
318
  Summary dict with ``methods``, ``calls``, ``files_scanned``, ``ts_available``.
@@ -315,28 +321,41 @@ class MethodGraphBuilder:
315
321
  all_calls: List[Dict] = []
316
322
  files_scanned = 0
317
323
 
318
- gitignore_spec = load_gitignore(Path(repo_path))
319
-
320
- for fp in Path(repo_path).rglob("*"):
321
- if not fp.is_file():
322
- continue
323
- # Only skip if the immediate parent directory name is in SKIP_DIRS
324
- # (avoids false-positives from matching path segments like 'corbel')
325
- rel = fp.relative_to(repo_path)
326
- if any(part in _SKIP_DIRS for part in rel.parts):
327
- continue
328
- if gitignore_spec.match_file(str(rel).replace("\\", "/")):
329
- continue
330
- lang = _EXT_LANG.get(fp.suffix)
331
- if not lang:
332
- continue
333
- files_scanned += 1
334
- result = self._analyze_file(fp, service_id, lang)
335
- for m in result["methods"]:
336
- all_methods[m["id"]] = m
337
- all_calls.extend(result["calls"])
338
-
339
- # Persist method nodes
324
+ if file_list is not None:
325
+ # Pre-filtered list from caller — skip all filtering
326
+ for fp in file_list:
327
+ lang = _EXT_LANG.get(fp.suffix)
328
+ if not lang:
329
+ continue
330
+ files_scanned += 1
331
+ result = self._analyze_file(fp, service_id, lang)
332
+ for m in result["methods"]:
333
+ all_methods[m["id"]] = m
334
+ all_calls.extend(result["calls"])
335
+ else:
336
+ gitignore_spec = load_gitignore(Path(repo_path))
337
+
338
+ for fp in Path(repo_path).rglob("*"):
339
+ if not fp.is_file():
340
+ continue
341
+ # Only skip if the immediate parent directory name is in SKIP_DIRS
342
+ # (avoids false-positives from matching path segments like 'corbel')
343
+ rel = fp.relative_to(repo_path)
344
+ if any(part in _SKIP_DIRS for part in rel.parts):
345
+ continue
346
+ if gitignore_spec.match_file(str(rel).replace("\\", "/")):
347
+ continue
348
+ lang = _EXT_LANG.get(fp.suffix)
349
+ if not lang:
350
+ continue
351
+ files_scanned += 1
352
+ result = self._analyze_file(fp, service_id, lang)
353
+ for m in result["methods"]:
354
+ all_methods[m["id"]] = m
355
+ all_calls.extend(result["calls"])
356
+
357
+ # Persist method nodes (batched)
358
+ method_nodes = []
340
359
  for method_id, info in all_methods.items():
341
360
  node = MethodNode(
342
361
  id=method_id,
@@ -351,12 +370,14 @@ class MethodGraphBuilder:
351
370
  service_id=service_id,
352
371
  typed_signature=info.get("typed_signature"),
353
372
  )
354
- self.store.upsert_node(node)
373
+ method_nodes.append(node)
374
+ self.store.upsert_nodes_batch(method_nodes)
355
375
 
356
- # Build and persist call graph edges
376
+ # Build and persist call graph edges (batched)
357
377
  call_graph = self._build_call_graph(all_methods, all_calls)
378
+ edge_objects = []
358
379
  for caller_id, callee_id, meta in call_graph:
359
- self.store.upsert_edge(
380
+ edge_objects.append(
360
381
  DependencyEdge(
361
382
  source_id=caller_id,
362
383
  target_id=callee_id,
@@ -364,6 +385,7 @@ class MethodGraphBuilder:
364
385
  metadata=meta,
365
386
  )
366
387
  )
388
+ self.store.upsert_edges_batch(edge_objects)
367
389
 
368
390
  return {
369
391
  "methods": len(all_methods),
@@ -134,6 +134,31 @@ class SQLiteGraphStore(GraphStore):
134
134
  )
135
135
  conn.commit()
136
136
 
137
+ def upsert_nodes_batch(self, nodes: list) -> None:
138
+ """Batch-upsert multiple nodes in a single transaction."""
139
+ if not nodes:
140
+ return
141
+ with self._conn() as conn:
142
+ conn.executemany(
143
+ "INSERT OR REPLACE INTO graph_nodes (id, node_type, data) VALUES (?, ?, ?)",
144
+ [(node.id, _node_type_str(node), json.dumps(_node_to_dict(node))) for node in nodes],
145
+ )
146
+ conn.commit()
147
+
148
+ def upsert_edges_batch(self, edges: list) -> None:
149
+ """Batch-upsert multiple edges in a single transaction."""
150
+ if not edges:
151
+ return
152
+ with self._conn() as conn:
153
+ conn.executemany(
154
+ """INSERT INTO graph_edges (source_id, target_id, kind, metadata)
155
+ VALUES (?, ?, ?, ?)
156
+ ON CONFLICT(source_id, target_id, kind)
157
+ DO UPDATE SET metadata = excluded.metadata""",
158
+ [(e.source_id, e.target_id, e.kind, json.dumps(e.metadata)) for e in edges],
159
+ )
160
+ conn.commit()
161
+
137
162
  def _load_node(self, row) -> ServiceNode | DataStoreNode | QueueNode | MethodNode:
138
163
  return _dict_to_node(row["node_type"], json.loads(row["data"]))
139
164