java-codebase-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
build_ast_graph.py ADDED
@@ -0,0 +1,3081 @@
1
+ #!/usr/bin/env python3
2
+ """Four-pass AST-derived Knowledge Base builder (Kuzu).
3
+
4
+ Walks a Java source tree with `tree_sitter_java`, writes a deterministic graph of:
5
+ Symbol nodes: package, file, class, interface, enum, record, annotation, method, constructor
6
+ Route nodes: declaration-site routes (Spring MVC/WebFlux, Feign, Kafka, …)
7
+ Rel tables: EXTENDS, IMPLEMENTS, INJECTS, DECLARES, OVERRIDES, CALLS, EXPOSES
8
+
9
+ Pass 1 builds every node and in-memory resolution indexes.
10
+ Pass 2 resolves each extends/implements/injection target using Java's lookup order
11
+ (same file → explicit import → same package → wildcard import → java.lang → phantom).
12
+ Pass 3 resolves static call sites into confidence-scored CALLS edges and DECLARES.
13
+ Pass 4 emits Route rows plus Symbol→Route EXPOSES edges from literal annotation metadata.
14
+
15
+ Usage:
16
+ build_ast_graph.py --source-root <repo> [--kuzu-path <path>] [--verbose]
17
+
18
+ Default Kuzu database path resolution order:
19
+ --kuzu-path CLI arg (path passed to kuzu.Database(...))
20
+ JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.kuzu (if set and local)
21
+ ./.java-codebase-rag/code_graph.kuzu under cwd
22
+
23
+ The Kuzu DB is dropped and rebuilt on every run (Phase 1 is a full rebuild).
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import hashlib
29
+ import json
30
+ import logging
31
+ import os
32
+ import re
33
+ import sys
34
+ import threading
35
+ import time
36
+ from collections import defaultdict
37
+ from dataclasses import asdict, dataclass, field, replace
38
+ from pathlib import Path
39
+
40
+ import kuzu
41
+
42
+ from ast_java import (
43
+ ONTOLOGY_VERSION,
44
+ CallSite,
45
+ JavaFileAst,
46
+ MethodDecl,
47
+ OutgoingCallDecl,
48
+ TypeDecl,
49
+ injection_annotation_names,
50
+ lombok_required_args_annotations,
51
+ parse_java,
52
+ )
53
+ from graph_enrich import (
54
+ _load_config_cross_service_resolution,
55
+ collect_annotation_meta_chain,
56
+ load_brownfield_overrides,
57
+ microservice_for_path,
58
+ module_for_path,
59
+ phantom_id,
60
+ resolve_async_producer_for_method,
61
+ resolve_http_client_for_method,
62
+ resolve_role_and_capabilities,
63
+ resolve_routes_for_method,
64
+ symbol_id,
65
+ )
66
+ from path_filtering import LayeredIgnore, iter_java_source_files
67
+ from java_ontology import VALID_CLIENT_KINDS, VALID_HTTP_CALL_MATCHES, VALID_PRODUCER_KINDS
68
+
69
+ log = logging.getLogger(__name__)
70
+
71
+ _VERBOSE_STDERR_LOCK = threading.Lock()
72
+
73
+ _PASS1_START = "[pass1] starting · parsing Java files under source root"
74
+ _PASS2_START = "[pass2] starting · emitting EXTENDS / IMPLEMENTS / DECLARES rows"
75
+ _PASS3_START = "[pass3] starting · call resolution (outgoing calls per site)"
76
+ _PASS4_START = "[pass4] starting · route and EXPOSES extraction"
77
+ _PASS5_START = "[pass5] starting · imperative HTTP_CALLS / ASYNC_CALLS edges"
78
+ _PASS6_START = "[pass6] starting · cross-service call-edge matching"
79
+ _WRITE_START = "[write] starting · writing Kuzu graph to disk"
80
+
81
+
82
+ def _verbose_stderr_line(content: str) -> None:
83
+ with _VERBOSE_STDERR_LOCK:
84
+ print(content, file=sys.stderr, flush=True)
85
+
86
+
87
+ class _VerbosePassHeartbeats:
88
+ """Emit ``[tag] running … Ns elapsed`` every 5s on stderr while in scope (verbose only)."""
89
+
90
+ def __init__(self, tag: str, *, verbose: bool) -> None:
91
+ self._tag = tag
92
+ self._verbose = verbose
93
+ self._thr: threading.Thread | None = None
94
+ self._stop: threading.Event | None = None
95
+
96
+ def __enter__(self) -> None:
97
+ if not self._verbose:
98
+ return None
99
+ self._stop = threading.Event()
100
+ stop = self._stop
101
+ tag = self._tag
102
+
103
+ def worker() -> None:
104
+ t0 = time.monotonic()
105
+ while not stop.wait(timeout=5.0):
106
+ elapsed = int(time.monotonic() - t0)
107
+ _verbose_stderr_line(f"{tag} running … {elapsed}s elapsed")
108
+
109
+ self._thr = threading.Thread(target=worker, name=f"hb-{tag}", daemon=True)
110
+ self._thr.start()
111
+ return None
112
+
113
+ def __exit__(self, exc_type, exc, tb) -> bool:
114
+ if self._thr is not None and self._stop is not None:
115
+ self._stop.set()
116
+ self._thr.join(timeout=2.0)
117
+ return False
118
+
119
+
120
+ _JAVA_LANG_SIMPLE = frozenset({
121
+ "Object", "String", "Integer", "Long", "Short", "Byte", "Boolean", "Double",
122
+ "Float", "Character", "Number", "Void", "Class", "Enum", "Record",
123
+ "Throwable", "Exception", "RuntimeException", "Error", "Thread", "Runnable",
124
+ "Iterable", "Comparable", "CharSequence", "StringBuilder", "StringBuffer",
125
+ "Math", "System", "AutoCloseable", "Cloneable",
126
+ })
127
+
128
+
129
+ # ---------- dataclasses ----------
130
+
131
+
132
+ @dataclass
133
+ class TypeIndexEntry:
134
+ """Pass-1 record for a type declaration + any methods/constructors inside it."""
135
+ decl: TypeDecl
136
+ file_path: str
137
+ module: str
138
+ microservice: str
139
+ package: str
140
+ outer_fqn: str | None
141
+ node_id: str
142
+
143
+
144
+ @dataclass
145
+ class MemberEntry:
146
+ kind: str # method | constructor
147
+ decl: MethodDecl
148
+ parent_id: str
149
+ parent_fqn: str
150
+ file_path: str
151
+ module: str
152
+ microservice: str
153
+ node_id: str
154
+
155
+
156
+ @dataclass
157
+ class EdgeRow:
158
+ src_id: str
159
+ dst_id: str
160
+ dst_name: str
161
+ dst_fqn: str
162
+ resolved: bool
163
+
164
+
165
+ @dataclass
166
+ class InjectsRow(EdgeRow):
167
+ mechanism: str = ""
168
+ annotation: str = ""
169
+ field_or_param: str = ""
170
+
171
+
172
+ @dataclass
173
+ class CallsRow:
174
+ src_id: str
175
+ dst_id: str
176
+ call_site_line: int = 0
177
+ call_site_byte: int = 0
178
+ arg_count: int = 0
179
+ confidence: float = 0.0
180
+ strategy: str = "phantom"
181
+ source: str = "static"
182
+ resolved: bool = True
183
+ callee_declaring_role: str = "OTHER"
184
+
185
+
186
+ @dataclass
187
+ class UnresolvedCallSiteRow:
188
+ id: str
189
+ caller_id: str
190
+ call_site_line: int
191
+ call_site_byte: int
192
+ arg_count: int
193
+ callee_simple: str
194
+ receiver_expr: str
195
+ reason: str
196
+
197
+
198
+ @dataclass
199
+ class DeclaresRow:
200
+ src_id: str
201
+ dst_id: str
202
+
203
+
204
+ @dataclass
205
+ class CallResolutionStats:
206
+ total: int = 0
207
+ by_strategy: dict[str, int] = field(default_factory=lambda: defaultdict(int))
208
+ phantom_chained: int = 0
209
+ phantom_other: int = 0
210
+ callee_unresolved: int = 0
211
+ skipped_cross_service: int = 0
212
+
213
+
214
+ @dataclass
215
+ class RouteRow:
216
+ id: str
217
+ kind: str
218
+ framework: str
219
+ method: str
220
+ path: str
221
+ path_template: str
222
+ path_regex: str
223
+ topic: str
224
+ broker: str
225
+ feign_name: str
226
+ feign_url: str
227
+ microservice: str
228
+ module: str
229
+ filename: str
230
+ start_line: int
231
+ end_line: int
232
+ resolved: bool
233
+ # B2a brownfield composition (PR-A3); not persisted on Kuzu `Route` nodes.
234
+ source_layer: str = "builtin"
235
+
236
+
237
+ @dataclass
238
+ class ExposesRow:
239
+ symbol_id: str
240
+ route_id: str
241
+ confidence: float
242
+ strategy: str
243
+
244
+
245
+ @dataclass
246
+ class RouteExtractionStats:
247
+ routes_skipped_unresolved: int = 0
248
+ by_framework: dict[str, int] = field(default_factory=lambda: defaultdict(int))
249
+ by_kind: dict[str, int] = field(default_factory=lambda: defaultdict(int))
250
+ routes_resolved_pct: float = 100.0
251
+ # Percentage of emitted `Route` rows whose `source_layer` is not `builtin`.
252
+ # Brownfield layers: `layer_b_ann`, `layer_a_meta`, `layer_c_source`, `layer_b_fqn`.
253
+ routes_from_brownfield_pct: float = 0.0
254
+ routes_by_layer: dict[str, int] = field(default_factory=dict)
255
+ exposes_suppressed_feign: int = 0
256
+
257
+
258
+ @dataclass
259
+ class HttpCallRow:
260
+ client_id: str
261
+ route_id: str
262
+ confidence: float
263
+ strategy: str
264
+ method_call: str
265
+ raw_uri: str
266
+ match: str
267
+
268
+
269
+ @dataclass
270
+ class AsyncCallRow:
271
+ producer_id: str
272
+ route_id: str
273
+ confidence: float
274
+ strategy: str
275
+ direction: str
276
+ raw_topic: str
277
+ match: str
278
+
279
+
280
+ @dataclass
281
+ class ClientRow:
282
+ id: str
283
+ client_kind: str
284
+ target_service: str
285
+ path: str
286
+ path_template: str
287
+ path_regex: str
288
+ method: str
289
+ member_fqn: str
290
+ member_id: str
291
+ microservice: str
292
+ module: str
293
+ filename: str
294
+ start_line: int
295
+ end_line: int
296
+ resolved: bool
297
+ source_layer: str
298
+
299
+
300
+ @dataclass
301
+ class DeclaresClientRow:
302
+ symbol_id: str
303
+ client_id: str
304
+ confidence: float
305
+ strategy: str
306
+
307
+
308
+ @dataclass
309
+ class ProducerRow:
310
+ id: str
311
+ producer_kind: str
312
+ topic: str
313
+ broker: str
314
+ direction: str
315
+ member_fqn: str
316
+ member_id: str
317
+ microservice: str
318
+ module: str
319
+ filename: str
320
+ start_line: int
321
+ end_line: int
322
+ resolved: bool
323
+ source_layer: str
324
+
325
+
326
+ @dataclass
327
+ class DeclaresProducerRow:
328
+ symbol_id: str
329
+ producer_id: str
330
+ confidence: float
331
+ strategy: str
332
+
333
+
334
+ @dataclass
335
+ class ClientExtractionStats:
336
+ clients_total: int = 0
337
+ declares_client_total: int = 0
338
+ clients_by_kind: dict[str, int] = field(default_factory=lambda: defaultdict(int))
339
+
340
+
341
+ @dataclass
342
+ class ProducerExtractionStats:
343
+ producers_total: int = 0
344
+ declares_producer_total: int = 0
345
+ producers_by_kind: dict[str, int] = field(default_factory=lambda: defaultdict(int))
346
+
347
+
348
+ @dataclass
349
+ class CallEdgeStats:
350
+ http_calls_total: int = 0
351
+ async_calls_total: int = 0
352
+ http_calls_by_client_kind: dict[str, int] = field(default_factory=lambda: defaultdict(int))
353
+ async_calls_by_client_kind: dict[str, int] = field(default_factory=lambda: defaultdict(int))
354
+ http_calls_by_strategy: dict[str, int] = field(default_factory=lambda: defaultdict(int))
355
+ async_calls_by_strategy: dict[str, int] = field(default_factory=lambda: defaultdict(int))
356
+ http_calls_skipped_unresolved: int = 0
357
+ async_calls_skipped_unresolved: int = 0
358
+ http_clients_from_brownfield_pct: float = 0.0
359
+ async_producers_from_brownfield_pct: float = 0.0
360
+ http_calls_match_breakdown: dict[str, int] = field(default_factory=lambda: defaultdict(int))
361
+ async_calls_match_breakdown: dict[str, int] = field(default_factory=lambda: defaultdict(int))
362
+ cross_service_calls_total: int = 0
363
+
364
+
365
+ @dataclass
366
+ class GraphTables:
367
+ types: dict[str, TypeIndexEntry] = field(default_factory=dict) # fqn -> entry
368
+ by_simple_name: dict[str, list[TypeIndexEntry]] = field(default_factory=dict)
369
+ by_package: dict[str, list[TypeIndexEntry]] = field(default_factory=dict)
370
+ files: dict[str, str] = field(default_factory=dict) # path -> node id
371
+ packages: dict[str, str] = field(default_factory=dict) # pkg -> node id
372
+ members: list[MemberEntry] = field(default_factory=list)
373
+ phantoms: dict[str, dict] = field(default_factory=dict) # id -> row
374
+ extends_rows: list[EdgeRow] = field(default_factory=list)
375
+ implements_rows: list[EdgeRow] = field(default_factory=list)
376
+ injects_rows: list[InjectsRow] = field(default_factory=list)
377
+ calls_rows: list[CallsRow] = field(default_factory=list)
378
+ unresolved_call_site_rows: list[UnresolvedCallSiteRow] = field(default_factory=list)
379
+ declares_rows: list[DeclaresRow] = field(default_factory=list)
380
+ routes_rows: list[RouteRow] = field(default_factory=list)
381
+ exposes_rows: list[ExposesRow] = field(default_factory=list)
382
+ http_call_rows: list[HttpCallRow] = field(default_factory=list)
383
+ async_call_rows: list[AsyncCallRow] = field(default_factory=list)
384
+ client_rows: list[ClientRow] = field(default_factory=list)
385
+ declares_client_rows: list[DeclaresClientRow] = field(default_factory=list)
386
+ producer_rows: list[ProducerRow] = field(default_factory=list)
387
+ declares_producer_rows: list[DeclaresProducerRow] = field(default_factory=list)
388
+ overrides_rows: list[DeclaresRow] = field(default_factory=list)
389
+ route_stats: RouteExtractionStats = field(default_factory=RouteExtractionStats)
390
+ call_edge_stats: CallEdgeStats = field(default_factory=CallEdgeStats)
391
+ client_stats: ClientExtractionStats = field(default_factory=ClientExtractionStats)
392
+ producer_stats: ProducerExtractionStats = field(default_factory=ProducerExtractionStats)
393
+ methods_by_type: dict[str, list[MemberEntry]] = field(default_factory=dict)
394
+ parse_errors: int = 0
395
+ skipped_files: int = 0
396
+ pass3_skipped_cross_service: int = 0
397
+ pass3_unresolved_phantom_receiver: int = 0
398
+ pass3_unresolved_chained: int = 0
399
+ cross_service_resolution: str = "auto"
400
+ # Populated in _write_nodes (same overrides + meta_chain as Symbol.role).
401
+ type_role_by_node_id: dict[str, str] = field(default_factory=dict)
402
+
403
+
404
+ # ---------- file walk (see `path_filtering.iter_java_source_files`) ----------
405
+
406
+
407
+ # ---------- pass 1 ----------
408
+
409
+
410
+ def _register_type(
411
+ tables: GraphTables,
412
+ decl: TypeDecl,
413
+ *,
414
+ file_path: str,
415
+ module: str,
416
+ microservice: str,
417
+ outer_fqn: str | None,
418
+ ) -> TypeIndexEntry:
419
+ package = decl.fqn.rsplit(".", 1)[0] if "." in decl.fqn and outer_fqn is None else (
420
+ outer_fqn.rsplit(".", 1)[0] if outer_fqn and "." in outer_fqn else ""
421
+ )
422
+ # top-level: package = fqn - name; nested: inherit from outer
423
+ if outer_fqn is None:
424
+ package = decl.fqn[: -(len(decl.name) + 1)] if decl.fqn.endswith("." + decl.name) else ""
425
+ else:
426
+ # walk outward to find a top-level fqn; package is everything before its simple name
427
+ top = outer_fqn
428
+ while top in tables.types and tables.types[top].outer_fqn:
429
+ top = tables.types[top].outer_fqn # type: ignore[assignment]
430
+ package = top[: top.rfind(".")] if "." in top else ""
431
+
432
+ node_id = symbol_id(decl.kind, decl.fqn, file_path, decl.start_byte)
433
+ entry = TypeIndexEntry(
434
+ decl=decl,
435
+ file_path=file_path,
436
+ module=module,
437
+ microservice=microservice,
438
+ package=package,
439
+ outer_fqn=outer_fqn,
440
+ node_id=node_id,
441
+ )
442
+ tables.types[decl.fqn] = entry
443
+ tables.by_simple_name.setdefault(decl.name, []).append(entry)
444
+ tables.by_package.setdefault(package, []).append(entry)
445
+
446
+ for m in decl.methods:
447
+ kind = "constructor" if m.is_constructor else "method"
448
+ mid = symbol_id(kind, f"{decl.fqn}#{m.signature}", file_path, m.start_byte)
449
+ tables.members.append(MemberEntry(
450
+ kind=kind, decl=m, parent_id=node_id, parent_fqn=decl.fqn,
451
+ file_path=file_path, module=module, microservice=microservice,
452
+ node_id=mid,
453
+ ))
454
+
455
+ for nested in decl.nested:
456
+ _register_type(
457
+ tables, nested, file_path=file_path,
458
+ module=module, microservice=microservice, outer_fqn=decl.fqn,
459
+ )
460
+
461
+ return entry
462
+
463
+
464
+ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str, JavaFileAst]:
465
+ """Walk files, parse them, populate node indexes. Returns path -> AST."""
466
+ asts: dict[str, JavaFileAst] = {}
467
+ ignore = LayeredIgnore(root)
468
+ t0 = time.time()
469
+ n_files = 0
470
+ if verbose:
471
+ _verbose_stderr_line(_PASS1_START)
472
+ slow_sec = 0.0
473
+ raw_slow = os.environ.get("JAVA_CODEBASE_RAG_TEST_GRAPH_SLOW_SEC", "").strip()
474
+ if raw_slow:
475
+ try:
476
+ slow_sec = float(raw_slow)
477
+ except ValueError:
478
+ slow_sec = 0.0
479
+ with _VerbosePassHeartbeats("[pass1]", verbose=verbose):
480
+ if verbose and slow_sec > 0:
481
+ time.sleep(slow_sec)
482
+ for p in iter_java_source_files(root, ignore=ignore):
483
+ n_files += 1
484
+ try:
485
+ content = p.read_bytes()
486
+ except OSError:
487
+ tables.skipped_files += 1
488
+ continue
489
+ if not content.strip():
490
+ continue
491
+ try:
492
+ rel = p.resolve().relative_to(root.resolve()).as_posix()
493
+ except ValueError:
494
+ rel = p.as_posix()
495
+ try:
496
+ ast = parse_java(content, filename=rel, verbose=verbose)
497
+ except Exception:
498
+ tables.parse_errors += 1
499
+ continue
500
+ if ast.parse_error:
501
+ tables.parse_errors += 1
502
+ # Still index what tree-sitter gave us; robust to syntax errors.
503
+ module = module_for_path(str(p), root)
504
+ microservice = microservice_for_path(str(p), root)
505
+ asts[rel] = ast
506
+
507
+ # file node
508
+ file_id = symbol_id("file", rel, rel, 0)
509
+ tables.files[rel] = file_id
510
+
511
+ # package node (created lazily; nodes deduped by id)
512
+ if ast.package and ast.package not in tables.packages:
513
+ tables.packages[ast.package] = symbol_id("package", ast.package, "", 0)
514
+
515
+ for t in ast.top_level_types:
516
+ _register_type(
517
+ tables, t, file_path=rel,
518
+ module=module, microservice=microservice, outer_fqn=None,
519
+ )
520
+
521
+ if verbose:
522
+ elapsed = time.time() - t0
523
+ _verbose_stderr_line(
524
+ f"[pass1] parsed {n_files} files in {elapsed:.2f}s: "
525
+ f"{len(tables.types)} types, {len(tables.members)} members, "
526
+ f"{tables.parse_errors} parse errors, {tables.skipped_files} skipped",
527
+ )
528
+ return asts
529
+
530
+
531
+ # ---------- pass 2: resolution + edges ----------
532
+
533
+
534
+ def _resolve_simple(
535
+ name: str,
536
+ *,
537
+ current: TypeIndexEntry,
538
+ ast: JavaFileAst,
539
+ tables: GraphTables,
540
+ ) -> TypeIndexEntry | None:
541
+ """Java-ish name resolution. Returns a known TypeIndexEntry or None (phantom)."""
542
+ # Strip trailing generics the caller may have left in, defensively.
543
+ bare = name.split("<", 1)[0].strip()
544
+ if not bare:
545
+ return None
546
+
547
+ # 0. Nested inside the same top-level hierarchy — try `Outer.Bare` fqn.
548
+ outer = current.outer_fqn
549
+ while outer is not None and outer in tables.types:
550
+ candidate = f"{outer}.{bare}"
551
+ if candidate in tables.types:
552
+ return tables.types[candidate]
553
+ outer = tables.types[outer].outer_fqn
554
+
555
+ # 1. Same-file siblings (same outer as `current`).
556
+ same_outer = current.outer_fqn or current.package
557
+ for e in tables.by_simple_name.get(bare, ()):
558
+ e_parent = e.outer_fqn or e.package
559
+ if e.file_path == current.file_path and e_parent == same_outer:
560
+ return e
561
+
562
+ # 2. Explicit import.
563
+ if bare in ast.explicit_imports:
564
+ fq = ast.explicit_imports[bare]
565
+ if fq in tables.types:
566
+ return tables.types[fq]
567
+ # Known FQN (outside our codebase) → unresolved; caller will phantom-ise.
568
+ return None
569
+
570
+ # 3. Same package.
571
+ if current.package:
572
+ candidate = f"{current.package}.{bare}"
573
+ if candidate in tables.types:
574
+ return tables.types[candidate]
575
+
576
+ # 4. Wildcard imports.
577
+ for wild in ast.wildcard_imports:
578
+ candidate = f"{wild}.{bare}"
579
+ if candidate in tables.types:
580
+ return tables.types[candidate]
581
+
582
+ # 5. java.lang best-effort (unresolved but deterministic phantom).
583
+ return None
584
+
585
+
586
+ def _phantom_target(
587
+ tables: GraphTables,
588
+ simple: str,
589
+ ast: JavaFileAst,
590
+ *,
591
+ current: TypeIndexEntry,
592
+ ) -> tuple[str, str, str]:
593
+ """Produce (id, simple, fqn-or-best-guess) for an unresolved type reference.
594
+
595
+ The fqn falls back through: explicit import → wildcard → java.lang → bare name.
596
+ """
597
+ bare = simple.split("<", 1)[0].strip()
598
+ guess_fqn = bare
599
+ if bare in ast.explicit_imports:
600
+ guess_fqn = ast.explicit_imports[bare]
601
+ elif bare in _JAVA_LANG_SIMPLE:
602
+ guess_fqn = f"java.lang.{bare}"
603
+ elif ast.wildcard_imports:
604
+ # Pick first wildcard as a hint (imperfect but useful for display).
605
+ guess_fqn = f"{ast.wildcard_imports[0]}.{bare}"
606
+
607
+ pid = phantom_id(guess_fqn)
608
+ if pid not in tables.phantoms:
609
+ tables.phantoms[pid] = {
610
+ "id": pid,
611
+ "kind": "class",
612
+ "name": bare,
613
+ "fqn": guess_fqn,
614
+ "package": guess_fqn.rsplit(".", 1)[0] if "." in guess_fqn else "",
615
+ "module": "",
616
+ "microservice": "",
617
+ "filename": "",
618
+ "start_line": 0,
619
+ "end_line": 0,
620
+ "start_byte": 0,
621
+ "end_byte": 0,
622
+ "modifiers": [],
623
+ "annotations": [],
624
+ "capabilities": [],
625
+ "role": "OTHER",
626
+ "signature": "",
627
+ "parent_id": "",
628
+ "resolved": False,
629
+ }
630
+ return pid, bare, guess_fqn
631
+
632
+
633
+ def _edge_for(
634
+ *,
635
+ src: TypeIndexEntry,
636
+ target_simple: str,
637
+ ast: JavaFileAst,
638
+ tables: GraphTables,
639
+ ) -> tuple[str, str, str, bool]:
640
+ resolved = _resolve_simple(target_simple, current=src, ast=ast, tables=tables)
641
+ if resolved is not None:
642
+ return resolved.node_id, resolved.decl.name, resolved.decl.fqn, True
643
+ pid, simple, fqn = _phantom_target(tables, target_simple, ast, current=src)
644
+ return pid, simple, fqn, False
645
+
646
+
647
+ def _emit_extends_implements(
648
+ entry: TypeIndexEntry,
649
+ ast: JavaFileAst,
650
+ tables: GraphTables,
651
+ *,
652
+ seen_ext: set[tuple[str, str]],
653
+ seen_impl: set[tuple[str, str]],
654
+ ) -> None:
655
+ for name in entry.decl.extends:
656
+ dst_id, dst_simple, dst_fqn, ok = _edge_for(
657
+ src=entry, target_simple=name, ast=ast, tables=tables,
658
+ )
659
+ key = (entry.node_id, dst_id)
660
+ if key in seen_ext:
661
+ continue
662
+ seen_ext.add(key)
663
+ tables.extends_rows.append(EdgeRow(
664
+ src_id=entry.node_id, dst_id=dst_id,
665
+ dst_name=dst_simple, dst_fqn=dst_fqn, resolved=ok,
666
+ ))
667
+
668
+ for name in entry.decl.implements:
669
+ dst_id, dst_simple, dst_fqn, ok = _edge_for(
670
+ src=entry, target_simple=name, ast=ast, tables=tables,
671
+ )
672
+ key = (entry.node_id, dst_id)
673
+ if key in seen_impl:
674
+ continue
675
+ seen_impl.add(key)
676
+ tables.implements_rows.append(EdgeRow(
677
+ src_id=entry.node_id, dst_id=dst_id,
678
+ dst_name=dst_simple, dst_fqn=dst_fqn, resolved=ok,
679
+ ))
680
+
681
+
682
+ def _emit_injects(
683
+ entry: TypeIndexEntry,
684
+ ast: JavaFileAst,
685
+ tables: GraphTables,
686
+ *,
687
+ seen: set[tuple[str, str, str, str]],
688
+ ) -> None:
689
+ if entry.decl.kind == "interface":
690
+ return
691
+
692
+ ann_names = [a.name for a in entry.decl.annotations]
693
+ inject_set = injection_annotation_names()
694
+ lombok_rac = lombok_required_args_annotations()
695
+ has_lombok_rac = any(a in lombok_rac for a in ann_names)
696
+
697
+ def _add(
698
+ target: str, mechanism: str, annotation: str, slot: str,
699
+ ) -> None:
700
+ dst_id, dst_simple, dst_fqn, ok = _edge_for(
701
+ src=entry, target_simple=target, ast=ast, tables=tables,
702
+ )
703
+ key = (entry.node_id, dst_id, mechanism, slot)
704
+ if key in seen:
705
+ return
706
+ seen.add(key)
707
+ tables.injects_rows.append(InjectsRow(
708
+ src_id=entry.node_id, dst_id=dst_id,
709
+ dst_name=dst_simple, dst_fqn=dst_fqn, resolved=ok,
710
+ mechanism=mechanism, annotation=annotation, field_or_param=slot,
711
+ ))
712
+
713
+ # Field injection: @Autowired / @Inject / @Resource.
714
+ for f in entry.decl.fields:
715
+ annotated = next((a.name for a in f.annotations if a.name in inject_set), None)
716
+ if annotated:
717
+ _add(f.type_name, "field", annotated, f.name)
718
+
719
+ # Lombok: @RequiredArgsConstructor -> each `final` non-static field becomes an injection;
720
+ # @AllArgsConstructor -> every non-static field.
721
+ if has_lombok_rac:
722
+ all_args = "AllArgsConstructor" in ann_names
723
+ for f in entry.decl.fields:
724
+ if "static" in f.modifiers:
725
+ continue
726
+ if not all_args and "final" not in f.modifiers:
727
+ continue
728
+ _add(f.type_name, "lombok_required_args",
729
+ "AllArgsConstructor" if all_args else "RequiredArgsConstructor",
730
+ f.name)
731
+
732
+ # Constructor injection:
733
+ ctors = [m for m in entry.decl.methods if m.is_constructor]
734
+ if ctors:
735
+ chosen = None
736
+ autowired = [c for c in ctors if any(a.name == "Autowired" for a in c.annotations)]
737
+ if autowired:
738
+ chosen = autowired[0]
739
+ elif len(ctors) == 1 and ctors[0].parameters:
740
+ chosen = ctors[0]
741
+ if chosen is not None:
742
+ annotation = "Autowired" if any(a.name == "Autowired" for a in chosen.annotations) else ""
743
+ for p in chosen.parameters:
744
+ _add(p.type_name, "constructor", annotation, p.name)
745
+
746
+ # Setter injection: setXxx annotated @Autowired with 1 parameter.
747
+ for m in entry.decl.methods:
748
+ if m.is_constructor or not m.name.startswith("set") or len(m.parameters) != 1:
749
+ continue
750
+ if any(a.name == "Autowired" for a in m.annotations):
751
+ _add(m.parameters[0].type_name, "setter", "Autowired",
752
+ m.parameters[0].name)
753
+
754
+
755
+ def pass2_edges(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: bool) -> None:
756
+ t0 = time.time()
757
+ seen_ext: set[tuple[str, str]] = set()
758
+ seen_impl: set[tuple[str, str]] = set()
759
+ seen_inj: set[tuple[str, str, str, str]] = set()
760
+ if verbose:
761
+ _verbose_stderr_line(_PASS2_START)
762
+ with _VerbosePassHeartbeats("[pass2]", verbose=verbose):
763
+ for fqn, entry in tables.types.items():
764
+ ast = asts.get(entry.file_path)
765
+ if ast is None:
766
+ continue
767
+ _emit_extends_implements(entry, ast, tables, seen_ext=seen_ext, seen_impl=seen_impl)
768
+ _emit_injects(entry, ast, tables, seen=seen_inj)
769
+ if verbose:
770
+ elapsed = time.time() - t0
771
+ _verbose_stderr_line(
772
+ f"[pass2] emitted {len(tables.extends_rows)} EXTENDS, "
773
+ f"{len(tables.implements_rows)} IMPLEMENTS, "
774
+ f"{len(tables.injects_rows)} INJECTS, "
775
+ f"{len(tables.phantoms)} phantoms in {elapsed:.2f}s",
776
+ )
777
+
778
+
779
+ # ---------- pass 3: call graph ----------
780
+
781
+
782
+ def _build_member_indexes(tables: GraphTables) -> None:
783
+ tables.methods_by_type = {}
784
+ for m in tables.members:
785
+ tables.methods_by_type.setdefault(m.parent_fqn, []).append(m)
786
+
787
+
788
+ def _direct_supertype_fqns(entry: TypeIndexEntry, tables: GraphTables) -> list[str]:
789
+ out: list[str] = []
790
+ for r in tables.extends_rows:
791
+ if r.src_id == entry.node_id and r.dst_fqn in tables.types:
792
+ out.append(r.dst_fqn)
793
+ for r in tables.implements_rows:
794
+ if r.src_id == entry.node_id and r.dst_fqn in tables.types:
795
+ out.append(r.dst_fqn)
796
+ return out
797
+
798
+
799
+ def _first_supertype_fqn(tables: GraphTables, type_fqn: str) -> str | None:
800
+ entry = tables.types.get(type_fqn)
801
+ if entry is None:
802
+ return None
803
+ for r in tables.extends_rows:
804
+ if r.src_id == entry.node_id and r.dst_fqn in tables.types:
805
+ return r.dst_fqn
806
+ for r in tables.implements_rows:
807
+ if r.src_id == entry.node_id and r.dst_fqn in tables.types:
808
+ return r.dst_fqn
809
+ return None
810
+
811
+
812
+ def _is_chained_receiver_text(receiver_expr: str) -> bool:
813
+ """Heuristic: call chain or complex expr (contains a completed call)."""
814
+ s = receiver_expr.strip()
815
+ return "(" in s and ")" in s
816
+
817
+
818
+ def _resolve_this_super_field_chain(
819
+ expr: str,
820
+ *,
821
+ member: MemberEntry,
822
+ ast: JavaFileAst,
823
+ tables: GraphTables,
824
+ ) -> str | None:
825
+ """Resolve `this.a.b` / `super.a` (no calls) to the final field's type FQN."""
826
+ s = expr.strip()
827
+ if "(" in s or ")" in s or "." not in s:
828
+ return None
829
+ entry = tables.types.get(member.parent_fqn)
830
+ if entry is None:
831
+ return None
832
+ parts = s.split(".")
833
+ if len(parts) < 2:
834
+ return None
835
+ if parts[0] == "this":
836
+ cur = entry
837
+ elif parts[0] == "super":
838
+ sup = _first_supertype_fqn(tables, member.parent_fqn)
839
+ if sup is None or sup not in tables.types:
840
+ return None
841
+ cur = tables.types[sup]
842
+ else:
843
+ return None
844
+ for fname in parts[1:]:
845
+ fld = next((f for f in cur.decl.fields if f.name == fname), None)
846
+ if fld is None:
847
+ return None
848
+ resolved = _resolve_simple(fld.type_name, current=cur, ast=ast, tables=tables)
849
+ if resolved is None:
850
+ return None
851
+ cur = resolved
852
+ return cur.decl.fqn
853
+
854
+
855
+ def _scope_table(member: MemberEntry, ast: JavaFileAst, tables: GraphTables) -> dict[str, str]:
856
+ """Map simple variable/field/param name -> resolved declaring type FQN."""
857
+ scope: dict[str, str] = {}
858
+ entry = tables.types.get(member.parent_fqn)
859
+ if entry is None:
860
+ return scope
861
+
862
+ def add_fields(tentry: TypeIndexEntry) -> None:
863
+ for f in tentry.decl.fields:
864
+ resolved = _resolve_simple(f.type_name, current=tentry, ast=ast, tables=tables)
865
+ if resolved is not None:
866
+ scope[f.name] = resolved.decl.fqn
867
+
868
+ add_fields(entry)
869
+ seen: set[str] = {member.parent_fqn}
870
+ queue = list(_direct_supertype_fqns(entry, tables))
871
+ while queue:
872
+ sup = queue.pop()
873
+ if sup in seen or sup not in tables.types:
874
+ continue
875
+ seen.add(sup)
876
+ te = tables.types[sup]
877
+ for f in te.decl.fields:
878
+ if f.name not in scope:
879
+ resolved = _resolve_simple(f.type_name, current=te, ast=ast, tables=tables)
880
+ if resolved is not None:
881
+ scope[f.name] = resolved.decl.fqn
882
+ queue.extend(_direct_supertype_fqns(te, tables))
883
+
884
+ for p in member.decl.parameters:
885
+ resolved = _resolve_simple(p.type_name, current=entry, ast=ast, tables=tables)
886
+ if resolved is not None:
887
+ scope[p.name] = resolved.decl.fqn
888
+
889
+ # Locals shadow fields and parameters (same simple name → local wins).
890
+ for name, t_simple in member.decl.local_vars:
891
+ resolved = _resolve_simple(t_simple, current=entry, ast=ast, tables=tables)
892
+ if resolved is not None:
893
+ scope[name] = resolved.decl.fqn
894
+
895
+ return scope
896
+
897
+
898
+ def _lookup_method_candidates(
899
+ type_fqn: str,
900
+ callee_simple: str,
901
+ arg_count: int,
902
+ tables: GraphTables,
903
+ ast: JavaFileAst,
904
+ *,
905
+ visited: set[str] | None = None,
906
+ ) -> tuple[list[MemberEntry], bool]:
907
+ """Return (candidates, used_name_only_fallback). Walks type + supertypes.
908
+
909
+ When ``used_name_only_fallback`` is true and ``len(candidates) == 1``, the
910
+ caller may reuse the receiver-resolution strategy (see ``_resolve_and_emit_call``)
911
+ instead of tagging ``overload_ambiguous``.
912
+ """
913
+ if visited is None:
914
+ visited = set()
915
+ exact: list[MemberEntry] = []
916
+ name_only: list[MemberEntry] = []
917
+
918
+ def collect_on_type(tfqn: str) -> None:
919
+ nonlocal exact, name_only
920
+ for m in tables.methods_by_type.get(tfqn, ()):
921
+ if callee_simple == "<init>":
922
+ if not m.decl.is_constructor:
923
+ continue
924
+ np = len(m.decl.parameters)
925
+ if arg_count < 0:
926
+ name_only.append(m)
927
+ elif np == arg_count:
928
+ exact.append(m)
929
+ else:
930
+ name_only.append(m)
931
+ continue
932
+ if m.decl.is_constructor:
933
+ continue
934
+ if m.decl.name != callee_simple:
935
+ continue
936
+ np = len(m.decl.parameters)
937
+ if arg_count < 0:
938
+ name_only.append(m)
939
+ elif np == arg_count:
940
+ exact.append(m)
941
+ else:
942
+ name_only.append(m)
943
+
944
+ queue = [type_fqn]
945
+ while queue:
946
+ tfqn = queue.pop(0)
947
+ if tfqn in visited or tfqn not in tables.types:
948
+ continue
949
+ visited.add(tfqn)
950
+ collect_on_type(tfqn)
951
+ te = tables.types[tfqn]
952
+ for sup in _direct_supertype_fqns(te, tables):
953
+ if sup not in visited:
954
+ queue.append(sup)
955
+ # Synthetic anonymous classes (`….<anon:byte>`): unqualified instance calls
956
+ # may target the lexically enclosing type (D3), e.g. `pingFromAnon()` from
957
+ # `NestedCalls` inside `new Runnable() { void run() { … } }`.
958
+ if ".<anon:" in tfqn and te.outer_fqn and te.outer_fqn not in visited:
959
+ queue.append(te.outer_fqn)
960
+
961
+ if exact:
962
+ return exact, False
963
+ if name_only:
964
+ return name_only, True
965
+ return [], False
966
+
967
+
968
+ def _static_wildcard_resolve(
969
+ callee_simple: str,
970
+ ast: JavaFileAst,
971
+ tables: GraphTables,
972
+ current: TypeIndexEntry,
973
+ ) -> str | None:
974
+ for tw in ast.file_imports.static_wildcards:
975
+ if tw not in tables.types:
976
+ continue
977
+ for m in tables.methods_by_type.get(tw, ()):
978
+ if m.decl.name != callee_simple or m.decl.is_constructor:
979
+ continue
980
+ if "static" not in m.decl.modifiers:
981
+ continue
982
+ return tw
983
+ return None
984
+
985
+
986
+ def _unique_type_simple_resolve(simple: str, tables: GraphTables) -> str | None:
987
+ """Return the type FQN iff exactly one indexed type uses `simple` as `decl.name`.
988
+
989
+ Used only for receiver / static-qualifier disambiguation. Do not use the
990
+ method index here: an unresolved identifier that equals some method's
991
+ simple name elsewhere in the project is not evidence about the receiver type.
992
+ """
993
+ hits = tables.by_simple_name.get(simple, [])
994
+ if len(hits) != 1:
995
+ return None
996
+ return hits[0].decl.fqn
997
+
998
+
999
+ def _suffix_resolve(receiver_simple: str, tables: GraphTables) -> str | None:
1000
+ matches = [fq for fq in tables.types if fq.endswith("." + receiver_simple)]
1001
+ if len(matches) != 1:
1002
+ return None
1003
+ return matches[0]
1004
+
1005
+
1006
+ def _resolve_receiver_type(
1007
+ call: CallSite,
1008
+ *,
1009
+ scope: dict[str, str],
1010
+ member: MemberEntry,
1011
+ ast: JavaFileAst,
1012
+ tables: GraphTables,
1013
+ ) -> tuple[str | None, str, float]:
1014
+ """Returns (receiver_type_fqn_or_none, strategy, confidence)."""
1015
+ expr = call.receiver_expr.strip()
1016
+ callee = call.callee_simple
1017
+
1018
+ effective_static = call.is_static_call
1019
+ if call.is_static_call and expr and not _is_chained_receiver_text(expr):
1020
+ bare_for_static = expr.split("<", 1)[0].strip()
1021
+ if bare_for_static and "." not in bare_for_static and bare_for_static in scope:
1022
+ effective_static = False
1023
+
1024
+ if not expr and not call.is_static_call:
1025
+ if callee in ast.file_imports.static_methods:
1026
+ full = ast.file_imports.static_methods[callee]
1027
+ if "." in full:
1028
+ type_fqn = full.rsplit(".", 1)[0]
1029
+ return type_fqn, "static_import", 0.95
1030
+ sw = _static_wildcard_resolve(callee, ast, tables, tables.types[member.parent_fqn])
1031
+ if sw is not None:
1032
+ return sw, "static_import_wildcard", 0.85
1033
+
1034
+ if effective_static and expr:
1035
+ if _is_chained_receiver_text(expr):
1036
+ return None, "chained_receiver", 0.0
1037
+ entry = tables.types.get(member.parent_fqn)
1038
+ if entry is None:
1039
+ return None, "chained_receiver", 0.0
1040
+ bare_static = expr.split("<", 1)[0].strip()
1041
+ resolved = _resolve_simple(bare_static, current=entry, ast=ast, tables=tables)
1042
+ if resolved is not None:
1043
+ return resolved.decl.fqn, "import_map", 0.95
1044
+ # External type not in the index but FQN is deterministic via an explicit import.
1045
+ # e.g. `import java.util.Objects; Objects.requireNonNull(x)` — we know the FQN
1046
+ # is "java.util.Objects" even though the type isn't indexed; return it so the
1047
+ # edge carries the correct receiver-tier confidence rather than collapsing to phantom.
1048
+ if bare_static in ast.explicit_imports:
1049
+ return ast.explicit_imports[bare_static], "import_map", 0.95
1050
+ uq = _unique_type_simple_resolve(expr, tables)
1051
+ if uq is not None:
1052
+ return uq, "unique_type_name", 0.75
1053
+ sf = _suffix_resolve(expr, tables)
1054
+ if sf is not None:
1055
+ return sf, "suffix", 0.55
1056
+ return None, "phantom", 0.0
1057
+
1058
+ if expr in ("", "this"):
1059
+ return member.parent_fqn, "this_super", 0.95
1060
+
1061
+ if expr == "super":
1062
+ sup = _first_supertype_fqn(tables, member.parent_fqn)
1063
+ if sup is not None:
1064
+ return sup, "this_super", 0.95
1065
+ # No indexed supertype — implicit super to java.lang.Object.
1066
+ # Keep strategy='implicit_super' and confidence=0.90 so this path is
1067
+ # distinguishable from a genuinely unresolvable receiver.
1068
+ return "java.lang.Object", "implicit_super", 0.90
1069
+
1070
+ if _is_chained_receiver_text(expr):
1071
+ return None, "chained_receiver", 0.0
1072
+
1073
+ entry = tables.types.get(member.parent_fqn)
1074
+ if entry is None:
1075
+ return None, "phantom", 0.0
1076
+
1077
+ bare = expr.split("<", 1)[0].strip()
1078
+ if bare in scope:
1079
+ return scope[bare], "import_map", 0.95
1080
+
1081
+ chain = _resolve_this_super_field_chain(expr, member=member, ast=ast, tables=tables)
1082
+ if chain is not None:
1083
+ return chain, "import_map", 0.95
1084
+
1085
+ resolved = _resolve_simple(bare, current=entry, ast=ast, tables=tables)
1086
+ if resolved is not None:
1087
+ return resolved.decl.fqn, "import_map", 0.95
1088
+
1089
+ if entry.package:
1090
+ cand = f"{entry.package}.{bare}"
1091
+ if cand in tables.types:
1092
+ return cand, "same_module", 0.90
1093
+
1094
+ uq = _unique_type_simple_resolve(bare, tables)
1095
+ if uq is not None:
1096
+ return uq, "unique_type_name", 0.75
1097
+
1098
+ sf = _suffix_resolve(bare, tables)
1099
+ if sf is not None:
1100
+ return sf, "suffix", 0.55
1101
+
1102
+ return None, "phantom", 0.0
1103
+
1104
+
1105
+ def _phantom_method_id(
1106
+ tables: GraphTables,
1107
+ *,
1108
+ receiver_fqn: str | None,
1109
+ receiver_expr: str,
1110
+ callee: str,
1111
+ arg_count: int,
1112
+ ) -> str:
1113
+ # Phantom node identity for a resolved receiver omits call-site arity so
1114
+ # method references (arg_count=-1) and normal invocations share one Symbol
1115
+ # per (receiver_fqn, callee) when the callee is not indexed (D1).
1116
+ if receiver_fqn:
1117
+ fqn = f"{receiver_fqn}#{callee}(?)"
1118
+ sig = f"{callee}(?)"
1119
+ else:
1120
+ expr_short = (receiver_expr[:50] if receiver_expr else "?")
1121
+ arity = "(?)" if arg_count < 0 else f"({arg_count})"
1122
+ fqn = f"?{expr_short}#{callee}{arity}"
1123
+ sig = f"{callee}{arity}"
1124
+ pid = phantom_id(fqn)
1125
+ if pid not in tables.phantoms:
1126
+ tables.phantoms[pid] = {
1127
+ "id": pid,
1128
+ "kind": "method",
1129
+ "name": callee,
1130
+ "fqn": fqn,
1131
+ "package": "",
1132
+ "module": "",
1133
+ "microservice": "",
1134
+ "filename": "",
1135
+ "start_line": 0,
1136
+ "end_line": 0,
1137
+ "start_byte": 0,
1138
+ "end_byte": 0,
1139
+ "modifiers": [],
1140
+ "annotations": [],
1141
+ "capabilities": [],
1142
+ "role": "OTHER",
1143
+ "signature": sig,
1144
+ "parent_id": "",
1145
+ "resolved": False,
1146
+ }
1147
+ return pid
1148
+
1149
+
1150
+ def _method_signature_matches_call(member: MemberEntry, call: CallSite) -> bool:
1151
+ if call.arg_count < 0:
1152
+ return True
1153
+ return len(member.decl.parameters) == call.arg_count
1154
+
1155
+
1156
+ def _is_strict_supertype_of(tables: GraphTables, super_fqn: str, subtype_fqn: str) -> bool:
1157
+ if super_fqn == subtype_fqn:
1158
+ return False
1159
+ entry = tables.types.get(subtype_fqn)
1160
+ if entry is None:
1161
+ return False
1162
+ visited: set[str] = set()
1163
+ queue = list(_direct_supertype_fqns(entry, tables))
1164
+ while queue:
1165
+ tfqn = queue.pop(0)
1166
+ if tfqn == super_fqn:
1167
+ return True
1168
+ if tfqn in visited or tfqn not in tables.types:
1169
+ continue
1170
+ visited.add(tfqn)
1171
+ queue.extend(_direct_supertype_fqns(tables.types[tfqn], tables))
1172
+ return False
1173
+
1174
+
1175
+ def _callee_declaring_role_at_write(
1176
+ tables: GraphTables,
1177
+ dst_id: str,
1178
+ *,
1179
+ member_by_id: dict[str, MemberEntry],
1180
+ ) -> str:
1181
+ """Match parent declaring-type Symbol.role (brownfield + meta_chain included)."""
1182
+ if dst_id in tables.phantoms:
1183
+ return "OTHER"
1184
+ member = member_by_id.get(dst_id)
1185
+ if member is None:
1186
+ return "OTHER"
1187
+ return tables.type_role_by_node_id.get(member.parent_id, "OTHER")
1188
+
1189
+
1190
+ def _collapse_supertype_duplicates(
1191
+ candidates: list[MemberEntry],
1192
+ recv_type_fqn: str,
1193
+ call: CallSite,
1194
+ tables: GraphTables,
1195
+ ) -> list[MemberEntry]:
1196
+ """§3.3.1 supertype-walk dedup — collapse interface + concrete duplicate sites."""
1197
+ if len(candidates) <= 1:
1198
+ return candidates
1199
+ concrete_on_receiver = [
1200
+ c for c in candidates
1201
+ if c.parent_fqn == recv_type_fqn and _method_signature_matches_call(c, call)
1202
+ ]
1203
+ if len(concrete_on_receiver) != 1:
1204
+ return candidates
1205
+ concrete = concrete_on_receiver[0]
1206
+ supertypes = [
1207
+ c for c in candidates
1208
+ if c is not concrete
1209
+ and _is_strict_supertype_of(tables, c.parent_fqn, recv_type_fqn)
1210
+ and c.decl.signature == concrete.decl.signature
1211
+ ]
1212
+ if not supertypes:
1213
+ return candidates
1214
+ allowed_ids = {concrete.node_id, *(c.node_id for c in supertypes)}
1215
+ if any(c.node_id not in allowed_ids for c in candidates):
1216
+ return candidates
1217
+ log.debug(
1218
+ "pass3 supertype dedup %s -> %s",
1219
+ [c.node_id for c in candidates],
1220
+ concrete.node_id,
1221
+ )
1222
+ return [concrete]
1223
+
1224
+
1225
+ def _unresolved_call_site_id(caller_id: str, call: CallSite) -> str:
1226
+ return f"ucs:{caller_id}:{call.line}:{call.byte}"
1227
+
1228
+
1229
+ def _emit_unresolved_call_site(
1230
+ tables: GraphTables,
1231
+ stats: CallResolutionStats,
1232
+ *,
1233
+ caller_id: str,
1234
+ call: CallSite,
1235
+ reason: str,
1236
+ ) -> None:
1237
+ tables.unresolved_call_site_rows.append(UnresolvedCallSiteRow(
1238
+ id=_unresolved_call_site_id(caller_id, call),
1239
+ caller_id=caller_id,
1240
+ call_site_line=call.line,
1241
+ call_site_byte=call.byte,
1242
+ arg_count=call.arg_count,
1243
+ callee_simple=call.callee_simple,
1244
+ receiver_expr=call.receiver_expr or "",
1245
+ reason=reason,
1246
+ ))
1247
+ if reason == "chained_receiver":
1248
+ stats.phantom_chained += 1
1249
+ else:
1250
+ stats.phantom_other += 1
1251
+
1252
+
1253
+ def _emit_call_edge(
1254
+ tables: GraphTables,
1255
+ stats: CallResolutionStats,
1256
+ *,
1257
+ src_id: str,
1258
+ dst_id: str,
1259
+ call: CallSite,
1260
+ confidence: float,
1261
+ strategy: str,
1262
+ resolved: bool,
1263
+ edge_arg_count: int | None = None,
1264
+ ) -> None:
1265
+ arity = call.arg_count if edge_arg_count is None else edge_arg_count
1266
+ tables.calls_rows.append(CallsRow(
1267
+ src_id=src_id,
1268
+ dst_id=dst_id,
1269
+ call_site_line=call.line,
1270
+ call_site_byte=call.byte,
1271
+ arg_count=arity,
1272
+ confidence=confidence,
1273
+ strategy=strategy,
1274
+ source="static",
1275
+ resolved=resolved,
1276
+ ))
1277
+ stats.total += 1
1278
+ stats.by_strategy[strategy] += 1
1279
+ if not resolved:
1280
+ stats.callee_unresolved += 1
1281
+
1282
+
1283
+ def _resolve_and_emit_call(
1284
+ call: CallSite,
1285
+ member: MemberEntry,
1286
+ ast: JavaFileAst,
1287
+ tables: GraphTables,
1288
+ stats: CallResolutionStats,
1289
+ *,
1290
+ scope: dict[str, str],
1291
+ ) -> None:
1292
+ """Emit CALLS rows for one call site.
1293
+
1294
+ Candidate selection uses ``_lookup_method_candidates`` (exact arity first, then
1295
+ name-only fallback on the type + supertype walk).
1296
+
1297
+ When ``used_name_only_fallback`` is true and exactly one name-only candidate
1298
+ exists, the edge ``strategy`` reuses the receiver-resolution tier (``strat``)
1299
+ rather than ``overload_ambiguous``: arity at the call site did not match any
1300
+ overload, but only one method of that name exists — the callee is unambiguous.
1301
+ """
1302
+ recv_type, strat, conf = _resolve_receiver_type(call, scope=scope, member=member, ast=ast, tables=tables)
1303
+
1304
+ if strat == "chained_receiver":
1305
+ _emit_unresolved_call_site(
1306
+ tables, stats, caller_id=member.node_id, call=call, reason="chained_receiver",
1307
+ )
1308
+ return
1309
+
1310
+ if recv_type is None:
1311
+ _emit_unresolved_call_site(
1312
+ tables, stats,
1313
+ caller_id=member.node_id,
1314
+ call=call,
1315
+ reason="phantom_unresolved_receiver",
1316
+ )
1317
+ return
1318
+
1319
+ candidates, name_only_fb = _lookup_method_candidates(
1320
+ recv_type, call.callee_simple, call.arg_count, tables, ast,
1321
+ )
1322
+
1323
+ # Guard relies on `_lookup_method_candidates` returning a same-ms candidate when one exists; revisit if pass3 scopes lookups per-microservice.
1324
+ if member.microservice:
1325
+ same_ms = [c for c in candidates if c.microservice == member.microservice]
1326
+ if same_ms and len(same_ms) != len(candidates):
1327
+ for c in candidates:
1328
+ if c.microservice and c.microservice != member.microservice:
1329
+ log.warning(
1330
+ "skipping cross-microservice CALLS edge %s -> %s "
1331
+ "(caller=%s, callee=%s)",
1332
+ f"{member.parent_fqn}#{member.decl.signature}",
1333
+ f"{c.parent_fqn}#{c.decl.signature}",
1334
+ member.microservice, c.microservice,
1335
+ )
1336
+ stats.skipped_cross_service += 1
1337
+ candidates = same_ms
1338
+
1339
+ # Compute the call-shape strategy / confidence override BEFORE the
1340
+ # empty-candidates check so they are preserved even when the callee cannot
1341
+ # be located on the resolved receiver type (B3 fix).
1342
+ edge_conf = conf
1343
+ if call.arg_count < 0:
1344
+ edge_strat = "method_reference"
1345
+ elif call.callee_simple == "<init>" and call.receiver_expr == "super" and (
1346
+ call.byte == member.decl.start_byte and call.line == member.decl.start_line
1347
+ ):
1348
+ # Synthesized implicit-super site from _parse_method.
1349
+ edge_strat = "implicit_super"
1350
+ edge_conf = 0.90
1351
+ elif call.callee_simple == "<init>":
1352
+ # new Foo(…), this(…), super(…) — confidence inherited from receiver tier.
1353
+ edge_strat = "constructor"
1354
+ elif name_only_fb and len(candidates) > 1:
1355
+ edge_strat = "overload_ambiguous"
1356
+ elif name_only_fb and len(candidates) == 1:
1357
+ # Name-only fallback with a single candidate — not ambiguous.
1358
+ edge_strat = strat
1359
+ else:
1360
+ edge_strat = strat
1361
+
1362
+ if not candidates:
1363
+ # Receiver was resolved but the callee method isn't indexed on that type
1364
+ # (e.g. JDK / Spring / external library). Preserve the receiver-tier
1365
+ # strategy and confidence — only resolved=False signals the phantom callee
1366
+ # (B3 fix: do NOT downgrade to confidence=0.0 / strategy='phantom' here).
1367
+ pid = _phantom_method_id(
1368
+ tables, receiver_fqn=recv_type, receiver_expr=call.receiver_expr,
1369
+ callee=call.callee_simple, arg_count=call.arg_count,
1370
+ )
1371
+ _emit_call_edge(
1372
+ tables, stats, src_id=member.node_id, dst_id=pid, call=call,
1373
+ confidence=edge_conf, strategy=edge_strat, resolved=False,
1374
+ )
1375
+ return
1376
+
1377
+ if len(candidates) > 1 and edge_strat != "overload_ambiguous":
1378
+ candidates = _collapse_supertype_duplicates(candidates, recv_type, call, tables)
1379
+
1380
+ if len(candidates) == 1:
1381
+ candidate = candidates[0]
1382
+ ref_arity: int | None = None
1383
+ if call.arg_count < 0:
1384
+ ref_arity = len(candidate.decl.parameters)
1385
+ _emit_call_edge(
1386
+ tables, stats, src_id=member.node_id, dst_id=candidate.node_id, call=call,
1387
+ confidence=edge_conf, strategy=edge_strat, resolved=True,
1388
+ edge_arg_count=ref_arity,
1389
+ )
1390
+ return
1391
+
1392
+ for c in candidates:
1393
+ ref_arity_multi: int | None = len(c.decl.parameters) if call.arg_count < 0 else None
1394
+ _emit_call_edge(
1395
+ tables, stats, src_id=member.node_id, dst_id=c.node_id, call=call,
1396
+ confidence=edge_conf, strategy="overload_ambiguous", resolved=True,
1397
+ edge_arg_count=ref_arity_multi,
1398
+ )
1399
+
1400
+
1401
+ def _resolve_method_calls(
1402
+ member: MemberEntry,
1403
+ ast: JavaFileAst,
1404
+ tables: GraphTables,
1405
+ stats: CallResolutionStats,
1406
+ ) -> None:
1407
+ scope = _scope_table(member, ast, tables)
1408
+ for call in member.decl.call_sites:
1409
+ try:
1410
+ _resolve_and_emit_call(call, member, ast, tables, stats, scope=scope)
1411
+ except Exception as e:
1412
+ log.warning("call resolution failed for %s: %s", member.decl.signature, e)
1413
+
1414
+
1415
+ def _process_file_calls(
1416
+ file_ast: JavaFileAst,
1417
+ file_path: str,
1418
+ tables: GraphTables,
1419
+ stats: CallResolutionStats,
1420
+ ) -> None:
1421
+ for member in tables.members:
1422
+ if member.file_path != file_path:
1423
+ continue
1424
+ try:
1425
+ _resolve_method_calls(member, file_ast, tables, stats)
1426
+ except Exception as e:
1427
+ log.warning("Failed to extract calls from %s#%s: %s", member.parent_fqn, member.decl.signature, e)
1428
+
1429
+
1430
+ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: bool) -> None:
1431
+ if verbose:
1432
+ _verbose_stderr_line(_PASS3_START)
1433
+ _build_member_indexes(tables)
1434
+ stats = CallResolutionStats()
1435
+ with _VerbosePassHeartbeats("[pass3]", verbose=verbose):
1436
+ for rel_path, file_ast in asts.items():
1437
+ try:
1438
+ _process_file_calls(file_ast, rel_path, tables, stats)
1439
+ except Exception as e:
1440
+ log.error("Call extraction failed for %s: %s", rel_path, e)
1441
+ denom_calls = max(1, stats.total)
1442
+ denom_sites = max(1, stats.total + stats.phantom_chained + stats.phantom_other)
1443
+ pct_chained = 100.0 * stats.phantom_chained / denom_sites
1444
+ pct_callee_unres = 100.0 * stats.callee_unresolved / denom_calls
1445
+ pct_phantom_recv = 100.0 * stats.phantom_other / denom_sites
1446
+ tables.pass3_skipped_cross_service = int(stats.skipped_cross_service)
1447
+ tables.pass3_unresolved_phantom_receiver = int(stats.phantom_other)
1448
+ tables.pass3_unresolved_chained = int(stats.phantom_chained)
1449
+ msg = (
1450
+ f"Call resolution: {stats.total} CALLS rows, {stats.phantom_chained} chained unresolved "
1451
+ f"({pct_chained:.1f}%), {stats.callee_unresolved} unresolved callee on CALLS "
1452
+ f"({pct_callee_unres:.1f}%), {stats.phantom_other} phantom-receiver unresolved "
1453
+ f"({pct_phantom_recv:.1f}%), {stats.skipped_cross_service} skipped cross-service, "
1454
+ f"strategies: {dict(stats.by_strategy)}"
1455
+ )
1456
+ log.info(msg)
1457
+ if verbose:
1458
+ _verbose_stderr_line(f"[pass3] {msg}")
1459
+
1460
+
1461
+ _PATH_VAR_SEG = re.compile(r"^\{([^:{}]+)(?::([^}]*))?\}$") # whole path segment
1462
+
1463
+
1464
+ def _normalize_path(raw_path: str) -> tuple[str, str]:
1465
+ """Return `(path_template, path_regex)` for a servlet-style path pattern.
1466
+
1467
+ `/api/users/{id}` → ``("/api/users/{}", "^/api/users/[^/]+/?$")``.
1468
+ `{id:\\d+}` constraints strip to ``{}`` in the template while preserving the
1469
+ regex constraint for that segment. Deterministic for shared use by B2b/B6.
1470
+ """
1471
+ raw_path = (raw_path or "").strip()
1472
+ if not raw_path:
1473
+ return "", ""
1474
+ p = raw_path if raw_path.startswith("/") else "/" + raw_path
1475
+ trimmed = p.rstrip("/")
1476
+ if trimmed == "":
1477
+ return "/", "^/?$"
1478
+ segments = [s for s in trimmed.split("/") if s != ""]
1479
+ tmpl_parts: list[str] = []
1480
+ re_parts: list[str] = []
1481
+ for seg in segments:
1482
+ m = _PATH_VAR_SEG.fullmatch(seg)
1483
+ if m:
1484
+ tmpl_parts.append("{}")
1485
+ constraint = m.group(2)
1486
+ re_parts.append(constraint if constraint else "[^/]+")
1487
+ else:
1488
+ tmpl_parts.append(seg)
1489
+ re_parts.append(re.escape(seg))
1490
+ tmpl = "/" + "/".join(tmpl_parts)
1491
+ body = "/".join(re_parts)
1492
+ if not body.startswith("/"):
1493
+ body = "/" + body
1494
+ return tmpl, f"^{body}/?$"
1495
+
1496
+
1497
+ def _route_id(
1498
+ framework: str,
1499
+ kind: str,
1500
+ http_method: str,
1501
+ path_template: str,
1502
+ path_raw: str,
1503
+ topic: str,
1504
+ broker: str,
1505
+ microservice: str,
1506
+ ) -> str:
1507
+ """Stable id; `path_raw` disambiguates HTTP routes when `path_template` is empty (SpEL / const)."""
1508
+ path_key = path_template if path_template else path_raw
1509
+ key = (
1510
+ f"{framework}|{kind}|{http_method}|{path_key}|"
1511
+ f"{topic}|{broker}|{microservice}"
1512
+ )
1513
+ return f"r:{hashlib.sha1(key.encode()).hexdigest()[:16]}"
1514
+
1515
+
1516
+ def _client_id(
1517
+ *,
1518
+ microservice: str,
1519
+ member_fqn: str,
1520
+ client_kind: str,
1521
+ path: str,
1522
+ method: str,
1523
+ ) -> str:
1524
+ key = f"{microservice}|{member_fqn}|{client_kind}|{path}|{method}"
1525
+ return f"c:{hashlib.sha1(key.encode()).hexdigest()[:16]}"
1526
+
1527
+
1528
+ def _producer_id(
1529
+ *,
1530
+ microservice: str,
1531
+ member_fqn: str,
1532
+ producer_kind: str,
1533
+ topic: str,
1534
+ ) -> str:
1535
+ # Topic-level identity per method+kind; broker is intentionally omitted so the same
1536
+ # resolved topic on one method shares one Producer node across call sites.
1537
+ key = f"{microservice}|{member_fqn}|{producer_kind}|{topic}"
1538
+ return f"p:{hashlib.sha1(key.encode()).hexdigest()[:16]}"
1539
+
1540
+
1541
+ def _client_source_layer(strategy: str) -> str:
1542
+ if strategy in {"layer_a_meta", "layer_b_ann", "layer_b_fqn", "layer_c_source"}:
1543
+ return strategy
1544
+ # Some caller extraction paths emit client kind as strategy; treat those
1545
+ # as builtin-source declarations instead of warning on every row.
1546
+ if strategy in VALID_CLIENT_KINDS:
1547
+ return "builtin"
1548
+ if strategy != "builtin":
1549
+ log.warning("unknown client source strategy %r, falling back to builtin", strategy)
1550
+ return "builtin"
1551
+
1552
+
1553
+ def _producer_source_layer(strategy: str) -> str:
1554
+ if strategy in {"layer_a_meta", "layer_b_ann", "layer_b_fqn", "layer_c_source"}:
1555
+ return strategy
1556
+ if strategy in VALID_PRODUCER_KINDS:
1557
+ return "builtin"
1558
+ if strategy != "builtin":
1559
+ log.warning("unknown producer source strategy %r, falling back to builtin", strategy)
1560
+ return "builtin"
1561
+
1562
+
1563
+ _ROUTE_LAYER_RANK: dict[str, int] = {
1564
+ "builtin": 0,
1565
+ "layer_b_ann": 1,
1566
+ "layer_a_meta": 2,
1567
+ "layer_c_source": 3,
1568
+ "layer_b_fqn": 4,
1569
+ }
1570
+
1571
+
1572
+ def pass4_routes(
1573
+ tables: GraphTables,
1574
+ asts: dict[str, JavaFileAst],
1575
+ *,
1576
+ source_root: Path,
1577
+ verbose: bool,
1578
+ ) -> None:
1579
+ stats = tables.route_stats
1580
+ overrides = load_brownfield_overrides(source_root)
1581
+ try:
1582
+ prs = str(source_root.resolve())
1583
+ except OSError:
1584
+ prs = str(source_root)
1585
+ tables.cross_service_resolution = _load_config_cross_service_resolution(prs)
1586
+ meta_chain = collect_annotation_meta_chain(prs)
1587
+ if verbose:
1588
+ _verbose_stderr_line(_PASS4_START)
1589
+ with _VerbosePassHeartbeats("[pass4]", verbose=verbose):
1590
+
1591
+ for ast in asts.values():
1592
+ stats.routes_skipped_unresolved += ast.routes_skipped_unresolved
1593
+
1594
+ routes_by_id: dict[str, RouteRow] = {}
1595
+ exposes_seen: set[tuple[str, str]] = set()
1596
+
1597
+ http_kinds = frozenset({"http_endpoint", "http_consumer"})
1598
+
1599
+ for member in sorted(tables.members, key=lambda m: m.node_id):
1600
+ if member.decl.is_constructor:
1601
+ continue
1602
+ ast = asts.get(member.file_path)
1603
+ if ast is None:
1604
+ continue
1605
+ type_decl = tables.types[member.parent_fqn].decl
1606
+ final_routes = resolve_routes_for_method(
1607
+ method_decl=member.decl,
1608
+ enclosing_type=type_decl,
1609
+ overrides=overrides,
1610
+ meta_chain=meta_chain,
1611
+ builtin_routes=member.decl.routes,
1612
+ )
1613
+ if not final_routes:
1614
+ continue
1615
+ for decl in final_routes:
1616
+ path_template, path_regex = ("", "")
1617
+ if decl.kind in http_kinds:
1618
+ if decl.resolved and decl.resolution_strategy in (
1619
+ "annotation",
1620
+ "codebase_route",
1621
+ ):
1622
+ path_template, path_regex = _normalize_path(decl.path)
1623
+ else:
1624
+ path_template, path_regex = "", ""
1625
+ rid = _route_id(
1626
+ decl.framework,
1627
+ decl.kind,
1628
+ decl.http_method,
1629
+ path_template,
1630
+ decl.path,
1631
+ decl.topic,
1632
+ decl.broker,
1633
+ member.microservice,
1634
+ )
1635
+ layer = decl.route_source_layer
1636
+ if rid not in routes_by_id:
1637
+ routes_by_id[rid] = RouteRow(
1638
+ id=rid,
1639
+ kind=decl.kind,
1640
+ framework=decl.framework,
1641
+ method=decl.http_method,
1642
+ path=decl.path,
1643
+ path_template=path_template,
1644
+ path_regex=path_regex,
1645
+ topic=decl.topic,
1646
+ broker=decl.broker,
1647
+ feign_name=decl.feign_name,
1648
+ feign_url=decl.feign_url,
1649
+ microservice=member.microservice,
1650
+ module=member.module,
1651
+ filename=decl.filename,
1652
+ start_line=decl.start_line,
1653
+ end_line=decl.end_line,
1654
+ resolved=decl.resolved,
1655
+ source_layer=layer,
1656
+ )
1657
+ else:
1658
+ prev = routes_by_id[rid]
1659
+ if _ROUTE_LAYER_RANK.get(layer, 0) > _ROUTE_LAYER_RANK.get(
1660
+ prev.source_layer,
1661
+ 0,
1662
+ ):
1663
+ routes_by_id[rid] = replace(prev, source_layer=layer)
1664
+ ek = (member.node_id, rid)
1665
+ if ek not in exposes_seen:
1666
+ route_kind = routes_by_id[rid].kind
1667
+ if route_kind == "http_consumer":
1668
+ stats.exposes_suppressed_feign += 1
1669
+ continue
1670
+ exposes_seen.add(ek)
1671
+ tables.exposes_rows.append(
1672
+ ExposesRow(
1673
+ symbol_id=member.node_id,
1674
+ route_id=rid,
1675
+ confidence=decl.confidence,
1676
+ strategy=decl.resolution_strategy,
1677
+ ),
1678
+ )
1679
+
1680
+ tables.routes_rows = sorted(routes_by_id.values(), key=lambda r: r.id)
1681
+
1682
+ for row in tables.routes_rows:
1683
+ stats.by_framework[row.framework] += 1
1684
+ stats.by_kind[row.kind] += 1
1685
+
1686
+ n_routes = len(tables.routes_rows)
1687
+ if n_routes:
1688
+ stats.routes_resolved_pct = 100.0 * sum(
1689
+ 1 for r in tables.routes_rows if r.resolved
1690
+ ) / n_routes
1691
+ stats.routes_from_brownfield_pct = 100.0 * sum(
1692
+ 1 for r in tables.routes_rows if r.source_layer != "builtin"
1693
+ ) / n_routes
1694
+ else:
1695
+ stats.routes_resolved_pct = 100.0
1696
+ stats.routes_from_brownfield_pct = 0.0
1697
+
1698
+ by_layer: dict[str, int] = defaultdict(int)
1699
+ for row in tables.routes_rows:
1700
+ by_layer[row.source_layer] += 1
1701
+ stats.routes_by_layer = dict(sorted(by_layer.items()))
1702
+
1703
+ msg = (
1704
+ f"Route extraction: emitted={n_routes}, exposes={len(tables.exposes_rows)}, "
1705
+ f"exposes_suppressed_feign={stats.exposes_suppressed_feign}, "
1706
+ f"skipped_unresolved={stats.routes_skipped_unresolved}, "
1707
+ f"routes_resolved_pct={stats.routes_resolved_pct:.1f}, "
1708
+ f"routes_from_brownfield_pct={stats.routes_from_brownfield_pct:.1f}, "
1709
+ f"by_framework={dict(stats.by_framework)}"
1710
+ )
1711
+ log.info(msg)
1712
+ if verbose:
1713
+ _verbose_stderr_line(f"[pass4] {msg}")
1714
+
1715
+
1716
+ def pass5_imperative_edges(
1717
+ tables: GraphTables,
1718
+ asts: dict[str, JavaFileAst],
1719
+ *,
1720
+ source_root: Path,
1721
+ verbose: bool,
1722
+ ) -> None:
1723
+ del asts
1724
+ overrides = load_brownfield_overrides(source_root)
1725
+ try:
1726
+ prs = str(source_root.resolve())
1727
+ except OSError:
1728
+ prs = str(source_root)
1729
+ tables.cross_service_resolution = _load_config_cross_service_resolution(prs)
1730
+ meta_chain = collect_annotation_meta_chain(prs)
1731
+ routes_by_id = {r.id: r for r in tables.routes_rows}
1732
+ existing_route_ids = set(routes_by_id)
1733
+ http_seen: set[tuple[str, str]] = set()
1734
+ async_seen: set[tuple[str, str]] = set()
1735
+ client_seen: set[str] = set()
1736
+ producer_seen: set[str] = set()
1737
+ declares_client_seen: set[tuple[str, str]] = set()
1738
+ declares_producer_seen: set[tuple[str, str]] = set()
1739
+ route_rows = list(tables.routes_rows)
1740
+
1741
+ def _micro_factor(member: MemberEntry) -> float:
1742
+ ms = microservice_for_path(member.file_path, source_root)
1743
+ return 1.0 if ms else 0.85
1744
+
1745
+ def _append_route(row: RouteRow) -> None:
1746
+ if row.id in existing_route_ids:
1747
+ return
1748
+ existing_route_ids.add(row.id)
1749
+ routes_by_id[row.id] = row
1750
+ route_rows.append(row)
1751
+
1752
+ def _phantom_http_route_id(call: OutgoingCallDecl) -> str:
1753
+ if call.path_template_call and call.method_call:
1754
+ return _route_id("", "http_endpoint", call.method_call, call.path_template_call, call.path_template_call, "", "", "")
1755
+ uniq = hashlib.sha1(f"{call.filename}:{call.start_line}:{call.raw_uri}".encode()).hexdigest()[:12]
1756
+ return f"r:phantom:{uniq}"
1757
+
1758
+ def _phantom_async_route_id(call: OutgoingCallDecl) -> str:
1759
+ if call.topic_call:
1760
+ return _route_id("", "kafka_topic", "", "", "", call.topic_call, call.broker_call, "")
1761
+ uniq = hashlib.sha1(f"{call.filename}:{call.start_line}:{call.raw_topic}".encode()).hexdigest()[:12]
1762
+ return f"r:phantom:{uniq}"
1763
+
1764
+ if verbose:
1765
+ _verbose_stderr_line(_PASS5_START)
1766
+ with _VerbosePassHeartbeats("[pass5]", verbose=verbose):
1767
+ for member in sorted(tables.members, key=lambda x: x.node_id):
1768
+ if member.decl.is_constructor:
1769
+ continue
1770
+ type_decl = tables.types[member.parent_fqn].decl
1771
+ final_http_calls = resolve_http_client_for_method(
1772
+ method_decl=member.decl,
1773
+ enclosing_type=type_decl,
1774
+ overrides=overrides,
1775
+ meta_chain=meta_chain,
1776
+ builtin_calls=member.decl.outgoing_calls,
1777
+ )
1778
+ final_async_calls = resolve_async_producer_for_method(
1779
+ method_decl=member.decl,
1780
+ enclosing_type=type_decl,
1781
+ overrides=overrides,
1782
+ meta_chain=meta_chain,
1783
+ builtin_calls=member.decl.outgoing_calls,
1784
+ )
1785
+ micro_factor = _micro_factor(member)
1786
+ for call in final_http_calls + final_async_calls:
1787
+ if call.channel == "http":
1788
+ client_path = (call.path_template_call or "").strip()
1789
+ client_method = (call.method_call or "").strip().upper()
1790
+ # Keep normalized path fields on Client now so LC3 filter semantics
1791
+ # (`path_prefix`) can use persisted columns without extra transforms.
1792
+ client_path_template = ""
1793
+ client_path_regex = ""
1794
+ if client_path:
1795
+ client_path_template, client_path_regex = _normalize_path(client_path)
1796
+ cid = _client_id(
1797
+ microservice=member.microservice,
1798
+ member_fqn=call.method_fqn,
1799
+ client_kind=call.client_kind,
1800
+ path=client_path,
1801
+ method=client_method,
1802
+ )
1803
+ if cid not in client_seen:
1804
+ client_seen.add(cid)
1805
+ tables.client_rows.append(
1806
+ ClientRow(
1807
+ id=cid,
1808
+ client_kind=call.client_kind,
1809
+ target_service=call.feign_target_name,
1810
+ path=client_path,
1811
+ path_template=client_path_template,
1812
+ path_regex=client_path_regex,
1813
+ method=client_method,
1814
+ member_fqn=call.method_fqn,
1815
+ member_id=member.node_id,
1816
+ microservice=member.microservice,
1817
+ module=member.module,
1818
+ filename=call.filename,
1819
+ start_line=call.start_line,
1820
+ end_line=call.end_line,
1821
+ resolved=call.resolved,
1822
+ source_layer=_client_source_layer(call.resolution_strategy),
1823
+ ),
1824
+ )
1825
+ dkey = (member.node_id, cid)
1826
+ if dkey not in declares_client_seen:
1827
+ declares_client_seen.add(dkey)
1828
+ tables.declares_client_rows.append(
1829
+ DeclaresClientRow(
1830
+ symbol_id=member.node_id,
1831
+ client_id=cid,
1832
+ confidence=call.confidence_base,
1833
+ strategy=call.resolution_strategy,
1834
+ ),
1835
+ )
1836
+ rid = ""
1837
+ strategy = call.resolution_strategy
1838
+ if call.client_kind == "feign_method":
1839
+ exposing = next((e for e in tables.exposes_rows if e.symbol_id == member.node_id), None)
1840
+ if exposing is not None:
1841
+ rid = exposing.route_id
1842
+ if not rid:
1843
+ rid = _phantom_http_route_id(call)
1844
+ _append_route(
1845
+ RouteRow(
1846
+ id=rid,
1847
+ kind="http_endpoint",
1848
+ framework="",
1849
+ method=call.method_call,
1850
+ path=call.path_template_call,
1851
+ path_template=call.path_template_call,
1852
+ path_regex="",
1853
+ topic="",
1854
+ broker="",
1855
+ feign_name=call.feign_target_name,
1856
+ feign_url=call.feign_target_url,
1857
+ microservice="",
1858
+ module="",
1859
+ filename=call.filename,
1860
+ start_line=call.start_line,
1861
+ end_line=call.end_line,
1862
+ resolved=False,
1863
+ source_layer="builtin",
1864
+ )
1865
+ )
1866
+ key = (cid, rid)
1867
+ if key in http_seen:
1868
+ continue
1869
+ http_seen.add(key)
1870
+ conf = call.confidence_base * 0.3 * micro_factor
1871
+ tables.http_call_rows.append(
1872
+ HttpCallRow(
1873
+ client_id=cid,
1874
+ route_id=rid,
1875
+ confidence=conf,
1876
+ strategy=strategy,
1877
+ method_call=call.method_call,
1878
+ raw_uri=call.raw_uri,
1879
+ match="unresolved",
1880
+ )
1881
+ )
1882
+ tables.call_edge_stats.http_calls_total += 1
1883
+ tables.call_edge_stats.http_calls_by_client_kind[call.client_kind] += 1
1884
+ tables.call_edge_stats.http_calls_by_strategy[strategy] += 1
1885
+ elif call.channel == "async":
1886
+ topic_atom = (call.topic_call or "").strip()
1887
+ pid = _producer_id(
1888
+ microservice=member.microservice,
1889
+ member_fqn=call.method_fqn,
1890
+ producer_kind=call.client_kind,
1891
+ topic=topic_atom,
1892
+ )
1893
+ if pid not in producer_seen:
1894
+ producer_seen.add(pid)
1895
+ tables.producer_rows.append(
1896
+ ProducerRow(
1897
+ id=pid,
1898
+ producer_kind=call.client_kind,
1899
+ topic=topic_atom,
1900
+ broker=call.broker_call,
1901
+ direction="producer",
1902
+ member_fqn=call.method_fqn,
1903
+ member_id=member.node_id,
1904
+ microservice=member.microservice,
1905
+ module=member.module,
1906
+ filename=call.filename,
1907
+ start_line=call.start_line,
1908
+ end_line=call.end_line,
1909
+ resolved=call.resolved,
1910
+ source_layer=_producer_source_layer(call.resolution_strategy),
1911
+ ),
1912
+ )
1913
+ dpkey = (member.node_id, pid)
1914
+ if dpkey not in declares_producer_seen:
1915
+ declares_producer_seen.add(dpkey)
1916
+ tables.declares_producer_rows.append(
1917
+ DeclaresProducerRow(
1918
+ symbol_id=member.node_id,
1919
+ producer_id=pid,
1920
+ confidence=call.confidence_base,
1921
+ strategy=call.resolution_strategy,
1922
+ ),
1923
+ )
1924
+ rid = _phantom_async_route_id(call)
1925
+ _append_route(
1926
+ RouteRow(
1927
+ id=rid,
1928
+ kind="kafka_topic",
1929
+ framework="",
1930
+ method="",
1931
+ path="",
1932
+ path_template="",
1933
+ path_regex="",
1934
+ topic=call.topic_call,
1935
+ broker=call.broker_call,
1936
+ feign_name="",
1937
+ feign_url="",
1938
+ microservice="",
1939
+ module="",
1940
+ filename=call.filename,
1941
+ start_line=call.start_line,
1942
+ end_line=call.end_line,
1943
+ resolved=False,
1944
+ source_layer="builtin",
1945
+ )
1946
+ )
1947
+ key = (pid, rid)
1948
+ if key in async_seen:
1949
+ continue
1950
+ async_seen.add(key)
1951
+ conf = call.confidence_base * 0.3 * micro_factor
1952
+ strategy = call.resolution_strategy
1953
+ tables.async_call_rows.append(
1954
+ AsyncCallRow(
1955
+ producer_id=pid,
1956
+ route_id=rid,
1957
+ confidence=conf,
1958
+ strategy=strategy,
1959
+ direction="producer",
1960
+ raw_topic=call.raw_topic,
1961
+ match="unresolved",
1962
+ )
1963
+ )
1964
+ tables.call_edge_stats.async_calls_total += 1
1965
+ tables.call_edge_stats.async_calls_by_client_kind[call.client_kind] += 1
1966
+ tables.call_edge_stats.async_calls_by_strategy[strategy] += 1
1967
+
1968
+ tables.routes_rows = sorted(route_rows, key=lambda r: r.id)
1969
+ tables.client_rows = sorted(tables.client_rows, key=lambda c: c.id)
1970
+ tables.declares_client_rows = sorted(
1971
+ tables.declares_client_rows,
1972
+ key=lambda e: (e.symbol_id, e.client_id),
1973
+ )
1974
+ tables.client_stats.clients_total = len(tables.client_rows)
1975
+ tables.client_stats.declares_client_total = len(tables.declares_client_rows)
1976
+ tables.client_stats.clients_by_kind = defaultdict(int)
1977
+ for row in tables.client_rows:
1978
+ tables.client_stats.clients_by_kind[row.client_kind] += 1
1979
+ tables.producer_rows = sorted(tables.producer_rows, key=lambda p: p.id)
1980
+ tables.declares_producer_rows = sorted(
1981
+ tables.declares_producer_rows,
1982
+ key=lambda e: (e.symbol_id, e.producer_id),
1983
+ )
1984
+ tables.producer_stats.producers_total = len(tables.producer_rows)
1985
+ tables.producer_stats.declares_producer_total = len(tables.declares_producer_rows)
1986
+ tables.producer_stats.producers_by_kind = defaultdict(int)
1987
+ for row in tables.producer_rows:
1988
+ tables.producer_stats.producers_by_kind[row.producer_kind] += 1
1989
+ brownfield_strategies = frozenset(
1990
+ (
1991
+ "layer_b_ann",
1992
+ "layer_a_meta",
1993
+ "layer_c_source",
1994
+ "layer_b_fqn",
1995
+ "codebase_client",
1996
+ "codebase_producer",
1997
+ ),
1998
+ )
1999
+ if tables.call_edge_stats.http_calls_total:
2000
+ n_http = sum(
2001
+ v for k, v in tables.call_edge_stats.http_calls_by_strategy.items()
2002
+ if k in brownfield_strategies
2003
+ )
2004
+ tables.call_edge_stats.http_clients_from_brownfield_pct = (
2005
+ 100.0 * float(n_http) / float(tables.call_edge_stats.http_calls_total)
2006
+ )
2007
+ if tables.call_edge_stats.async_calls_total:
2008
+ n_async = sum(
2009
+ v for k, v in tables.call_edge_stats.async_calls_by_strategy.items()
2010
+ if k in brownfield_strategies
2011
+ )
2012
+ tables.call_edge_stats.async_producers_from_brownfield_pct = (
2013
+ 100.0 * float(n_async) / float(tables.call_edge_stats.async_calls_total)
2014
+ )
2015
+ if verbose:
2016
+ http_client = dict(sorted(tables.call_edge_stats.http_calls_by_client_kind.items()))
2017
+ async_client = dict(sorted(tables.call_edge_stats.async_calls_by_client_kind.items()))
2018
+ http_strategy = dict(sorted(tables.call_edge_stats.http_calls_by_strategy.items()))
2019
+ async_strategy = dict(sorted(tables.call_edge_stats.async_calls_by_strategy.items()))
2020
+ _verbose_stderr_line(
2021
+ f"[pass5] HTTP_CALLS: {len(tables.http_call_rows)} edges, "
2022
+ f"ASYNC_CALLS: {len(tables.async_call_rows)} edges; "
2023
+ f"http_by_client_kind={http_client}, async_by_client_kind={async_client}, "
2024
+ f"http_by_strategy={http_strategy}, async_by_strategy={async_strategy}",
2025
+ )
2026
+
2027
+
2028
+ def _match_call_edge(
2029
+ call: OutgoingCallDecl,
2030
+ routes: list[RouteRow],
2031
+ caller_microservice: str,
2032
+ ) -> tuple[str, list[RouteRow]]:
2033
+ """Return (match_outcome, candidate_routes) for an outgoing call."""
2034
+ if (
2035
+ (not call.resolved)
2036
+ and call.path_template_call == ""
2037
+ and call.topic_call == ""
2038
+ ):
2039
+ return "unresolved", []
2040
+
2041
+ candidates: list[RouteRow] = []
2042
+ if call.client_kind == "feign_method":
2043
+ # Prefer endpoint matching by target service + path/method for Feign declarations.
2044
+ path_value = call.path_template_call
2045
+ method_value = call.method_call
2046
+ if path_value:
2047
+ for r in routes:
2048
+ if r.kind != "http_endpoint":
2049
+ continue
2050
+ if call.feign_target_name and r.microservice != call.feign_target_name:
2051
+ continue
2052
+ if not (r.method == "" or method_value == "" or r.method == method_value):
2053
+ continue
2054
+ if not r.path_regex:
2055
+ continue
2056
+ try:
2057
+ if re.fullmatch(r.path_regex, path_value or "") is None:
2058
+ continue
2059
+ except re.error:
2060
+ continue
2061
+ candidates.append(r)
2062
+ if not candidates:
2063
+ # Fallback for legacy/manual routes that only expose Feign target names.
2064
+ candidates = [
2065
+ r for r in routes
2066
+ if r.feign_name and call.feign_target_name and r.feign_name == call.feign_target_name
2067
+ ]
2068
+ elif call.channel == "http":
2069
+ path_value = call.path_template_call
2070
+ method_value = call.method_call
2071
+ for r in routes:
2072
+ if r.kind != "http_endpoint":
2073
+ continue
2074
+ if not (r.method == "" or method_value == "" or r.method == method_value):
2075
+ continue
2076
+ if not r.path_regex:
2077
+ continue
2078
+ try:
2079
+ if re.fullmatch(r.path_regex, path_value or "") is None:
2080
+ continue
2081
+ except re.error:
2082
+ continue
2083
+ candidates.append(r)
2084
+ elif call.channel == "async":
2085
+ candidates = [
2086
+ r for r in routes
2087
+ if r.topic == call.topic_call and r.broker == call.broker_call
2088
+ ]
2089
+
2090
+ if not candidates:
2091
+ return "phantom", []
2092
+ if len(candidates) > 1:
2093
+ return "ambiguous", candidates
2094
+ if candidates[0].microservice and candidates[0].microservice == caller_microservice:
2095
+ return "intra_service", candidates
2096
+ return "cross_service", candidates
2097
+
2098
+
2099
+ _BROWNFIELD_LAYERS = frozenset({
2100
+ "layer_c_source",
2101
+ "layer_b_ann",
2102
+ "layer_b_fqn",
2103
+ "layer_a_meta",
2104
+ })
2105
+
2106
+
2107
+ def _is_brownfield_sourced(
2108
+ call_strategy: str,
2109
+ candidates: list[RouteRow],
2110
+ ) -> bool:
2111
+ """Both sides must come from brownfield layers for an edge to count as
2112
+ authoritative under brownfield_only mode."""
2113
+ if not candidates:
2114
+ return False
2115
+ if call_strategy not in _BROWNFIELD_LAYERS:
2116
+ return False
2117
+ return all(
2118
+ getattr(c, "source_layer", "builtin") in _BROWNFIELD_LAYERS
2119
+ for c in candidates
2120
+ )
2121
+
2122
+
2123
+ def pass6_match_edges(
2124
+ tables: GraphTables,
2125
+ *,
2126
+ verbose: bool,
2127
+ ) -> None:
2128
+ match_factor: dict[str, float] = {
2129
+ "cross_service": 1.0,
2130
+ "intra_service": 0.6,
2131
+ "ambiguous": 0.5,
2132
+ "phantom": 0.4,
2133
+ "unresolved": 0.3,
2134
+ }
2135
+ route_by_id = {r.id: r for r in tables.routes_rows}
2136
+ all_routes = [r for r in tables.routes_rows if r.microservice]
2137
+ member_by_id = {m.node_id: m for m in tables.members}
2138
+ clients_by_id = {c.id: c for c in tables.client_rows}
2139
+ producers_by_id = {p.id: p for p in tables.producer_rows}
2140
+ client_hints_by_member: dict[str, list[ClientRow]] = defaultdict(list)
2141
+ for edge in tables.declares_client_rows:
2142
+ client = clients_by_id.get(edge.client_id)
2143
+ if client is None:
2144
+ continue
2145
+ # `DECLARES_CLIENT.symbol_id` targets `Symbol.id` for member symbols,
2146
+ # and member symbols are emitted with `id == MemberEntry.node_id`.
2147
+ client_hints_by_member[edge.symbol_id].append(client)
2148
+ for member_symbol_id in list(client_hints_by_member.keys()):
2149
+ # Deterministic fallback when a method carries multiple feign declarations.
2150
+ client_hints_by_member[member_symbol_id].sort(key=lambda c: c.id)
2151
+
2152
+ # Pass 6 is idempotent for full rebuilds: each run fully re-derives match outcomes.
2153
+ # If incremental rebuild lands later (Tier-2 follow-up), this reset must remain pass-scoped.
2154
+ tables.call_edge_stats.http_calls_match_breakdown.clear()
2155
+ tables.call_edge_stats.async_calls_match_breakdown.clear()
2156
+ tables.call_edge_stats.cross_service_calls_total = 0
2157
+
2158
+ brownfield_only = tables.cross_service_resolution == "brownfield_only"
2159
+ suppressed_auto_cross_http: list[str] = []
2160
+ suppressed_auto_cross_async: list[str] = []
2161
+ suppressed_auto_cross_count = 0
2162
+
2163
+ def _micro_factor(member: MemberEntry | None) -> float:
2164
+ return 1.0 if (member and member.microservice) else 0.85
2165
+
2166
+ if verbose:
2167
+ _verbose_stderr_line(_PASS6_START)
2168
+ with _VerbosePassHeartbeats("[pass6]", verbose=verbose):
2169
+ for row in tables.http_call_rows:
2170
+ if row.match != "unresolved":
2171
+ continue
2172
+ client = clients_by_id.get(row.client_id)
2173
+ member = member_by_id.get(client.member_id) if client else None
2174
+ base = row.confidence / max(1e-9, (0.3 * _micro_factor(member)))
2175
+ src_route = route_by_id.get(row.route_id)
2176
+ if src_route is None and member is not None:
2177
+ # Recover feign caller hints from persisted caller-side Client declarations.
2178
+ for client in client_hints_by_member.get(member.node_id, ()):
2179
+ if client.client_kind != "feign_method":
2180
+ continue
2181
+ path_template, path_regex = _normalize_path(client.path)
2182
+ src_route = RouteRow(
2183
+ id="",
2184
+ kind="http_consumer",
2185
+ framework="feign",
2186
+ method=client.method,
2187
+ path=client.path,
2188
+ path_template=path_template,
2189
+ path_regex=path_regex,
2190
+ topic="",
2191
+ broker="",
2192
+ feign_name=client.target_service,
2193
+ # `Client` stores service-name hints, not feign URL; matcher keys off feign_name.
2194
+ feign_url="",
2195
+ microservice=member.microservice,
2196
+ module=member.module,
2197
+ filename=client.filename,
2198
+ start_line=client.start_line,
2199
+ end_line=client.end_line,
2200
+ resolved=client.resolved,
2201
+ source_layer=client.source_layer,
2202
+ )
2203
+ break
2204
+ # Feign caller hints are synthesized as transient `http_consumer` routes in pass6;
2205
+ # synthetic phantoms from imperative clients are `http_endpoint` even when `feign_name` is populated from
2206
+ # `@CodebaseHttpClient.targetService` / YAML hints — those must path-match like RestTemplate.
2207
+ _feign_like = (
2208
+ src_route is not None
2209
+ and src_route.kind == "http_consumer"
2210
+ and bool(src_route.feign_name)
2211
+ )
2212
+ call = OutgoingCallDecl(
2213
+ method_fqn=f"{member.parent_fqn}#{member.decl.signature}" if member else "",
2214
+ method_sig=member.decl.signature if member else "",
2215
+ client_kind="feign_method" if _feign_like else "rest_template",
2216
+ channel="http",
2217
+ feign_target_name=src_route.feign_name if src_route else "",
2218
+ feign_target_url=src_route.feign_url if src_route else "",
2219
+ path_template_call=src_route.path_template if src_route else "",
2220
+ method_call=row.method_call,
2221
+ topic_call="",
2222
+ broker_call="",
2223
+ raw_uri=row.raw_uri,
2224
+ raw_topic="",
2225
+ resolution_strategy=row.strategy,
2226
+ confidence_base=base,
2227
+ resolved=(row.strategy != "unresolved"),
2228
+ filename=member.file_path if member else "",
2229
+ start_line=member.decl.start_line if member else 0,
2230
+ end_line=member.decl.end_line if member else 0,
2231
+ )
2232
+ outcome, candidates = _match_call_edge(call, all_routes, member.microservice if member else "")
2233
+ if (
2234
+ brownfield_only
2235
+ and outcome == "cross_service"
2236
+ and not _is_brownfield_sourced(row.strategy, candidates)
2237
+ ):
2238
+ outcome = "unresolved"
2239
+ candidates = []
2240
+ suppressed_auto_cross_count += 1
2241
+ if len(suppressed_auto_cross_http) < 5:
2242
+ suppressed_auto_cross_http.append(call.method_fqn)
2243
+ if outcome in VALID_HTTP_CALL_MATCHES:
2244
+ row.match = outcome
2245
+ if outcome in ("cross_service", "intra_service") and len(candidates) == 1:
2246
+ row.route_id = candidates[0].id
2247
+ row.confidence = call.confidence_base * match_factor[row.match] * _micro_factor(member)
2248
+ tables.call_edge_stats.http_calls_match_breakdown[row.match] += 1
2249
+ if row.match == "cross_service":
2250
+ tables.call_edge_stats.cross_service_calls_total += 1
2251
+
2252
+ for row in tables.async_call_rows:
2253
+ if row.match != "unresolved":
2254
+ continue
2255
+ producer = producers_by_id.get(row.producer_id)
2256
+ member = member_by_id.get(producer.member_id) if producer else None
2257
+ base = row.confidence / max(1e-9, (0.3 * _micro_factor(member)))
2258
+ src_route = route_by_id.get(row.route_id)
2259
+ async_kind = producer.producer_kind if producer else "kafka_send"
2260
+ call = OutgoingCallDecl(
2261
+ method_fqn=f"{member.parent_fqn}#{member.decl.signature}" if member else "",
2262
+ method_sig=member.decl.signature if member else "",
2263
+ client_kind=async_kind,
2264
+ channel="async",
2265
+ feign_target_name="",
2266
+ feign_target_url="",
2267
+ path_template_call="",
2268
+ method_call="",
2269
+ topic_call=src_route.topic if src_route else "",
2270
+ broker_call=src_route.broker if src_route else "",
2271
+ raw_uri="",
2272
+ raw_topic=row.raw_topic,
2273
+ resolution_strategy=row.strategy,
2274
+ confidence_base=base,
2275
+ resolved=(row.strategy != "unresolved"),
2276
+ filename=member.file_path if member else "",
2277
+ start_line=member.decl.start_line if member else 0,
2278
+ end_line=member.decl.end_line if member else 0,
2279
+ )
2280
+ outcome, candidates = _match_call_edge(call, all_routes, member.microservice if member else "")
2281
+ if (
2282
+ brownfield_only
2283
+ and outcome == "cross_service"
2284
+ and not _is_brownfield_sourced(row.strategy, candidates)
2285
+ ):
2286
+ outcome = "unresolved"
2287
+ candidates = []
2288
+ suppressed_auto_cross_count += 1
2289
+ if len(suppressed_auto_cross_async) < 5:
2290
+ suppressed_auto_cross_async.append(call.method_fqn)
2291
+ if outcome in VALID_HTTP_CALL_MATCHES:
2292
+ row.match = outcome
2293
+ if outcome in ("cross_service", "intra_service") and len(candidates) == 1:
2294
+ row.route_id = candidates[0].id
2295
+ row.confidence = call.confidence_base * match_factor[row.match] * _micro_factor(member)
2296
+ tables.call_edge_stats.async_calls_match_breakdown[row.match] += 1
2297
+ if row.match == "cross_service":
2298
+ tables.call_edge_stats.cross_service_calls_total += 1
2299
+
2300
+ inbound_route_ids = {r.route_id for r in tables.http_call_rows} | {r.route_id for r in tables.async_call_rows}
2301
+ tables.routes_rows = sorted(
2302
+ [
2303
+ r for r in tables.routes_rows
2304
+ if not (
2305
+ (r.microservice == "")
2306
+ and (r.framework == "")
2307
+ and (not r.resolved)
2308
+ and (r.id not in inbound_route_ids)
2309
+ )
2310
+ ],
2311
+ key=lambda r: r.id,
2312
+ )
2313
+
2314
+ if verbose:
2315
+ if brownfield_only:
2316
+ n_bf = tables.call_edge_stats.cross_service_calls_total
2317
+ first_http = ", ".join(suppressed_auto_cross_http)
2318
+ first_async = ", ".join(suppressed_auto_cross_async)
2319
+ _verbose_stderr_line(
2320
+ f"[pass6] cross_service_resolution=brownfield_only:\n"
2321
+ f" {n_bf} cross_service edges from brownfield layers,\n"
2322
+ f" {suppressed_auto_cross_count} auto-cross-service candidates suppressed -> unresolved\n"
2323
+ f" (first 5 http: {first_http})\n"
2324
+ f" (first 5 async: {first_async})",
2325
+ )
2326
+ _verbose_stderr_line(
2327
+ f"[pass6] http_match={dict(sorted(tables.call_edge_stats.http_calls_match_breakdown.items()))}, "
2328
+ f"async_match={dict(sorted(tables.call_edge_stats.async_calls_match_breakdown.items()))}, "
2329
+ f"cross_service_calls_total={tables.call_edge_stats.cross_service_calls_total}",
2330
+ )
2331
+
2332
+
2333
+ # ---------- Kuzu write ----------
2334
+
2335
+
2336
+ _SCHEMA_NODE = (
2337
+ "CREATE NODE TABLE Symbol("
2338
+ "id STRING PRIMARY KEY, "
2339
+ "kind STRING, name STRING, fqn STRING, package STRING, "
2340
+ "module STRING, microservice STRING, "
2341
+ "filename STRING, start_line INT64, end_line INT64, "
2342
+ "start_byte INT64, end_byte INT64, "
2343
+ "modifiers STRING[], annotations STRING[], capabilities STRING[], "
2344
+ "role STRING, signature STRING, parent_id STRING, resolved BOOLEAN"
2345
+ ")"
2346
+ )
2347
+
2348
+ _SCHEMA_META = (
2349
+ "CREATE NODE TABLE GraphMeta("
2350
+ "key STRING PRIMARY KEY, "
2351
+ "ontology_version INT64, built_at INT64, source_root STRING, "
2352
+ "counts_json STRING, parse_errors INT64, "
2353
+ "routes_total INT64, exposes_total INT64, "
2354
+ # JSON map {framework: count}; STRING avoids Kuzu Python MAP↔STRUCT binder mismatch.
2355
+ "routes_by_framework STRING, "
2356
+ "routes_resolved_pct DOUBLE, "
2357
+ "routes_from_brownfield_pct DOUBLE, "
2358
+ "routes_by_layer STRING, "
2359
+ "clients_total INT64, "
2360
+ "declares_client_total INT64, "
2361
+ "clients_by_kind STRING, "
2362
+ "producers_total INT64, "
2363
+ "declares_producer_total INT64, "
2364
+ "producers_by_kind STRING, "
2365
+ "http_calls_total INT64, "
2366
+ "async_calls_total INT64, "
2367
+ "http_calls_by_strategy STRING, "
2368
+ "async_calls_by_strategy STRING, "
2369
+ "http_calls_resolved_pct DOUBLE, "
2370
+ "async_calls_resolved_pct DOUBLE, "
2371
+ "http_clients_from_brownfield_pct DOUBLE, "
2372
+ "async_producers_from_brownfield_pct DOUBLE, "
2373
+ "http_calls_match_breakdown STRING, "
2374
+ "async_calls_match_breakdown STRING, "
2375
+ "cross_service_calls_total INT64, "
2376
+ "pass3_skipped_cross_service INT64, "
2377
+ "pass3_unresolved_phantom_receiver INT64, "
2378
+ "pass3_unresolved_chained INT64, "
2379
+ "pass4_exposes_suppressed_feign INT64, "
2380
+ "cross_service_resolution STRING"
2381
+ ")"
2382
+ )
2383
+
2384
+ _SCHEMA_ROUTE = (
2385
+ "CREATE NODE TABLE Route("
2386
+ "id STRING, kind STRING, framework STRING, "
2387
+ "method STRING, path STRING, path_template STRING, path_regex STRING, "
2388
+ "topic STRING, broker STRING, "
2389
+ "feign_name STRING, feign_url STRING, "
2390
+ "microservice STRING, module STRING, "
2391
+ "filename STRING, start_line INT64, end_line INT64, "
2392
+ "resolved BOOLEAN, "
2393
+ "PRIMARY KEY(id))"
2394
+ )
2395
+
2396
+ _SCHEMA_CLIENT = (
2397
+ "CREATE NODE TABLE Client("
2398
+ "id STRING, client_kind STRING, target_service STRING, "
2399
+ "path STRING, path_template STRING, path_regex STRING, method STRING, "
2400
+ "member_fqn STRING, member_id STRING, "
2401
+ "microservice STRING, module STRING, filename STRING, "
2402
+ "start_line INT64, end_line INT64, resolved BOOLEAN, source_layer STRING, "
2403
+ "PRIMARY KEY(id))"
2404
+ )
2405
+
2406
+ _SCHEMA_PRODUCER = (
2407
+ "CREATE NODE TABLE Producer("
2408
+ "id STRING, producer_kind STRING, topic STRING, broker STRING, direction STRING, "
2409
+ "member_fqn STRING, member_id STRING, "
2410
+ "microservice STRING, module STRING, filename STRING, "
2411
+ "start_line INT64, end_line INT64, resolved BOOLEAN, source_layer STRING, "
2412
+ "PRIMARY KEY(id))"
2413
+ )
2414
+
2415
+ _SCHEMA_EXTENDS = (
2416
+ "CREATE REL TABLE EXTENDS(FROM Symbol TO Symbol, "
2417
+ "dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2418
+ )
2419
+ _SCHEMA_IMPLEMENTS = (
2420
+ "CREATE REL TABLE IMPLEMENTS(FROM Symbol TO Symbol, "
2421
+ "dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2422
+ )
2423
+ _SCHEMA_INJECTS = (
2424
+ "CREATE REL TABLE INJECTS(FROM Symbol TO Symbol, "
2425
+ "dst_name STRING, dst_fqn STRING, resolved BOOLEAN, "
2426
+ "mechanism STRING, annotation STRING, field_or_param STRING)"
2427
+ )
2428
+ _SCHEMA_DECLARES = "CREATE REL TABLE DECLARES(FROM Symbol TO Symbol)"
2429
+ _SCHEMA_OVERRIDES = "CREATE REL TABLE OVERRIDES(FROM Symbol TO Symbol)"
2430
+ _SCHEMA_CALLS = (
2431
+ "CREATE REL TABLE CALLS(FROM Symbol TO Symbol, "
2432
+ "call_site_line INT64, call_site_byte INT64, arg_count INT64, "
2433
+ "confidence DOUBLE, strategy STRING, source STRING, resolved BOOLEAN, "
2434
+ "callee_declaring_role STRING)"
2435
+ )
2436
+ _SCHEMA_UNRESOLVED_CALL_SITE = (
2437
+ "CREATE NODE TABLE UnresolvedCallSite("
2438
+ "id STRING, caller_id STRING, call_site_line INT64, call_site_byte INT64, "
2439
+ "arg_count INT64, callee_simple STRING, receiver_expr STRING, reason STRING, "
2440
+ "PRIMARY KEY(id))"
2441
+ )
2442
+ _SCHEMA_UNRESOLVED_AT = "CREATE REL TABLE UNRESOLVED_AT(FROM Symbol TO UnresolvedCallSite)"
2443
+ _SCHEMA_EXPOSES = (
2444
+ "CREATE REL TABLE EXPOSES(FROM Symbol TO Route, "
2445
+ "confidence DOUBLE, strategy STRING)"
2446
+ )
2447
+ _SCHEMA_DECLARES_CLIENT = (
2448
+ "CREATE REL TABLE DECLARES_CLIENT(FROM Symbol TO Client, "
2449
+ "confidence DOUBLE, strategy STRING)"
2450
+ )
2451
+ _SCHEMA_DECLARES_PRODUCER = (
2452
+ "CREATE REL TABLE DECLARES_PRODUCER(FROM Symbol TO Producer, "
2453
+ "confidence DOUBLE, strategy STRING)"
2454
+ )
2455
+ _SCHEMA_HTTP_CALLS = (
2456
+ "CREATE REL TABLE HTTP_CALLS(FROM Client TO Route, "
2457
+ "confidence DOUBLE, strategy STRING, "
2458
+ "method_call STRING, raw_uri STRING, match STRING)"
2459
+ )
2460
+ _SCHEMA_ASYNC_CALLS = (
2461
+ "CREATE REL TABLE ASYNC_CALLS(FROM Producer TO Route, "
2462
+ "confidence DOUBLE, strategy STRING, "
2463
+ "direction STRING, raw_topic STRING, match STRING)"
2464
+ )
2465
+
2466
+
2467
+ def _drop_all(conn: kuzu.Connection) -> None:
2468
+ for stmt in (
2469
+ "DROP TABLE IF EXISTS DECLARES_CLIENT",
2470
+ "DROP TABLE IF EXISTS DECLARES_PRODUCER",
2471
+ "DROP TABLE IF EXISTS HTTP_CALLS",
2472
+ "DROP TABLE IF EXISTS ASYNC_CALLS",
2473
+ "DROP TABLE IF EXISTS EXPOSES",
2474
+ "DROP TABLE IF EXISTS UNRESOLVED_AT",
2475
+ "DROP TABLE IF EXISTS EXTENDS",
2476
+ "DROP TABLE IF EXISTS IMPLEMENTS",
2477
+ "DROP TABLE IF EXISTS INJECTS",
2478
+ "DROP TABLE IF EXISTS CALLS",
2479
+ "DROP TABLE IF EXISTS OVERRIDES",
2480
+ "DROP TABLE IF EXISTS DECLARES",
2481
+ "DROP TABLE IF EXISTS UnresolvedCallSite",
2482
+ "DROP TABLE IF EXISTS Symbol",
2483
+ "DROP TABLE IF EXISTS Route",
2484
+ "DROP TABLE IF EXISTS Client",
2485
+ "DROP TABLE IF EXISTS Producer",
2486
+ "DROP TABLE IF EXISTS GraphMeta",
2487
+ ):
2488
+ try:
2489
+ conn.execute(stmt)
2490
+ except Exception:
2491
+ pass
2492
+
2493
+
2494
+ def _create_schema(conn: kuzu.Connection) -> None:
2495
+ for stmt in (
2496
+ _SCHEMA_NODE,
2497
+ _SCHEMA_UNRESOLVED_CALL_SITE,
2498
+ _SCHEMA_ROUTE,
2499
+ _SCHEMA_CLIENT,
2500
+ _SCHEMA_PRODUCER,
2501
+ _SCHEMA_META,
2502
+ _SCHEMA_EXTENDS,
2503
+ _SCHEMA_IMPLEMENTS,
2504
+ _SCHEMA_INJECTS,
2505
+ _SCHEMA_DECLARES,
2506
+ _SCHEMA_OVERRIDES,
2507
+ _SCHEMA_CALLS,
2508
+ _SCHEMA_UNRESOLVED_AT,
2509
+ _SCHEMA_EXPOSES,
2510
+ _SCHEMA_DECLARES_CLIENT,
2511
+ _SCHEMA_DECLARES_PRODUCER,
2512
+ _SCHEMA_HTTP_CALLS,
2513
+ _SCHEMA_ASYNC_CALLS,
2514
+ ):
2515
+ conn.execute(stmt)
2516
+
2517
+
2518
+ def _node_row(**kwargs) -> dict:
2519
+ base = {
2520
+ "kind": "", "name": "", "fqn": "", "package": "",
2521
+ "module": "", "microservice": "",
2522
+ "filename": "", "start_line": 0, "end_line": 0,
2523
+ "start_byte": 0, "end_byte": 0,
2524
+ "modifiers": [], "annotations": [], "capabilities": [],
2525
+ "role": "OTHER", "signature": "", "parent_id": "", "resolved": True,
2526
+ }
2527
+ base.update(kwargs)
2528
+ return base
2529
+
2530
+
2531
+ _CREATE_SYMBOL = (
2532
+ "CREATE (:Symbol {id: $id, kind: $kind, name: $name, fqn: $fqn, "
2533
+ "package: $package, module: $module, microservice: $microservice, "
2534
+ "filename: $filename, "
2535
+ "start_line: $start_line, end_line: $end_line, "
2536
+ "start_byte: $start_byte, end_byte: $end_byte, "
2537
+ "modifiers: $modifiers, annotations: $annotations, capabilities: $capabilities, "
2538
+ "role: $role, signature: $signature, parent_id: $parent_id, resolved: $resolved})"
2539
+ )
2540
+
2541
+
2542
+ def _write_nodes(
2543
+ conn: kuzu.Connection,
2544
+ tables: GraphTables,
2545
+ *,
2546
+ project_root: Path,
2547
+ meta_chain: dict[str, frozenset[str]] | None,
2548
+ ) -> None:
2549
+ overrides = load_brownfield_overrides(project_root)
2550
+ try:
2551
+ prs = str(project_root.resolve())
2552
+ except OSError:
2553
+ prs = str(project_root)
2554
+ tables.cross_service_resolution = _load_config_cross_service_resolution(prs)
2555
+ mch = meta_chain
2556
+ # packages
2557
+ for pkg, pid in tables.packages.items():
2558
+ conn.execute(_CREATE_SYMBOL, _node_row(
2559
+ id=pid, kind="package", name=pkg.rsplit(".", 1)[-1], fqn=pkg, package=pkg,
2560
+ ))
2561
+ # files
2562
+ for path, fid in tables.files.items():
2563
+ conn.execute(_CREATE_SYMBOL, _node_row(
2564
+ id=fid, kind="file", name=Path(path).name, fqn=path, filename=path,
2565
+ ))
2566
+ # types
2567
+ for entry in tables.types.values():
2568
+ d = entry.decl
2569
+ role, capabilities = resolve_role_and_capabilities(
2570
+ d,
2571
+ overrides=overrides,
2572
+ meta_chain=mch,
2573
+ )
2574
+ tables.type_role_by_node_id[entry.node_id] = role
2575
+ conn.execute(_CREATE_SYMBOL, _node_row(
2576
+ id=entry.node_id, kind=d.kind, name=d.name, fqn=d.fqn,
2577
+ package=entry.package,
2578
+ module=entry.module, microservice=entry.microservice,
2579
+ filename=entry.file_path,
2580
+ start_line=d.start_line, end_line=d.end_line,
2581
+ start_byte=d.start_byte, end_byte=d.end_byte,
2582
+ modifiers=list(d.modifiers),
2583
+ annotations=[a.name for a in d.annotations],
2584
+ capabilities=capabilities,
2585
+ role=role,
2586
+ signature="",
2587
+ parent_id=tables.types[entry.outer_fqn].node_id if entry.outer_fqn and entry.outer_fqn in tables.types else "",
2588
+ ))
2589
+ # members (methods / constructors)
2590
+ for m in tables.members:
2591
+ conn.execute(_CREATE_SYMBOL, _node_row(
2592
+ id=m.node_id, kind=m.kind, name=m.decl.name,
2593
+ fqn=f"{m.parent_fqn}#{m.decl.signature}",
2594
+ package=tables.types[m.parent_fqn].package if m.parent_fqn in tables.types else "",
2595
+ module=m.module, microservice=m.microservice,
2596
+ filename=m.file_path,
2597
+ start_line=m.decl.start_line, end_line=m.decl.end_line,
2598
+ start_byte=m.decl.start_byte, end_byte=m.decl.end_byte,
2599
+ modifiers=list(m.decl.modifiers),
2600
+ annotations=[a.name for a in m.decl.annotations],
2601
+ signature=m.decl.signature, parent_id=m.parent_id,
2602
+ ))
2603
+ # phantoms
2604
+ for pid, row in tables.phantoms.items():
2605
+ conn.execute(_CREATE_SYMBOL, row)
2606
+
2607
+
2608
+ _CREATE_EXT = (
2609
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2610
+ "CREATE (a)-[:EXTENDS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2611
+ )
2612
+ _CREATE_IMPL = (
2613
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2614
+ "CREATE (a)-[:IMPLEMENTS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2615
+ )
2616
+ _CREATE_INJ = (
2617
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2618
+ "CREATE (a)-[:INJECTS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved, "
2619
+ "mechanism: $mechanism, annotation: $annotation, field_or_param: $field_or_param}]->(b)"
2620
+ )
2621
+ _CREATE_DECL = (
2622
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2623
+ "CREATE (a)-[:DECLARES]->(b)"
2624
+ )
2625
+ _CREATE_OVERRIDES = (
2626
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2627
+ "CREATE (a)-[:OVERRIDES]->(b)"
2628
+ )
2629
+ _CREATE_CALL = (
2630
+ "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2631
+ "CREATE (a)-[:CALLS {"
2632
+ "call_site_line: $line, call_site_byte: $byte, arg_count: $argc, "
2633
+ "confidence: $conf, strategy: $strat, source: $src_kind, resolved: $resolved, "
2634
+ "callee_declaring_role: $callee_declaring_role"
2635
+ "}]->(b)"
2636
+ )
2637
+
2638
+ _CREATE_ROUTE = (
2639
+ "CREATE (:Route {"
2640
+ "id: $id, kind: $kind, framework: $framework, method: $method, "
2641
+ "path: $path, path_template: $path_template, path_regex: $path_regex, "
2642
+ "topic: $topic, broker: $broker, feign_name: $feign_name, feign_url: $feign_url, "
2643
+ "microservice: $microservice, module: $module, filename: $filename, "
2644
+ "start_line: $start_line, end_line: $end_line, resolved: $resolved"
2645
+ "})"
2646
+ )
2647
+ _CREATE_CLIENT = (
2648
+ "CREATE (:Client {"
2649
+ "id: $id, client_kind: $client_kind, target_service: $target_service, "
2650
+ "path: $path, path_template: $path_template, path_regex: $path_regex, method: $method, "
2651
+ "member_fqn: $member_fqn, member_id: $member_id, "
2652
+ "microservice: $microservice, module: $module, filename: $filename, "
2653
+ "start_line: $start_line, end_line: $end_line, resolved: $resolved, source_layer: $source_layer"
2654
+ "})"
2655
+ )
2656
+
2657
+ _CREATE_EXPOSES = (
2658
+ "MATCH (s:Symbol {id: $sid}), (r:Route {id: $rid}) "
2659
+ "CREATE (s)-[:EXPOSES {confidence: $confidence, strategy: $strategy}]->(r)"
2660
+ )
2661
+ _CREATE_DECLARES_CLIENT = (
2662
+ "MATCH (s:Symbol {id: $sid}), (c:Client {id: $cid}) "
2663
+ "CREATE (s)-[:DECLARES_CLIENT {confidence: $confidence, strategy: $strategy}]->(c)"
2664
+ )
2665
+ _CREATE_PRODUCER = (
2666
+ "CREATE (:Producer {"
2667
+ "id: $id, producer_kind: $producer_kind, topic: $topic, broker: $broker, "
2668
+ "direction: $direction, member_fqn: $member_fqn, member_id: $member_id, "
2669
+ "microservice: $microservice, module: $module, filename: $filename, "
2670
+ "start_line: $start_line, end_line: $end_line, resolved: $resolved, "
2671
+ "source_layer: $source_layer"
2672
+ "})"
2673
+ )
2674
+ _CREATE_DECLARES_PRODUCER = (
2675
+ "MATCH (s:Symbol {id: $sid}), (p:Producer {id: $pid}) "
2676
+ "CREATE (s)-[:DECLARES_PRODUCER {confidence: $confidence, strategy: $strategy}]->(p)"
2677
+ )
2678
+ _CREATE_HTTP_CALL = (
2679
+ "MATCH (c:Client {id: $cid}), (r:Route {id: $rid}) "
2680
+ "CREATE (c)-[:HTTP_CALLS {confidence: $confidence, strategy: $strategy, "
2681
+ "method_call: $method_call, raw_uri: $raw_uri, match: $match}]->(r)"
2682
+ )
2683
+ _CREATE_ASYNC_CALL = (
2684
+ "MATCH (p:Producer {id: $pid}), (r:Route {id: $rid}) "
2685
+ "CREATE (p)-[:ASYNC_CALLS {confidence: $confidence, strategy: $strategy, "
2686
+ "direction: $direction, raw_topic: $raw_topic, match: $match}]->(r)"
2687
+ )
2688
+
2689
+
2690
+ def _populate_declares_rows(tables: GraphTables) -> None:
2691
+ tables.declares_rows = [
2692
+ DeclaresRow(src_id=m.parent_id, dst_id=m.node_id) for m in tables.members
2693
+ ]
2694
+
2695
+
2696
+ def _direct_supertype_ids(tables: GraphTables, type_id: str) -> list[str]:
2697
+ out: list[str] = []
2698
+ for r in tables.extends_rows:
2699
+ if r.src_id == type_id:
2700
+ out.append(r.dst_id)
2701
+ for r in tables.implements_rows:
2702
+ if r.src_id == type_id:
2703
+ out.append(r.dst_id)
2704
+ return out
2705
+
2706
+
2707
+ def _populate_overrides_rows(tables: GraphTables) -> None:
2708
+ """Materialize (subtype_method)-[:OVERRIDES]->(supertype_method) for one supertype hop.
2709
+
2710
+ Matches ``KuzuGraph.override_axis_rollup_for`` (direct ``IMPLEMENTS`` / ``EXTENDS``
2711
+ only, same ``signature``, distinct method ids, non-static instance methods).
2712
+ """
2713
+ by_declaring_type: dict[str, list[MemberEntry]] = defaultdict(list)
2714
+ for m in tables.members:
2715
+ by_declaring_type[m.parent_id].append(m)
2716
+ pairs: set[tuple[str, str]] = set()
2717
+ for m in tables.members:
2718
+ if m.kind != "method" or "static" in m.decl.modifiers:
2719
+ continue
2720
+ impl_tid = m.parent_id
2721
+ for sup_id in _direct_supertype_ids(tables, impl_tid):
2722
+ for other in by_declaring_type.get(sup_id, ()):
2723
+ if other.kind != "method":
2724
+ continue
2725
+ if other.decl.signature != m.decl.signature:
2726
+ continue
2727
+ if other.node_id == m.node_id:
2728
+ continue
2729
+ pairs.add((m.node_id, other.node_id))
2730
+ tables.overrides_rows = [
2731
+ DeclaresRow(src_id=a, dst_id=b) for a, b in sorted(pairs)
2732
+ ]
2733
+
2734
+
2735
+ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
2736
+ for r in tables.extends_rows:
2737
+ conn.execute(_CREATE_EXT, {
2738
+ "src": r.src_id, "dst": r.dst_id,
2739
+ "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2740
+ })
2741
+ for r in tables.implements_rows:
2742
+ conn.execute(_CREATE_IMPL, {
2743
+ "src": r.src_id, "dst": r.dst_id,
2744
+ "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2745
+ })
2746
+ for r in tables.injects_rows:
2747
+ conn.execute(_CREATE_INJ, {
2748
+ "src": r.src_id, "dst": r.dst_id,
2749
+ "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2750
+ "mechanism": r.mechanism, "annotation": r.annotation,
2751
+ "field_or_param": r.field_or_param,
2752
+ })
2753
+
2754
+ for row in tables.declares_rows:
2755
+ conn.execute(_CREATE_DECL, {"src": row.src_id, "dst": row.dst_id})
2756
+
2757
+ for row in tables.overrides_rows:
2758
+ conn.execute(_CREATE_OVERRIDES, {"src": row.src_id, "dst": row.dst_id})
2759
+
2760
+ seen_calls: set[tuple[str, str, int, int]] = set()
2761
+ unique_calls: list[CallsRow] = []
2762
+ for row in tables.calls_rows:
2763
+ key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line)
2764
+ if key not in seen_calls:
2765
+ seen_calls.add(key)
2766
+ unique_calls.append(row)
2767
+
2768
+ member_by_id = {m.node_id: m for m in tables.members}
2769
+ for row in unique_calls:
2770
+ conn.execute(_CREATE_CALL, {
2771
+ "src": row.src_id, "dst": row.dst_id,
2772
+ "line": row.call_site_line,
2773
+ "byte": row.call_site_byte,
2774
+ "argc": row.arg_count,
2775
+ "conf": row.confidence,
2776
+ "strat": row.strategy,
2777
+ "src_kind": row.source,
2778
+ "resolved": row.resolved,
2779
+ "callee_declaring_role": _callee_declaring_role_at_write(
2780
+ tables, row.dst_id, member_by_id=member_by_id,
2781
+ ),
2782
+ })
2783
+
2784
+ _CREATE_UNRESOLVED = (
2785
+ "CREATE (:UnresolvedCallSite {"
2786
+ "id: $id, caller_id: $caller_id, call_site_line: $line, call_site_byte: $byte, "
2787
+ "arg_count: $argc, callee_simple: $callee, receiver_expr: $recv, reason: $reason"
2788
+ "})"
2789
+ )
2790
+ _CREATE_UNRESOLVED_AT = (
2791
+ "MATCH (a:Symbol {id: $caller}), (u:UnresolvedCallSite {id: $ucs}) "
2792
+ "CREATE (a)-[:UNRESOLVED_AT]->(u)"
2793
+ )
2794
+ seen_ucs: set[str] = set()
2795
+ for row in tables.unresolved_call_site_rows:
2796
+ if row.id in seen_ucs:
2797
+ continue
2798
+ seen_ucs.add(row.id)
2799
+ conn.execute(_CREATE_UNRESOLVED, {
2800
+ "id": row.id,
2801
+ "caller_id": row.caller_id,
2802
+ "line": row.call_site_line,
2803
+ "byte": row.call_site_byte,
2804
+ "argc": row.arg_count,
2805
+ "callee": row.callee_simple,
2806
+ "recv": row.receiver_expr,
2807
+ "reason": row.reason,
2808
+ })
2809
+ conn.execute(_CREATE_UNRESOLVED_AT, {"caller": row.caller_id, "ucs": row.id})
2810
+
2811
+
2812
+ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> None:
2813
+ for row in tables.routes_rows:
2814
+ conn.execute(_CREATE_ROUTE, {
2815
+ "id": row.id,
2816
+ "kind": row.kind,
2817
+ "framework": row.framework,
2818
+ "method": row.method,
2819
+ "path": row.path,
2820
+ "path_template": row.path_template,
2821
+ "path_regex": row.path_regex,
2822
+ "topic": row.topic,
2823
+ "broker": row.broker,
2824
+ "feign_name": row.feign_name,
2825
+ "feign_url": row.feign_url,
2826
+ "microservice": row.microservice,
2827
+ "module": row.module,
2828
+ "filename": row.filename,
2829
+ "start_line": row.start_line,
2830
+ "end_line": row.end_line,
2831
+ "resolved": row.resolved,
2832
+ })
2833
+ for row in tables.exposes_rows:
2834
+ conn.execute(_CREATE_EXPOSES, {
2835
+ "sid": row.symbol_id,
2836
+ "rid": row.route_id,
2837
+ "confidence": row.confidence,
2838
+ "strategy": row.strategy,
2839
+ })
2840
+ for row in tables.client_rows:
2841
+ conn.execute(_CREATE_CLIENT, asdict(row))
2842
+ for row in tables.declares_client_rows:
2843
+ conn.execute(_CREATE_DECLARES_CLIENT, {
2844
+ "sid": row.symbol_id,
2845
+ "cid": row.client_id,
2846
+ "confidence": row.confidence,
2847
+ "strategy": row.strategy,
2848
+ })
2849
+ for row in tables.producer_rows:
2850
+ conn.execute(_CREATE_PRODUCER, asdict(row))
2851
+ for row in tables.declares_producer_rows:
2852
+ conn.execute(_CREATE_DECLARES_PRODUCER, {
2853
+ "sid": row.symbol_id,
2854
+ "pid": row.producer_id,
2855
+ "confidence": row.confidence,
2856
+ "strategy": row.strategy,
2857
+ })
2858
+ for row in tables.http_call_rows:
2859
+ conn.execute(_CREATE_HTTP_CALL, {
2860
+ "cid": row.client_id,
2861
+ "rid": row.route_id,
2862
+ "confidence": row.confidence,
2863
+ "strategy": row.strategy,
2864
+ "method_call": row.method_call,
2865
+ "raw_uri": row.raw_uri,
2866
+ "match": row.match,
2867
+ })
2868
+ for row in tables.async_call_rows:
2869
+ conn.execute(_CREATE_ASYNC_CALL, {
2870
+ "pid": row.producer_id,
2871
+ "rid": row.route_id,
2872
+ "confidence": row.confidence,
2873
+ "strategy": row.strategy,
2874
+ "direction": row.direction,
2875
+ "raw_topic": row.raw_topic,
2876
+ "match": row.match,
2877
+ })
2878
+
2879
+
2880
+ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -> None:
2881
+ seen_calls: set[tuple[str, str, int, int]] = set()
2882
+ calls_unique = 0
2883
+ for row in tables.calls_rows:
2884
+ key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line)
2885
+ if key not in seen_calls:
2886
+ seen_calls.add(key)
2887
+ calls_unique += 1
2888
+ st = tables.route_stats
2889
+ routes_fw = dict(sorted(st.by_framework.items()))
2890
+ call_stats = tables.call_edge_stats
2891
+ client_stats = tables.client_stats
2892
+ producer_stats = tables.producer_stats
2893
+ http_by_strategy = dict(sorted(call_stats.http_calls_by_strategy.items()))
2894
+ async_by_strategy = dict(sorted(call_stats.async_calls_by_strategy.items()))
2895
+ http_match = dict(sorted(call_stats.http_calls_match_breakdown.items()))
2896
+ async_match = dict(sorted(call_stats.async_calls_match_breakdown.items()))
2897
+ http_resolved_pct = 0.0
2898
+ async_resolved_pct = 0.0
2899
+ if call_stats.http_calls_total:
2900
+ # PR-D1 definition: "resolved_pct" is strategy-based (strategy != 'unresolved'),
2901
+ # not match-based (all PR-D1 edges keep match='unresolved').
2902
+ resolved_http = sum(v for k, v in call_stats.http_calls_by_strategy.items() if k != "unresolved")
2903
+ http_resolved_pct = float(resolved_http) / float(call_stats.http_calls_total)
2904
+ if call_stats.async_calls_total:
2905
+ resolved_async = sum(v for k, v in call_stats.async_calls_by_strategy.items() if k != "unresolved")
2906
+ async_resolved_pct = float(resolved_async) / float(call_stats.async_calls_total)
2907
+ counts = {
2908
+ "packages": len(tables.packages),
2909
+ "files": len(tables.files),
2910
+ "types": len(tables.types),
2911
+ "members": len(tables.members),
2912
+ "phantoms": len(tables.phantoms),
2913
+ "extends": len(tables.extends_rows),
2914
+ "implements": len(tables.implements_rows),
2915
+ "injects": len(tables.injects_rows),
2916
+ "declares": len(tables.declares_rows),
2917
+ "overrides": len(tables.overrides_rows),
2918
+ "calls": calls_unique,
2919
+ "routes": len(tables.routes_rows),
2920
+ "exposes": len(tables.exposes_rows),
2921
+ "clients": len(tables.client_rows),
2922
+ "declares_client": len(tables.declares_client_rows),
2923
+ "producers": len(tables.producer_rows),
2924
+ "declares_producer": len(tables.declares_producer_rows),
2925
+ "http_calls": len(tables.http_call_rows),
2926
+ "async_calls": len(tables.async_call_rows),
2927
+ }
2928
+ routes_layer = dict(sorted(st.routes_by_layer.items()))
2929
+ clients_by_kind = dict(sorted(client_stats.clients_by_kind.items()))
2930
+ producers_by_kind = dict(sorted(producer_stats.producers_by_kind.items()))
2931
+ conn.execute(
2932
+ "CREATE (:GraphMeta {key: $k, ontology_version: $ov, built_at: $t, "
2933
+ "source_root: $sr, counts_json: $cj, parse_errors: $pe, "
2934
+ "routes_total: $routes_total, exposes_total: $exposes_total, "
2935
+ "routes_by_framework: $routes_by_framework, routes_resolved_pct: $routes_resolved_pct, "
2936
+ "routes_from_brownfield_pct: $routes_from_brownfield_pct, routes_by_layer: $routes_by_layer, "
2937
+ "clients_total: $clients_total, declares_client_total: $declares_client_total, "
2938
+ "clients_by_kind: $clients_by_kind, "
2939
+ "producers_total: $producers_total, declares_producer_total: $declares_producer_total, "
2940
+ "producers_by_kind: $producers_by_kind, "
2941
+ "http_calls_total: $http_calls_total, async_calls_total: $async_calls_total, "
2942
+ "http_calls_by_strategy: $http_calls_by_strategy, async_calls_by_strategy: $async_calls_by_strategy, "
2943
+ "http_calls_resolved_pct: $http_calls_resolved_pct, async_calls_resolved_pct: $async_calls_resolved_pct, "
2944
+ "http_clients_from_brownfield_pct: $http_clients_from_brownfield_pct, "
2945
+ "async_producers_from_brownfield_pct: $async_producers_from_brownfield_pct, "
2946
+ "http_calls_match_breakdown: $http_calls_match_breakdown, "
2947
+ "async_calls_match_breakdown: $async_calls_match_breakdown, "
2948
+ "cross_service_calls_total: $cross_service_calls_total, "
2949
+ "pass3_skipped_cross_service: $pass3_skipped_cross_service, "
2950
+ "pass3_unresolved_phantom_receiver: $pass3_unresolved_phantom_receiver, "
2951
+ "pass3_unresolved_chained: $pass3_unresolved_chained, "
2952
+ "pass4_exposes_suppressed_feign: $pass4_exposes_suppressed_feign, "
2953
+ "cross_service_resolution: $cross_service_resolution})",
2954
+ {
2955
+ "k": "graph",
2956
+ "ov": ONTOLOGY_VERSION,
2957
+ "t": int(time.time()),
2958
+ "sr": str(source_root.resolve()),
2959
+ "cj": json.dumps(counts),
2960
+ "pe": tables.parse_errors,
2961
+ "routes_total": len(tables.routes_rows),
2962
+ "exposes_total": len(tables.exposes_rows),
2963
+ "routes_by_framework": json.dumps(routes_fw),
2964
+ "routes_resolved_pct": float(st.routes_resolved_pct),
2965
+ "routes_from_brownfield_pct": float(st.routes_from_brownfield_pct),
2966
+ "routes_by_layer": json.dumps(routes_layer),
2967
+ "clients_total": int(client_stats.clients_total),
2968
+ "declares_client_total": int(client_stats.declares_client_total),
2969
+ "clients_by_kind": json.dumps(clients_by_kind),
2970
+ "producers_total": int(producer_stats.producers_total),
2971
+ "declares_producer_total": int(producer_stats.declares_producer_total),
2972
+ "producers_by_kind": json.dumps(producers_by_kind),
2973
+ "http_calls_total": call_stats.http_calls_total,
2974
+ "async_calls_total": call_stats.async_calls_total,
2975
+ "http_calls_by_strategy": json.dumps(http_by_strategy),
2976
+ "async_calls_by_strategy": json.dumps(async_by_strategy),
2977
+ "http_calls_resolved_pct": http_resolved_pct,
2978
+ "async_calls_resolved_pct": async_resolved_pct,
2979
+ "http_clients_from_brownfield_pct": call_stats.http_clients_from_brownfield_pct,
2980
+ "async_producers_from_brownfield_pct": call_stats.async_producers_from_brownfield_pct,
2981
+ "http_calls_match_breakdown": json.dumps(http_match),
2982
+ "async_calls_match_breakdown": json.dumps(async_match),
2983
+ "cross_service_calls_total": int(call_stats.cross_service_calls_total),
2984
+ "pass3_skipped_cross_service": int(tables.pass3_skipped_cross_service),
2985
+ "pass3_unresolved_phantom_receiver": int(tables.pass3_unresolved_phantom_receiver),
2986
+ "pass3_unresolved_chained": int(tables.pass3_unresolved_chained),
2987
+ "pass4_exposes_suppressed_feign": int(st.exposes_suppressed_feign),
2988
+ "cross_service_resolution": str(tables.cross_service_resolution),
2989
+ },
2990
+ )
2991
+
2992
+
2993
+ def write_kuzu(
2994
+ db_path: Path,
2995
+ tables: GraphTables,
2996
+ *,
2997
+ source_root: Path,
2998
+ verbose: bool,
2999
+ meta_chain: dict[str, frozenset[str]] | None = None,
3000
+ ) -> None:
3001
+ if meta_chain is None:
3002
+ meta_chain = collect_annotation_meta_chain(
3003
+ str(source_root.resolve()),
3004
+ )
3005
+ if verbose:
3006
+ _verbose_stderr_line(_WRITE_START)
3007
+ with _VerbosePassHeartbeats("[write]", verbose=verbose):
3008
+ db_path.parent.mkdir(parents=True, exist_ok=True)
3009
+ db = kuzu.Database(str(db_path))
3010
+ conn = kuzu.Connection(db)
3011
+ _drop_all(conn)
3012
+ _create_schema(conn)
3013
+ t0 = time.time()
3014
+ _write_nodes(
3015
+ conn,
3016
+ tables,
3017
+ project_root=source_root,
3018
+ meta_chain=meta_chain,
3019
+ )
3020
+ if verbose:
3021
+ _verbose_stderr_line(f"[write] nodes written in {time.time() - t0:.2f}s")
3022
+ _populate_declares_rows(tables)
3023
+ _populate_overrides_rows(tables)
3024
+ t1 = time.time()
3025
+ _write_edges(conn, tables)
3026
+ if verbose:
3027
+ _verbose_stderr_line(f"[write] edges written in {time.time() - t1:.2f}s")
3028
+ t2 = time.time()
3029
+ _write_routes_and_exposes(conn, tables)
3030
+ if verbose:
3031
+ _verbose_stderr_line(f"[write] routes/exposes written in {time.time() - t2:.2f}s")
3032
+ _write_meta(conn, tables, source_root)
3033
+ conn.close()
3034
+
3035
+
3036
+ # ---------- CLI ----------
3037
+
3038
+
3039
+ def _default_kuzu_path() -> Path:
3040
+ idx = os.environ.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
3041
+ if idx and not idx.startswith(("s3://", "gs://", "az://")):
3042
+ return Path(os.path.expanduser(idx.rstrip("/"))) / "code_graph.kuzu"
3043
+ return Path.cwd() / ".java-codebase-rag" / "code_graph.kuzu"
3044
+
3045
+
3046
+ def main() -> int:
3047
+ parser = argparse.ArgumentParser(description="Build an AST-derived Kuzu graph for Java sources.")
3048
+ parser.add_argument("--source-root", default=None, help="Repository / monorepo root to scan for .java (defaults to current working directory)")
3049
+ parser.add_argument(
3050
+ "--kuzu-path",
3051
+ default=None,
3052
+ help=(
3053
+ "Kuzu database path (file/dir as used by kuzu.Database; "
3054
+ "default: $JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.kuzu or ./.java-codebase-rag/code_graph.kuzu)"
3055
+ ),
3056
+ )
3057
+ parser.add_argument("--verbose", action="store_true")
3058
+ args = parser.parse_args()
3059
+
3060
+ root = Path(args.source_root).expanduser().resolve() if args.source_root else Path.cwd().resolve()
3061
+ if not root.is_dir():
3062
+ print(f"source-root not a directory: {root}", file=sys.stderr)
3063
+ return 2
3064
+
3065
+ kuzu_path = Path(args.kuzu_path).expanduser() if args.kuzu_path else _default_kuzu_path()
3066
+
3067
+ tables = GraphTables()
3068
+ asts = pass1_parse(root, tables, verbose=args.verbose)
3069
+ pass2_edges(tables, asts, verbose=args.verbose)
3070
+ pass3_calls(tables, asts, verbose=args.verbose)
3071
+ pass4_routes(tables, asts, source_root=root, verbose=args.verbose)
3072
+ pass5_imperative_edges(tables, asts, source_root=root, verbose=args.verbose)
3073
+ pass6_match_edges(tables, verbose=args.verbose)
3074
+ write_kuzu(kuzu_path, tables, source_root=root, verbose=args.verbose)
3075
+ if args.verbose:
3076
+ _verbose_stderr_line(f"[done] kuzu at {kuzu_path}")
3077
+ return 0
3078
+
3079
+
3080
+ if __name__ == "__main__":
3081
+ sys.exit(main())