java-codebase-rag 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
build_ast_graph.py CHANGED
@@ -401,6 +401,330 @@ class GraphTables:
401
401
  type_role_by_node_id: dict[str, str] = field(default_factory=dict)
402
402
 
403
403
 
404
+ @dataclass
405
+ class IncrementalResult:
406
+ """Result of an incremental graph rebuild."""
407
+ mode: str # "incremental" | "full_fallback"
408
+ files_changed: int
409
+ files_added: int
410
+ files_removed: int
411
+ dependents_reprocessed: int
412
+ elapsed_sec: float
413
+
414
+
415
+ class FileHashTracker:
416
+ """Track content hashes for incremental graph rebuild."""
417
+ def __init__(self, index_dir: Path):
418
+ self._path = index_dir / ".graph_hashes.json"
419
+ self._hashes: dict[str, str] = {} # rel_path -> sha256_hex
420
+
421
+ def load(self) -> None:
422
+ """Load hashes from disk. No-op if file missing (first run)."""
423
+ if not self._path.exists():
424
+ return
425
+ try:
426
+ with open(self._path, "r", encoding="utf-8") as f:
427
+ self._hashes = json.load(f)
428
+ except (json.JSONDecodeError, OSError):
429
+ # Corrupt or unreadable hash file; start fresh.
430
+ self._hashes = {}
431
+
432
+ def save(self) -> None:
433
+ """Persist hashes to disk atomically (write .tmp, rename)."""
434
+ tmp_path = self._path.with_suffix(".json.tmp")
435
+ try:
436
+ with open(tmp_path, "w", encoding="utf-8") as f:
437
+ json.dump(self._hashes, f, sort_keys=True)
438
+ os.replace(tmp_path, self._path)
439
+ except OSError as e:
440
+ # Fail gracefully; next run will treat as missing and rebuild.
441
+ log.warning("Failed to save hash file %s: %s; next run will rebuild from scratch", self._path, e)
442
+
443
+ def detect_changes(self, source_root: Path, ignore: LayeredIgnore) -> tuple[set[str], set[str], set[str]]:
444
+ """Return (added, changed, removed) sets of relative POSIX paths."""
445
+ current_files: set[str] = set()
446
+ # Resolve source_root to handle symlinks
447
+ source_root_resolved = source_root.resolve()
448
+ for abs_path in iter_java_source_files(source_root, ignore=ignore):
449
+ # Resolve the absolute path and compute relative path
450
+ abs_path_resolved = abs_path.resolve()
451
+ try:
452
+ rel_path = abs_path_resolved.relative_to(source_root_resolved).as_posix()
453
+ except ValueError:
454
+ # Fallback to using the path as-is if it's not under source_root
455
+ rel_path = abs_path.as_posix()
456
+ current_files.add(rel_path)
457
+
458
+ added: set[str] = set()
459
+ changed: set[str] = set()
460
+ removed: set[str] = set()
461
+
462
+ # Detect added and changed files.
463
+ for rel_path in current_files:
464
+ abs_path = source_root / rel_path
465
+ try:
466
+ file_hash = _hash_file(abs_path)
467
+ except FileNotFoundError:
468
+ continue
469
+ stored_hash = self._hashes.get(rel_path)
470
+ if stored_hash is None:
471
+ added.add(rel_path)
472
+ elif stored_hash != file_hash:
473
+ changed.add(rel_path)
474
+
475
+ # Detect removed files.
476
+ for rel_path in self._hashes:
477
+ if rel_path not in current_files:
478
+ removed.add(rel_path)
479
+
480
+ return added, changed, removed
481
+
482
+ def update(self, rel_paths: set[str], source_root: Path) -> None:
483
+ """Compute and store hashes for the given paths."""
484
+ for rel_path in rel_paths:
485
+ abs_path = source_root / rel_path
486
+ if abs_path.exists():
487
+ self._hashes[rel_path] = _hash_file(abs_path)
488
+
489
+
490
+ def _hash_file(abs_path: Path) -> str:
491
+ """Compute SHA-256 hash of a file's raw bytes."""
492
+ hasher = hashlib.sha256()
493
+ with open(abs_path, "rb") as f:
494
+ for chunk in iter(lambda: f.read(65536), b""):
495
+ hasher.update(chunk)
496
+ return hasher.hexdigest()
497
+
498
+
499
+ # ---------- incremental rebuild helpers ----------
500
+
501
+
502
+ def _load_existing_types(conn: kuzu.Connection, tables: GraphTables, exclude_files: set[str] | None = None) -> None:
503
+ """Load type entries from existing Kuzu graph into tables for cross-file resolution.
504
+
505
+ When exclude_files is provided, only load types from files NOT in the set.
506
+ """
507
+ if exclude_files is not None and not exclude_files:
508
+ return
509
+
510
+ where = "WHERE s.kind IN ['class', 'interface', 'enum', 'annotation', 'record']"
511
+ params: dict = {}
512
+ if exclude_files:
513
+ where += "\n AND NOT (s.filename IN $exclude_files)"
514
+ params["exclude_files"] = list(exclude_files)
515
+
516
+ query = f"""
517
+ MATCH (s:Symbol)
518
+ {where}
519
+ RETURN s.kind, s.fqn, s.name, s.filename, s.module, s.microservice, s.id
520
+ """
521
+ result = conn.execute(query, params)
522
+ while result.has_next():
523
+ row = result.get_next()
524
+ kind, fqn, name, filename = row[0], row[1], row[2], row[3]
525
+ module = row[4] if len(row) > 4 else ""
526
+ microservice = row[5] if len(row) > 5 else ""
527
+ node_id = row[6] if len(row) > 6 else ""
528
+
529
+ decl = TypeDecl(name, kind, fqn)
530
+ package = fqn[: -(len(name) + 1)] if fqn.endswith("." + name) else ""
531
+
532
+ entry = TypeIndexEntry(
533
+ decl=decl,
534
+ file_path=filename,
535
+ module=module,
536
+ microservice=microservice,
537
+ package=package,
538
+ outer_fqn=None,
539
+ node_id=node_id,
540
+ )
541
+ tables.types[fqn] = entry
542
+ tables.by_simple_name.setdefault(name, []).append(entry)
543
+ tables.by_package.setdefault(package, []).append(entry)
544
+
545
+
546
+ def _load_existing_members(conn: kuzu.Connection, tables: GraphTables, exclude_files: set[str] | None = None) -> None:
547
+ """Load member entries from existing Kuzu graph into tables.members.
548
+
549
+ When exclude_files is provided, only load members from files NOT in the set.
550
+ """
551
+ if exclude_files is not None and not exclude_files:
552
+ return
553
+
554
+ where = "WHERE s.kind IN ['method', 'constructor']"
555
+ params: dict = {}
556
+ if exclude_files:
557
+ where += "\n AND NOT (s.filename IN $exclude_files)"
558
+ params["exclude_files"] = list(exclude_files)
559
+
560
+ query = f"""
561
+ MATCH (s:Symbol)
562
+ {where}
563
+ RETURN s.kind, s.name, s.filename, s.signature, s.parent_id, s.fqn, s.id
564
+ """
565
+ result = conn.execute(query, params)
566
+ while result.has_next():
567
+ row = result.get_next()
568
+ kind, name, filename = row[0], row[1], row[2]
569
+ signature = row[3] if len(row) > 3 else ""
570
+ parent_id = row[4] if len(row) > 4 else ""
571
+ fqn = row[5] if len(row) > 5 else ""
572
+ node_id = row[6] if len(row) > 6 else ""
573
+
574
+ parent_fqn = fqn.split("#")[0] if "#" in fqn else ""
575
+
576
+ decl = MethodDecl(name, "", kind == "constructor")
577
+ decl.signature = signature
578
+
579
+ tables.members.append(MemberEntry(
580
+ kind=kind,
581
+ decl=decl,
582
+ parent_id=parent_id,
583
+ parent_fqn=parent_fqn,
584
+ file_path=filename,
585
+ module="",
586
+ microservice="",
587
+ node_id=node_id,
588
+ ))
589
+
590
+
591
+ def _find_dependents(conn: kuzu.Connection, changed_node_ids: set[str]) -> set[str]:
592
+ """Find files whose nodes have edges pointing into changed nodes. Returns set of filenames."""
593
+ dependent_files: set[str] = set()
594
+
595
+ # Query each Symbol-to-Symbol edge table for incoming edges
596
+ edge_types = ["EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES"]
597
+ params = {"changed_ids": list(changed_node_ids)}
598
+
599
+ for edge_type in edge_types:
600
+ query = f"""
601
+ MATCH (src:Symbol)-[e:{edge_type}]->(dst:Symbol)
602
+ WHERE dst.id IN $changed_ids
603
+ RETURN DISTINCT src.filename
604
+ """
605
+ result = conn.execute(query, params)
606
+ while result.has_next():
607
+ row = result.get_next()
608
+ filename = row[0]
609
+ if filename: # Skip phantom nodes (filename = "")
610
+ dependent_files.add(filename)
611
+
612
+ return dependent_files
613
+
614
+
615
+ def _delete_file_scope(conn: kuzu.Connection, filenames: set[str]) -> None:
616
+ """Delete all nodes and edges originating from the given files.
617
+
618
+ Skip phantom nodes (filename=""). Deletes ALL edge types in Phase 1,
619
+ then nodes in subsequent phases. Route/Client/Producer nodes use
620
+ DETACH DELETE as a safety net for any edges missed in Phase 1.
621
+
622
+ Edges are deleted in batch across all filenames first to avoid Kuzu
623
+ "has connected edges" errors when edges from one file point to nodes
624
+ in another file within the same scope.
625
+ """
626
+ filename_list = list(filenames)
627
+
628
+ # Phase 1: Delete ALL edges from ALL scope files at once.
629
+ # This avoids ordering issues where file A has an edge from file B
630
+ # pointing into it; if we delete A's nodes before B's edges, Kuzu
631
+ # raises "has connected edges" errors.
632
+ edge_tables = [
633
+ "EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES",
634
+ "UNRESOLVED_AT", "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER",
635
+ "HTTP_CALLS", "ASYNC_CALLS",
636
+ ]
637
+ for edge_type in edge_tables:
638
+ query = f"""
639
+ MATCH (src)-[e:{edge_type}]->(dst)
640
+ WHERE e.source_file IN $filenames
641
+ DELETE e
642
+ """
643
+ conn.execute(query, {"filenames": filename_list})
644
+
645
+ # Phase 2: Collect all Symbol node IDs for UnresolvedCallSite cleanup.
646
+ symbol_ids: list[str] = []
647
+ symbol_ids_query = """
648
+ MATCH (s:Symbol)
649
+ WHERE s.filename IN $filenames
650
+ RETURN s.id
651
+ """
652
+ result = conn.execute(symbol_ids_query, {"filenames": filename_list})
653
+ while result.has_next():
654
+ row = result.get_next()
655
+ symbol_ids.append(row[0])
656
+
657
+ # Delete UnresolvedCallSite nodes whose caller_id is in the collected set
658
+ if symbol_ids:
659
+ unresolved_query = """
660
+ MATCH (u:UnresolvedCallSite)
661
+ WHERE u.caller_id IN $symbol_ids
662
+ DELETE u
663
+ """
664
+ conn.execute(unresolved_query, {"symbol_ids": symbol_ids})
665
+
666
+ # Phase 3: Delete Symbol nodes.
667
+ delete_symbols_query = """
668
+ MATCH (s:Symbol)
669
+ WHERE s.filename IN $filenames
670
+ DELETE s
671
+ """
672
+ conn.execute(delete_symbols_query, {"filenames": filename_list})
673
+
674
+ # Phase 4: Delete Route, Client, Producer nodes.
675
+ # Use DETACH DELETE as a safety net in case any edges were missed in Phase 1.
676
+ for label in ["Route", "Client", "Producer"]:
677
+ conn.execute(
678
+ f"MATCH (n:{label}) WHERE n.filename IN $filenames DETACH DELETE n",
679
+ {"filenames": filename_list},
680
+ )
681
+
682
+
683
+ def _scoped_write(conn: kuzu.Connection, tables: GraphTables, *, project_root: Path, meta_chain: dict[str, frozenset[str]] | None) -> None:
684
+ """Write nodes and edges to existing Kuzu database without drop/create schema.
685
+
686
+ Like write_kuzu() but without _drop_all()/_create_schema(). The caller is
687
+ responsible for calling _populate_declares_rows() and _populate_overrides_rows()
688
+ before invoking this function.
689
+
690
+ Uses MERGE instead of CREATE to handle cases where nodes already exist.
691
+ """
692
+ t0 = time.time()
693
+ _write_nodes_merge(
694
+ conn,
695
+ tables,
696
+ project_root=project_root,
697
+ meta_chain=meta_chain,
698
+ )
699
+ elapsed = time.time() - t0
700
+ if elapsed > 0.1: # Only log if significant
701
+ _verbose_stderr_line(f"[graph] scoped write · nodes written in {elapsed:.2f}s")
702
+
703
+ t1 = time.time()
704
+ _fbyid = _build_file_by_node_id(tables)
705
+ _write_edges(conn, tables, _fbyid)
706
+ elapsed = time.time() - t1
707
+ if elapsed > 0.1:
708
+ _verbose_stderr_line(f"[graph] scoped write · edges written in {elapsed:.2f}s")
709
+
710
+ t2 = time.time()
711
+ _write_routes_and_exposes(conn, tables, _fbyid)
712
+ elapsed = time.time() - t2
713
+ if elapsed > 0.1:
714
+ _verbose_stderr_line(f"[graph] scoped write · routes/exposes written in {elapsed:.2f}s")
715
+
716
+
717
+ def _write_nodes_merge(
718
+ conn: kuzu.Connection,
719
+ tables: GraphTables,
720
+ *,
721
+ project_root: Path,
722
+ meta_chain: dict[str, frozenset[str]] | None,
723
+ ) -> None:
724
+ """Write nodes to existing Kuzu database using MERGE to handle existing nodes."""
725
+ _write_nodes_impl(conn, tables, project_root=project_root, meta_chain=meta_chain, symbol_query=_MERGE_SYMBOL)
726
+
727
+
404
728
  # ---------- file walk (see `path_filtering.iter_java_source_files`) ----------
405
729
 
406
730
 
@@ -461,8 +785,15 @@ def _register_type(
461
785
  return entry
462
786
 
463
787
 
464
- def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str, JavaFileAst]:
465
- """Walk files, parse them, populate node indexes. Returns path -> AST."""
788
+ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool, scope_files: set[str] | None = None) -> dict[str, JavaFileAst]:
789
+ """Walk files, parse them, populate node indexes. Returns path -> AST.
790
+
791
+ Args:
792
+ root: Source root directory.
793
+ tables: GraphTables to populate.
794
+ verbose: Whether to emit progress output.
795
+ scope_files: Optional set of relative POSIX paths to parse. If None, parse all files.
796
+ """
466
797
  asts: dict[str, JavaFileAst] = {}
467
798
  ignore = LayeredIgnore(root)
468
799
  t0 = time.time()
@@ -480,6 +811,13 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str,
480
811
  if verbose and slow_sec > 0:
481
812
  time.sleep(slow_sec)
482
813
  for p in iter_java_source_files(root, ignore=ignore):
814
+ # Skip files not in scope (if scope is provided)
815
+ try:
816
+ rel = p.resolve().relative_to(root.resolve()).as_posix()
817
+ except ValueError:
818
+ rel = p.as_posix()
819
+ if scope_files is not None and rel not in scope_files:
820
+ continue
483
821
  n_files += 1
484
822
  try:
485
823
  content = p.read_bytes()
@@ -488,10 +826,6 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool) -> dict[str,
488
826
  continue
489
827
  if not content.strip():
490
828
  continue
491
- try:
492
- rel = p.resolve().relative_to(root.resolve()).as_posix()
493
- except ValueError:
494
- rel = p.as_posix()
495
829
  try:
496
830
  ast = parse_java(content, filename=rel, verbose=verbose)
497
831
  except Exception:
@@ -2414,22 +2748,22 @@ _SCHEMA_PRODUCER = (
2414
2748
 
2415
2749
  _SCHEMA_EXTENDS = (
2416
2750
  "CREATE REL TABLE EXTENDS(FROM Symbol TO Symbol, "
2417
- "dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2751
+ "source_file STRING, dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2418
2752
  )
2419
2753
  _SCHEMA_IMPLEMENTS = (
2420
2754
  "CREATE REL TABLE IMPLEMENTS(FROM Symbol TO Symbol, "
2421
- "dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2755
+ "source_file STRING, dst_name STRING, dst_fqn STRING, resolved BOOLEAN)"
2422
2756
  )
2423
2757
  _SCHEMA_INJECTS = (
2424
2758
  "CREATE REL TABLE INJECTS(FROM Symbol TO Symbol, "
2425
- "dst_name STRING, dst_fqn STRING, resolved BOOLEAN, "
2759
+ "source_file STRING, dst_name STRING, dst_fqn STRING, resolved BOOLEAN, "
2426
2760
  "mechanism STRING, annotation STRING, field_or_param STRING)"
2427
2761
  )
2428
- _SCHEMA_DECLARES = "CREATE REL TABLE DECLARES(FROM Symbol TO Symbol)"
2429
- _SCHEMA_OVERRIDES = "CREATE REL TABLE OVERRIDES(FROM Symbol TO Symbol)"
2762
+ _SCHEMA_DECLARES = "CREATE REL TABLE DECLARES(FROM Symbol TO Symbol, source_file STRING)"
2763
+ _SCHEMA_OVERRIDES = "CREATE REL TABLE OVERRIDES(FROM Symbol TO Symbol, source_file STRING)"
2430
2764
  _SCHEMA_CALLS = (
2431
2765
  "CREATE REL TABLE CALLS(FROM Symbol TO Symbol, "
2432
- "call_site_line INT64, call_site_byte INT64, arg_count INT64, "
2766
+ "source_file STRING, call_site_line INT64, call_site_byte INT64, arg_count INT64, "
2433
2767
  "confidence DOUBLE, strategy STRING, source STRING, resolved BOOLEAN, "
2434
2768
  "callee_declaring_role STRING)"
2435
2769
  )
@@ -2439,27 +2773,27 @@ _SCHEMA_UNRESOLVED_CALL_SITE = (
2439
2773
  "arg_count INT64, callee_simple STRING, receiver_expr STRING, reason STRING, "
2440
2774
  "PRIMARY KEY(id))"
2441
2775
  )
2442
- _SCHEMA_UNRESOLVED_AT = "CREATE REL TABLE UNRESOLVED_AT(FROM Symbol TO UnresolvedCallSite)"
2776
+ _SCHEMA_UNRESOLVED_AT = "CREATE REL TABLE UNRESOLVED_AT(FROM Symbol TO UnresolvedCallSite, source_file STRING)"
2443
2777
  _SCHEMA_EXPOSES = (
2444
2778
  "CREATE REL TABLE EXPOSES(FROM Symbol TO Route, "
2445
- "confidence DOUBLE, strategy STRING)"
2779
+ "source_file STRING, confidence DOUBLE, strategy STRING)"
2446
2780
  )
2447
2781
  _SCHEMA_DECLARES_CLIENT = (
2448
2782
  "CREATE REL TABLE DECLARES_CLIENT(FROM Symbol TO Client, "
2449
- "confidence DOUBLE, strategy STRING)"
2783
+ "source_file STRING, confidence DOUBLE, strategy STRING)"
2450
2784
  )
2451
2785
  _SCHEMA_DECLARES_PRODUCER = (
2452
2786
  "CREATE REL TABLE DECLARES_PRODUCER(FROM Symbol TO Producer, "
2453
- "confidence DOUBLE, strategy STRING)"
2787
+ "source_file STRING, confidence DOUBLE, strategy STRING)"
2454
2788
  )
2455
2789
  _SCHEMA_HTTP_CALLS = (
2456
2790
  "CREATE REL TABLE HTTP_CALLS(FROM Client TO Route, "
2457
- "confidence DOUBLE, strategy STRING, "
2791
+ "source_file STRING, confidence DOUBLE, strategy STRING, "
2458
2792
  "method_call STRING, raw_uri STRING, match STRING)"
2459
2793
  )
2460
2794
  _SCHEMA_ASYNC_CALLS = (
2461
2795
  "CREATE REL TABLE ASYNC_CALLS(FROM Producer TO Route, "
2462
- "confidence DOUBLE, strategy STRING, "
2796
+ "source_file STRING, confidence DOUBLE, strategy STRING, "
2463
2797
  "direction STRING, raw_topic STRING, match STRING)"
2464
2798
  )
2465
2799
 
@@ -2538,13 +2872,25 @@ _CREATE_SYMBOL = (
2538
2872
  "role: $role, signature: $signature, parent_id: $parent_id, resolved: $resolved})"
2539
2873
  )
2540
2874
 
2875
+ _MERGE_SYMBOL = (
2876
+ "MERGE (n:Symbol {id: $id}) "
2877
+ "SET n.kind = $kind, n.name = $name, n.fqn = $fqn, "
2878
+ "n.package = $package, n.module = $module, n.microservice = $microservice, "
2879
+ "n.filename = $filename, "
2880
+ "n.start_line = $start_line, n.end_line = $end_line, "
2881
+ "n.start_byte = $start_byte, n.end_byte = $end_byte, "
2882
+ "n.modifiers = $modifiers, n.annotations = $annotations, n.capabilities = $capabilities, "
2883
+ "n.role = $role, n.signature = $signature, n.parent_id = $parent_id, n.resolved = $resolved"
2884
+ )
2541
2885
 
2542
- def _write_nodes(
2886
+
2887
+ def _write_nodes_impl(
2543
2888
  conn: kuzu.Connection,
2544
2889
  tables: GraphTables,
2545
2890
  *,
2546
2891
  project_root: Path,
2547
2892
  meta_chain: dict[str, frozenset[str]] | None,
2893
+ symbol_query: str,
2548
2894
  ) -> None:
2549
2895
  overrides = load_brownfield_overrides(project_root)
2550
2896
  try:
@@ -2555,12 +2901,12 @@ def _write_nodes(
2555
2901
  mch = meta_chain
2556
2902
  # packages
2557
2903
  for pkg, pid in tables.packages.items():
2558
- conn.execute(_CREATE_SYMBOL, _node_row(
2904
+ conn.execute(symbol_query, _node_row(
2559
2905
  id=pid, kind="package", name=pkg.rsplit(".", 1)[-1], fqn=pkg, package=pkg,
2560
2906
  ))
2561
2907
  # files
2562
2908
  for path, fid in tables.files.items():
2563
- conn.execute(_CREATE_SYMBOL, _node_row(
2909
+ conn.execute(symbol_query, _node_row(
2564
2910
  id=fid, kind="file", name=Path(path).name, fqn=path, filename=path,
2565
2911
  ))
2566
2912
  # types
@@ -2572,7 +2918,7 @@ def _write_nodes(
2572
2918
  meta_chain=mch,
2573
2919
  )
2574
2920
  tables.type_role_by_node_id[entry.node_id] = role
2575
- conn.execute(_CREATE_SYMBOL, _node_row(
2921
+ conn.execute(symbol_query, _node_row(
2576
2922
  id=entry.node_id, kind=d.kind, name=d.name, fqn=d.fqn,
2577
2923
  package=entry.package,
2578
2924
  module=entry.module, microservice=entry.microservice,
@@ -2588,7 +2934,7 @@ def _write_nodes(
2588
2934
  ))
2589
2935
  # members (methods / constructors)
2590
2936
  for m in tables.members:
2591
- conn.execute(_CREATE_SYMBOL, _node_row(
2937
+ conn.execute(symbol_query, _node_row(
2592
2938
  id=m.node_id, kind=m.kind, name=m.decl.name,
2593
2939
  fqn=f"{m.parent_fqn}#{m.decl.signature}",
2594
2940
  package=tables.types[m.parent_fqn].package if m.parent_fqn in tables.types else "",
@@ -2602,33 +2948,44 @@ def _write_nodes(
2602
2948
  ))
2603
2949
  # phantoms
2604
2950
  for pid, row in tables.phantoms.items():
2605
- conn.execute(_CREATE_SYMBOL, row)
2951
+ conn.execute(symbol_query, row)
2952
+
2953
+
2954
+ def _write_nodes(
2955
+ conn: kuzu.Connection,
2956
+ tables: GraphTables,
2957
+ *,
2958
+ project_root: Path,
2959
+ meta_chain: dict[str, frozenset[str]] | None,
2960
+ ) -> None:
2961
+ _write_nodes_impl(conn, tables, project_root=project_root, meta_chain=meta_chain, symbol_query=_CREATE_SYMBOL)
2606
2962
 
2607
2963
 
2608
2964
  _CREATE_EXT = (
2609
2965
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2610
- "CREATE (a)-[:EXTENDS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2966
+ "CREATE (a)-[:EXTENDS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2611
2967
  )
2612
2968
  _CREATE_IMPL = (
2613
2969
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2614
- "CREATE (a)-[:IMPLEMENTS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2970
+ "CREATE (a)-[:IMPLEMENTS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)"
2615
2971
  )
2616
2972
  _CREATE_INJ = (
2617
2973
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2618
- "CREATE (a)-[:INJECTS {dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved, "
2974
+ "CREATE (a)-[:INJECTS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved, "
2619
2975
  "mechanism: $mechanism, annotation: $annotation, field_or_param: $field_or_param}]->(b)"
2620
2976
  )
2621
2977
  _CREATE_DECL = (
2622
2978
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2623
- "CREATE (a)-[:DECLARES]->(b)"
2979
+ "CREATE (a)-[:DECLARES {source_file: $source_file}]->(b)"
2624
2980
  )
2625
2981
  _CREATE_OVERRIDES = (
2626
2982
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2627
- "CREATE (a)-[:OVERRIDES]->(b)"
2983
+ "CREATE (a)-[:OVERRIDES {source_file: $source_file}]->(b)"
2628
2984
  )
2629
2985
  _CREATE_CALL = (
2630
2986
  "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
2631
2987
  "CREATE (a)-[:CALLS {"
2988
+ "source_file: $source_file, "
2632
2989
  "call_site_line: $line, call_site_byte: $byte, arg_count: $argc, "
2633
2990
  "confidence: $conf, strategy: $strat, source: $src_kind, resolved: $resolved, "
2634
2991
  "callee_declaring_role: $callee_declaring_role"
@@ -2656,11 +3013,11 @@ _CREATE_CLIENT = (
2656
3013
 
2657
3014
  _CREATE_EXPOSES = (
2658
3015
  "MATCH (s:Symbol {id: $sid}), (r:Route {id: $rid}) "
2659
- "CREATE (s)-[:EXPOSES {confidence: $confidence, strategy: $strategy}]->(r)"
3016
+ "CREATE (s)-[:EXPOSES {source_file: $source_file, confidence: $confidence, strategy: $strategy}]->(r)"
2660
3017
  )
2661
3018
  _CREATE_DECLARES_CLIENT = (
2662
3019
  "MATCH (s:Symbol {id: $sid}), (c:Client {id: $cid}) "
2663
- "CREATE (s)-[:DECLARES_CLIENT {confidence: $confidence, strategy: $strategy}]->(c)"
3020
+ "CREATE (s)-[:DECLARES_CLIENT {source_file: $source_file, confidence: $confidence, strategy: $strategy}]->(c)"
2664
3021
  )
2665
3022
  _CREATE_PRODUCER = (
2666
3023
  "CREATE (:Producer {"
@@ -2673,16 +3030,16 @@ _CREATE_PRODUCER = (
2673
3030
  )
2674
3031
  _CREATE_DECLARES_PRODUCER = (
2675
3032
  "MATCH (s:Symbol {id: $sid}), (p:Producer {id: $pid}) "
2676
- "CREATE (s)-[:DECLARES_PRODUCER {confidence: $confidence, strategy: $strategy}]->(p)"
3033
+ "CREATE (s)-[:DECLARES_PRODUCER {source_file: $source_file, confidence: $confidence, strategy: $strategy}]->(p)"
2677
3034
  )
2678
3035
  _CREATE_HTTP_CALL = (
2679
3036
  "MATCH (c:Client {id: $cid}), (r:Route {id: $rid}) "
2680
- "CREATE (c)-[:HTTP_CALLS {confidence: $confidence, strategy: $strategy, "
3037
+ "CREATE (c)-[:HTTP_CALLS {source_file: $source_file, confidence: $confidence, strategy: $strategy, "
2681
3038
  "method_call: $method_call, raw_uri: $raw_uri, match: $match}]->(r)"
2682
3039
  )
2683
3040
  _CREATE_ASYNC_CALL = (
2684
3041
  "MATCH (p:Producer {id: $pid}), (r:Route {id: $rid}) "
2685
- "CREATE (p)-[:ASYNC_CALLS {confidence: $confidence, strategy: $strategy, "
3042
+ "CREATE (p)-[:ASYNC_CALLS {source_file: $source_file, confidence: $confidence, strategy: $strategy, "
2686
3043
  "direction: $direction, raw_topic: $raw_topic, match: $match}]->(r)"
2687
3044
  )
2688
3045
 
@@ -2732,30 +3089,53 @@ def _populate_overrides_rows(tables: GraphTables) -> None:
2732
3089
  ]
2733
3090
 
2734
3091
 
2735
- def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
3092
+ def _build_file_by_node_id(tables: GraphTables) -> dict[str, str]:
3093
+ """Build node_id -> file_path lookup for source_file resolution."""
3094
+ lookup: dict[str, str] = {}
3095
+ for entry in tables.types.values():
3096
+ lookup[entry.node_id] = entry.file_path
3097
+ for m in tables.members:
3098
+ lookup[m.node_id] = m.file_path
3099
+ return lookup
3100
+
3101
+
3102
+ def _write_edges(conn: kuzu.Connection, tables: GraphTables, _file_by_node_id: dict[str, str] | None = None) -> None:
3103
+ # Build node_id -> file_path lookup for source_file resolution.
3104
+ if _file_by_node_id is None:
3105
+ _file_by_node_id = _build_file_by_node_id(tables)
3106
+
2736
3107
  for r in tables.extends_rows:
2737
3108
  conn.execute(_CREATE_EXT, {
2738
3109
  "src": r.src_id, "dst": r.dst_id,
3110
+ "source_file": _file_by_node_id.get(r.src_id, ""),
2739
3111
  "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2740
3112
  })
2741
3113
  for r in tables.implements_rows:
2742
3114
  conn.execute(_CREATE_IMPL, {
2743
3115
  "src": r.src_id, "dst": r.dst_id,
3116
+ "source_file": _file_by_node_id.get(r.src_id, ""),
2744
3117
  "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2745
3118
  })
2746
3119
  for r in tables.injects_rows:
2747
3120
  conn.execute(_CREATE_INJ, {
2748
3121
  "src": r.src_id, "dst": r.dst_id,
3122
+ "source_file": _file_by_node_id.get(r.src_id, ""),
2749
3123
  "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved,
2750
3124
  "mechanism": r.mechanism, "annotation": r.annotation,
2751
3125
  "field_or_param": r.field_or_param,
2752
3126
  })
2753
3127
 
2754
3128
  for row in tables.declares_rows:
2755
- conn.execute(_CREATE_DECL, {"src": row.src_id, "dst": row.dst_id})
3129
+ conn.execute(_CREATE_DECL, {
3130
+ "src": row.src_id, "dst": row.dst_id,
3131
+ "source_file": _file_by_node_id.get(row.src_id, ""),
3132
+ })
2756
3133
 
2757
3134
  for row in tables.overrides_rows:
2758
- conn.execute(_CREATE_OVERRIDES, {"src": row.src_id, "dst": row.dst_id})
3135
+ conn.execute(_CREATE_OVERRIDES, {
3136
+ "src": row.src_id, "dst": row.dst_id,
3137
+ "source_file": _file_by_node_id.get(row.src_id, ""),
3138
+ })
2759
3139
 
2760
3140
  seen_calls: set[tuple[str, str, int, int]] = set()
2761
3141
  unique_calls: list[CallsRow] = []
@@ -2769,6 +3149,7 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
2769
3149
  for row in unique_calls:
2770
3150
  conn.execute(_CREATE_CALL, {
2771
3151
  "src": row.src_id, "dst": row.dst_id,
3152
+ "source_file": _file_by_node_id.get(row.src_id, ""),
2772
3153
  "line": row.call_site_line,
2773
3154
  "byte": row.call_site_byte,
2774
3155
  "argc": row.arg_count,
@@ -2789,7 +3170,7 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
2789
3170
  )
2790
3171
  _CREATE_UNRESOLVED_AT = (
2791
3172
  "MATCH (a:Symbol {id: $caller}), (u:UnresolvedCallSite {id: $ucs}) "
2792
- "CREATE (a)-[:UNRESOLVED_AT]->(u)"
3173
+ "CREATE (a)-[:UNRESOLVED_AT {source_file: $source_file}]->(u)"
2793
3174
  )
2794
3175
  seen_ucs: set[str] = set()
2795
3176
  for row in tables.unresolved_call_site_rows:
@@ -2806,10 +3187,23 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
2806
3187
  "recv": row.receiver_expr,
2807
3188
  "reason": row.reason,
2808
3189
  })
2809
- conn.execute(_CREATE_UNRESOLVED_AT, {"caller": row.caller_id, "ucs": row.id})
3190
+ conn.execute(_CREATE_UNRESOLVED_AT, {
3191
+ "caller": row.caller_id, "ucs": row.id,
3192
+ "source_file": _file_by_node_id.get(row.caller_id, ""),
3193
+ })
2810
3194
 
2811
3195
 
2812
- def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> None:
3196
+ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables, _file_by_node_id: dict[str, str] | None = None) -> None:
3197
+ # Build node_id -> file_path lookup for source_file resolution (for Symbol sources).
3198
+ if _file_by_node_id is None:
3199
+ _file_by_node_id = _build_file_by_node_id(tables)
3200
+
3201
+ # Build client_id -> filename lookup for HTTP_CALLS source_file.
3202
+ _file_by_client_id: dict[str, str] = {row.id: row.filename for row in tables.client_rows}
3203
+
3204
+ # Build producer_id -> filename lookup for ASYNC_CALLS source_file.
3205
+ _file_by_producer_id: dict[str, str] = {row.id: row.filename for row in tables.producer_rows}
3206
+
2813
3207
  for row in tables.routes_rows:
2814
3208
  conn.execute(_CREATE_ROUTE, {
2815
3209
  "id": row.id,
@@ -2834,6 +3228,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> Non
2834
3228
  conn.execute(_CREATE_EXPOSES, {
2835
3229
  "sid": row.symbol_id,
2836
3230
  "rid": row.route_id,
3231
+ "source_file": _file_by_node_id.get(row.symbol_id, ""),
2837
3232
  "confidence": row.confidence,
2838
3233
  "strategy": row.strategy,
2839
3234
  })
@@ -2843,6 +3238,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> Non
2843
3238
  conn.execute(_CREATE_DECLARES_CLIENT, {
2844
3239
  "sid": row.symbol_id,
2845
3240
  "cid": row.client_id,
3241
+ "source_file": _file_by_node_id.get(row.symbol_id, ""),
2846
3242
  "confidence": row.confidence,
2847
3243
  "strategy": row.strategy,
2848
3244
  })
@@ -2852,6 +3248,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> Non
2852
3248
  conn.execute(_CREATE_DECLARES_PRODUCER, {
2853
3249
  "sid": row.symbol_id,
2854
3250
  "pid": row.producer_id,
3251
+ "source_file": _file_by_node_id.get(row.symbol_id, ""),
2855
3252
  "confidence": row.confidence,
2856
3253
  "strategy": row.strategy,
2857
3254
  })
@@ -2859,6 +3256,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> Non
2859
3256
  conn.execute(_CREATE_HTTP_CALL, {
2860
3257
  "cid": row.client_id,
2861
3258
  "rid": row.route_id,
3259
+ "source_file": _file_by_client_id.get(row.client_id, ""),
2862
3260
  "confidence": row.confidence,
2863
3261
  "strategy": row.strategy,
2864
3262
  "method_call": row.method_call,
@@ -2869,6 +3267,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> Non
2869
3267
  conn.execute(_CREATE_ASYNC_CALL, {
2870
3268
  "pid": row.producer_id,
2871
3269
  "rid": row.route_id,
3270
+ "source_file": _file_by_producer_id.get(row.producer_id, ""),
2872
3271
  "confidence": row.confidence,
2873
3272
  "strategy": row.strategy,
2874
3273
  "direction": row.direction,
@@ -2929,28 +3328,29 @@ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -
2929
3328
  clients_by_kind = dict(sorted(client_stats.clients_by_kind.items()))
2930
3329
  producers_by_kind = dict(sorted(producer_stats.producers_by_kind.items()))
2931
3330
  conn.execute(
2932
- "CREATE (:GraphMeta {key: $k, ontology_version: $ov, built_at: $t, "
2933
- "source_root: $sr, counts_json: $cj, parse_errors: $pe, "
2934
- "routes_total: $routes_total, exposes_total: $exposes_total, "
2935
- "routes_by_framework: $routes_by_framework, routes_resolved_pct: $routes_resolved_pct, "
2936
- "routes_from_brownfield_pct: $routes_from_brownfield_pct, routes_by_layer: $routes_by_layer, "
2937
- "clients_total: $clients_total, declares_client_total: $declares_client_total, "
2938
- "clients_by_kind: $clients_by_kind, "
2939
- "producers_total: $producers_total, declares_producer_total: $declares_producer_total, "
2940
- "producers_by_kind: $producers_by_kind, "
2941
- "http_calls_total: $http_calls_total, async_calls_total: $async_calls_total, "
2942
- "http_calls_by_strategy: $http_calls_by_strategy, async_calls_by_strategy: $async_calls_by_strategy, "
2943
- "http_calls_resolved_pct: $http_calls_resolved_pct, async_calls_resolved_pct: $async_calls_resolved_pct, "
2944
- "http_clients_from_brownfield_pct: $http_clients_from_brownfield_pct, "
2945
- "async_producers_from_brownfield_pct: $async_producers_from_brownfield_pct, "
2946
- "http_calls_match_breakdown: $http_calls_match_breakdown, "
2947
- "async_calls_match_breakdown: $async_calls_match_breakdown, "
2948
- "cross_service_calls_total: $cross_service_calls_total, "
2949
- "pass3_skipped_cross_service: $pass3_skipped_cross_service, "
2950
- "pass3_unresolved_phantom_receiver: $pass3_unresolved_phantom_receiver, "
2951
- "pass3_unresolved_chained: $pass3_unresolved_chained, "
2952
- "pass4_exposes_suppressed_feign: $pass4_exposes_suppressed_feign, "
2953
- "cross_service_resolution: $cross_service_resolution})",
3331
+ "MERGE (m:GraphMeta {key: $k}) "
3332
+ "SET m.ontology_version = $ov, m.built_at = $t, "
3333
+ "m.source_root = $sr, m.counts_json = $cj, m.parse_errors = $pe, "
3334
+ "m.routes_total = $routes_total, m.exposes_total = $exposes_total, "
3335
+ "m.routes_by_framework = $routes_by_framework, m.routes_resolved_pct = $routes_resolved_pct, "
3336
+ "m.routes_from_brownfield_pct = $routes_from_brownfield_pct, m.routes_by_layer = $routes_by_layer, "
3337
+ "m.clients_total = $clients_total, m.declares_client_total = $declares_client_total, "
3338
+ "m.clients_by_kind = $clients_by_kind, "
3339
+ "m.producers_total = $producers_total, m.declares_producer_total = $declares_producer_total, "
3340
+ "m.producers_by_kind = $producers_by_kind, "
3341
+ "m.http_calls_total = $http_calls_total, m.async_calls_total = $async_calls_total, "
3342
+ "m.http_calls_by_strategy = $http_calls_by_strategy, m.async_calls_by_strategy = $async_calls_by_strategy, "
3343
+ "m.http_calls_resolved_pct = $http_calls_resolved_pct, m.async_calls_resolved_pct = $async_calls_resolved_pct, "
3344
+ "m.http_clients_from_brownfield_pct = $http_clients_from_brownfield_pct, "
3345
+ "m.async_producers_from_brownfield_pct = $async_producers_from_brownfield_pct, "
3346
+ "m.http_calls_match_breakdown = $http_calls_match_breakdown, "
3347
+ "m.async_calls_match_breakdown = $async_calls_match_breakdown, "
3348
+ "m.cross_service_calls_total = $cross_service_calls_total, "
3349
+ "m.pass3_skipped_cross_service = $pass3_skipped_cross_service, "
3350
+ "m.pass3_unresolved_phantom_receiver = $pass3_unresolved_phantom_receiver, "
3351
+ "m.pass3_unresolved_chained = $pass3_unresolved_chained, "
3352
+ "m.pass4_exposes_suppressed_feign = $pass4_exposes_suppressed_feign, "
3353
+ "m.cross_service_resolution = $cross_service_resolution",
2954
3354
  {
2955
3355
  "k": "graph",
2956
3356
  "ov": ONTOLOGY_VERSION,
@@ -2990,6 +3390,359 @@ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -
2990
3390
  )
2991
3391
 
2992
3392
 
3393
+ def incremental_rebuild(
3394
+ source_root: Path,
3395
+ kuzu_path: Path,
3396
+ *,
3397
+ verbose: bool,
3398
+ expansion_cap: int = 50,
3399
+ ) -> IncrementalResult:
3400
+ """Incrementally rebuild the Kuzu graph, processing only changed files and their dependents.
3401
+
3402
+ Returns IncrementalResult with statistics about the rebuild.
3403
+ Falls back to full rebuild if:
3404
+ - No previous graph exists
3405
+ - Ontology version < 17 (missing source_file on edges)
3406
+ - Crash marker exists (previous incremental run failed)
3407
+ - Dependent expansion exceeds expansion_cap
3408
+ """
3409
+ t_start = time.time()
3410
+
3411
+ # Step 1: Load existing graph and detect changes
3412
+ if not kuzu_path.exists():
3413
+ if verbose:
3414
+ _verbose_stderr_line("[increment] no existing graph; falling back to full rebuild")
3415
+ # Fall back to full rebuild
3416
+ tables = GraphTables()
3417
+ asts = pass1_parse(source_root, tables, verbose=verbose)
3418
+ pass2_edges(tables, asts, verbose=verbose)
3419
+ pass3_calls(tables, asts, verbose=verbose)
3420
+ pass4_routes(tables, asts, source_root=source_root, verbose=verbose)
3421
+ pass5_imperative_edges(tables, asts, source_root=source_root, verbose=verbose)
3422
+ pass6_match_edges(tables, verbose=verbose)
3423
+ write_kuzu(kuzu_path, tables, source_root=source_root, verbose=verbose)
3424
+
3425
+ n_files = _init_hash_tracker(source_root, kuzu_path)
3426
+
3427
+ return IncrementalResult(
3428
+ mode="full_fallback",
3429
+ files_changed=0,
3430
+ files_added=n_files,
3431
+ files_removed=0,
3432
+ dependents_reprocessed=0,
3433
+ elapsed_sec=time.time() - t_start,
3434
+ )
3435
+
3436
+ db = kuzu.Database(str(kuzu_path))
3437
+ conn = kuzu.Connection(db)
3438
+
3439
+ # Check ontology version
3440
+ try:
3441
+ meta_result = conn.execute("MATCH (m:GraphMeta) RETURN m.ontology_version AS version")
3442
+ if meta_result.has_next():
3443
+ row = meta_result.get_next()
3444
+ version = row[0] if row else 0
3445
+ if version < 17:
3446
+ if verbose:
3447
+ _verbose_stderr_line(f"[increment] ontology version {version} < 17; falling back to full rebuild")
3448
+ conn.close()
3449
+ del conn, db
3450
+ return _fallback_to_full(source_root, kuzu_path, verbose, t_start)
3451
+ except Exception as e:
3452
+ if verbose:
3453
+ _verbose_stderr_line(f"[increment] failed to read ontology version: {e}; falling back to full rebuild")
3454
+ try:
3455
+ conn.close()
3456
+ except Exception:
3457
+ pass
3458
+ del conn, db
3459
+ return _fallback_to_full(source_root, kuzu_path, verbose, t_start)
3460
+
3461
+ index_dir = kuzu_path.parent
3462
+ tracker = FileHashTracker(index_dir)
3463
+ tracker.load()
3464
+
3465
+ ignore = LayeredIgnore(source_root)
3466
+ added, changed, removed = tracker.detect_changes(source_root, ignore=ignore)
3467
+
3468
+ changed_files = added | changed | removed
3469
+
3470
+ if not changed_files:
3471
+ if verbose:
3472
+ _verbose_stderr_line("[increment] no changes detected; no-op")
3473
+ conn.close()
3474
+ return IncrementalResult(
3475
+ mode="incremental",
3476
+ files_changed=0,
3477
+ files_added=0,
3478
+ files_removed=0,
3479
+ dependents_reprocessed=0,
3480
+ elapsed_sec=time.time() - t_start,
3481
+ )
3482
+
3483
+ if verbose:
3484
+ _verbose_stderr_line(f"[increment] detected {len(added)} added, {len(changed)} changed, {len(removed)} removed files")
3485
+
3486
+ # Step 2: Crash marker check
3487
+ crash_marker_path = index_dir / ".graph_increment_in_progress"
3488
+ if crash_marker_path.exists():
3489
+ if verbose:
3490
+ _verbose_stderr_line("[increment] crash marker exists; falling back to full rebuild")
3491
+ conn.close()
3492
+ crash_marker_path.unlink(missing_ok=True)
3493
+ return _fallback_to_full(source_root, kuzu_path, verbose, t_start)
3494
+
3495
+ # Write crash marker
3496
+ crash_marker_path.write_text("", encoding="utf-8")
3497
+
3498
+ try:
3499
+ # Step 3: Dependent expansion
3500
+ # Collect node IDs for changed files (single query instead of N+1)
3501
+ changed_node_ids: set[str] = set()
3502
+ result = conn.execute(
3503
+ "MATCH (s:Symbol) WHERE s.filename IN $filenames RETURN s.id",
3504
+ {"filenames": list(changed_files)},
3505
+ )
3506
+ while result.has_next():
3507
+ row = result.get_next()
3508
+ changed_node_ids.add(row[0])
3509
+
3510
+ # Find dependents
3511
+ dependent_files = _find_dependents(conn, changed_node_ids)
3512
+
3513
+ # Union changed files with dependents
3514
+ scope_files = changed_files | dependent_files
3515
+
3516
+ if len(scope_files) > expansion_cap:
3517
+ if verbose:
3518
+ _verbose_stderr_line(f"[increment] dependent expansion cap ({expansion_cap}) exceeded ({len(scope_files)} files); falling back to full rebuild")
3519
+ conn.close()
3520
+ crash_marker_path.unlink(missing_ok=True)
3521
+ return _fallback_to_full(source_root, kuzu_path, verbose, t_start)
3522
+
3523
+ if verbose:
3524
+ _verbose_stderr_line(f"[increment] processing {len(scope_files)} files ({len(changed_files)} changed + {len(dependent_files)} dependents)")
3525
+
3526
+ # Step 4: Scoped deletion
3527
+ if verbose:
3528
+ _verbose_stderr_line("[increment] deleting outdated nodes and edges")
3529
+ _delete_file_scope(conn, scope_files)
3530
+
3531
+ # Force deletion to be applied by running a dummy query
3532
+ conn.execute("MATCH (s:Symbol) RETURN count(*)")
3533
+
3534
+ # Step 5: Scoped pass 1-4
3535
+ if verbose:
3536
+ _verbose_stderr_line("[increment] rebuilding scoped files (passes 1-4)")
3537
+
3538
+ tables = GraphTables()
3539
+ asts = pass1_parse(source_root, tables, verbose=verbose, scope_files=scope_files)
3540
+
3541
+ # Load existing types and members for cross-file resolution (only from unchanged files)
3542
+ _load_existing_types(conn, tables, exclude_files=scope_files)
3543
+ _load_existing_members(conn, tables, exclude_files=scope_files)
3544
+
3545
+ pass2_edges(tables, asts, verbose=verbose)
3546
+ pass3_calls(tables, asts, verbose=verbose)
3547
+ pass4_routes(tables, asts, source_root=source_root, verbose=verbose)
3548
+
3549
+ # Populate declares and overrides rows
3550
+ _populate_declares_rows(tables)
3551
+ _populate_overrides_rows(tables)
3552
+
3553
+ # Write scoped nodes and edges
3554
+ meta_chain = collect_annotation_meta_chain(str(source_root.resolve()))
3555
+ _scoped_write(conn, tables, project_root=source_root, meta_chain=meta_chain)
3556
+
3557
+ # Step 6: Global pass 5-6
3558
+ if verbose:
3559
+ _verbose_stderr_line("[increment] running global passes 5-6")
3560
+
3561
+ # Rebuild full tables for global pass 5-6 (pass1 populates members from scratch)
3562
+ tables_for_global = GraphTables()
3563
+ global_asts = pass1_parse(source_root, tables_for_global, verbose=verbose)
3564
+
3565
+ pass5_imperative_edges(tables_for_global, global_asts, source_root=source_root, verbose=verbose)
3566
+
3567
+ # Delete existing Client, Producer, and their edges
3568
+ conn.execute("MATCH (c:Client) DETACH DELETE c")
3569
+ conn.execute("MATCH (p:Producer) DETACH DELETE p")
3570
+
3571
+ pass6_match_edges(tables_for_global, verbose=verbose)
3572
+
3573
+ # Write Client, Producer, and cross-service edges
3574
+ _write_clients_producers_and_calls(conn, tables_for_global)
3575
+
3576
+ # Step 7: Update hash store and metadata
3577
+ if verbose:
3578
+ _verbose_stderr_line("[increment] updating hash store and metadata")
3579
+
3580
+ # Update hashes for processed files
3581
+ tracker.update(scope_files, source_root)
3582
+
3583
+ # Remove hashes for deleted files
3584
+ for filename in removed:
3585
+ if filename in tracker._hashes:
3586
+ del tracker._hashes[filename]
3587
+
3588
+ tracker.save()
3589
+
3590
+ # Update GraphMeta
3591
+ _write_meta(conn, tables_for_global, source_root)
3592
+
3593
+ # Remove crash marker
3594
+ crash_marker_path.unlink(missing_ok=True)
3595
+
3596
+ conn.close()
3597
+
3598
+ elapsed = time.time() - t_start
3599
+ if verbose:
3600
+ _verbose_stderr_line(f"[increment] completed in {elapsed:.2f}s")
3601
+
3602
+ return IncrementalResult(
3603
+ mode="incremental",
3604
+ files_changed=len(changed),
3605
+ files_added=len(added),
3606
+ files_removed=len(removed),
3607
+ dependents_reprocessed=len(dependent_files),
3608
+ elapsed_sec=elapsed,
3609
+ )
3610
+
3611
+ except Exception as e:
3612
+ # On error, remove crash marker and fall back to full rebuild
3613
+ if verbose:
3614
+ _verbose_stderr_line(f"[increment] error during incremental rebuild: {e}; falling back to full rebuild")
3615
+ conn.close()
3616
+ crash_marker_path.unlink(missing_ok=True)
3617
+ return _fallback_to_full(source_root, kuzu_path, verbose, t_start)
3618
+
3619
+
3620
+ def _init_hash_tracker(source_root: Path, kuzu_path: Path) -> int:
3621
+ """Initialize hash tracker for all Java files. Returns number of files hashed."""
3622
+ index_dir = kuzu_path.parent
3623
+ tracker = FileHashTracker(index_dir)
3624
+ tracker.load()
3625
+ ignore = LayeredIgnore(source_root)
3626
+ all_files: set[str] = set()
3627
+ source_root_resolved = source_root.resolve()
3628
+ for p in iter_java_source_files(source_root, ignore=ignore):
3629
+ p_resolved = p.resolve()
3630
+ try:
3631
+ rel_path = p_resolved.relative_to(source_root_resolved).as_posix()
3632
+ except ValueError:
3633
+ rel_path = p.as_posix()
3634
+ all_files.add(rel_path)
3635
+ tracker.update(all_files, source_root)
3636
+ tracker.save()
3637
+ return len(all_files)
3638
+
3639
+
3640
+ def _fallback_to_full(source_root: Path, kuzu_path: Path, verbose: bool, t_start: float) -> IncrementalResult:
3641
+ """Fallback to full rebuild."""
3642
+ tables = GraphTables()
3643
+ asts = pass1_parse(source_root, tables, verbose=verbose)
3644
+ pass2_edges(tables, asts, verbose=verbose)
3645
+ pass3_calls(tables, asts, verbose=verbose)
3646
+ pass4_routes(tables, asts, source_root=source_root, verbose=verbose)
3647
+ pass5_imperative_edges(tables, asts, source_root=source_root, verbose=verbose)
3648
+ pass6_match_edges(tables, verbose=verbose)
3649
+ write_kuzu(kuzu_path, tables, source_root=source_root, verbose=verbose)
3650
+
3651
+ n_files = _init_hash_tracker(source_root, kuzu_path)
3652
+
3653
+ return IncrementalResult(
3654
+ mode="full_fallback",
3655
+ files_changed=0,
3656
+ files_added=n_files,
3657
+ files_removed=0,
3658
+ dependents_reprocessed=0,
3659
+ elapsed_sec=time.time() - t_start,
3660
+ )
3661
+
3662
+
3663
+ def _write_clients_producers_and_calls(conn: kuzu.Connection, tables: GraphTables) -> None:
3664
+ """Write Route, Client, Producer, and cross-service edges to Kuzu.
3665
+
3666
+ Used by the incremental rebuild's global pass 5-6 step. Writes phantom
3667
+ Route nodes (created by pass5 for cross-service calls) that wouldn't
3668
+ otherwise exist in Kuzu.
3669
+ """
3670
+ # Write phantom routes that don't already exist (pass5 creates these for cross-service calls)
3671
+ for row in tables.routes_rows:
3672
+ # MERGE to avoid duplicates with routes written during scoped step
3673
+ conn.execute(
3674
+ "MERGE (r:Route {id: $id}) "
3675
+ "SET r.kind = $kind, r.framework = $framework, r.method = $method, "
3676
+ "r.path = $path, r.path_template = $path_template, r.path_regex = $path_regex, "
3677
+ "r.topic = $topic, r.broker = $broker, r.feign_name = $feign_name, r.feign_url = $feign_url, "
3678
+ "r.microservice = $microservice, r.module = $module, r.filename = $filename, "
3679
+ "r.start_line = $start_line, r.end_line = $end_line, r.resolved = $resolved",
3680
+ asdict(row),
3681
+ )
3682
+
3683
+ # Build node_id lookup for members and types
3684
+ member_by_id = {m.node_id: m for m in tables.members}
3685
+
3686
+ # Write clients and producers using asdict (same pattern as _write_routes_and_exposes)
3687
+ for row in tables.client_rows:
3688
+ conn.execute(_CREATE_CLIENT, asdict(row))
3689
+ for row in tables.producer_rows:
3690
+ conn.execute(_CREATE_PRODUCER, asdict(row))
3691
+
3692
+ client_by_id = {c.id: c for c in tables.client_rows}
3693
+ producer_by_id = {p.id: p for p in tables.producer_rows}
3694
+
3695
+ # Write declares_client edges
3696
+ for row in tables.declares_client_rows:
3697
+ source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
3698
+ conn.execute(_CREATE_DECLARES_CLIENT, {
3699
+ "sid": row.symbol_id,
3700
+ "cid": row.client_id,
3701
+ "source_file": source_file,
3702
+ "confidence": row.confidence,
3703
+ "strategy": row.strategy,
3704
+ })
3705
+
3706
+ # Write declares_producer edges
3707
+ for row in tables.declares_producer_rows:
3708
+ source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
3709
+ conn.execute(_CREATE_DECLARES_PRODUCER, {
3710
+ "sid": row.symbol_id,
3711
+ "pid": row.producer_id,
3712
+ "source_file": source_file,
3713
+ "confidence": row.confidence,
3714
+ "strategy": row.strategy,
3715
+ })
3716
+
3717
+ # Write HTTP_CALLS edges
3718
+ for row in tables.http_call_rows:
3719
+ client = client_by_id.get(row.client_id)
3720
+ conn.execute(_CREATE_HTTP_CALL, {
3721
+ "cid": row.client_id,
3722
+ "rid": row.route_id,
3723
+ "source_file": client.filename if client else "",
3724
+ "confidence": row.confidence,
3725
+ "strategy": row.strategy,
3726
+ "method_call": row.method_call,
3727
+ "raw_uri": row.raw_uri,
3728
+ "match": row.match,
3729
+ })
3730
+
3731
+ # Write ASYNC_CALLS edges
3732
+ for row in tables.async_call_rows:
3733
+ producer = producer_by_id.get(row.producer_id)
3734
+ conn.execute(_CREATE_ASYNC_CALL, {
3735
+ "pid": row.producer_id,
3736
+ "rid": row.route_id,
3737
+ "source_file": producer.filename if producer else "",
3738
+ "confidence": row.confidence,
3739
+ "strategy": row.strategy,
3740
+ "direction": row.direction,
3741
+ "raw_topic": row.raw_topic,
3742
+ "match": row.match,
3743
+ })
3744
+
3745
+
2993
3746
  def write_kuzu(
2994
3747
  db_path: Path,
2995
3748
  tables: GraphTables,
@@ -3022,11 +3775,12 @@ def write_kuzu(
3022
3775
  _populate_declares_rows(tables)
3023
3776
  _populate_overrides_rows(tables)
3024
3777
  t1 = time.time()
3025
- _write_edges(conn, tables)
3778
+ _fbyid = _build_file_by_node_id(tables)
3779
+ _write_edges(conn, tables, _fbyid)
3026
3780
  if verbose:
3027
3781
  _verbose_stderr_line(f"[graph] writing · edges written in {time.time() - t1:.2f}s")
3028
3782
  t2 = time.time()
3029
- _write_routes_and_exposes(conn, tables)
3783
+ _write_routes_and_exposes(conn, tables, _fbyid)
3030
3784
  if verbose:
3031
3785
  _verbose_stderr_line(f"[graph] writing · routes/exposes written in {time.time() - t2:.2f}s")
3032
3786
  _write_meta(conn, tables, source_root)
@@ -3055,6 +3809,7 @@ def main() -> int:
3055
3809
  ),
3056
3810
  )
3057
3811
  parser.add_argument("--verbose", action="store_true")
3812
+ parser.add_argument("--incremental", action="store_true", help="Run incremental rebuild instead of full rebuild")
3058
3813
  args = parser.parse_args()
3059
3814
 
3060
3815
  root = Path(args.source_root).expanduser().resolve() if args.source_root else Path.cwd().resolve()
@@ -3064,6 +3819,20 @@ def main() -> int:
3064
3819
 
3065
3820
  kuzu_path = Path(args.kuzu_path).expanduser() if args.kuzu_path else _default_kuzu_path()
3066
3821
 
3822
+ if args.incremental:
3823
+ result = incremental_rebuild(root, kuzu_path, verbose=args.verbose)
3824
+ print(json.dumps({
3825
+ "mode": result.mode,
3826
+ "files_changed": result.files_changed,
3827
+ "files_added": result.files_added,
3828
+ "files_removed": result.files_removed,
3829
+ "dependents_reprocessed": result.dependents_reprocessed,
3830
+ "elapsed_sec": result.elapsed_sec,
3831
+ }))
3832
+ if args.verbose:
3833
+ _verbose_stderr_line(f"[graph] done · mode={result.mode} files_changed={result.files_changed} files_added={result.files_added} files_removed={result.files_removed} dependents={result.dependents_reprocessed} elapsed={result.elapsed_sec:.2f}s")
3834
+ return 0
3835
+
3067
3836
  tables = GraphTables()
3068
3837
  asts = pass1_parse(root, tables, verbose=args.verbose)
3069
3838
  pass2_edges(tables, asts, verbose=args.verbose)