codegraph-gen 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,747 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import networkx as nx
4
+ from codegraph_gen.parser.base import ExtractionResult
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Common builtin/standard library functions for languages to avoid call graph pollution
9
+ BUILTIN_FUNCTIONS: dict[str, set[str]] = {
10
+ "python": {
11
+ "print",
12
+ "len",
13
+ "range",
14
+ "str",
15
+ "int",
16
+ "dict",
17
+ "list",
18
+ "set",
19
+ "tuple",
20
+ "open",
21
+ "sum",
22
+ "min",
23
+ "max",
24
+ "abs",
25
+ "enumerate",
26
+ "zip",
27
+ "any",
28
+ "all",
29
+ "map",
30
+ "filter",
31
+ "super",
32
+ "repr",
33
+ "type",
34
+ "isinstance",
35
+ "issubclass",
36
+ "dir",
37
+ "id",
38
+ "hash",
39
+ "input",
40
+ },
41
+ "go": {
42
+ "print",
43
+ "println",
44
+ "panic",
45
+ "recover",
46
+ "make",
47
+ "new",
48
+ "len",
49
+ "cap",
50
+ "append",
51
+ "copy",
52
+ "delete",
53
+ "complex",
54
+ "real",
55
+ "imag",
56
+ "close",
57
+ },
58
+ "javascript": {
59
+ "console",
60
+ "require",
61
+ "module",
62
+ "exports",
63
+ "process",
64
+ "window",
65
+ "document",
66
+ "eval",
67
+ "parseInt",
68
+ "parseFloat",
69
+ "isNaN",
70
+ "isFinite",
71
+ "decodeURI",
72
+ "encodeURI",
73
+ "Object",
74
+ "Array",
75
+ "String",
76
+ "Number",
77
+ "Boolean",
78
+ "Date",
79
+ "RegExp",
80
+ "Error",
81
+ "Map",
82
+ "Set",
83
+ "Promise",
84
+ "JSON",
85
+ "Math",
86
+ "setTimeout",
87
+ "clearTimeout",
88
+ "setInterval",
89
+ "clearInterval",
90
+ },
91
+ "typescript": {
92
+ "console",
93
+ "require",
94
+ "module",
95
+ "exports",
96
+ "process",
97
+ "window",
98
+ "document",
99
+ "eval",
100
+ "parseInt",
101
+ "parseFloat",
102
+ "isNaN",
103
+ "isFinite",
104
+ "decodeURI",
105
+ "encodeURI",
106
+ "Object",
107
+ "Array",
108
+ "String",
109
+ "Number",
110
+ "Boolean",
111
+ "Date",
112
+ "RegExp",
113
+ "Error",
114
+ "Map",
115
+ "Set",
116
+ "Promise",
117
+ "JSON",
118
+ "Math",
119
+ "setTimeout",
120
+ "clearTimeout",
121
+ "setInterval",
122
+ "clearInterval",
123
+ },
124
+ "rust": {
125
+ "println!",
126
+ "print!",
127
+ "format!",
128
+ "panic!",
129
+ "vec!",
130
+ "assert!",
131
+ "assert_eq!",
132
+ "Option",
133
+ "Result",
134
+ "Some",
135
+ "None",
136
+ "Ok",
137
+ "Err",
138
+ "Default",
139
+ },
140
+ "swift": {
141
+ "print",
142
+ "min",
143
+ "max",
144
+ "abs",
145
+ "count",
146
+ "fatalError",
147
+ "precondition",
148
+ "assert",
149
+ },
150
+ "kotlin": {
151
+ "print",
152
+ "println",
153
+ "listOf",
154
+ "mapOf",
155
+ "setOf",
156
+ "mutableListOf",
157
+ "mutableMapOf",
158
+ "mutableSetOf",
159
+ "arrayOf",
160
+ "emptyList",
161
+ "emptyMap",
162
+ "emptySet",
163
+ "run",
164
+ "let",
165
+ "also",
166
+ "apply",
167
+ "takeIf",
168
+ "takeUnless",
169
+ "repeat",
170
+ "require",
171
+ "check",
172
+ "error",
173
+ },
174
+ "c": {
175
+ "printf",
176
+ "scanf",
177
+ "malloc",
178
+ "free",
179
+ "calloc",
180
+ "realloc",
181
+ "memcpy",
182
+ "memset",
183
+ "strcpy",
184
+ "strlen",
185
+ "strcmp",
186
+ "strcat",
187
+ "exit",
188
+ "fopen",
189
+ "fclose",
190
+ "fprintf",
191
+ "sprintf",
192
+ "sizeof",
193
+ },
194
+ "cpp": {
195
+ "printf",
196
+ "scanf",
197
+ "malloc",
198
+ "free",
199
+ "calloc",
200
+ "realloc",
201
+ "memcpy",
202
+ "memset",
203
+ "strcpy",
204
+ "strlen",
205
+ "strcmp",
206
+ "strcat",
207
+ "exit",
208
+ "fopen",
209
+ "fclose",
210
+ "fprintf",
211
+ "sprintf",
212
+ "sizeof",
213
+ "std",
214
+ "cout",
215
+ "cin",
216
+ "endl",
217
+ "vector",
218
+ "string",
219
+ "map",
220
+ "set",
221
+ "list",
222
+ "shared_ptr",
223
+ "unique_ptr",
224
+ "make_shared",
225
+ "make_unique",
226
+ "move",
227
+ },
228
+ }
229
+
230
+
231
+ # Common builtin/standard library method names to avoid incorrect resolution during global fallback
232
+ COMMON_BUILTIN_METHODS: set[str] = {
233
+ "append",
234
+ "decode",
235
+ "encode",
236
+ "insert",
237
+ "remove",
238
+ "contains",
239
+ "push",
240
+ "pop",
241
+ "split",
242
+ "join",
243
+ "map",
244
+ "filter",
245
+ "reduce",
246
+ "forEach",
247
+ "sorted",
248
+ "count",
249
+ "length",
250
+ "size",
251
+ "isEmpty",
252
+ "resume",
253
+ "cancel",
254
+ "suspend",
255
+ "start",
256
+ "stop",
257
+ "send",
258
+ "receive",
259
+ # Added common programming language method/constructor names
260
+ "len",
261
+ "new",
262
+ "is_empty",
263
+ "clone",
264
+ "default",
265
+ "parse",
266
+ "format",
267
+ "read",
268
+ "write",
269
+ "close",
270
+ "flush",
271
+ "to_string",
272
+ "to_str",
273
+ "as_str",
274
+ "as_ref",
275
+ "as_mut",
276
+ "unwrap",
277
+ "expect",
278
+ "iter",
279
+ "iter_mut",
280
+ "into_iter",
281
+ "next",
282
+ "into",
283
+ "from",
284
+ "ok",
285
+ "err",
286
+ "clear",
287
+ "get",
288
+ "set",
289
+ "add",
290
+ "keys",
291
+ "values",
292
+ "items",
293
+ "update",
294
+ "copy",
295
+ "find",
296
+ "index",
297
+ "last",
298
+ "first",
299
+ }
300
+
301
+
302
+ class FileSymbolScope:
303
+ def __init__(self, file_path: str, language: str):
304
+ self.file_path = file_path
305
+ self.language = language
306
+ # Maps local symbol name -> fully qualified Node ID (e.g. {"MyClass": "foo.py::MyClass"})
307
+ self.declared_symbols: dict[str, str] = {}
308
+ # Maps import alias or local name -> (target_file_id, original_name)
309
+ self.imported_symbols: dict[str, tuple[str, str]] = {}
310
+ # List of target files that were wildcard imported (e.g. from X import *)
311
+ self.wildcard_imports: list[str] = []
312
+
313
+
314
+ def build_graph(extractions: list[ExtractionResult], workspace_dir: Path) -> nx.DiGraph:
315
+ """
316
+ Assembles a list of ExtractionResults into a single directed graph
317
+ and resolves call, inherit, and import edges using a two-pass scope resolver.
318
+ """
319
+ G = nx.DiGraph()
320
+
321
+ # 1. Add all nodes to the graph
322
+ for ext in extractions:
323
+ for node in ext.nodes:
324
+ G.add_node(node.id, **node.model_dump())
325
+
326
+ node_ids = set(G.nodes)
327
+
328
+ # Helper: resolve local file path from Go/Python/C/C++ import targets
329
+ def resolve_import_to_file_node(source_file: str, target: str) -> str | None:
330
+ # Check if target is a direct relative/absolute file path
331
+ # (either starting with '.' or containing '/' or having a C/C++ file extension)
332
+ is_path_target = target.startswith(".") or "/" in target or "\\" in target
333
+ if not is_path_target and file_languages.get(source_file) in ("c", "cpp"):
334
+ is_path_target = any(
335
+ target.endswith(ext)
336
+ for ext in (".h", ".hpp", ".hxx", ".c", ".cpp", ".cc", ".cxx")
337
+ )
338
+
339
+ if is_path_target:
340
+ source_dir = (Path(workspace_dir) / Path(source_file)).parent
341
+ try:
342
+ resolved_path = (source_dir / target).resolve()
343
+ rel_path = str(resolved_path.relative_to(workspace_dir))
344
+ if rel_path in node_ids:
345
+ return rel_path
346
+ # Try adding standard extensions
347
+ for suff in (".h", ".hpp", ".hxx", ".c", ".cpp", ".cc", ".cxx"):
348
+ check_path = rel_path + suff
349
+ if check_path in node_ids:
350
+ return check_path
351
+ except Exception:
352
+ pass
353
+
354
+ # Global fallback search for this filename in the workspace (for C/C++ includes)
355
+ target_name = Path(target).name
356
+ for nid in node_ids:
357
+ if G.nodes[nid]["type"] == "file":
358
+ if Path(nid).name == target_name:
359
+ return nid
360
+ return None
361
+
362
+ if target.startswith("."):
363
+ source_dir = Path(workspace_dir) / Path(source_file).parent
364
+ try:
365
+ resolved_path = (source_dir / target).resolve()
366
+ rel_path = str(resolved_path.relative_to(workspace_dir))
367
+
368
+ for suff in (".py", ".ts", ".js", ".go", ".rs", ".swift"):
369
+ check_path = rel_path + suff
370
+ if check_path in node_ids:
371
+ return check_path
372
+ check_init = str(Path(rel_path) / f"__init__{suff}")
373
+ if check_init in node_ids:
374
+ return check_init
375
+ if rel_path in node_ids:
376
+ return rel_path
377
+ except Exception:
378
+ pass
379
+
380
+ target_path_part = target.replace(".", "/")
381
+ for nid in node_ids:
382
+ if G.nodes[nid]["type"] == "file":
383
+ if (
384
+ nid.replace("\\", "/").endswith(target_path_part)
385
+ or nid.replace("\\", "/").endswith(target_path_part + ".py")
386
+ or nid.replace("\\", "/").endswith(
387
+ target_path_part + "/__init__.py"
388
+ )
389
+ or nid.replace("\\", "/").endswith(target_path_part + ".go")
390
+ or nid.replace("\\", "/").endswith(target_path_part + ".rs")
391
+ ):
392
+ return nid
393
+ return None
394
+
395
+ # Pass 1: Build Symbol Scopes
396
+ scopes: dict[str, FileSymbolScope] = {}
397
+ file_languages: dict[str, str] = {}
398
+
399
+ for nid, data in G.nodes(data=True):
400
+ if data.get("type") == "file":
401
+ suffix = Path(nid).suffix.lower()
402
+ lang = "python"
403
+ for lang_name, exts in {
404
+ "python": {".py"},
405
+ "javascript": {".js", ".mjs", ".cjs"},
406
+ "typescript": {".ts", ".tsx"},
407
+ "kotlin": {".kt", ".kts"},
408
+ "go": {".go"},
409
+ "rust": {".rs"},
410
+ "swift": {".swift"},
411
+ "c": {".c", ".h"},
412
+ "cpp": {".cpp", ".cc", ".cxx", ".hpp", ".hxx"},
413
+ }.items():
414
+ if suffix in exts:
415
+ lang = lang_name
416
+ break
417
+ file_languages[nid] = lang
418
+ scopes[nid] = FileSymbolScope(nid, lang)
419
+
420
+ # Populate declared symbols for each scope
421
+ for nid, data in G.nodes(data=True):
422
+ sf = data.get("source_file")
423
+ ntype = data.get("type")
424
+ label = data.get("label")
425
+ if sf and ntype != "file" and label and sf in scopes:
426
+ scopes[sf].declared_symbols[label] = nid
427
+
428
+ # Populate imported symbols for each scope
429
+ for ext in extractions:
430
+ # Find file node
431
+ file_node = next((n for n in ext.nodes if n.type == "file"), None)
432
+ if not file_node:
433
+ continue
434
+ file_id = file_node.id
435
+ if file_id not in scopes:
436
+ continue
437
+
438
+ for edge in ext.edges:
439
+ if edge.relation == "imports":
440
+ target_file_id = resolve_import_to_file_node(file_id, edge.target)
441
+ if target_file_id:
442
+ # In C/C++, importing/including a header imports all its symbols as wildcard imports
443
+ if scopes[file_id].language in ("c", "cpp"):
444
+ scopes[file_id].wildcard_imports.append(target_file_id)
445
+
446
+ # Parse import_map
447
+ if edge.import_map:
448
+ for local_name, original_name in edge.import_map.items():
449
+ if original_name == "*":
450
+ scopes[file_id].wildcard_imports.append(target_file_id)
451
+ else:
452
+ scopes[file_id].imported_symbols[local_name] = (
453
+ target_file_id,
454
+ original_name,
455
+ )
456
+ else:
457
+ # Direct import of a module name (e.g. import module_b)
458
+ stem = Path(target_file_id).stem
459
+ scopes[file_id].imported_symbols[stem] = (target_file_id, stem)
460
+
461
+ # Resolve symbol helper using the scope chain
462
+ def resolve_symbol(caller_id: str, callee_name: str) -> str | None:
463
+ caller_data = G.nodes.get(caller_id)
464
+ if not caller_data:
465
+ return None
466
+ source_file = caller_data["source_file"]
467
+
468
+ lang = file_languages.get(source_file, "python")
469
+ callee_clean = callee_name.replace("::", ".")
470
+ parts = [p.strip() for p in callee_clean.split(".") if p.strip()]
471
+ if not parts:
472
+ return None
473
+
474
+ main_symbol = parts[0]
475
+ rest_of_callee = callee_clean.split(".", 1)[1] if len(parts) > 1 else ""
476
+
477
+ # 1. Builtins / Stdlib Check
478
+ if main_symbol in BUILTIN_FUNCTIONS.get(lang, set()):
479
+ return None
480
+
481
+ scope = scopes.get(source_file)
482
+ if not scope:
483
+ return None
484
+
485
+ # Local Scope Type Binding resolution
486
+ local_bindings = caller_data.get("local_bindings", {})
487
+ if len(parts) > 1 and main_symbol in local_bindings:
488
+ receiver_type = local_bindings[main_symbol]
489
+ resolved_class_id = None
490
+
491
+ # Check if it's declared in the same file
492
+ file_cand = f"{source_file}::{receiver_type}"
493
+ if file_cand in node_ids:
494
+ resolved_class_id = file_cand
495
+
496
+ # Check explicit imports
497
+ elif receiver_type in scope.imported_symbols:
498
+ target_file_id, original_name = scope.imported_symbols[receiver_type]
499
+ resolved_class_id = f"{target_file_id}::{original_name}"
500
+
501
+ # Check package siblings (for Go/Swift)
502
+ elif lang in ("go", "swift"):
503
+ caller_dir = Path(source_file).parent
504
+ for nid in node_ids:
505
+ ndata = G.nodes[nid]
506
+ if (
507
+ ndata.get("type") in ("class", "struct", "interface", "enum")
508
+ and ndata.get("label") == receiver_type
509
+ ):
510
+ node_file = ndata.get("source_file", "")
511
+ if node_file and Path(node_file).parent == caller_dir:
512
+ resolved_class_id = nid
513
+ break
514
+
515
+ # Global fallback for class/struct name if not found in current module/scope
516
+ if not resolved_class_id:
517
+ for nid in node_ids:
518
+ ndata = G.nodes[nid]
519
+ if (
520
+ ndata.get("type") in ("class", "struct", "interface", "enum")
521
+ and ndata.get("label") == receiver_type
522
+ ):
523
+ resolved_class_id = nid
524
+ break
525
+
526
+ if resolved_class_id:
527
+ target_method_id = f"{resolved_class_id}.{rest_of_callee}"
528
+ if target_method_id in node_ids:
529
+ return target_method_id
530
+ target_method_id = f"{resolved_class_id}.{parts[-1]}"
531
+ if target_method_id in node_ids:
532
+ return target_method_id
533
+
534
+ # Cross-file / implementation-to-header fallback for C++ and Python binding boundaries
535
+ method_name = parts[-1]
536
+ for nid in node_ids:
537
+ ndata = G.nodes[nid]
538
+ if (
539
+ ndata.get("type") in ("method", "function")
540
+ and ndata.get("label") == method_name
541
+ ):
542
+ parent_class_part = nid.rsplit(".", 1)[0] if "." in nid else ""
543
+ parent_class_name = (
544
+ parent_class_part.rsplit("::", 1)[-1]
545
+ if "::" in parent_class_part
546
+ else parent_class_part
547
+ )
548
+ if (
549
+ parent_class_name == receiver_type
550
+ or parent_class_name.endswith(f".{receiver_type}")
551
+ ):
552
+ return nid
553
+ else:
554
+ # Known type but not defined in the workspace -> external/standard library type.
555
+ # Bypassing global fallback to prevent incorrect resolution of its methods.
556
+ return None
557
+
558
+ # 2. Local lexical scope check
559
+ # self / this / cls references
560
+ if main_symbol in ("self", "this", "cls"):
561
+ if "." in caller_id:
562
+ parent_class_id = caller_id.rsplit(".", 1)[0]
563
+ if rest_of_callee:
564
+ target_candidate = f"{parent_class_id}.{rest_of_callee}"
565
+ if target_candidate in node_ids:
566
+ return target_candidate
567
+ target_candidate = f"{parent_class_id}.{parts[-1]}"
568
+ if target_candidate in node_ids:
569
+ return target_candidate
570
+
571
+ # Inside current class context
572
+ if "." in caller_id:
573
+ parent_class_id = caller_id.rsplit(".", 1)[0]
574
+ target_candidate = f"{parent_class_id}.{main_symbol}"
575
+ if target_candidate in node_ids:
576
+ if rest_of_callee:
577
+ sub_target = f"{target_candidate}.{rest_of_callee}"
578
+ if sub_target in node_ids:
579
+ return sub_target
580
+ return target_candidate
581
+
582
+ # File-level scope check
583
+ file_candidate = f"{source_file}::{main_symbol}"
584
+ if file_candidate in node_ids:
585
+ if rest_of_callee:
586
+ sub_target = f"{file_candidate}.{rest_of_callee}"
587
+ if sub_target in node_ids:
588
+ return sub_target
589
+ return file_candidate
590
+
591
+ # 3. Package scope check (for Go, Swift sibling files)
592
+ if lang in ("go", "swift"):
593
+ caller_dir = Path(source_file).parent
594
+ for nid in node_ids:
595
+ ndata = G.nodes[nid]
596
+ if ndata.get("type") == "file":
597
+ continue
598
+ node_file = ndata.get("source_file", "")
599
+ if node_file and Path(node_file).parent == caller_dir:
600
+ if nid.endswith(f"::{main_symbol}"):
601
+ if rest_of_callee:
602
+ sub_target = f"{nid}.{rest_of_callee}"
603
+ if sub_target in node_ids:
604
+ return sub_target
605
+ return nid
606
+
607
+ # 4. Explicit imports and aliases check
608
+ if main_symbol in scope.imported_symbols:
609
+ target_file_id, original_name = scope.imported_symbols[main_symbol]
610
+ if original_name == "*" or original_name == Path(target_file_id).stem:
611
+ if rest_of_callee:
612
+ target_candidate = f"{target_file_id}::{rest_of_callee}"
613
+ if target_candidate in node_ids:
614
+ return target_candidate
615
+ for nid in node_ids:
616
+ if G.nodes[nid].get(
617
+ "source_file"
618
+ ) == target_file_id and nid.endswith(f".{parts[-1]}"):
619
+ return nid
620
+ else:
621
+ target_candidate = f"{target_file_id}::{main_symbol}"
622
+ if target_candidate in node_ids:
623
+ return target_candidate
624
+ return target_file_id
625
+ else:
626
+ target_candidate = f"{target_file_id}::{original_name}"
627
+ if target_candidate in node_ids:
628
+ if rest_of_callee:
629
+ sub_target = f"{target_candidate}.{rest_of_callee}"
630
+ if sub_target in node_ids:
631
+ return sub_target
632
+ return target_candidate
633
+ return target_candidate
634
+
635
+ # 5. Wildcard imports check
636
+ for target_file_id in scope.wildcard_imports:
637
+ target_candidate = f"{target_file_id}::{main_symbol}"
638
+ if target_candidate in node_ids:
639
+ if rest_of_callee:
640
+ sub_target = f"{target_candidate}.{rest_of_callee}"
641
+ if sub_target in node_ids:
642
+ return sub_target
643
+ return target_candidate
644
+
645
+ # 6. Global fallback check
646
+ if main_symbol in {
647
+ "os",
648
+ "sys",
649
+ "json",
650
+ "time",
651
+ "math",
652
+ "re",
653
+ "pathlib",
654
+ "logging",
655
+ "subprocess",
656
+ "shutil",
657
+ "hashlib",
658
+ "urllib",
659
+ "socket",
660
+ "threading",
661
+ "multiprocessing",
662
+ "typing",
663
+ "collections",
664
+ "itertools",
665
+ "functools",
666
+ "logger",
667
+ "log",
668
+ "console",
669
+ "pytest",
670
+ "unittest",
671
+ "fmt",
672
+ "sync",
673
+ "context",
674
+ "strings",
675
+ "bytes",
676
+ "errors",
677
+ "net",
678
+ "http",
679
+ "process",
680
+ "document",
681
+ "window",
682
+ "global",
683
+ "fs",
684
+ "path",
685
+ "std",
686
+ "core",
687
+ "env",
688
+ "Logger",
689
+ } or any(p in {"logger", "log", "logging", "console"} for p in parts):
690
+ return None
691
+
692
+ search_label = parts[-1] if len(parts) > 1 else main_symbol
693
+ if len(parts) > 1 and search_label in COMMON_BUILTIN_METHODS:
694
+ return None
695
+
696
+ candidates = []
697
+ for nid, ndata in G.nodes(data=True):
698
+ if ndata.get("label") == search_label and ndata.get("type") != "file":
699
+ candidates.append(nid)
700
+
701
+ if len(candidates) == 1:
702
+ return candidates[0]
703
+ elif len(candidates) > 1:
704
+ caller_parent_dir = Path(source_file).parent
705
+ near_candidates = [
706
+ c
707
+ for c in candidates
708
+ if Path(G.nodes[c]["source_file"]).parent == caller_parent_dir
709
+ ]
710
+ if len(near_candidates) == 1:
711
+ return near_candidates[0]
712
+
713
+ return None
714
+
715
+ # Pass 2: Process and resolve edges
716
+ for ext in extractions:
717
+ for edge in ext.edges:
718
+ src = edge.source
719
+ tgt = edge.target
720
+ rel = edge.relation
721
+
722
+ if src == tgt:
723
+ continue
724
+ if src not in node_ids:
725
+ continue
726
+
727
+ resolved_tgt = None
728
+
729
+ if rel == "contains":
730
+ if tgt in node_ids:
731
+ resolved_tgt = tgt
732
+ elif rel == "imports":
733
+ resolved_tgt = resolve_import_to_file_node(
734
+ G.nodes[src]["source_file"], tgt
735
+ )
736
+ elif rel in ("inherits", "implements"):
737
+ resolved_tgt = resolve_symbol(src, tgt)
738
+ elif rel == "calls":
739
+ resolved_tgt = resolve_symbol(src, tgt)
740
+
741
+ if resolved_tgt and resolved_tgt in node_ids:
742
+ if rel == "imports":
743
+ G.add_edge(src, resolved_tgt, relation=rel, raw_target=tgt)
744
+ else:
745
+ G.add_edge(src, resolved_tgt, relation=rel)
746
+
747
+ return G