@vyuhlabs/dxkit 2.9.4 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/CHANGELOG.md +236 -0
  2. package/dist/allowlist/annotate.d.ts +71 -0
  3. package/dist/allowlist/annotate.d.ts.map +1 -0
  4. package/dist/allowlist/annotate.js +105 -0
  5. package/dist/allowlist/annotate.js.map +1 -0
  6. package/dist/allowlist/cli.d.ts +29 -23
  7. package/dist/allowlist/cli.d.ts.map +1 -1
  8. package/dist/allowlist/cli.js +141 -70
  9. package/dist/allowlist/cli.js.map +1 -1
  10. package/dist/allowlist/file.d.ts +7 -1
  11. package/dist/allowlist/file.d.ts.map +1 -1
  12. package/dist/allowlist/file.js +7 -1
  13. package/dist/allowlist/file.js.map +1 -1
  14. package/dist/analysis-result.d.ts +10 -0
  15. package/dist/analysis-result.d.ts.map +1 -1
  16. package/dist/analyzers/cache.d.ts +1 -0
  17. package/dist/analyzers/cache.d.ts.map +1 -1
  18. package/dist/analyzers/cache.js +69 -0
  19. package/dist/analyzers/cache.js.map +1 -1
  20. package/dist/analyzers/dashboard/index.d.ts.map +1 -1
  21. package/dist/analyzers/dashboard/index.js +6 -1
  22. package/dist/analyzers/dashboard/index.js.map +1 -1
  23. package/dist/analyzers/health.d.ts.map +1 -1
  24. package/dist/analyzers/health.js +17 -2
  25. package/dist/analyzers/health.js.map +1 -1
  26. package/dist/analyzers/security/actions.d.ts.map +1 -1
  27. package/dist/analyzers/security/actions.js +13 -0
  28. package/dist/analyzers/security/actions.js.map +1 -1
  29. package/dist/analyzers/security/aggregator.d.ts +97 -79
  30. package/dist/analyzers/security/aggregator.d.ts.map +1 -1
  31. package/dist/analyzers/security/aggregator.js +168 -56
  32. package/dist/analyzers/security/aggregator.js.map +1 -1
  33. package/dist/analyzers/security/gather.d.ts +2 -0
  34. package/dist/analyzers/security/gather.d.ts.map +1 -1
  35. package/dist/analyzers/security/gather.js +36 -4
  36. package/dist/analyzers/security/gather.js.map +1 -1
  37. package/dist/analyzers/security/index.d.ts.map +1 -1
  38. package/dist/analyzers/security/index.js +81 -2
  39. package/dist/analyzers/security/index.js.map +1 -1
  40. package/dist/analyzers/security/scanner-drift.d.ts +21 -0
  41. package/dist/analyzers/security/scanner-drift.d.ts.map +1 -0
  42. package/dist/analyzers/security/scanner-drift.js +113 -0
  43. package/dist/analyzers/security/scanner-drift.js.map +1 -0
  44. package/dist/analyzers/security/shallow.d.ts.map +1 -1
  45. package/dist/analyzers/security/shallow.js +24 -2
  46. package/dist/analyzers/security/shallow.js.map +1 -1
  47. package/dist/analyzers/security/types.d.ts +64 -4
  48. package/dist/analyzers/security/types.d.ts.map +1 -1
  49. package/dist/analyzers/tools/fingerprint.d.ts +133 -20
  50. package/dist/analyzers/tools/fingerprint.d.ts.map +1 -1
  51. package/dist/analyzers/tools/fingerprint.js +194 -20
  52. package/dist/analyzers/tools/fingerprint.js.map +1 -1
  53. package/dist/analyzers/tools/gitleaks.d.ts +2 -2
  54. package/dist/analyzers/tools/gitleaks.d.ts.map +1 -1
  55. package/dist/analyzers/tools/gitleaks.js +7 -1
  56. package/dist/analyzers/tools/gitleaks.js.map +1 -1
  57. package/dist/analyzers/tools/graphify.d.ts +11 -0
  58. package/dist/analyzers/tools/graphify.d.ts.map +1 -1
  59. package/dist/analyzers/tools/graphify.js +457 -413
  60. package/dist/analyzers/tools/graphify.js.map +1 -1
  61. package/dist/analyzers/tools/grep-secrets.d.ts.map +1 -1
  62. package/dist/analyzers/tools/grep-secrets.js +31 -12
  63. package/dist/analyzers/tools/grep-secrets.js.map +1 -1
  64. package/dist/analyzers/tools/osv-scanner-fix.d.ts.map +1 -1
  65. package/dist/analyzers/tools/osv-scanner-fix.js +12 -1
  66. package/dist/analyzers/tools/osv-scanner-fix.js.map +1 -1
  67. package/dist/analyzers/tools/salt.d.ts +68 -0
  68. package/dist/analyzers/tools/salt.d.ts.map +1 -0
  69. package/dist/{baseline → analyzers/tools}/salt.js +59 -18
  70. package/dist/analyzers/tools/salt.js.map +1 -0
  71. package/dist/analyzers/tools/semgrep.d.ts +7 -7
  72. package/dist/analyzers/tools/semgrep.d.ts.map +1 -1
  73. package/dist/analyzers/tools/semgrep.js +14 -7
  74. package/dist/analyzers/tools/semgrep.js.map +1 -1
  75. package/dist/analyzers/tools/tool-registry.d.ts.map +1 -1
  76. package/dist/analyzers/tools/tool-registry.js +78 -43
  77. package/dist/analyzers/tools/tool-registry.js.map +1 -1
  78. package/dist/analyzers/tools/walk-source-files.d.ts +10 -0
  79. package/dist/analyzers/tools/walk-source-files.d.ts.map +1 -1
  80. package/dist/analyzers/tools/walk-source-files.js +14 -0
  81. package/dist/analyzers/tools/walk-source-files.js.map +1 -1
  82. package/dist/analyzers/types.d.ts +9 -0
  83. package/dist/analyzers/types.d.ts.map +1 -1
  84. package/dist/baseline/baseline-file.d.ts +9 -2
  85. package/dist/baseline/baseline-file.d.ts.map +1 -1
  86. package/dist/baseline/baseline-file.js.map +1 -1
  87. package/dist/baseline/check-renderers.d.ts.map +1 -1
  88. package/dist/baseline/check-renderers.js +14 -0
  89. package/dist/baseline/check-renderers.js.map +1 -1
  90. package/dist/baseline/check.d.ts +33 -0
  91. package/dist/baseline/check.d.ts.map +1 -1
  92. package/dist/baseline/check.js +78 -2
  93. package/dist/baseline/check.js.map +1 -1
  94. package/dist/baseline/create.d.ts +1 -1
  95. package/dist/baseline/create.d.ts.map +1 -1
  96. package/dist/baseline/create.js +3 -1
  97. package/dist/baseline/create.js.map +1 -1
  98. package/dist/baseline/entry-to-located.d.ts +12 -5
  99. package/dist/baseline/entry-to-located.d.ts.map +1 -1
  100. package/dist/baseline/entry-to-located.js +21 -7
  101. package/dist/baseline/entry-to-located.js.map +1 -1
  102. package/dist/baseline/finding-identity.d.ts +20 -13
  103. package/dist/baseline/finding-identity.d.ts.map +1 -1
  104. package/dist/baseline/finding-identity.js +51 -20
  105. package/dist/baseline/finding-identity.js.map +1 -1
  106. package/dist/baseline/git-aware-match.d.ts +7 -5
  107. package/dist/baseline/git-aware-match.d.ts.map +1 -1
  108. package/dist/baseline/git-aware-match.js +78 -5
  109. package/dist/baseline/git-aware-match.js.map +1 -1
  110. package/dist/baseline/migrate.d.ts +94 -0
  111. package/dist/baseline/migrate.d.ts.map +1 -0
  112. package/dist/baseline/migrate.js +238 -0
  113. package/dist/baseline/migrate.js.map +1 -0
  114. package/dist/baseline/producers/security.d.ts +9 -9
  115. package/dist/baseline/producers/security.d.ts.map +1 -1
  116. package/dist/baseline/producers/security.js +16 -4
  117. package/dist/baseline/producers/security.js.map +1 -1
  118. package/dist/baseline/types.d.ts +145 -95
  119. package/dist/baseline/types.d.ts.map +1 -1
  120. package/dist/baseline/types.js +30 -26
  121. package/dist/baseline/types.js.map +1 -1
  122. package/dist/explore/context-hook.d.ts +49 -29
  123. package/dist/explore/context-hook.d.ts.map +1 -1
  124. package/dist/explore/context-hook.js +304 -29
  125. package/dist/explore/context-hook.js.map +1 -1
  126. package/dist/explore/finding-context.d.ts +17 -0
  127. package/dist/explore/finding-context.d.ts.map +1 -1
  128. package/dist/explore/finding-context.js +34 -0
  129. package/dist/explore/finding-context.js.map +1 -1
  130. package/dist/explore/queries.d.ts +32 -15
  131. package/dist/explore/queries.d.ts.map +1 -1
  132. package/dist/explore/queries.js +36 -6
  133. package/dist/explore/queries.js.map +1 -1
  134. package/dist/generator.d.ts.map +1 -1
  135. package/dist/generator.js +13 -7
  136. package/dist/generator.js.map +1 -1
  137. package/dist/ingest/normalize.d.ts +1 -1
  138. package/dist/ingest/normalize.d.ts.map +1 -1
  139. package/dist/ingest/normalize.js +5 -1
  140. package/dist/ingest/normalize.js.map +1 -1
  141. package/dist/ingest/sarif.d.ts.map +1 -1
  142. package/dist/ingest/sarif.js +16 -7
  143. package/dist/ingest/sarif.js.map +1 -1
  144. package/dist/ingest/snyk-policy.d.ts +22 -1
  145. package/dist/ingest/snyk-policy.d.ts.map +1 -1
  146. package/dist/ingest/snyk-policy.js +75 -18
  147. package/dist/ingest/snyk-policy.js.map +1 -1
  148. package/dist/ingest/types.d.ts +23 -12
  149. package/dist/ingest/types.d.ts.map +1 -1
  150. package/dist/languages/capabilities/types.d.ts +64 -53
  151. package/dist/languages/capabilities/types.d.ts.map +1 -1
  152. package/dist/languages/capabilities/types.js +4 -4
  153. package/dist/languages/index.d.ts +28 -5
  154. package/dist/languages/index.d.ts.map +1 -1
  155. package/dist/languages/index.js +38 -7
  156. package/dist/languages/index.js.map +1 -1
  157. package/dist/languages/typescript.d.ts.map +1 -1
  158. package/dist/languages/typescript.js +19 -0
  159. package/dist/languages/typescript.js.map +1 -1
  160. package/dist/scoring/dimensions/security.d.ts +17 -0
  161. package/dist/scoring/dimensions/security.d.ts.map +1 -1
  162. package/dist/scoring/dimensions/security.js +12 -0
  163. package/dist/scoring/dimensions/security.js.map +1 -1
  164. package/dist/update.d.ts.map +1 -1
  165. package/dist/update.js +49 -0
  166. package/dist/update.js.map +1 -1
  167. package/dist/upgrade.d.ts.map +1 -1
  168. package/dist/upgrade.js +2 -1
  169. package/dist/upgrade.js.map +1 -1
  170. package/package.json +6 -3
  171. package/templates/.claude/skills/dxkit-action/SKILL.md +11 -2
  172. package/templates/.claude/skills/dxkit-allowlist/SKILL.md +9 -0
  173. package/templates/.claude/skills/dxkit-onboard/SKILL.md +2 -2
  174. package/templates/.claude/skills/dxkit-update/SKILL.md +45 -4
  175. package/dist/baseline/salt.d.ts +0 -45
  176. package/dist/baseline/salt.d.ts.map +0 -1
  177. package/dist/baseline/salt.js.map +0 -1
@@ -34,6 +34,7 @@ var __importStar = (this && this.__importStar) || (function () {
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
36
  exports.graphifyProvider = void 0;
37
+ exports.buildGraphifyScript = buildGraphifyScript;
37
38
  exports.gatherGraphifyResult = gatherGraphifyResult;
38
39
  exports.gatherGraphifyGraph = gatherGraphifyGraph;
39
40
  exports.buildGraphifyEnvelope = buildGraphifyEnvelope;
@@ -60,19 +61,42 @@ const path = __importStar(require("path"));
60
61
  const runner_1 = require("./runner");
61
62
  const tool_registry_1 = require("./tool-registry");
62
63
  const exclusions_1 = require("./exclusions");
64
+ const languages_1 = require("../../languages");
63
65
  const paths_1 = require("./paths");
64
66
  const types_1 = require("../../explore/types");
65
- /** Build the graphify Python script with cwd-specific exclusions baked in. */
67
+ /**
68
+ * Build the graphify Python script with cwd-specific exclusions baked in.
69
+ *
70
+ * Exported so the structural contract of the generated script — the
71
+ * `if __name__ == '__main__'` guard that keeps ProcessPoolExecutor workers
72
+ * from re-running extraction under spawn/forkserver (Python 3.14's Linux
73
+ * default), and the public `extract(cache_root=...)` cache redirect that
74
+ * replaced the fragile `cache_dir` monkeypatch — is unit-testable without a
75
+ * Python interpreter or graphify installed (mirrors `buildGraphifyEnvelope`).
76
+ */
66
77
  function buildGraphifyScript(cwd) {
67
78
  const { dirsSet, pathsList, fileGlobsList } = (0, exclusions_1.getPythonExcludeFilter)(cwd);
79
+ // Source-extension allowlist for the CODE graph. graphify's collect_files
80
+ // enumerates everything its _DISPATCH table can parse — including .md / .mdx
81
+ // (markdown headings → "module" nodes) and .json (config + lockfile keys →
82
+ // nodes). On NodeGoat that produced a graph that was ~92% non-code:
83
+ // package-lock.json alone contributed 137 nodes, .claude/**/*.md (dxkit's
84
+ // own scaffolding) 205, .vyuh-dxkit.json 53 — versus 51 nodes of real app
85
+ // code. Doc/config nodes pollute every graph-derived surface (communities,
86
+ // hot-files, api-surface, god-node ranking) and the context-hook's file
87
+ // summaries. Restrict the walk to the pack-declared source extensions
88
+ // (Rule 3/6: "what counts as source" is a language fact). graphify's TS
89
+ // import resolution reads tsconfig.json / package.json by direct path, not
90
+ // from the collected set, so dropping config files from the walk does not
91
+ // affect import-edge resolution.
92
+ const includeExtsSet = `set([${(0, languages_1.allSourceExtensions)()
93
+ .map((e) => `'${e.toLowerCase()}'`)
94
+ .join(', ')}])`;
68
95
  return `# Exclusion set derived from src/analyzers/tools/exclusions.ts
69
- import json, sys, os, tempfile
96
+ import json, sys, os
70
97
  from pathlib import Path
71
98
  from collections import Counter
72
99
 
73
- # Redirect graphify cache to /tmp so we don't pollute the target repo
74
- _cache_dir = Path(tempfile.mkdtemp(prefix='dxkit-graphify-'))
75
-
76
100
  try:
77
101
  from graphify.extract import extract, collect_files
78
102
  from graphify.build import build
@@ -82,17 +106,6 @@ except ImportError:
82
106
  print(json.dumps({"error": "graphify not installed"}))
83
107
  sys.exit(0)
84
108
 
85
- # Redirect graphify's on-disk cache BEFORE any graphify function runs.
86
- # collect_files() eagerly resolves cache_dir() during enumeration, so
87
- # the patch has to land before the first graphify call — not after.
88
- # Pre-patch, a 'graphify-out/cache/' directory was created in the
89
- # customer's repo every time the analyzer touched a project.
90
- import graphify.cache as _gc
91
- _gc.cache_dir = lambda root=None: _cache_dir / "cache"
92
- (_cache_dir / "cache").mkdir(parents=True, exist_ok=True)
93
-
94
- target = Path(sys.argv[1])
95
-
96
109
  # Three-axis exclusion. EXCLUDE_DIRS is basename-only (any path
97
110
  # segment matching skips the file). EXCLUDE_PATHS holds multi-segment
98
111
  # relative paths from .dxkit-ignore (e.g. 'app/modules/plugins/VendorPlugin')
@@ -106,6 +119,12 @@ EXCLUDE_DIRS = ${dirsSet}
106
119
  EXCLUDE_PATHS = ${pathsList}
107
120
  EXCLUDE_FILE_GLOBS = ${fileGlobsList}
108
121
 
122
+ # Source-extension allowlist (pack-declared via allSourceExtensions()).
123
+ # Keeps the CODE graph to actual source files — graphify also parses .md /
124
+ # .json into nodes, which is noise for code navigation. Empty set would be a
125
+ # bug (no files pass); the TS builder always emits a non-empty literal.
126
+ INCLUDE_EXTS = ${includeExtsSet}
127
+
109
128
  # Bytes-per-line floor above which a file is almost certainly minified
110
129
  # / bundled output. Mirrors the heuristic in
111
130
  # src/analyzers/tools/minified-detection.ts so graphify's enumeration
@@ -132,6 +151,11 @@ def _is_likely_minified(f):
132
151
  return False
133
152
 
134
153
  def _is_excluded(f):
154
+ # Source-extension allowlist first: anything that isn't a pack-declared
155
+ # source file (markdown, JSON config, lockfiles, plain text) is not part
156
+ # of the code graph.
157
+ if f.suffix.lower() not in INCLUDE_EXTS:
158
+ return True
135
159
  if any(seg in EXCLUDE_DIRS for seg in f.parts):
136
160
  return True
137
161
  name = f.name
@@ -274,407 +298,418 @@ def _strip_paren_suffix(label):
274
298
  s = s.rsplit('.', 1)[1]
275
299
  return s
276
300
 
277
- all_files = collect_files(target)
278
- files = [f for f in all_files if not _is_excluded(f)]
279
- if not files:
280
- print(json.dumps({"error": "no files found"}))
281
- sys.exit(0)
282
-
283
- # Suppress progress output by redirecting stdout during extraction
284
- import io
285
- _real_stdout = sys.stdout
286
- sys.stdout = io.StringIO()
287
- result = extract(files)
288
- sys.stdout = _real_stdout
289
- G = build([result], directed=True)
290
- communities = cluster(G)
291
-
292
- # Functions vs modules
293
- nodes = list(G.nodes(data=True))
294
- functions = [(n, d) for n, d in nodes if "()" in d.get("label", "")]
295
- modules = [(n, d) for n, d in nodes if "()" not in d.get("label", "")]
296
-
297
- # Functions per file
298
- file_funcs = Counter()
299
- for n, d in functions:
300
- sf = d.get("source_file", "")
301
- file_funcs[sf] += 1
302
-
303
- max_file = file_funcs.most_common(1)[0] if file_funcs else ("", 0)
304
-
305
- # God nodes: graphifyy@0.5.0 renamed the result key "edges" → "degree".
306
- gods = god_nodes(G, top_n=50)
307
- god_count = sum(1 for g in gods if g["degree"] > 15)
308
-
309
- # Cohesion
310
- scores = score_all(G, communities) if communities else {}
311
- avg_cohesion = sum(scores.values()) / len(scores) if scores else 0.0
312
-
313
- # Orphan modules (no inbound imports)
314
- import_targets = set()
315
- for u, v, data in G.edges(data=True):
316
- if data.get("relation") == "imports_from":
317
- import_targets.add(v)
318
- module_ids = set(n for n, d in modules)
319
- orphans = module_ids - import_targets
320
-
321
- # Dead imports (imported but never called)
322
- call_targets = set()
323
- for u, v, data in G.edges(data=True):
324
- if data.get("relation") == "calls":
325
- call_targets.add(v)
326
- dead = import_targets - call_targets - module_ids
327
-
328
- # Commented code ratio: source files with 0 function/class AST nodes
329
- source_files_set = set()
330
- files_with_nodes = set()
331
- for n, d in nodes:
332
- sf = d.get("source_file", "")
333
- if sf:
334
- source_files_set.add(sf)
335
- if "()" in d.get("label", "") or any(
336
- data.get("relation") == "method"
337
- for _, _, data in G.edges(n, data=True)
338
- ):
339
- files_with_nodes.add(sf)
340
-
341
- total_src = len(source_files_set)
342
- empty_files = total_src - len(files_with_nodes)
343
- commented_ratio = empty_files / total_src if total_src > 0 else 0.0
344
-
345
-
346
- # ── Build the full graph artifact ────────────────────────────────────────────
347
- # 2.7 Sprint 1: emit nodes / edges / communities / symbolIndex alongside
348
- # the aggregate metrics. Consumers (explore CLI, dashboard viz, future
349
- # 2.8 context CLI + reachability) read this via src/explore/load.ts.
350
- # Schema contract documented in tmp/2.7-graph-json-schema.md.
351
-
352
- # Determine class membership: a module-shaped node is a CLASS if it has
353
- # outbound 'method' edges to other nodes (it's the owner). A function-
354
- # shaped node ("()" in label) is a METHOD if it has inbound 'method'
355
- # edges from a class node; otherwise it's a free FUNCTION.
356
- _class_owners = set()
357
- _method_members = set()
358
- for u, v, data in G.edges(data=True):
359
- if data.get("relation") == "method":
360
- _class_owners.add(u)
361
- _method_members.add(v)
362
-
363
- def _node_kind(nid, attrs):
364
- label = attrs.get('label', '')
365
- is_callable = '()' in label
366
- if is_callable:
367
- return 'method' if nid in _method_members else 'function'
368
- return 'class' if nid in _class_owners else 'module'
369
-
370
- # Make node sourceFile paths project-relative (graphify emits absolute
371
- # paths derived from \`target = sys.argv[1]\`). Mirrors the existing
372
- # maxFunctionsFilePath path-normalization at the TS layer.
373
- def _rel(p):
374
- if not p:
375
- return ''
376
- s = str(p).replace(os.sep, '/')
377
- t = str(target).replace(os.sep, '/').rstrip('/')
378
- if s.startswith(t + '/'):
379
- return s[len(t) + 1:]
380
- if s == t:
381
- return ''
382
- return s
383
-
384
- # Assign stable in-run ids: n0, n1, n2, ... in extraction order. The
385
- # graphify-internal id strings (long underscored slugs) work but bloat
386
- # the JSON by ~20 bytes per node; the n<idx> shortening saves ~50KB on
387
- # a 13k-node repo. IDs are NOT stable across runs (per schema doc).
388
- _id_remap = {}
389
- graph_nodes = []
390
- for idx, (nid, attrs) in enumerate(nodes):
391
- short_id = f'n{idx}'
392
- _id_remap[nid] = short_id
393
- line_no = _parse_line_no(attrs)
394
- rel_source = _rel(attrs.get('source_file', ''))
395
- label = attrs.get('label', '')
396
- name = _strip_paren_suffix(label)
397
- kind = _node_kind(nid, attrs)
398
- node_obj = {
399
- 'id': short_id,
400
- 'kind': kind,
401
- 'label': label,
402
- 'sourceFile': rel_source,
403
- }
404
- if line_no:
405
- node_obj['line'] = line_no
406
- # Export detection only meaningful for symbol-bearing kinds
407
- # (functions, classes, methods). Module-level "is this file
408
- # exported?" isn't a useful question — exclude.
409
- if kind in ('function', 'class', 'method'):
410
- # Resolve to absolute path for the file-line cache (we read
411
- # the raw source content; the cache key is the actual path
412
- # on disk, not the project-relative form).
413
- abs_source = attrs.get('source_file', '')
414
- exported = _detect_exported(abs_source, line_no, name)
415
- if exported is not None:
416
- node_obj['exported'] = exported
417
- graph_nodes.append(node_obj)
418
-
419
- # Edges remapped to short ids. Drop self-loops and edges where either
420
- # endpoint was filtered out (defensive — graphify shouldn't produce them
421
- # but be tolerant). Graphify emits both 'imports' (broad form: \`import X\`)
422
- # and 'imports_from' (\`from X import Y\` / \`import {Y} from X\`); both
423
- # carry the same semantic for our schema ("A imports from B"). Merge
424
- # both into the canonical 'imports_from' edge relation. The 'contains'
425
- # and 'inherits' relations graphify also produces are intentionally
426
- # dropped 'contains' duplicates the file/symbol-membership info
427
- # already encoded in nodes' sourceFile field, and 'inherits' is
428
- # class-inheritance which isn't yet a first-class schema relation.
429
- graph_edges = []
430
- for u, v, data in G.edges(data=True):
431
- if u not in _id_remap or v not in _id_remap:
432
- continue
433
- graphify_relation = data.get('relation', '')
434
- if graphify_relation == 'calls':
435
- relation = 'calls'
436
- elif graphify_relation in ('imports', 'imports_from'):
437
- relation = 'imports_from'
438
- elif graphify_relation == 'method':
439
- relation = 'method'
440
- else:
441
- continue
442
- edge_obj = {
443
- 'from': _id_remap[u],
444
- 'to': _id_remap[v],
445
- 'relation': relation,
301
+ if __name__ == '__main__':
302
+ # ProcessPoolExecutor workers re-import this module under spawn/
303
+ # forkserver (the Python 3.14 default on Linux); the __main__ guard
304
+ # keeps extraction from re-running per worker. graphify's own
305
+ # _extract_parallel requires this guard (it warns BrokenProcessPool
306
+ # and dies without it). See graphify/extract.py:_extract_parallel.
307
+ target = Path(sys.argv[1])
308
+ # graphify's on-disk cache is redirected here (the public cache_root
309
+ # param passed to extract() below) so it never lands in the target
310
+ # repo. The TS caller owns this dir's lifecycle — it lives under the
311
+ # ephemeral scriptDir and is removed after this process fully exits,
312
+ # which is the only point that survives graphify's atexit stat-index
313
+ # flush (graphify/cache.py registers _flush_stat_index at exit, so a
314
+ # Python-side rmtree here would be undone by that post-exit write).
315
+ _cache_dir = Path(sys.argv[2])
316
+ all_files = collect_files(target)
317
+ files = [f for f in all_files if not _is_excluded(f)]
318
+ if not files:
319
+ print(json.dumps({"error": "no files found"}))
320
+ sys.exit(0)
321
+
322
+ # Suppress progress output by redirecting stdout during extraction
323
+ import io
324
+ _real_stdout = sys.stdout
325
+ sys.stdout = io.StringIO()
326
+ result = extract(files, cache_root=_cache_dir)
327
+ sys.stdout = _real_stdout
328
+ G = build([result], directed=True)
329
+ communities = cluster(G)
330
+
331
+ # Functions vs modules
332
+ nodes = list(G.nodes(data=True))
333
+ functions = [(n, d) for n, d in nodes if "()" in d.get("label", "")]
334
+ modules = [(n, d) for n, d in nodes if "()" not in d.get("label", "")]
335
+
336
+ # Functions per file
337
+ file_funcs = Counter()
338
+ for n, d in functions:
339
+ sf = d.get("source_file", "")
340
+ file_funcs[sf] += 1
341
+
342
+ max_file = file_funcs.most_common(1)[0] if file_funcs else ("", 0)
343
+
344
+ # God nodes: graphifyy@0.5.0 renamed the result key "edges" → "degree".
345
+ gods = god_nodes(G, top_n=50)
346
+ god_count = sum(1 for g in gods if g["degree"] > 15)
347
+
348
+ # Cohesion
349
+ scores = score_all(G, communities) if communities else {}
350
+ avg_cohesion = sum(scores.values()) / len(scores) if scores else 0.0
351
+
352
+ # Orphan modules (no inbound imports)
353
+ import_targets = set()
354
+ for u, v, data in G.edges(data=True):
355
+ if data.get("relation") == "imports_from":
356
+ import_targets.add(v)
357
+ module_ids = set(n for n, d in modules)
358
+ orphans = module_ids - import_targets
359
+
360
+ # Dead imports (imported but never called)
361
+ call_targets = set()
362
+ for u, v, data in G.edges(data=True):
363
+ if data.get("relation") == "calls":
364
+ call_targets.add(v)
365
+ dead = import_targets - call_targets - module_ids
366
+
367
+ # Commented code ratio: source files with 0 function/class AST nodes
368
+ source_files_set = set()
369
+ files_with_nodes = set()
370
+ for n, d in nodes:
371
+ sf = d.get("source_file", "")
372
+ if sf:
373
+ source_files_set.add(sf)
374
+ if "()" in d.get("label", "") or any(
375
+ data.get("relation") == "method"
376
+ for _, _, data in G.edges(n, data=True)
377
+ ):
378
+ files_with_nodes.add(sf)
379
+
380
+ total_src = len(source_files_set)
381
+ empty_files = total_src - len(files_with_nodes)
382
+ commented_ratio = empty_files / total_src if total_src > 0 else 0.0
383
+
384
+
385
+ # ── Build the full graph artifact ────────────────────────────────────────────
386
+ # 2.7 Sprint 1: emit nodes / edges / communities / symbolIndex alongside
387
+ # the aggregate metrics. Consumers (explore CLI, dashboard viz, future
388
+ # 2.8 context CLI + reachability) read this via src/explore/load.ts.
389
+ # Schema contract documented in tmp/2.7-graph-json-schema.md.
390
+
391
+ # Determine class membership: a module-shaped node is a CLASS if it has
392
+ # outbound 'method' edges to other nodes (it's the owner). A function-
393
+ # shaped node ("()" in label) is a METHOD if it has inbound 'method'
394
+ # edges from a class node; otherwise it's a free FUNCTION.
395
+ _class_owners = set()
396
+ _method_members = set()
397
+ for u, v, data in G.edges(data=True):
398
+ if data.get("relation") == "method":
399
+ _class_owners.add(u)
400
+ _method_members.add(v)
401
+
402
+ def _node_kind(nid, attrs):
403
+ label = attrs.get('label', '')
404
+ is_callable = '()' in label
405
+ if is_callable:
406
+ return 'method' if nid in _method_members else 'function'
407
+ return 'class' if nid in _class_owners else 'module'
408
+
409
+ # Make node sourceFile paths project-relative (graphify emits absolute
410
+ # paths derived from \`target = sys.argv[1]\`). Mirrors the existing
411
+ # maxFunctionsFilePath path-normalization at the TS layer.
412
+ def _rel(p):
413
+ if not p:
414
+ return ''
415
+ s = str(p).replace(os.sep, '/')
416
+ t = str(target).replace(os.sep, '/').rstrip('/')
417
+ if s.startswith(t + '/'):
418
+ return s[len(t) + 1:]
419
+ if s == t:
420
+ return ''
421
+ return s
422
+
423
+ # Assign stable in-run ids: n0, n1, n2, ... in extraction order. The
424
+ # graphify-internal id strings (long underscored slugs) work but bloat
425
+ # the JSON by ~20 bytes per node; the n<idx> shortening saves ~50KB on
426
+ # a 13k-node repo. IDs are NOT stable across runs (per schema doc).
427
+ _id_remap = {}
428
+ graph_nodes = []
429
+ for idx, (nid, attrs) in enumerate(nodes):
430
+ short_id = f'n{idx}'
431
+ _id_remap[nid] = short_id
432
+ line_no = _parse_line_no(attrs)
433
+ rel_source = _rel(attrs.get('source_file', ''))
434
+ label = attrs.get('label', '')
435
+ name = _strip_paren_suffix(label)
436
+ kind = _node_kind(nid, attrs)
437
+ node_obj = {
438
+ 'id': short_id,
439
+ 'kind': kind,
440
+ 'label': label,
441
+ 'sourceFile': rel_source,
442
+ }
443
+ if line_no:
444
+ node_obj['line'] = line_no
445
+ # Export detection only meaningful for symbol-bearing kinds
446
+ # (functions, classes, methods). Module-level "is this file
447
+ # exported?" isn't a useful question exclude.
448
+ if kind in ('function', 'class', 'method'):
449
+ # Resolve to absolute path for the file-line cache (we read
450
+ # the raw source content; the cache key is the actual path
451
+ # on disk, not the project-relative form).
452
+ abs_source = attrs.get('source_file', '')
453
+ exported = _detect_exported(abs_source, line_no, name)
454
+ if exported is not None:
455
+ node_obj['exported'] = exported
456
+ graph_nodes.append(node_obj)
457
+
458
+ # Edges remapped to short ids. Drop self-loops and edges where either
459
+ # endpoint was filtered out (defensive — graphify shouldn't produce them
460
+ # but be tolerant). Graphify emits both 'imports' (broad form: \`import X\`)
461
+ # and 'imports_from' (\`from X import Y\` / \`import {Y} from X\`); both
462
+ # carry the same semantic for our schema ("A imports from B"). Merge
463
+ # both into the canonical 'imports_from' edge relation. The 'contains'
464
+ # and 'inherits' relations graphify also produces are intentionally
465
+ # dropped — 'contains' duplicates the file/symbol-membership info
466
+ # already encoded in nodes' sourceFile field, and 'inherits' is
467
+ # class-inheritance which isn't yet a first-class schema relation.
468
+ graph_edges = []
469
+ for u, v, data in G.edges(data=True):
470
+ if u not in _id_remap or v not in _id_remap:
471
+ continue
472
+ graphify_relation = data.get('relation', '')
473
+ if graphify_relation == 'calls':
474
+ relation = 'calls'
475
+ elif graphify_relation in ('imports', 'imports_from'):
476
+ relation = 'imports_from'
477
+ elif graphify_relation == 'method':
478
+ relation = 'method'
479
+ else:
480
+ continue
481
+ edge_obj = {
482
+ 'from': _id_remap[u],
483
+ 'to': _id_remap[v],
484
+ 'relation': relation,
485
+ }
486
+ graph_edges.append(edge_obj)
487
+
488
+ # Communities: for each cluster compute dominantSourceDir + dominantPack.
489
+ # dominantSourceDir = most common ancestor directory (the longest
490
+ # leading-segment path that >= 40% of members share); empty string when
491
+ # no clear dominant. dominantPack = most common pack id among member
492
+ # files' extensions; empty when no dominant pack.
493
+ def _ancestor_dir(rel_path):
494
+ if not rel_path or '/' not in rel_path:
495
+ return ''
496
+ return rel_path.rsplit('/', 1)[0] + '/'
497
+
498
+ graph_communities = []
499
+ # Graphify's cluster() returns dict[community_id: list[node_id]].
500
+ # Iterate via .items(); the community_id is the actual cluster
501
+ # identifier (used to look up cohesion in scores), members is the
502
+ # node-id list.
503
+ _node_attrs_by_id = dict(nodes)
504
+ for cidx, member_list in communities.items():
505
+ member_ids = sorted(_id_remap.get(n, '') for n in member_list if n in _id_remap)
506
+ member_ids = [m for m in member_ids if m]
507
+ if not member_ids:
508
+ continue
509
+ # Per-member source files (project-relative)
510
+ member_files = []
511
+ for nid in member_list:
512
+ if nid in _id_remap:
513
+ sf = _rel(_node_attrs_by_id.get(nid, {}).get('source_file', ''))
514
+ if sf:
515
+ member_files.append(sf)
516
+ # Dominant directory: longest common ancestor that >= 40% of
517
+ # members share (or empty if no clear winner).
518
+ dir_counter = Counter(_ancestor_dir(f) for f in member_files)
519
+ dir_counter.pop('', None)
520
+ dominant_dir = ''
521
+ if dir_counter:
522
+ top_dir, top_count = dir_counter.most_common(1)[0]
523
+ if top_count / len(member_files) >= 0.4:
524
+ dominant_dir = top_dir
525
+ # Dominant pack
526
+ pack_counter = Counter()
527
+ for f in member_files:
528
+ pk = _EXT_TO_PACK.get(_ext_of(f))
529
+ if pk:
530
+ pack_counter[pk] += 1
531
+ dominant_pack = ''
532
+ if pack_counter:
533
+ top_pack, top_pack_count = pack_counter.most_common(1)[0]
534
+ if top_pack_count / max(1, len(member_files)) >= 0.5:
535
+ dominant_pack = top_pack
536
+ cohesion = float(scores.get(cidx, 0.0)) if scores else 0.0
537
+ graph_communities.append({
538
+ 'id': cidx,
539
+ 'nodeIds': member_ids,
540
+ 'cohesion': round(cohesion, 3),
541
+ 'dominantSourceDir': dominant_dir,
542
+ 'dominantPack': dominant_pack,
543
+ })
544
+
545
+ # Symbol index: lowercased label (without trailing ()) → list of nodeIds.
546
+ _symbol_index = {}
547
+ for node_obj in graph_nodes:
548
+ key = _strip_paren_suffix(node_obj['label']).lower()
549
+ if not key:
550
+ continue
551
+ _symbol_index.setdefault(key, []).append(node_obj['id'])
552
+
553
+ # Active-pack detection: derive from extensions seen in source files.
554
+ _packs_seen = sorted({_EXT_TO_PACK[e] for e in (_ext_of(_rel(d.get('source_file', '')))
555
+ for _, d in nodes)
556
+ if e in _EXT_TO_PACK})
557
+
558
+ # Size-budget enforcement. Hard cap 50MB serialized. If we exceed,
559
+ # drop method edges first (densest class — structural noise, doesn't
560
+ # affect call-graph queries).
561
+ import datetime as _dt
562
+ _meta = {
563
+ 'tool': 'graphify',
564
+ 'graphifyVersion': '', # filled by TS-side post-parse (read from graphifyy package version)
565
+ 'dxkitVersion': '', # filled by TS-side post-parse (read from package.json)
566
+ 'generatedAt': _dt.datetime.now(_dt.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
567
+ 'sourceFilesInGraph': total_src,
568
+ 'excludedFileCount': len(all_files) - len(files),
569
+ 'packs': _packs_seen,
570
+ 'truncated': False,
571
+ 'truncatedReason': '',
446
572
  }
447
- graph_edges.append(edge_obj)
448
-
449
- # Communities: for each cluster compute dominantSourceDir + dominantPack.
450
- # dominantSourceDir = most common ancestor directory (the longest
451
- # leading-segment path that >= 40% of members share); empty string when
452
- # no clear dominant. dominantPack = most common pack id among member
453
- # files' extensions; empty when no dominant pack.
454
- def _ancestor_dir(rel_path):
455
- if not rel_path or '/' not in rel_path:
456
- return ''
457
- return rel_path.rsplit('/', 1)[0] + '/'
458
-
459
- graph_communities = []
460
- # Graphify's cluster() returns dict[community_id: list[node_id]].
461
- # Iterate via .items(); the community_id is the actual cluster
462
- # identifier (used to look up cohesion in scores), members is the
463
- # node-id list.
464
- _node_attrs_by_id = dict(nodes)
465
- for cidx, member_list in communities.items():
466
- member_ids = sorted(_id_remap.get(n, '') for n in member_list if n in _id_remap)
467
- member_ids = [m for m in member_ids if m]
468
- if not member_ids:
469
- continue
470
- # Per-member source files (project-relative)
471
- member_files = []
472
- for nid in member_list:
473
- if nid in _id_remap:
474
- sf = _rel(_node_attrs_by_id.get(nid, {}).get('source_file', ''))
475
- if sf:
476
- member_files.append(sf)
477
- # Dominant directory: longest common ancestor that >= 40% of
478
- # members share (or empty if no clear winner).
479
- dir_counter = Counter(_ancestor_dir(f) for f in member_files)
480
- dir_counter.pop('', None)
481
- dominant_dir = ''
482
- if dir_counter:
483
- top_dir, top_count = dir_counter.most_common(1)[0]
484
- if top_count / len(member_files) >= 0.4:
485
- dominant_dir = top_dir
486
- # Dominant pack
487
- pack_counter = Counter()
488
- for f in member_files:
489
- pk = _EXT_TO_PACK.get(_ext_of(f))
490
- if pk:
491
- pack_counter[pk] += 1
492
- dominant_pack = ''
493
- if pack_counter:
494
- top_pack, top_pack_count = pack_counter.most_common(1)[0]
495
- if top_pack_count / max(1, len(member_files)) >= 0.5:
496
- dominant_pack = top_pack
497
- cohesion = float(scores.get(cidx, 0.0)) if scores else 0.0
498
- graph_communities.append({
499
- 'id': cidx,
500
- 'nodeIds': member_ids,
501
- 'cohesion': round(cohesion, 3),
502
- 'dominantSourceDir': dominant_dir,
503
- 'dominantPack': dominant_pack,
504
- })
505
-
506
- # Symbol index: lowercased label (without trailing ()) → list of nodeIds.
507
- _symbol_index = {}
508
- for node_obj in graph_nodes:
509
- key = _strip_paren_suffix(node_obj['label']).lower()
510
- if not key:
511
- continue
512
- _symbol_index.setdefault(key, []).append(node_obj['id'])
513
-
514
- # Active-pack detection: derive from extensions seen in source files.
515
- _packs_seen = sorted({_EXT_TO_PACK[e] for e in (_ext_of(_rel(d.get('source_file', '')))
516
- for _, d in nodes)
517
- if e in _EXT_TO_PACK})
518
-
519
- # Size-budget enforcement. Hard cap 50MB serialized. If we exceed,
520
- # drop method edges first (densest class — structural noise, doesn't
521
- # affect call-graph queries).
522
- import datetime as _dt
523
- _meta = {
524
- 'tool': 'graphify',
525
- 'graphifyVersion': '', # filled by TS-side post-parse (read from graphifyy package version)
526
- 'dxkitVersion': '', # filled by TS-side post-parse (read from package.json)
527
- 'generatedAt': _dt.datetime.now(_dt.timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
528
- 'sourceFilesInGraph': total_src,
529
- 'excludedFileCount': len(all_files) - len(files),
530
- 'packs': _packs_seen,
531
- 'truncated': False,
532
- 'truncatedReason': '',
533
- }
534
573
 
535
- _graph_payload = {
536
- 'schemaVersion': 1,
537
- 'meta': _meta,
538
- 'nodes': graph_nodes,
539
- 'edges': graph_edges,
540
- 'communities': graph_communities,
541
- 'symbolIndex': _symbol_index,
542
- }
574
+ _graph_payload = {
575
+ 'schemaVersion': 1,
576
+ 'meta': _meta,
577
+ 'nodes': graph_nodes,
578
+ 'edges': graph_edges,
579
+ 'communities': graph_communities,
580
+ 'symbolIndex': _symbol_index,
581
+ }
543
582
 
544
- # Cheap pre-check on size: serialize once, measure, drop method edges
545
- # if over the cap, re-serialize. The 50MB cap matches the schema
546
- # contract; 10MB soft target is informational only (no enforcement).
547
- _BYTES_HARD_CAP = 50 * 1024 * 1024
548
-
549
- def _serialize(payload):
550
- return json.dumps(payload, separators=(',', ':'))
551
-
552
- _graph_json = _serialize(_graph_payload)
553
- if len(_graph_json.encode('utf-8')) > _BYTES_HARD_CAP:
554
- # Drop method edges first; they're structural (class-owns-method),
555
- # not behavioral. Call + import edges carry the actionable info.
556
- pre_count = len(_graph_payload['edges'])
557
- _graph_payload['edges'] = [e for e in _graph_payload['edges']
558
- if e['relation'] != 'method']
559
- post_count = len(_graph_payload['edges'])
560
- _meta['truncated'] = True
561
- _meta['truncatedReason'] = (
562
- f"dropped {pre_count - post_count} method edges to fit under "
563
- f"the 50MB hard cap"
564
- )
565
-
566
- # Render the interactive viewer alongside graph.json so the dashboard
567
- # Graph tab can embed it. graphify ships its own vis.js-based renderer
568
- # (graphify.export.to_html). Two emission paths:
569
- #
570
- # - Full graph (G.number_of_nodes() <= MAX_NODES_FOR_VIZ = 5000):
571
- # pass the original G + communities. The viewer renders every
572
- # symbol; the user can zoom + drill.
573
- #
574
- # - Aggregated community view (G > MAX_NODES_FOR_VIZ): build a
575
- # networkx super-graph whose nodes ARE the communities. Sized by
576
- # member count via graphify member_counts parameter. Inter-
577
- # community edges aggregated to weighted edges. This lets a
578
- # customer-scale repo still get a meaningful "what does this
579
- # codebase look like" viz instead of a dead empty-state.
580
- #
581
- # Either way failures are non-fatal: the dashboard surfaces a clear
582
- # empty-state when graph.html isn't on disk.
583
- try:
584
- from graphify.export import to_html as _to_html, MAX_NODES_FOR_VIZ as _MAX_VIZ
585
- import networkx as _nx
586
- _html_dir = target / '.dxkit' / 'reports'
587
- _html_dir.mkdir(parents=True, exist_ok=True)
588
- _html_path = _html_dir / 'graph.html'
589
-
590
- if G.number_of_nodes() <= _MAX_VIZ:
591
- _labels = {
592
- c['id']: (c.get('dominantSourceDir') or f"community-{c['id']}")
593
- for c in graph_communities
594
- }
595
- _to_html(G, communities, str(_html_path), community_labels=_labels)
596
- _viz_mode = 'full'
597
- else:
598
- # Aggregated community super-graph.
599
- _node_to_comm = {}
600
- for _cid, _members in communities.items():
601
- for _nid in _members:
602
- _node_to_comm[_nid] = _cid
603
-
604
- _G_agg = _nx.DiGraph()
605
- _member_counts = {}
606
- _labels = {}
607
- for _c in graph_communities:
608
- _cid = _c['id']
609
- _label = _c.get('dominantSourceDir') or f"community-{_cid}"
610
- # vis.js node attrs: label drives display; file_type is
611
- # surfaced in graphify's sidebar so we set a sentinel
612
- # value the dashboard can grep on.
613
- _G_agg.add_node(_cid, label=_label, source_file='', file_type='community')
614
- _member_counts[_cid] = len(_c['nodeIds'])
615
- _labels[_cid] = _label
616
-
617
- # Cross-community edge aggregation. Counter keyed on
618
- # (smaller_id, larger_id) for undirected aggregation; we then
619
- # add a directed edge in one canonical direction so vis.js
620
- # has a definite source/target. The viewer doesn't show
621
- # arrows on these (they're community connections, not calls).
622
- from collections import Counter as _CommCounter
623
- _edge_w = _CommCounter()
624
- for _u, _v, _ in G.edges(data=True):
625
- _cu = _node_to_comm.get(_u)
626
- _cv = _node_to_comm.get(_v)
627
- if _cu is None or _cv is None or _cu == _cv:
628
- continue
629
- _key = (_cu, _cv) if _cu < _cv else (_cv, _cu)
630
- _edge_w[_key] += 1
631
- for (_a, _b), _w in _edge_w.items():
632
- _G_agg.add_edge(_a, _b, relation='inter_community', occurrences=_w)
633
-
634
- # to_html requires a communities dict; one-element groups
635
- # treat each aggregated node as its own community so each
636
- # community keeps a distinct color in graphify's palette.
637
- _agg_groups = {_cid: [_cid] for _cid in communities}
638
-
639
- _to_html(
640
- _G_agg, _agg_groups, str(_html_path),
641
- community_labels=_labels, member_counts=_member_counts,
583
+ # Cheap pre-check on size: serialize once, measure, drop method edges
584
+ # if over the cap, re-serialize. The 50MB cap matches the schema
585
+ # contract; 10MB soft target is informational only (no enforcement).
586
+ _BYTES_HARD_CAP = 50 * 1024 * 1024
587
+
588
+ def _serialize(payload):
589
+ return json.dumps(payload, separators=(',', ':'))
590
+
591
+ _graph_json = _serialize(_graph_payload)
592
+ if len(_graph_json.encode('utf-8')) > _BYTES_HARD_CAP:
593
+ # Drop method edges first; they're structural (class-owns-method),
594
+ # not behavioral. Call + import edges carry the actionable info.
595
+ pre_count = len(_graph_payload['edges'])
596
+ _graph_payload['edges'] = [e for e in _graph_payload['edges']
597
+ if e['relation'] != 'method']
598
+ post_count = len(_graph_payload['edges'])
599
+ _meta['truncated'] = True
600
+ _meta['truncatedReason'] = (
601
+ f"dropped {pre_count - post_count} method edges to fit under "
602
+ f"the 50MB hard cap"
642
603
  )
643
- _viz_mode = 'aggregated'
644
-
645
- # Sidecar so the dashboard renderer can label the view honestly.
646
- # JSON is tiny (~120B); avoids parsing graph.json twice from TS.
647
- _meta_path = _html_dir / 'graph.html.meta.json'
648
- _meta_path.write_text(json.dumps({
649
- 'mode': _viz_mode,
650
- 'totalNodes': G.number_of_nodes(),
651
- 'totalEdges': G.number_of_edges(),
652
- 'communities': len(communities),
653
- 'aggregatedNodeCount': len(communities) if _viz_mode == 'aggregated' else None,
604
+
605
+ # Render the interactive viewer alongside graph.json so the dashboard
606
+ # Graph tab can embed it. graphify ships its own vis.js-based renderer
607
+ # (graphify.export.to_html). Two emission paths:
608
+ #
609
+ # - Full graph (G.number_of_nodes() <= MAX_NODES_FOR_VIZ = 5000):
610
+ # pass the original G + communities. The viewer renders every
611
+ # symbol; the user can zoom + drill.
612
+ #
613
+ # - Aggregated community view (G > MAX_NODES_FOR_VIZ): build a
614
+ # networkx super-graph whose nodes ARE the communities. Sized by
615
+ # member count via graphify member_counts parameter. Inter-
616
+ # community edges aggregated to weighted edges. This lets a
617
+ # customer-scale repo still get a meaningful "what does this
618
+ # codebase look like" viz instead of a dead empty-state.
619
+ #
620
+ # Either way failures are non-fatal: the dashboard surfaces a clear
621
+ # empty-state when graph.html isn't on disk.
622
+ try:
623
+ from graphify.export import to_html as _to_html, MAX_NODES_FOR_VIZ as _MAX_VIZ
624
+ import networkx as _nx
625
+ _html_dir = target / '.dxkit' / 'reports'
626
+ _html_dir.mkdir(parents=True, exist_ok=True)
627
+ _html_path = _html_dir / 'graph.html'
628
+
629
+ if G.number_of_nodes() <= _MAX_VIZ:
630
+ _labels = {
631
+ c['id']: (c.get('dominantSourceDir') or f"community-{c['id']}")
632
+ for c in graph_communities
633
+ }
634
+ _to_html(G, communities, str(_html_path), community_labels=_labels)
635
+ _viz_mode = 'full'
636
+ else:
637
+ # Aggregated community super-graph.
638
+ _node_to_comm = {}
639
+ for _cid, _members in communities.items():
640
+ for _nid in _members:
641
+ _node_to_comm[_nid] = _cid
642
+
643
+ _G_agg = _nx.DiGraph()
644
+ _member_counts = {}
645
+ _labels = {}
646
+ for _c in graph_communities:
647
+ _cid = _c['id']
648
+ _label = _c.get('dominantSourceDir') or f"community-{_cid}"
649
+ # vis.js node attrs: label drives display; file_type is
650
+ # surfaced in graphify's sidebar so we set a sentinel
651
+ # value the dashboard can grep on.
652
+ _G_agg.add_node(_cid, label=_label, source_file='', file_type='community')
653
+ _member_counts[_cid] = len(_c['nodeIds'])
654
+ _labels[_cid] = _label
655
+
656
+ # Cross-community edge aggregation. Counter keyed on
657
+ # (smaller_id, larger_id) for undirected aggregation; we then
658
+ # add a directed edge in one canonical direction so vis.js
659
+ # has a definite source/target. The viewer doesn't show
660
+ # arrows on these (they're community connections, not calls).
661
+ from collections import Counter as _CommCounter
662
+ _edge_w = _CommCounter()
663
+ for _u, _v, _ in G.edges(data=True):
664
+ _cu = _node_to_comm.get(_u)
665
+ _cv = _node_to_comm.get(_v)
666
+ if _cu is None or _cv is None or _cu == _cv:
667
+ continue
668
+ _key = (_cu, _cv) if _cu < _cv else (_cv, _cu)
669
+ _edge_w[_key] += 1
670
+ for (_a, _b), _w in _edge_w.items():
671
+ _G_agg.add_edge(_a, _b, relation='inter_community', occurrences=_w)
672
+
673
+ # to_html requires a communities dict; one-element groups
674
+ # treat each aggregated node as its own community so each
675
+ # community keeps a distinct color in graphify's palette.
676
+ _agg_groups = {_cid: [_cid] for _cid in communities}
677
+
678
+ _to_html(
679
+ _G_agg, _agg_groups, str(_html_path),
680
+ community_labels=_labels, member_counts=_member_counts,
681
+ )
682
+ _viz_mode = 'aggregated'
683
+
684
+ # Sidecar so the dashboard renderer can label the view honestly.
685
+ # JSON is tiny (~120B); avoids parsing graph.json twice from TS.
686
+ _meta_path = _html_dir / 'graph.html.meta.json'
687
+ _meta_path.write_text(json.dumps({
688
+ 'mode': _viz_mode,
689
+ 'totalNodes': G.number_of_nodes(),
690
+ 'totalEdges': G.number_of_edges(),
691
+ 'communities': len(communities),
692
+ 'aggregatedNodeCount': len(communities) if _viz_mode == 'aggregated' else None,
693
+ }))
694
+ except Exception as _html_err:
695
+ sys.stderr.write(f"dxkit: graph.html not generated ({_html_err})\\n")
696
+
697
+ print(json.dumps({
698
+ "functionCount": len(functions),
699
+ "classCount": len([n for n, d in modules if any(
700
+ data.get("relation") == "method" for _, _, data in G.edges(n, data=True)
701
+ )]),
702
+ "maxFunctionsInFile": max_file[1] if max_file else 0,
703
+ "maxFunctionsFilePath": str(max_file[0]) if max_file else "",
704
+ "godNodeCount": god_count,
705
+ "communityCount": len(communities),
706
+ "avgCohesion": round(avg_cohesion, 3),
707
+ "orphanModuleCount": len(orphans),
708
+ "deadImportCount": len(dead),
709
+ "commentedCodeRatio": round(commented_ratio, 3),
710
+ "sourceFilesInGraph": total_src,
711
+ "graph": _graph_payload,
654
712
  }))
655
- except Exception as _html_err:
656
- sys.stderr.write(f"dxkit: graph.html not generated ({_html_err})\\n")
657
-
658
- # Clean up temp cache
659
- import shutil
660
- shutil.rmtree(str(_cache_dir), ignore_errors=True)
661
-
662
- print(json.dumps({
663
- "functionCount": len(functions),
664
- "classCount": len([n for n, d in modules if any(
665
- data.get("relation") == "method" for _, _, data in G.edges(n, data=True)
666
- )]),
667
- "maxFunctionsInFile": max_file[1] if max_file else 0,
668
- "maxFunctionsFilePath": str(max_file[0]) if max_file else "",
669
- "godNodeCount": god_count,
670
- "communityCount": len(communities),
671
- "avgCohesion": round(avg_cohesion, 3),
672
- "orphanModuleCount": len(orphans),
673
- "deadImportCount": len(dead),
674
- "commentedCodeRatio": round(commented_ratio, 3),
675
- "sourceFilesInGraph": total_src,
676
- "graph": _graph_payload,
677
- }))
678
713
  `;
679
714
  }
680
715
  /**
@@ -781,6 +816,15 @@ async function computeAndCache(cwd) {
781
816
  // don't litter /tmp across runs.
782
817
  const scriptDir = fs.mkdtempSync(path.join(os.tmpdir(), 'dxkit-graphify-'));
783
818
  const scriptPath = path.join(scriptDir, 'run.py');
819
+ // graphify's on-disk AST cache is redirected here (passed to the script
820
+ // as argv[2] → extract(cache_root=...)), keeping it out of the target
821
+ // repo. It lives under scriptDir so the single `fs.rmSync(scriptDir)`
822
+ // below reclaims it — crucially AFTER the Python process and its atexit
823
+ // handlers exit. graphify flushes a stat-index via atexit
824
+ // (graphify/cache.py), so cleaning the cache from inside the script
825
+ // would be undone by that post-exit write; owning the lifecycle here is
826
+ // the only leak-free point.
827
+ const cacheDir = path.join(scriptDir, 'graphify-cache');
784
828
  fs.writeFileSync(scriptPath, buildGraphifyScript(cwd));
785
829
  // Spawn-with-process-group so the Python interpreter + any
786
830
  // tree-sitter worker subprocesses it starts are all killed
@@ -793,7 +837,7 @@ async function computeAndCache(cwd) {
793
837
  //
794
838
  // runDetached captures stderr natively so the tempfile redirect
795
839
  // pattern is no longer needed — same effect, fewer moving parts.
796
- const outcome = await (0, runner_1.runDetached)(pythonCmd, [scriptPath, cwd], {
840
+ const outcome = await (0, runner_1.runDetached)(pythonCmd, [scriptPath, cwd, cacheDir], {
797
841
  cwd: scriptDir,
798
842
  timeoutMs: 300000, // 5 min — bumped from 120000 in 2.4.7 for multi-thousand-file frontend repos
799
843
  });