tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tricoder/graph_view.py ADDED
@@ -0,0 +1,583 @@
1
+ """Graph view: adjacency matrix, PPMI, and SVD."""
2
+ import os
3
+ from collections import defaultdict, deque
4
+ from multiprocessing import Pool, cpu_count
5
+ from typing import Tuple, List, Dict
6
+
7
+ import numpy as np
8
+ from scipy import sparse
9
+ from sklearn.decomposition import TruncatedSVD
10
+
11
+ # Set threading appropriately for multiprocessing
12
+ # Each process should use 1 thread to avoid oversubscription when using multiprocessing
13
+ # numpy/scipy operations release GIL so threading can help, but with multiprocessing
14
+ # we want to avoid thread contention
15
+ os.environ['OMP_NUM_THREADS'] = '1'
16
+ os.environ['MKL_NUM_THREADS'] = '1'
17
+ os.environ['NUMEXPR_NUM_THREADS'] = '1'
18
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
19
+
20
+
21
+ def _get_num_workers() -> int:
22
+ """Get number of workers (all cores - 1, minimum 1)."""
23
+ return max(1, cpu_count() - 1)
24
+
25
+
26
+ def expand_call_graph(edges: List[Tuple[int, int, str, float]], num_nodes: int,
27
+ max_depth: int = 3) -> List[Tuple[int, int, str, float]]:
28
+ """
29
+ Expand call graph by propagating call edges to depth 2-3.
30
+
31
+ Args:
32
+ edges: list of (src_idx, dst_idx, relation, weight) tuples
33
+ num_nodes: number of nodes
34
+ max_depth: maximum depth for propagation (2 or 3)
35
+
36
+ Returns:
37
+ Expanded list of edges including propagated calls
38
+ """
39
+ # Build call graph (only "calls" relations)
40
+ call_graph = defaultdict(list)
41
+ call_edges = []
42
+ other_edges = []
43
+
44
+ for src_idx, dst_idx, rel, weight in edges:
45
+ if rel == "calls":
46
+ call_graph[src_idx].append((dst_idx, weight))
47
+ call_edges.append((src_idx, dst_idx, rel, weight))
48
+ else:
49
+ other_edges.append((src_idx, dst_idx, rel, weight))
50
+
51
+ # BFS to find transitive calls
52
+ expanded_call_set = set()
53
+ expanded_call_edges = list(call_edges)
54
+
55
+ for start_node in call_graph:
56
+ # BFS with depth control
57
+ queue = deque([(start_node, 0, 1.0)]) # (node, depth, cumulative_weight)
58
+ visited = {start_node}
59
+
60
+ while queue:
61
+ curr_node, depth, cum_weight = queue.popleft()
62
+
63
+ if depth >= max_depth:
64
+ continue
65
+
66
+ for next_node, edge_weight in call_graph.get(curr_node, []):
67
+ if next_node == start_node:
68
+ continue # Avoid self-loops
69
+
70
+ # Compute propagated weight
71
+ propagated_weight = cum_weight * edge_weight
72
+
73
+ if depth == 1:
74
+ propagated_weight *= 0.5 # Depth 2: weight *= 0.5
75
+ elif depth == 2:
76
+ propagated_weight *= 0.25 # Depth 3: weight *= 0.25
77
+
78
+ # Add edge if not already present (avoid cycles)
79
+ edge_key = (start_node, next_node)
80
+ if edge_key not in expanded_call_set and propagated_weight > 1e-6:
81
+ expanded_call_set.add(edge_key)
82
+ expanded_call_edges.append((start_node, next_node, "calls", propagated_weight))
83
+
84
+ # Continue BFS if not visited at this depth
85
+ if next_node not in visited and depth < max_depth - 1:
86
+ visited.add(next_node)
87
+ queue.append((next_node, depth + 1, propagated_weight))
88
+
89
+ # Combine all edges
90
+ return other_edges + expanded_call_edges
91
+
92
+
93
+ def add_subtoken_edges(edges: List[Tuple[int, int, str, float]],
94
+ node_to_idx: Dict[str, int],
95
+ node_subtokens: Dict[str, List[str]],
96
+ num_nodes: int) -> Tuple[List[Tuple[int, int, str, float]], int, Dict[str, int]]:
97
+ """
98
+ Add subtoken nodes and edges to the graph.
99
+
100
+ Args:
101
+ edges: existing edges
102
+ node_to_idx: mapping from node_id to index
103
+ node_subtokens: mapping from node_id to list of normalized subtokens
104
+ num_nodes: current number of nodes
105
+
106
+ Returns:
107
+ Tuple of (expanded_edges, new_num_nodes, subtoken_to_idx)
108
+ """
109
+ # Create subtoken nodes
110
+ subtoken_to_idx = {}
111
+ new_edges = list(edges)
112
+ current_num_nodes = num_nodes
113
+
114
+ # First pass: create all subtoken nodes
115
+ for node_id, subtokens in node_subtokens.items():
116
+ if node_id not in node_to_idx:
117
+ continue
118
+
119
+ node_idx = node_to_idx[node_id]
120
+
121
+ for subtoken in subtokens:
122
+ if subtoken not in subtoken_to_idx:
123
+ subtoken_to_idx[subtoken] = current_num_nodes
124
+ current_num_nodes += 1
125
+
126
+ # Second pass: add edges
127
+ for node_id, subtokens in node_subtokens.items():
128
+ if node_id not in node_to_idx:
129
+ continue
130
+
131
+ node_idx = node_to_idx[node_id]
132
+
133
+ # Add edges: symbol ↔ subtoken (weight = 1.0)
134
+ for subtoken in subtokens:
135
+ subtoken_idx = subtoken_to_idx[subtoken]
136
+ new_edges.append((node_idx, subtoken_idx, "has_subtoken", 1.0))
137
+ new_edges.append((subtoken_idx, node_idx, "subtoken_of", 1.0))
138
+
139
+ # Add edges between subtokens from same symbol (weight = 0.25)
140
+ for i, subtoken1 in enumerate(subtokens):
141
+ for subtoken2 in subtokens[i + 1:]:
142
+ idx1 = subtoken_to_idx[subtoken1]
143
+ idx2 = subtoken_to_idx[subtoken2]
144
+ new_edges.append((idx1, idx2, "co_subtoken", 0.25))
145
+ new_edges.append((idx2, idx1, "co_subtoken", 0.25))
146
+
147
+ return new_edges, current_num_nodes, subtoken_to_idx
148
+
149
+
150
+ def add_file_hierarchy_edges(edges: List[Tuple[int, int, str, float]],
151
+ node_to_idx: Dict[str, int],
152
+ node_file_info: Dict[str, Tuple[str, str, str]],
153
+ idx_to_node: Dict[int, str]) -> List[Tuple[int, int, str, float]]:
154
+ """
155
+ Add file hierarchy edges based on file/directory relationships.
156
+
157
+ Args:
158
+ edges: existing edges
159
+ node_to_idx: mapping from node_id to index
160
+ node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
161
+ idx_to_node: reverse mapping from index to node_id
162
+
163
+ Returns:
164
+ Expanded list of edges
165
+ """
166
+ new_edges = list(edges)
167
+
168
+ # Group nodes by file, directory, and package
169
+ nodes_by_file = defaultdict(list)
170
+ nodes_by_directory = defaultdict(list)
171
+ nodes_by_package = defaultdict(list)
172
+
173
+ for node_id, (file_name, directory_path, top_level_package) in node_file_info.items():
174
+ if node_id not in node_to_idx:
175
+ continue
176
+
177
+ node_idx = node_to_idx[node_id]
178
+
179
+ if file_name:
180
+ nodes_by_file[file_name].append(node_idx)
181
+ if directory_path:
182
+ nodes_by_directory[directory_path].append(node_idx)
183
+ if top_level_package:
184
+ nodes_by_package[top_level_package].append(node_idx)
185
+
186
+ # Add edges for same file (weight += 3.0)
187
+ for file_nodes in nodes_by_file.values():
188
+ for i, node1 in enumerate(file_nodes):
189
+ for node2 in file_nodes[i + 1:]:
190
+ # Find existing edge weight or use 0
191
+ existing_weight = 0.0
192
+ for src, dst, rel, w in edges:
193
+ if (src == node1 and dst == node2) or (src == node2 and dst == node1):
194
+ existing_weight = max(existing_weight, w)
195
+ break
196
+
197
+ new_weight = existing_weight + 3.0
198
+ new_edges.append((node1, node2, "same_file", new_weight))
199
+ new_edges.append((node2, node1, "same_file", new_weight))
200
+
201
+ # Add edges for same directory (weight += 2.0)
202
+ for dir_nodes in nodes_by_directory.values():
203
+ for i, node1 in enumerate(dir_nodes):
204
+ for node2 in dir_nodes[i + 1:]:
205
+ existing_weight = 0.0
206
+ for src, dst, rel, w in edges:
207
+ if (src == node1 and dst == node2) or (src == node2 and dst == node1):
208
+ existing_weight = max(existing_weight, w)
209
+ break
210
+
211
+ new_weight = existing_weight + 2.0
212
+ new_edges.append((node1, node2, "same_directory", new_weight))
213
+ new_edges.append((node2, node1, "same_directory", new_weight))
214
+
215
+ # Add edges for same package (weight += 1.0)
216
+ for pkg_nodes in nodes_by_package.values():
217
+ for i, node1 in enumerate(pkg_nodes):
218
+ for node2 in pkg_nodes[i + 1:]:
219
+ existing_weight = 0.0
220
+ for src, dst, rel, w in edges:
221
+ if (src == node1 and dst == node2) or (src == node2 and dst == node1):
222
+ existing_weight = max(existing_weight, w)
223
+ break
224
+
225
+ new_weight = existing_weight + 1.0
226
+ new_edges.append((node1, node2, "same_package", new_weight))
227
+ new_edges.append((node2, node1, "same_package", new_weight))
228
+
229
+ return new_edges
230
+
231
+
232
+ def _process_file_context(args):
233
+ """Process a single file for context window edges."""
234
+ file_path, node_lines, window_size, edge_weights = args
235
+ new_edges = []
236
+
237
+ # Sort by line number
238
+ node_lines.sort(key=lambda x: x[1])
239
+
240
+ for i, (node1_idx, line1) in enumerate(node_lines):
241
+ for j in range(i + 1, len(node_lines)):
242
+ node2_idx, line2 = node_lines[j]
243
+
244
+ # Check if within window
245
+ if abs(line2 - line1) <= window_size:
246
+ # Find existing edge weight or use 0
247
+ key = (min(node1_idx, node2_idx), max(node1_idx, node2_idx))
248
+ existing_weight = edge_weights.get(key, 0.0)
249
+
250
+ new_weight = existing_weight + 1.0
251
+ new_edges.append((node1_idx, node2_idx, "context_window", new_weight))
252
+ new_edges.append((node2_idx, node1_idx, "context_window", new_weight))
253
+ else:
254
+ # Lines are sorted, so we can break early
255
+ break
256
+
257
+ return new_edges
258
+
259
+
260
+ def add_context_window_edges(edges: List[Tuple[int, int, str, float]],
261
+ node_metadata: List[Dict],
262
+ window_size: int = 5,
263
+ n_jobs: int = -1) -> List[Tuple[int, int, str, float]]:
264
+ """
265
+ Add edges for symbols appearing within ±W lines in the same file.
266
+
267
+ Args:
268
+ edges: existing edges
269
+ node_metadata: list of node metadata dictionaries
270
+ window_size: context window size (default 5)
271
+ n_jobs: number of parallel jobs (-1 for all cores - 1)
272
+
273
+ Returns:
274
+ Expanded list of edges
275
+ """
276
+ new_edges = list(edges)
277
+
278
+ # Group nodes by file and line number
279
+ nodes_by_file = defaultdict(list) # file_path -> [(node_idx, lineno), ...]
280
+
281
+ for idx, node_meta in enumerate(node_metadata):
282
+ meta = node_meta.get('meta', {})
283
+ if isinstance(meta, dict):
284
+ file_path = meta.get('file', '')
285
+ lineno = meta.get('lineno', -1)
286
+ if file_path and lineno >= 0:
287
+ nodes_by_file[file_path].append((idx, lineno))
288
+
289
+ # Build edge weight map for efficient lookup
290
+ edge_weights = defaultdict(float)
291
+ for edge in edges:
292
+ if len(edge) == 4:
293
+ src, dst, rel, w = edge
294
+ key = (min(src, dst), max(src, dst))
295
+ edge_weights[key] = max(edge_weights[key], w)
296
+
297
+ # Parallelize file processing
298
+ if n_jobs == -1:
299
+ n_jobs = _get_num_workers()
300
+
301
+ if len(nodes_by_file) > 10 and n_jobs > 1:
302
+ # Parallel processing for multiple files
303
+ args_list = [(file_path, node_lines, window_size, edge_weights)
304
+ for file_path, node_lines in nodes_by_file.items()]
305
+ chunksize = max(1, len(args_list) // n_jobs)
306
+
307
+ with Pool(processes=n_jobs) as pool:
308
+ results = pool.map(_process_file_context, args_list, chunksize=chunksize)
309
+
310
+ # Flatten results
311
+ for file_edges in results:
312
+ new_edges.extend(file_edges)
313
+ else:
314
+ # Sequential processing for small cases
315
+ for file_path, node_lines in nodes_by_file.items():
316
+ # Sort by line number
317
+ node_lines.sort(key=lambda x: x[1])
318
+
319
+ for i, (node1_idx, line1) in enumerate(node_lines):
320
+ for j in range(i + 1, len(node_lines)):
321
+ node2_idx, line2 = node_lines[j]
322
+
323
+ # Check if within window
324
+ if abs(line2 - line1) <= window_size:
325
+ # Find existing edge weight or use 0
326
+ key = (min(node1_idx, node2_idx), max(node1_idx, node2_idx))
327
+ existing_weight = edge_weights.get(key, 0.0)
328
+
329
+ new_weight = existing_weight + 1.0
330
+ new_edges.append((node1_idx, node2_idx, "context_window", new_weight))
331
+ new_edges.append((node2_idx, node1_idx, "context_window", new_weight))
332
+ else:
333
+ # Lines are sorted, so we can break early
334
+ break
335
+
336
+ return new_edges
337
+
338
+
339
+ def _process_edge_chunk(args):
340
+ """Process a chunk of edges for parallel aggregation."""
341
+ edge_chunk, start_idx = args
342
+ edge_weights = {}
343
+ rows = []
344
+ cols = []
345
+ data = []
346
+
347
+ for src_idx, dst_idx, rel, weight in edge_chunk:
348
+ key = (src_idx, dst_idx)
349
+ if key not in edge_weights:
350
+ edge_weights[key] = weight
351
+ else:
352
+ edge_weights[key] = max(edge_weights[key], weight)
353
+
354
+ for (src_idx, dst_idx), weight in edge_weights.items():
355
+ rows.append(src_idx)
356
+ cols.append(dst_idx)
357
+ data.append(weight)
358
+
359
+ return rows, cols, data
360
+
361
+
362
+ def build_adjacency_matrix(edges: List[Tuple[int, int, str, float]], num_nodes: int,
363
+ n_jobs: int = -1) -> sparse.csr_matrix:
364
+ """
365
+ Build weighted adjacency matrix from edges with optional parallelization.
366
+
367
+ Args:
368
+ edges: list of (src_idx, dst_idx, relation, weight) tuples
369
+ num_nodes: number of nodes
370
+ n_jobs: number of parallel jobs (-1 for all cores - 1)
371
+
372
+ Returns:
373
+ Sparse CSR adjacency matrix
374
+ """
375
+ if n_jobs == -1:
376
+ n_jobs = _get_num_workers()
377
+
378
+ # For small edge lists, use sequential processing
379
+ if len(edges) < 10000 or n_jobs == 1:
380
+ rows = []
381
+ cols = []
382
+ data = []
383
+
384
+ # Aggregate weights for duplicate edges (same src, dst pair)
385
+ edge_weights = defaultdict(float)
386
+ for src_idx, dst_idx, rel, weight in edges:
387
+ key = (src_idx, dst_idx)
388
+ edge_weights[key] = max(edge_weights[key], weight)
389
+
390
+ for (src_idx, dst_idx), weight in edge_weights.items():
391
+ rows.append(src_idx)
392
+ cols.append(dst_idx)
393
+ data.append(weight)
394
+ else:
395
+ # Parallel processing for large edge lists
396
+ chunk_size = max(1, len(edges) // n_jobs)
397
+ chunks = [edges[i:i + chunk_size] for i in range(0, len(edges), chunk_size)]
398
+ args_list = [(chunk, i) for i, chunk in enumerate(chunks)]
399
+
400
+ with Pool(processes=n_jobs) as pool:
401
+ results = pool.map(_process_edge_chunk, args_list)
402
+
403
+ # Merge results
404
+ edge_weights = defaultdict(float)
405
+ for chunk_rows, chunk_cols, chunk_data in results:
406
+ for i, (src_idx, dst_idx) in enumerate(zip(chunk_rows, chunk_cols)):
407
+ key = (src_idx, dst_idx)
408
+ edge_weights[key] = max(edge_weights[key], chunk_data[i])
409
+
410
+ rows = []
411
+ cols = []
412
+ data = []
413
+ for (src_idx, dst_idx), weight in edge_weights.items():
414
+ rows.append(src_idx)
415
+ cols.append(dst_idx)
416
+ data.append(weight)
417
+
418
+ # Create symmetric matrix (undirected graph)
419
+ rows_sym = rows + cols
420
+ cols_sym = cols + rows
421
+ data_sym = data + data
422
+
423
+ adj = sparse.csr_matrix((data_sym, (rows_sym, cols_sym)), shape=(num_nodes, num_nodes))
424
+ return adj
425
+
426
+
427
+ def compute_ppmi(adj: sparse.csr_matrix, k: float = 1.0) -> sparse.csr_matrix:
428
+ """
429
+ Compute Positive Pointwise Mutual Information (PPMI) matrix.
430
+
431
+ Args:
432
+ adj: adjacency matrix
433
+ k: shift parameter (typically 1.0)
434
+
435
+ Returns:
436
+ PPMI matrix (sparse)
437
+ """
438
+ # Convert to cooccurrence matrix (symmetric)
439
+ cooc = adj + adj.T
440
+ cooc.data = np.maximum(cooc.data, 0) # Ensure non-negative
441
+
442
+ # Compute marginals
443
+ row_sums = np.array(cooc.sum(axis=1)).flatten()
444
+ col_sums = np.array(cooc.sum(axis=0)).flatten()
445
+ total = cooc.sum()
446
+
447
+ # Avoid division by zero
448
+ row_sums = np.maximum(row_sums, 1e-10)
449
+ col_sums = np.maximum(col_sums, 1e-10)
450
+ total = max(total, 1e-10)
451
+
452
+ # Compute PMI
453
+ rows, cols = cooc.nonzero()
454
+ values = cooc.data
455
+
456
+ # PMI(i,j) = log(P(i,j) / (P(i) * P(j)))
457
+ p_ij = values / total
458
+ p_i = row_sums[rows] / total
459
+ p_j = col_sums[cols] / total
460
+
461
+ pmi = np.log(p_ij / (p_i * p_j + 1e-10) + 1e-10)
462
+ ppmi = np.maximum(pmi, 0.0) # Positive PMI
463
+
464
+ # Create PPMI matrix
465
+ ppmi_matrix = sparse.csr_matrix((ppmi, (rows, cols)), shape=cooc.shape)
466
+
467
+ return ppmi_matrix
468
+
469
+
470
+ def reduce_dimensions_ppmi(ppmi: sparse.csr_matrix, dim: int, random_state: int = 42,
471
+ n_jobs: int = -1, gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray]:
472
+ """
473
+ Reduce PPMI matrix dimensionality using Truncated SVD (GPU-accelerated if available).
474
+
475
+ Args:
476
+ ppmi: PPMI matrix
477
+ dim: target dimensionality
478
+ random_state: random seed
479
+ n_jobs: number of parallel jobs (not used for SVD, but kept for API consistency)
480
+ gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
481
+
482
+ Returns:
483
+ Reduced embeddings matrix and SVD components
484
+ """
485
+ num_features = ppmi.shape[1]
486
+ actual_dim = min(dim, num_features)
487
+
488
+ # Try GPU acceleration if available
489
+ if gpu_accelerator and gpu_accelerator.use_gpu:
490
+ try:
491
+ # Convert sparse matrix to dense for GPU SVD (CuPy doesn't support sparse SVD well)
492
+ # Only do this if matrix is reasonably sized
493
+ if ppmi.shape[0] * ppmi.shape[1] < 50_000_000: # ~50M elements threshold
494
+ ppmi_dense = ppmi.toarray()
495
+ U, S, Vt = gpu_accelerator.svd(ppmi_dense, actual_dim, random_state)
496
+
497
+ # Transform: U @ diag(S)
498
+ embeddings = U @ np.diag(S)
499
+ components = Vt
500
+ else:
501
+ # Too large, fall back to CPU sparse SVD
502
+ raise ValueError("Matrix too large for GPU dense SVD")
503
+ except Exception as e:
504
+ # Fall back to CPU
505
+ svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
506
+ embeddings = svd.fit_transform(ppmi)
507
+ components = svd.components_
508
+ else:
509
+ # CPU path
510
+ svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
511
+ embeddings = svd.fit_transform(ppmi)
512
+ components = svd.components_
513
+
514
+ # Pad embeddings if needed to match requested dimension
515
+ if actual_dim < dim:
516
+ padding = np.zeros((embeddings.shape[0], dim - actual_dim))
517
+ embeddings = np.hstack([embeddings, padding])
518
+ # Pad components similarly
519
+ component_padding = np.zeros((dim - actual_dim, components.shape[1]))
520
+ components = np.vstack([components, component_padding])
521
+
522
+ return embeddings, components
523
+
524
+
525
+ def compute_graph_view(edges: List[Tuple[int, int, str, float]], num_nodes: int,
526
+ dim: int, random_state: int = 42, n_jobs: int = -1,
527
+ node_to_idx: Dict[str, int] = None,
528
+ node_subtokens: Dict[str, List[str]] = None,
529
+ node_file_info: Dict[str, Tuple[str, str, str]] = None,
530
+ node_metadata: List[Dict] = None,
531
+ idx_to_node: Dict[int, str] = None,
532
+ expand_calls: bool = True,
533
+ add_subtokens: bool = True,
534
+ add_hierarchy: bool = True,
535
+ add_context: bool = True,
536
+ context_window: int = 5,
537
+ max_depth: int = 3,
538
+ gpu_accelerator=None) -> Tuple[
539
+ np.ndarray, np.ndarray, int, Dict[str, int], List[Tuple[int, int, str, float]]]:
540
+ """
541
+ Compute graph view embeddings with all enhancements.
542
+
543
+ Args:
544
+ max_depth: maximum depth for call graph expansion (default 3, use 2 for faster)
545
+
546
+ Returns:
547
+ embeddings: node embeddings from graph view
548
+ svd_components: SVD components for reconstruction
549
+ final_num_nodes: final number of nodes (after adding subtokens)
550
+ subtoken_to_idx: mapping from subtoken to index (if subtokens added)
551
+ expanded_edges: expanded edge list including all enhancements
552
+ """
553
+ expanded_edges = list(edges)
554
+ current_num_nodes = num_nodes
555
+ subtoken_to_idx = {}
556
+
557
+ # Step 1: Expand call graph (depth 2-3)
558
+ if expand_calls:
559
+ expanded_edges = expand_call_graph(expanded_edges, current_num_nodes, max_depth=max_depth)
560
+
561
+ # Step 2: Add context window co-occurrence (must be before PPMI)
562
+ if add_context and node_metadata is not None:
563
+ expanded_edges = add_context_window_edges(expanded_edges, node_metadata, window_size=context_window,
564
+ n_jobs=n_jobs)
565
+
566
+ # Step 3: Add subtoken nodes and edges
567
+ if add_subtokens and node_to_idx is not None and node_subtokens is not None:
568
+ expanded_edges, current_num_nodes, subtoken_to_idx = add_subtoken_edges(
569
+ expanded_edges, node_to_idx, node_subtokens, current_num_nodes
570
+ )
571
+
572
+ # Step 4: Add file hierarchy edges
573
+ if add_hierarchy and node_to_idx is not None and node_file_info is not None and idx_to_node is not None:
574
+ expanded_edges = add_file_hierarchy_edges(
575
+ expanded_edges, node_to_idx, node_file_info, idx_to_node
576
+ )
577
+
578
+ # Step 5: Build adjacency matrix and compute PPMI (with parallelization)
579
+ adj = build_adjacency_matrix(expanded_edges, current_num_nodes, n_jobs=n_jobs)
580
+ ppmi = compute_ppmi(adj)
581
+ embeddings, svd_components = reduce_dimensions_ppmi(ppmi, dim, random_state, n_jobs, gpu_accelerator)
582
+
583
+ return embeddings, svd_components, current_num_nodes, subtoken_to_idx, expanded_edges