tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/graph_view.py
ADDED
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
"""Graph view: adjacency matrix, PPMI, and SVD."""
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict, deque
|
|
4
|
+
from multiprocessing import Pool, cpu_count
|
|
5
|
+
from typing import Tuple, List, Dict
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from scipy import sparse
|
|
9
|
+
from sklearn.decomposition import TruncatedSVD
|
|
10
|
+
|
|
11
|
+
# Set threading appropriately for multiprocessing
|
|
12
|
+
# Each process should use 1 thread to avoid oversubscription when using multiprocessing
|
|
13
|
+
# numpy/scipy operations release GIL so threading can help, but with multiprocessing
|
|
14
|
+
# we want to avoid thread contention
|
|
15
|
+
os.environ['OMP_NUM_THREADS'] = '1'
|
|
16
|
+
os.environ['MKL_NUM_THREADS'] = '1'
|
|
17
|
+
os.environ['NUMEXPR_NUM_THREADS'] = '1'
|
|
18
|
+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_num_workers() -> int:
|
|
22
|
+
"""Get number of workers (all cores - 1, minimum 1)."""
|
|
23
|
+
return max(1, cpu_count() - 1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def expand_call_graph(edges: List[Tuple[int, int, str, float]], num_nodes: int,
|
|
27
|
+
max_depth: int = 3) -> List[Tuple[int, int, str, float]]:
|
|
28
|
+
"""
|
|
29
|
+
Expand call graph by propagating call edges to depth 2-3.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
edges: list of (src_idx, dst_idx, relation, weight) tuples
|
|
33
|
+
num_nodes: number of nodes
|
|
34
|
+
max_depth: maximum depth for propagation (2 or 3)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Expanded list of edges including propagated calls
|
|
38
|
+
"""
|
|
39
|
+
# Build call graph (only "calls" relations)
|
|
40
|
+
call_graph = defaultdict(list)
|
|
41
|
+
call_edges = []
|
|
42
|
+
other_edges = []
|
|
43
|
+
|
|
44
|
+
for src_idx, dst_idx, rel, weight in edges:
|
|
45
|
+
if rel == "calls":
|
|
46
|
+
call_graph[src_idx].append((dst_idx, weight))
|
|
47
|
+
call_edges.append((src_idx, dst_idx, rel, weight))
|
|
48
|
+
else:
|
|
49
|
+
other_edges.append((src_idx, dst_idx, rel, weight))
|
|
50
|
+
|
|
51
|
+
# BFS to find transitive calls
|
|
52
|
+
expanded_call_set = set()
|
|
53
|
+
expanded_call_edges = list(call_edges)
|
|
54
|
+
|
|
55
|
+
for start_node in call_graph:
|
|
56
|
+
# BFS with depth control
|
|
57
|
+
queue = deque([(start_node, 0, 1.0)]) # (node, depth, cumulative_weight)
|
|
58
|
+
visited = {start_node}
|
|
59
|
+
|
|
60
|
+
while queue:
|
|
61
|
+
curr_node, depth, cum_weight = queue.popleft()
|
|
62
|
+
|
|
63
|
+
if depth >= max_depth:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
for next_node, edge_weight in call_graph.get(curr_node, []):
|
|
67
|
+
if next_node == start_node:
|
|
68
|
+
continue # Avoid self-loops
|
|
69
|
+
|
|
70
|
+
# Compute propagated weight
|
|
71
|
+
propagated_weight = cum_weight * edge_weight
|
|
72
|
+
|
|
73
|
+
if depth == 1:
|
|
74
|
+
propagated_weight *= 0.5 # Depth 2: weight *= 0.5
|
|
75
|
+
elif depth == 2:
|
|
76
|
+
propagated_weight *= 0.25 # Depth 3: weight *= 0.25
|
|
77
|
+
|
|
78
|
+
# Add edge if not already present (avoid cycles)
|
|
79
|
+
edge_key = (start_node, next_node)
|
|
80
|
+
if edge_key not in expanded_call_set and propagated_weight > 1e-6:
|
|
81
|
+
expanded_call_set.add(edge_key)
|
|
82
|
+
expanded_call_edges.append((start_node, next_node, "calls", propagated_weight))
|
|
83
|
+
|
|
84
|
+
# Continue BFS if not visited at this depth
|
|
85
|
+
if next_node not in visited and depth < max_depth - 1:
|
|
86
|
+
visited.add(next_node)
|
|
87
|
+
queue.append((next_node, depth + 1, propagated_weight))
|
|
88
|
+
|
|
89
|
+
# Combine all edges
|
|
90
|
+
return other_edges + expanded_call_edges
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def add_subtoken_edges(edges: List[Tuple[int, int, str, float]],
|
|
94
|
+
node_to_idx: Dict[str, int],
|
|
95
|
+
node_subtokens: Dict[str, List[str]],
|
|
96
|
+
num_nodes: int) -> Tuple[List[Tuple[int, int, str, float]], int, Dict[str, int]]:
|
|
97
|
+
"""
|
|
98
|
+
Add subtoken nodes and edges to the graph.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
edges: existing edges
|
|
102
|
+
node_to_idx: mapping from node_id to index
|
|
103
|
+
node_subtokens: mapping from node_id to list of normalized subtokens
|
|
104
|
+
num_nodes: current number of nodes
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Tuple of (expanded_edges, new_num_nodes, subtoken_to_idx)
|
|
108
|
+
"""
|
|
109
|
+
# Create subtoken nodes
|
|
110
|
+
subtoken_to_idx = {}
|
|
111
|
+
new_edges = list(edges)
|
|
112
|
+
current_num_nodes = num_nodes
|
|
113
|
+
|
|
114
|
+
# First pass: create all subtoken nodes
|
|
115
|
+
for node_id, subtokens in node_subtokens.items():
|
|
116
|
+
if node_id not in node_to_idx:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
node_idx = node_to_idx[node_id]
|
|
120
|
+
|
|
121
|
+
for subtoken in subtokens:
|
|
122
|
+
if subtoken not in subtoken_to_idx:
|
|
123
|
+
subtoken_to_idx[subtoken] = current_num_nodes
|
|
124
|
+
current_num_nodes += 1
|
|
125
|
+
|
|
126
|
+
# Second pass: add edges
|
|
127
|
+
for node_id, subtokens in node_subtokens.items():
|
|
128
|
+
if node_id not in node_to_idx:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
node_idx = node_to_idx[node_id]
|
|
132
|
+
|
|
133
|
+
# Add edges: symbol ↔ subtoken (weight = 1.0)
|
|
134
|
+
for subtoken in subtokens:
|
|
135
|
+
subtoken_idx = subtoken_to_idx[subtoken]
|
|
136
|
+
new_edges.append((node_idx, subtoken_idx, "has_subtoken", 1.0))
|
|
137
|
+
new_edges.append((subtoken_idx, node_idx, "subtoken_of", 1.0))
|
|
138
|
+
|
|
139
|
+
# Add edges between subtokens from same symbol (weight = 0.25)
|
|
140
|
+
for i, subtoken1 in enumerate(subtokens):
|
|
141
|
+
for subtoken2 in subtokens[i + 1:]:
|
|
142
|
+
idx1 = subtoken_to_idx[subtoken1]
|
|
143
|
+
idx2 = subtoken_to_idx[subtoken2]
|
|
144
|
+
new_edges.append((idx1, idx2, "co_subtoken", 0.25))
|
|
145
|
+
new_edges.append((idx2, idx1, "co_subtoken", 0.25))
|
|
146
|
+
|
|
147
|
+
return new_edges, current_num_nodes, subtoken_to_idx
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def add_file_hierarchy_edges(edges: List[Tuple[int, int, str, float]],
|
|
151
|
+
node_to_idx: Dict[str, int],
|
|
152
|
+
node_file_info: Dict[str, Tuple[str, str, str]],
|
|
153
|
+
idx_to_node: Dict[int, str]) -> List[Tuple[int, int, str, float]]:
|
|
154
|
+
"""
|
|
155
|
+
Add file hierarchy edges based on file/directory relationships.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
edges: existing edges
|
|
159
|
+
node_to_idx: mapping from node_id to index
|
|
160
|
+
node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
|
|
161
|
+
idx_to_node: reverse mapping from index to node_id
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Expanded list of edges
|
|
165
|
+
"""
|
|
166
|
+
new_edges = list(edges)
|
|
167
|
+
|
|
168
|
+
# Group nodes by file, directory, and package
|
|
169
|
+
nodes_by_file = defaultdict(list)
|
|
170
|
+
nodes_by_directory = defaultdict(list)
|
|
171
|
+
nodes_by_package = defaultdict(list)
|
|
172
|
+
|
|
173
|
+
for node_id, (file_name, directory_path, top_level_package) in node_file_info.items():
|
|
174
|
+
if node_id not in node_to_idx:
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
node_idx = node_to_idx[node_id]
|
|
178
|
+
|
|
179
|
+
if file_name:
|
|
180
|
+
nodes_by_file[file_name].append(node_idx)
|
|
181
|
+
if directory_path:
|
|
182
|
+
nodes_by_directory[directory_path].append(node_idx)
|
|
183
|
+
if top_level_package:
|
|
184
|
+
nodes_by_package[top_level_package].append(node_idx)
|
|
185
|
+
|
|
186
|
+
# Add edges for same file (weight += 3.0)
|
|
187
|
+
for file_nodes in nodes_by_file.values():
|
|
188
|
+
for i, node1 in enumerate(file_nodes):
|
|
189
|
+
for node2 in file_nodes[i + 1:]:
|
|
190
|
+
# Find existing edge weight or use 0
|
|
191
|
+
existing_weight = 0.0
|
|
192
|
+
for src, dst, rel, w in edges:
|
|
193
|
+
if (src == node1 and dst == node2) or (src == node2 and dst == node1):
|
|
194
|
+
existing_weight = max(existing_weight, w)
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
new_weight = existing_weight + 3.0
|
|
198
|
+
new_edges.append((node1, node2, "same_file", new_weight))
|
|
199
|
+
new_edges.append((node2, node1, "same_file", new_weight))
|
|
200
|
+
|
|
201
|
+
# Add edges for same directory (weight += 2.0)
|
|
202
|
+
for dir_nodes in nodes_by_directory.values():
|
|
203
|
+
for i, node1 in enumerate(dir_nodes):
|
|
204
|
+
for node2 in dir_nodes[i + 1:]:
|
|
205
|
+
existing_weight = 0.0
|
|
206
|
+
for src, dst, rel, w in edges:
|
|
207
|
+
if (src == node1 and dst == node2) or (src == node2 and dst == node1):
|
|
208
|
+
existing_weight = max(existing_weight, w)
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
new_weight = existing_weight + 2.0
|
|
212
|
+
new_edges.append((node1, node2, "same_directory", new_weight))
|
|
213
|
+
new_edges.append((node2, node1, "same_directory", new_weight))
|
|
214
|
+
|
|
215
|
+
# Add edges for same package (weight += 1.0)
|
|
216
|
+
for pkg_nodes in nodes_by_package.values():
|
|
217
|
+
for i, node1 in enumerate(pkg_nodes):
|
|
218
|
+
for node2 in pkg_nodes[i + 1:]:
|
|
219
|
+
existing_weight = 0.0
|
|
220
|
+
for src, dst, rel, w in edges:
|
|
221
|
+
if (src == node1 and dst == node2) or (src == node2 and dst == node1):
|
|
222
|
+
existing_weight = max(existing_weight, w)
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
new_weight = existing_weight + 1.0
|
|
226
|
+
new_edges.append((node1, node2, "same_package", new_weight))
|
|
227
|
+
new_edges.append((node2, node1, "same_package", new_weight))
|
|
228
|
+
|
|
229
|
+
return new_edges
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _process_file_context(args):
|
|
233
|
+
"""Process a single file for context window edges."""
|
|
234
|
+
file_path, node_lines, window_size, edge_weights = args
|
|
235
|
+
new_edges = []
|
|
236
|
+
|
|
237
|
+
# Sort by line number
|
|
238
|
+
node_lines.sort(key=lambda x: x[1])
|
|
239
|
+
|
|
240
|
+
for i, (node1_idx, line1) in enumerate(node_lines):
|
|
241
|
+
for j in range(i + 1, len(node_lines)):
|
|
242
|
+
node2_idx, line2 = node_lines[j]
|
|
243
|
+
|
|
244
|
+
# Check if within window
|
|
245
|
+
if abs(line2 - line1) <= window_size:
|
|
246
|
+
# Find existing edge weight or use 0
|
|
247
|
+
key = (min(node1_idx, node2_idx), max(node1_idx, node2_idx))
|
|
248
|
+
existing_weight = edge_weights.get(key, 0.0)
|
|
249
|
+
|
|
250
|
+
new_weight = existing_weight + 1.0
|
|
251
|
+
new_edges.append((node1_idx, node2_idx, "context_window", new_weight))
|
|
252
|
+
new_edges.append((node2_idx, node1_idx, "context_window", new_weight))
|
|
253
|
+
else:
|
|
254
|
+
# Lines are sorted, so we can break early
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
return new_edges
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def add_context_window_edges(edges: List[Tuple[int, int, str, float]],
|
|
261
|
+
node_metadata: List[Dict],
|
|
262
|
+
window_size: int = 5,
|
|
263
|
+
n_jobs: int = -1) -> List[Tuple[int, int, str, float]]:
|
|
264
|
+
"""
|
|
265
|
+
Add edges for symbols appearing within ±W lines in the same file.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
edges: existing edges
|
|
269
|
+
node_metadata: list of node metadata dictionaries
|
|
270
|
+
window_size: context window size (default 5)
|
|
271
|
+
n_jobs: number of parallel jobs (-1 for all cores - 1)
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Expanded list of edges
|
|
275
|
+
"""
|
|
276
|
+
new_edges = list(edges)
|
|
277
|
+
|
|
278
|
+
# Group nodes by file and line number
|
|
279
|
+
nodes_by_file = defaultdict(list) # file_path -> [(node_idx, lineno), ...]
|
|
280
|
+
|
|
281
|
+
for idx, node_meta in enumerate(node_metadata):
|
|
282
|
+
meta = node_meta.get('meta', {})
|
|
283
|
+
if isinstance(meta, dict):
|
|
284
|
+
file_path = meta.get('file', '')
|
|
285
|
+
lineno = meta.get('lineno', -1)
|
|
286
|
+
if file_path and lineno >= 0:
|
|
287
|
+
nodes_by_file[file_path].append((idx, lineno))
|
|
288
|
+
|
|
289
|
+
# Build edge weight map for efficient lookup
|
|
290
|
+
edge_weights = defaultdict(float)
|
|
291
|
+
for edge in edges:
|
|
292
|
+
if len(edge) == 4:
|
|
293
|
+
src, dst, rel, w = edge
|
|
294
|
+
key = (min(src, dst), max(src, dst))
|
|
295
|
+
edge_weights[key] = max(edge_weights[key], w)
|
|
296
|
+
|
|
297
|
+
# Parallelize file processing
|
|
298
|
+
if n_jobs == -1:
|
|
299
|
+
n_jobs = _get_num_workers()
|
|
300
|
+
|
|
301
|
+
if len(nodes_by_file) > 10 and n_jobs > 1:
|
|
302
|
+
# Parallel processing for multiple files
|
|
303
|
+
args_list = [(file_path, node_lines, window_size, edge_weights)
|
|
304
|
+
for file_path, node_lines in nodes_by_file.items()]
|
|
305
|
+
chunksize = max(1, len(args_list) // n_jobs)
|
|
306
|
+
|
|
307
|
+
with Pool(processes=n_jobs) as pool:
|
|
308
|
+
results = pool.map(_process_file_context, args_list, chunksize=chunksize)
|
|
309
|
+
|
|
310
|
+
# Flatten results
|
|
311
|
+
for file_edges in results:
|
|
312
|
+
new_edges.extend(file_edges)
|
|
313
|
+
else:
|
|
314
|
+
# Sequential processing for small cases
|
|
315
|
+
for file_path, node_lines in nodes_by_file.items():
|
|
316
|
+
# Sort by line number
|
|
317
|
+
node_lines.sort(key=lambda x: x[1])
|
|
318
|
+
|
|
319
|
+
for i, (node1_idx, line1) in enumerate(node_lines):
|
|
320
|
+
for j in range(i + 1, len(node_lines)):
|
|
321
|
+
node2_idx, line2 = node_lines[j]
|
|
322
|
+
|
|
323
|
+
# Check if within window
|
|
324
|
+
if abs(line2 - line1) <= window_size:
|
|
325
|
+
# Find existing edge weight or use 0
|
|
326
|
+
key = (min(node1_idx, node2_idx), max(node1_idx, node2_idx))
|
|
327
|
+
existing_weight = edge_weights.get(key, 0.0)
|
|
328
|
+
|
|
329
|
+
new_weight = existing_weight + 1.0
|
|
330
|
+
new_edges.append((node1_idx, node2_idx, "context_window", new_weight))
|
|
331
|
+
new_edges.append((node2_idx, node1_idx, "context_window", new_weight))
|
|
332
|
+
else:
|
|
333
|
+
# Lines are sorted, so we can break early
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
return new_edges
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _process_edge_chunk(args):
|
|
340
|
+
"""Process a chunk of edges for parallel aggregation."""
|
|
341
|
+
edge_chunk, start_idx = args
|
|
342
|
+
edge_weights = {}
|
|
343
|
+
rows = []
|
|
344
|
+
cols = []
|
|
345
|
+
data = []
|
|
346
|
+
|
|
347
|
+
for src_idx, dst_idx, rel, weight in edge_chunk:
|
|
348
|
+
key = (src_idx, dst_idx)
|
|
349
|
+
if key not in edge_weights:
|
|
350
|
+
edge_weights[key] = weight
|
|
351
|
+
else:
|
|
352
|
+
edge_weights[key] = max(edge_weights[key], weight)
|
|
353
|
+
|
|
354
|
+
for (src_idx, dst_idx), weight in edge_weights.items():
|
|
355
|
+
rows.append(src_idx)
|
|
356
|
+
cols.append(dst_idx)
|
|
357
|
+
data.append(weight)
|
|
358
|
+
|
|
359
|
+
return rows, cols, data
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def build_adjacency_matrix(edges: List[Tuple[int, int, str, float]], num_nodes: int,
|
|
363
|
+
n_jobs: int = -1) -> sparse.csr_matrix:
|
|
364
|
+
"""
|
|
365
|
+
Build weighted adjacency matrix from edges with optional parallelization.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
edges: list of (src_idx, dst_idx, relation, weight) tuples
|
|
369
|
+
num_nodes: number of nodes
|
|
370
|
+
n_jobs: number of parallel jobs (-1 for all cores - 1)
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Sparse CSR adjacency matrix
|
|
374
|
+
"""
|
|
375
|
+
if n_jobs == -1:
|
|
376
|
+
n_jobs = _get_num_workers()
|
|
377
|
+
|
|
378
|
+
# For small edge lists, use sequential processing
|
|
379
|
+
if len(edges) < 10000 or n_jobs == 1:
|
|
380
|
+
rows = []
|
|
381
|
+
cols = []
|
|
382
|
+
data = []
|
|
383
|
+
|
|
384
|
+
# Aggregate weights for duplicate edges (same src, dst pair)
|
|
385
|
+
edge_weights = defaultdict(float)
|
|
386
|
+
for src_idx, dst_idx, rel, weight in edges:
|
|
387
|
+
key = (src_idx, dst_idx)
|
|
388
|
+
edge_weights[key] = max(edge_weights[key], weight)
|
|
389
|
+
|
|
390
|
+
for (src_idx, dst_idx), weight in edge_weights.items():
|
|
391
|
+
rows.append(src_idx)
|
|
392
|
+
cols.append(dst_idx)
|
|
393
|
+
data.append(weight)
|
|
394
|
+
else:
|
|
395
|
+
# Parallel processing for large edge lists
|
|
396
|
+
chunk_size = max(1, len(edges) // n_jobs)
|
|
397
|
+
chunks = [edges[i:i + chunk_size] for i in range(0, len(edges), chunk_size)]
|
|
398
|
+
args_list = [(chunk, i) for i, chunk in enumerate(chunks)]
|
|
399
|
+
|
|
400
|
+
with Pool(processes=n_jobs) as pool:
|
|
401
|
+
results = pool.map(_process_edge_chunk, args_list)
|
|
402
|
+
|
|
403
|
+
# Merge results
|
|
404
|
+
edge_weights = defaultdict(float)
|
|
405
|
+
for chunk_rows, chunk_cols, chunk_data in results:
|
|
406
|
+
for i, (src_idx, dst_idx) in enumerate(zip(chunk_rows, chunk_cols)):
|
|
407
|
+
key = (src_idx, dst_idx)
|
|
408
|
+
edge_weights[key] = max(edge_weights[key], chunk_data[i])
|
|
409
|
+
|
|
410
|
+
rows = []
|
|
411
|
+
cols = []
|
|
412
|
+
data = []
|
|
413
|
+
for (src_idx, dst_idx), weight in edge_weights.items():
|
|
414
|
+
rows.append(src_idx)
|
|
415
|
+
cols.append(dst_idx)
|
|
416
|
+
data.append(weight)
|
|
417
|
+
|
|
418
|
+
# Create symmetric matrix (undirected graph)
|
|
419
|
+
rows_sym = rows + cols
|
|
420
|
+
cols_sym = cols + rows
|
|
421
|
+
data_sym = data + data
|
|
422
|
+
|
|
423
|
+
adj = sparse.csr_matrix((data_sym, (rows_sym, cols_sym)), shape=(num_nodes, num_nodes))
|
|
424
|
+
return adj
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def compute_ppmi(adj: sparse.csr_matrix, k: float = 1.0) -> sparse.csr_matrix:
|
|
428
|
+
"""
|
|
429
|
+
Compute Positive Pointwise Mutual Information (PPMI) matrix.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
adj: adjacency matrix
|
|
433
|
+
k: shift parameter (typically 1.0)
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
PPMI matrix (sparse)
|
|
437
|
+
"""
|
|
438
|
+
# Convert to cooccurrence matrix (symmetric)
|
|
439
|
+
cooc = adj + adj.T
|
|
440
|
+
cooc.data = np.maximum(cooc.data, 0) # Ensure non-negative
|
|
441
|
+
|
|
442
|
+
# Compute marginals
|
|
443
|
+
row_sums = np.array(cooc.sum(axis=1)).flatten()
|
|
444
|
+
col_sums = np.array(cooc.sum(axis=0)).flatten()
|
|
445
|
+
total = cooc.sum()
|
|
446
|
+
|
|
447
|
+
# Avoid division by zero
|
|
448
|
+
row_sums = np.maximum(row_sums, 1e-10)
|
|
449
|
+
col_sums = np.maximum(col_sums, 1e-10)
|
|
450
|
+
total = max(total, 1e-10)
|
|
451
|
+
|
|
452
|
+
# Compute PMI
|
|
453
|
+
rows, cols = cooc.nonzero()
|
|
454
|
+
values = cooc.data
|
|
455
|
+
|
|
456
|
+
# PMI(i,j) = log(P(i,j) / (P(i) * P(j)))
|
|
457
|
+
p_ij = values / total
|
|
458
|
+
p_i = row_sums[rows] / total
|
|
459
|
+
p_j = col_sums[cols] / total
|
|
460
|
+
|
|
461
|
+
pmi = np.log(p_ij / (p_i * p_j + 1e-10) + 1e-10)
|
|
462
|
+
ppmi = np.maximum(pmi, 0.0) # Positive PMI
|
|
463
|
+
|
|
464
|
+
# Create PPMI matrix
|
|
465
|
+
ppmi_matrix = sparse.csr_matrix((ppmi, (rows, cols)), shape=cooc.shape)
|
|
466
|
+
|
|
467
|
+
return ppmi_matrix
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def reduce_dimensions_ppmi(ppmi: sparse.csr_matrix, dim: int, random_state: int = 42,
|
|
471
|
+
n_jobs: int = -1, gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray]:
|
|
472
|
+
"""
|
|
473
|
+
Reduce PPMI matrix dimensionality using Truncated SVD (GPU-accelerated if available).
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
ppmi: PPMI matrix
|
|
477
|
+
dim: target dimensionality
|
|
478
|
+
random_state: random seed
|
|
479
|
+
n_jobs: number of parallel jobs (not used for SVD, but kept for API consistency)
|
|
480
|
+
gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Reduced embeddings matrix and SVD components
|
|
484
|
+
"""
|
|
485
|
+
num_features = ppmi.shape[1]
|
|
486
|
+
actual_dim = min(dim, num_features)
|
|
487
|
+
|
|
488
|
+
# Try GPU acceleration if available
|
|
489
|
+
if gpu_accelerator and gpu_accelerator.use_gpu:
|
|
490
|
+
try:
|
|
491
|
+
# Convert sparse matrix to dense for GPU SVD (CuPy doesn't support sparse SVD well)
|
|
492
|
+
# Only do this if matrix is reasonably sized
|
|
493
|
+
if ppmi.shape[0] * ppmi.shape[1] < 50_000_000: # ~50M elements threshold
|
|
494
|
+
ppmi_dense = ppmi.toarray()
|
|
495
|
+
U, S, Vt = gpu_accelerator.svd(ppmi_dense, actual_dim, random_state)
|
|
496
|
+
|
|
497
|
+
# Transform: U @ diag(S)
|
|
498
|
+
embeddings = U @ np.diag(S)
|
|
499
|
+
components = Vt
|
|
500
|
+
else:
|
|
501
|
+
# Too large, fall back to CPU sparse SVD
|
|
502
|
+
raise ValueError("Matrix too large for GPU dense SVD")
|
|
503
|
+
except Exception as e:
|
|
504
|
+
# Fall back to CPU
|
|
505
|
+
svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
|
|
506
|
+
embeddings = svd.fit_transform(ppmi)
|
|
507
|
+
components = svd.components_
|
|
508
|
+
else:
|
|
509
|
+
# CPU path
|
|
510
|
+
svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
|
|
511
|
+
embeddings = svd.fit_transform(ppmi)
|
|
512
|
+
components = svd.components_
|
|
513
|
+
|
|
514
|
+
# Pad embeddings if needed to match requested dimension
|
|
515
|
+
if actual_dim < dim:
|
|
516
|
+
padding = np.zeros((embeddings.shape[0], dim - actual_dim))
|
|
517
|
+
embeddings = np.hstack([embeddings, padding])
|
|
518
|
+
# Pad components similarly
|
|
519
|
+
component_padding = np.zeros((dim - actual_dim, components.shape[1]))
|
|
520
|
+
components = np.vstack([components, component_padding])
|
|
521
|
+
|
|
522
|
+
return embeddings, components
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def compute_graph_view(edges: List[Tuple[int, int, str, float]], num_nodes: int,
|
|
526
|
+
dim: int, random_state: int = 42, n_jobs: int = -1,
|
|
527
|
+
node_to_idx: Dict[str, int] = None,
|
|
528
|
+
node_subtokens: Dict[str, List[str]] = None,
|
|
529
|
+
node_file_info: Dict[str, Tuple[str, str, str]] = None,
|
|
530
|
+
node_metadata: List[Dict] = None,
|
|
531
|
+
idx_to_node: Dict[int, str] = None,
|
|
532
|
+
expand_calls: bool = True,
|
|
533
|
+
add_subtokens: bool = True,
|
|
534
|
+
add_hierarchy: bool = True,
|
|
535
|
+
add_context: bool = True,
|
|
536
|
+
context_window: int = 5,
|
|
537
|
+
max_depth: int = 3,
|
|
538
|
+
gpu_accelerator=None) -> Tuple[
|
|
539
|
+
np.ndarray, np.ndarray, int, Dict[str, int], List[Tuple[int, int, str, float]]]:
|
|
540
|
+
"""
|
|
541
|
+
Compute graph view embeddings with all enhancements.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
max_depth: maximum depth for call graph expansion (default 3, use 2 for faster)
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
embeddings: node embeddings from graph view
|
|
548
|
+
svd_components: SVD components for reconstruction
|
|
549
|
+
final_num_nodes: final number of nodes (after adding subtokens)
|
|
550
|
+
subtoken_to_idx: mapping from subtoken to index (if subtokens added)
|
|
551
|
+
expanded_edges: expanded edge list including all enhancements
|
|
552
|
+
"""
|
|
553
|
+
expanded_edges = list(edges)
|
|
554
|
+
current_num_nodes = num_nodes
|
|
555
|
+
subtoken_to_idx = {}
|
|
556
|
+
|
|
557
|
+
# Step 1: Expand call graph (depth 2-3)
|
|
558
|
+
if expand_calls:
|
|
559
|
+
expanded_edges = expand_call_graph(expanded_edges, current_num_nodes, max_depth=max_depth)
|
|
560
|
+
|
|
561
|
+
# Step 2: Add context window co-occurrence (must be before PPMI)
|
|
562
|
+
if add_context and node_metadata is not None:
|
|
563
|
+
expanded_edges = add_context_window_edges(expanded_edges, node_metadata, window_size=context_window,
|
|
564
|
+
n_jobs=n_jobs)
|
|
565
|
+
|
|
566
|
+
# Step 3: Add subtoken nodes and edges
|
|
567
|
+
if add_subtokens and node_to_idx is not None and node_subtokens is not None:
|
|
568
|
+
expanded_edges, current_num_nodes, subtoken_to_idx = add_subtoken_edges(
|
|
569
|
+
expanded_edges, node_to_idx, node_subtokens, current_num_nodes
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Step 4: Add file hierarchy edges
|
|
573
|
+
if add_hierarchy and node_to_idx is not None and node_file_info is not None and idx_to_node is not None:
|
|
574
|
+
expanded_edges = add_file_hierarchy_edges(
|
|
575
|
+
expanded_edges, node_to_idx, node_file_info, idx_to_node
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Step 5: Build adjacency matrix and compute PPMI (with parallelization)
|
|
579
|
+
adj = build_adjacency_matrix(expanded_edges, current_num_nodes, n_jobs=n_jobs)
|
|
580
|
+
ppmi = compute_ppmi(adj)
|
|
581
|
+
embeddings, svd_components = reduce_dimensions_ppmi(ppmi, dim, random_state, n_jobs, gpu_accelerator)
|
|
582
|
+
|
|
583
|
+
return embeddings, svd_components, current_num_nodes, subtoken_to_idx, expanded_edges
|