tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/__about__.py
ADDED
tricoder/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""TriVector Code Intelligence - Multi-view code relationship model.
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Jiri Otoupal
|
|
4
|
+
Licensed under Non-Commercial License. Commercial use requires a license.
|
|
5
|
+
See LICENSE file for details.
|
|
6
|
+
"""
|
|
7
|
+
from .model import SymbolModel
|
|
8
|
+
from .train import train_model
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from .__about__ import __version__, __author__, __email__, __license__, __copyright__
|
|
12
|
+
except ImportError:
|
|
13
|
+
__version__ = "1.1.9"
|
|
14
|
+
__author__ = "Jiri Otoupal"
|
|
15
|
+
__email__ = "j.f.otoupal@gmail.com"
|
|
16
|
+
__license__ = "Non-Commercial License"
|
|
17
|
+
__copyright__ = "Copyright (c) 2024 Jiri Otoupal"
|
|
18
|
+
|
|
19
|
+
__all__ = ['SymbolModel', 'train_model']
|
tricoder/calibration.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Temperature calibration using held-out edges."""
|
|
2
|
+
import random
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from multiprocessing import Pool, cpu_count
|
|
5
|
+
from typing import List, Tuple, Dict
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_num_workers() -> int:
|
|
11
|
+
"""Get number of workers (all cores - 1, minimum 1)."""
|
|
12
|
+
return max(1, cpu_count() - 1)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _evaluate_single_tau(args):
|
|
16
|
+
"""Evaluate a single tau value (helper for multiprocessing)."""
|
|
17
|
+
tau, embeddings, positive_pairs, negative_pairs = args
|
|
18
|
+
return tau, evaluate_tau(embeddings, positive_pairs, negative_pairs, tau)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def split_edges(edges: List[Tuple[int, int, str, float]],
|
|
22
|
+
train_ratio: float = 0.8,
|
|
23
|
+
random_state: int = 42) -> Tuple[List[Tuple[int, int, str, float]],
|
|
24
|
+
List[Tuple[int, int, str, float]]]:
|
|
25
|
+
"""
|
|
26
|
+
Split edges into training and validation sets.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
edges: list of edges
|
|
30
|
+
train_ratio: proportion of edges for training
|
|
31
|
+
random_state: random seed
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
train_edges, val_edges
|
|
35
|
+
"""
|
|
36
|
+
random.seed(random_state)
|
|
37
|
+
np.random.seed(random_state)
|
|
38
|
+
|
|
39
|
+
shuffled = edges.copy()
|
|
40
|
+
random.shuffle(shuffled)
|
|
41
|
+
|
|
42
|
+
split_idx = int(len(shuffled) * train_ratio)
|
|
43
|
+
train_edges = shuffled[:split_idx]
|
|
44
|
+
val_edges = shuffled[split_idx:]
|
|
45
|
+
|
|
46
|
+
return train_edges, val_edges
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def sample_negatives(num_nodes: int, positive_pairs: List[Tuple[int, int]],
|
|
50
|
+
num_negatives: int = 5, random_state: int = 42,
|
|
51
|
+
node_metadata: List[Dict] = None,
|
|
52
|
+
node_file_info: Dict[str, Tuple[str, str, str]] = None,
|
|
53
|
+
idx_to_node: Dict[int, str] = None) -> List[Tuple[int, int]]:
|
|
54
|
+
"""
|
|
55
|
+
Sample negative pairs with biased sampling (50% same directory, 25% same file, 25% random).
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
num_nodes: number of nodes
|
|
59
|
+
positive_pairs: list of (src_idx, dst_idx) positive pairs
|
|
60
|
+
num_negatives: number of negatives per positive
|
|
61
|
+
random_state: random seed
|
|
62
|
+
node_metadata: list of node metadata dictionaries
|
|
63
|
+
node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
|
|
64
|
+
idx_to_node: reverse mapping from index to node_id
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
List of negative pairs
|
|
68
|
+
"""
|
|
69
|
+
np.random.seed(random_state)
|
|
70
|
+
positive_set = set(positive_pairs)
|
|
71
|
+
negative_pairs = []
|
|
72
|
+
|
|
73
|
+
# Build directory and file groupings if metadata available
|
|
74
|
+
nodes_by_directory = defaultdict(list)
|
|
75
|
+
nodes_by_file = defaultdict(list)
|
|
76
|
+
|
|
77
|
+
if node_metadata is not None and node_file_info is not None and idx_to_node is not None:
|
|
78
|
+
for idx in range(num_nodes):
|
|
79
|
+
node_id = idx_to_node.get(idx)
|
|
80
|
+
if node_id and node_id in node_file_info:
|
|
81
|
+
file_name, directory_path, _ = node_file_info[node_id]
|
|
82
|
+
if directory_path:
|
|
83
|
+
nodes_by_directory[directory_path].append(idx)
|
|
84
|
+
if file_name:
|
|
85
|
+
nodes_by_file[file_name].append(idx)
|
|
86
|
+
|
|
87
|
+
for src, dst in positive_pairs:
|
|
88
|
+
src_node_id = idx_to_node.get(src) if idx_to_node else None
|
|
89
|
+
src_dir = None
|
|
90
|
+
src_file = None
|
|
91
|
+
|
|
92
|
+
if src_node_id and node_file_info and src_node_id in node_file_info:
|
|
93
|
+
src_file, src_dir, _ = node_file_info[src_node_id]
|
|
94
|
+
|
|
95
|
+
for neg_idx in range(num_negatives):
|
|
96
|
+
neg_dst = None
|
|
97
|
+
|
|
98
|
+
# Biased sampling: 50% same directory, 25% same file, 25% random
|
|
99
|
+
rand_val = np.random.random()
|
|
100
|
+
|
|
101
|
+
if rand_val < 0.5 and src_dir and src_dir in nodes_by_directory:
|
|
102
|
+
# 50%: sample from same directory
|
|
103
|
+
candidates = [n for n in nodes_by_directory[src_dir] if
|
|
104
|
+
n != src and (src, n) not in positive_set]
|
|
105
|
+
if candidates:
|
|
106
|
+
neg_dst = np.random.choice(candidates)
|
|
107
|
+
|
|
108
|
+
elif rand_val < 0.75 and src_file and src_file in nodes_by_file:
|
|
109
|
+
# 25%: sample from same file
|
|
110
|
+
candidates = [n for n in nodes_by_file[src_file] if n != src and (src, n) not in positive_set]
|
|
111
|
+
if candidates:
|
|
112
|
+
neg_dst = np.random.choice(candidates)
|
|
113
|
+
|
|
114
|
+
# 25%: random sample (or fallback if biased sampling failed)
|
|
115
|
+
if neg_dst is None:
|
|
116
|
+
attempts = 0
|
|
117
|
+
while attempts < 100: # Limit attempts to avoid infinite loop
|
|
118
|
+
neg_dst = np.random.randint(0, num_nodes)
|
|
119
|
+
if (src, neg_dst) not in positive_set and src != neg_dst:
|
|
120
|
+
break
|
|
121
|
+
attempts += 1
|
|
122
|
+
else:
|
|
123
|
+
# Fallback: just pick any node that's not src
|
|
124
|
+
neg_dst = (src + 1) % num_nodes
|
|
125
|
+
|
|
126
|
+
if neg_dst is not None:
|
|
127
|
+
negative_pairs.append((src, neg_dst))
|
|
128
|
+
|
|
129
|
+
return negative_pairs
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def compute_calibrated_score(emb_u: np.ndarray, emb_v: np.ndarray, tau: float) -> float:
|
|
133
|
+
"""
|
|
134
|
+
Compute calibrated similarity score.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
emb_u: embedding vector for node u
|
|
138
|
+
emb_v: embedding vector for node v
|
|
139
|
+
tau: temperature parameter
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Calibrated score
|
|
143
|
+
"""
|
|
144
|
+
dot_product = np.dot(emb_u, emb_v)
|
|
145
|
+
return dot_product / tau
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def softmax_scores(scores: np.ndarray) -> np.ndarray:
|
|
149
|
+
"""
|
|
150
|
+
Compute softmax probabilities from scores.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
scores: array of scores
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Softmax probabilities
|
|
157
|
+
"""
|
|
158
|
+
# Numerical stability: subtract max
|
|
159
|
+
exp_scores = np.exp(scores - np.max(scores))
|
|
160
|
+
return exp_scores / exp_scores.sum()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def evaluate_tau(embeddings: np.ndarray,
|
|
164
|
+
positive_pairs: List[Tuple[int, int]],
|
|
165
|
+
negative_pairs: List[Tuple[int, int]],
|
|
166
|
+
tau: float) -> float:
|
|
167
|
+
"""
|
|
168
|
+
Evaluate temperature tau on validation pairs.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Average probability assigned to positive pairs
|
|
172
|
+
"""
|
|
173
|
+
pos_scores = []
|
|
174
|
+
for src, dst in positive_pairs:
|
|
175
|
+
score = compute_calibrated_score(embeddings[src], embeddings[dst], tau)
|
|
176
|
+
pos_scores.append(score)
|
|
177
|
+
|
|
178
|
+
neg_scores = []
|
|
179
|
+
for src, dst in negative_pairs:
|
|
180
|
+
score = compute_calibrated_score(embeddings[src], embeddings[dst], tau)
|
|
181
|
+
neg_scores.append(score)
|
|
182
|
+
|
|
183
|
+
# For each positive, compute softmax over positive + negatives
|
|
184
|
+
total_prob = 0.0
|
|
185
|
+
for i, (src, dst) in enumerate(positive_pairs):
|
|
186
|
+
# Get corresponding negatives
|
|
187
|
+
start_idx = i * len(negative_pairs) // len(positive_pairs)
|
|
188
|
+
end_idx = (i + 1) * len(negative_pairs) // len(positive_pairs)
|
|
189
|
+
if end_idx == start_idx:
|
|
190
|
+
end_idx = start_idx + 1
|
|
191
|
+
|
|
192
|
+
relevant_negatives = negative_pairs[start_idx:end_idx]
|
|
193
|
+
all_scores = [pos_scores[i]]
|
|
194
|
+
for neg_src, neg_dst in relevant_negatives:
|
|
195
|
+
neg_score = compute_calibrated_score(embeddings[neg_src], embeddings[neg_dst], tau)
|
|
196
|
+
all_scores.append(neg_score)
|
|
197
|
+
|
|
198
|
+
probs = softmax_scores(np.array(all_scores))
|
|
199
|
+
total_prob += probs[0] # Probability of positive
|
|
200
|
+
|
|
201
|
+
return total_prob / len(positive_pairs)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def learn_temperature(embeddings: np.ndarray,
|
|
205
|
+
val_edges: List[Tuple[int, int, str, float]],
|
|
206
|
+
num_nodes: int,
|
|
207
|
+
num_negatives: int = 5,
|
|
208
|
+
tau_candidates: np.ndarray = None,
|
|
209
|
+
random_state: int = 42,
|
|
210
|
+
n_jobs: int = -1,
|
|
211
|
+
node_metadata: List[Dict] = None,
|
|
212
|
+
node_file_info: Dict[str, Tuple[str, str, str]] = None,
|
|
213
|
+
idx_to_node: Dict[int, str] = None) -> float:
|
|
214
|
+
"""
|
|
215
|
+
Learn optimal temperature parameter via parallel grid search with improved negative sampling.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
embeddings: fused embeddings
|
|
219
|
+
val_edges: validation edges
|
|
220
|
+
num_nodes: number of nodes
|
|
221
|
+
num_negatives: number of negatives per positive
|
|
222
|
+
tau_candidates: candidate tau values (if None, use logspace)
|
|
223
|
+
random_state: random seed
|
|
224
|
+
n_jobs: number of parallel jobs (-1 for all cores - 1)
|
|
225
|
+
node_metadata: list of node metadata dictionaries
|
|
226
|
+
node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
|
|
227
|
+
idx_to_node: reverse mapping from index to node_id
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Optimal temperature tau
|
|
231
|
+
"""
|
|
232
|
+
# Handle empty validation set
|
|
233
|
+
if not val_edges:
|
|
234
|
+
return 1.0
|
|
235
|
+
|
|
236
|
+
if tau_candidates is None:
|
|
237
|
+
# Reduced from 50 to 30 for faster training (still good coverage)
|
|
238
|
+
tau_candidates = np.logspace(-2, 2, num=30)
|
|
239
|
+
|
|
240
|
+
positive_pairs = [(src, dst) for src, dst, _, _ in val_edges]
|
|
241
|
+
negative_pairs = sample_negatives(
|
|
242
|
+
num_nodes, positive_pairs, num_negatives, random_state,
|
|
243
|
+
node_metadata, node_file_info, idx_to_node
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Handle case where no negatives could be sampled
|
|
247
|
+
if not negative_pairs:
|
|
248
|
+
return 1.0
|
|
249
|
+
|
|
250
|
+
if n_jobs == -1:
|
|
251
|
+
n_jobs = _get_num_workers()
|
|
252
|
+
|
|
253
|
+
# Parallel evaluation of tau candidates
|
|
254
|
+
if n_jobs > 1 and len(tau_candidates) > 4:
|
|
255
|
+
args_list = [(tau, embeddings, positive_pairs, negative_pairs) for tau in tau_candidates]
|
|
256
|
+
chunksize = max(1, len(args_list) // (n_jobs * 2))
|
|
257
|
+
with Pool(processes=n_jobs) as pool:
|
|
258
|
+
results = pool.map(_evaluate_single_tau, args_list, chunksize=chunksize)
|
|
259
|
+
|
|
260
|
+
best_tau = 1.0
|
|
261
|
+
best_score = -np.inf
|
|
262
|
+
for tau, score in results:
|
|
263
|
+
if score > best_score:
|
|
264
|
+
best_score = score
|
|
265
|
+
best_tau = tau
|
|
266
|
+
else:
|
|
267
|
+
# Sequential evaluation for small cases
|
|
268
|
+
best_tau = 1.0
|
|
269
|
+
best_score = -np.inf
|
|
270
|
+
for tau in tau_candidates:
|
|
271
|
+
score = evaluate_tau(embeddings, positive_pairs, negative_pairs, tau)
|
|
272
|
+
if score > best_score:
|
|
273
|
+
best_score = score
|
|
274
|
+
best_tau = tau
|
|
275
|
+
|
|
276
|
+
return best_tau
|