tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tricoder/__about__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Package metadata for TriCoder."""
2
+ __version__ = "1.2.8"
3
+ __author__ = "Jiri Otoupal"
4
+ __email__ = "j.f.otoupal@gmail.com"
5
+ __license__ = "Non-Commercial License"
6
+ __copyright__ = "Copyright (c) 2024 Jiri Otoupal"
tricoder/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """TriVector Code Intelligence - Multi-view code relationship model.
2
+
3
+ Copyright (c) 2024 Jiri Otoupal
4
+ Licensed under Non-Commercial License. Commercial use requires a license.
5
+ See LICENSE file for details.
6
+ """
7
+ from .model import SymbolModel
8
+ from .train import train_model
9
+
10
+ try:
11
+ from .__about__ import __version__, __author__, __email__, __license__, __copyright__
12
+ except ImportError:
13
+ __version__ = "1.1.9"
14
+ __author__ = "Jiri Otoupal"
15
+ __email__ = "j.f.otoupal@gmail.com"
16
+ __license__ = "Non-Commercial License"
17
+ __copyright__ = "Copyright (c) 2024 Jiri Otoupal"
18
+
19
+ __all__ = ['SymbolModel', 'train_model']
@@ -0,0 +1,276 @@
1
+ """Temperature calibration using held-out edges."""
2
+ import random
3
+ from collections import defaultdict
4
+ from multiprocessing import Pool, cpu_count
5
+ from typing import List, Tuple, Dict
6
+
7
+ import numpy as np
8
+
9
+
10
+ def _get_num_workers() -> int:
11
+ """Get number of workers (all cores - 1, minimum 1)."""
12
+ return max(1, cpu_count() - 1)
13
+
14
+
15
+ def _evaluate_single_tau(args):
16
+ """Evaluate a single tau value (helper for multiprocessing)."""
17
+ tau, embeddings, positive_pairs, negative_pairs = args
18
+ return tau, evaluate_tau(embeddings, positive_pairs, negative_pairs, tau)
19
+
20
+
21
+ def split_edges(edges: List[Tuple[int, int, str, float]],
22
+ train_ratio: float = 0.8,
23
+ random_state: int = 42) -> Tuple[List[Tuple[int, int, str, float]],
24
+ List[Tuple[int, int, str, float]]]:
25
+ """
26
+ Split edges into training and validation sets.
27
+
28
+ Args:
29
+ edges: list of edges
30
+ train_ratio: proportion of edges for training
31
+ random_state: random seed
32
+
33
+ Returns:
34
+ train_edges, val_edges
35
+ """
36
+ random.seed(random_state)
37
+ np.random.seed(random_state)
38
+
39
+ shuffled = edges.copy()
40
+ random.shuffle(shuffled)
41
+
42
+ split_idx = int(len(shuffled) * train_ratio)
43
+ train_edges = shuffled[:split_idx]
44
+ val_edges = shuffled[split_idx:]
45
+
46
+ return train_edges, val_edges
47
+
48
+
49
+ def sample_negatives(num_nodes: int, positive_pairs: List[Tuple[int, int]],
50
+ num_negatives: int = 5, random_state: int = 42,
51
+ node_metadata: List[Dict] = None,
52
+ node_file_info: Dict[str, Tuple[str, str, str]] = None,
53
+ idx_to_node: Dict[int, str] = None) -> List[Tuple[int, int]]:
54
+ """
55
+ Sample negative pairs with biased sampling (50% same directory, 25% same file, 25% random).
56
+
57
+ Args:
58
+ num_nodes: number of nodes
59
+ positive_pairs: list of (src_idx, dst_idx) positive pairs
60
+ num_negatives: number of negatives per positive
61
+ random_state: random seed
62
+ node_metadata: list of node metadata dictionaries
63
+ node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
64
+ idx_to_node: reverse mapping from index to node_id
65
+
66
+ Returns:
67
+ List of negative pairs
68
+ """
69
+ np.random.seed(random_state)
70
+ positive_set = set(positive_pairs)
71
+ negative_pairs = []
72
+
73
+ # Build directory and file groupings if metadata available
74
+ nodes_by_directory = defaultdict(list)
75
+ nodes_by_file = defaultdict(list)
76
+
77
+ if node_metadata is not None and node_file_info is not None and idx_to_node is not None:
78
+ for idx in range(num_nodes):
79
+ node_id = idx_to_node.get(idx)
80
+ if node_id and node_id in node_file_info:
81
+ file_name, directory_path, _ = node_file_info[node_id]
82
+ if directory_path:
83
+ nodes_by_directory[directory_path].append(idx)
84
+ if file_name:
85
+ nodes_by_file[file_name].append(idx)
86
+
87
+ for src, dst in positive_pairs:
88
+ src_node_id = idx_to_node.get(src) if idx_to_node else None
89
+ src_dir = None
90
+ src_file = None
91
+
92
+ if src_node_id and node_file_info and src_node_id in node_file_info:
93
+ src_file, src_dir, _ = node_file_info[src_node_id]
94
+
95
+ for neg_idx in range(num_negatives):
96
+ neg_dst = None
97
+
98
+ # Biased sampling: 50% same directory, 25% same file, 25% random
99
+ rand_val = np.random.random()
100
+
101
+ if rand_val < 0.5 and src_dir and src_dir in nodes_by_directory:
102
+ # 50%: sample from same directory
103
+ candidates = [n for n in nodes_by_directory[src_dir] if
104
+ n != src and (src, n) not in positive_set]
105
+ if candidates:
106
+ neg_dst = np.random.choice(candidates)
107
+
108
+ elif rand_val < 0.75 and src_file and src_file in nodes_by_file:
109
+ # 25%: sample from same file
110
+ candidates = [n for n in nodes_by_file[src_file] if n != src and (src, n) not in positive_set]
111
+ if candidates:
112
+ neg_dst = np.random.choice(candidates)
113
+
114
+ # 25%: random sample (or fallback if biased sampling failed)
115
+ if neg_dst is None:
116
+ attempts = 0
117
+ while attempts < 100: # Limit attempts to avoid infinite loop
118
+ neg_dst = np.random.randint(0, num_nodes)
119
+ if (src, neg_dst) not in positive_set and src != neg_dst:
120
+ break
121
+ attempts += 1
122
+ else:
123
+ # Fallback: just pick any node that's not src
124
+ neg_dst = (src + 1) % num_nodes
125
+
126
+ if neg_dst is not None:
127
+ negative_pairs.append((src, neg_dst))
128
+
129
+ return negative_pairs
130
+
131
+
132
+ def compute_calibrated_score(emb_u: np.ndarray, emb_v: np.ndarray, tau: float) -> float:
133
+ """
134
+ Compute calibrated similarity score.
135
+
136
+ Args:
137
+ emb_u: embedding vector for node u
138
+ emb_v: embedding vector for node v
139
+ tau: temperature parameter
140
+
141
+ Returns:
142
+ Calibrated score
143
+ """
144
+ dot_product = np.dot(emb_u, emb_v)
145
+ return dot_product / tau
146
+
147
+
148
+ def softmax_scores(scores: np.ndarray) -> np.ndarray:
149
+ """
150
+ Compute softmax probabilities from scores.
151
+
152
+ Args:
153
+ scores: array of scores
154
+
155
+ Returns:
156
+ Softmax probabilities
157
+ """
158
+ # Numerical stability: subtract max
159
+ exp_scores = np.exp(scores - np.max(scores))
160
+ return exp_scores / exp_scores.sum()
161
+
162
+
163
+ def evaluate_tau(embeddings: np.ndarray,
164
+ positive_pairs: List[Tuple[int, int]],
165
+ negative_pairs: List[Tuple[int, int]],
166
+ tau: float) -> float:
167
+ """
168
+ Evaluate temperature tau on validation pairs.
169
+
170
+ Returns:
171
+ Average probability assigned to positive pairs
172
+ """
173
+ pos_scores = []
174
+ for src, dst in positive_pairs:
175
+ score = compute_calibrated_score(embeddings[src], embeddings[dst], tau)
176
+ pos_scores.append(score)
177
+
178
+ neg_scores = []
179
+ for src, dst in negative_pairs:
180
+ score = compute_calibrated_score(embeddings[src], embeddings[dst], tau)
181
+ neg_scores.append(score)
182
+
183
+ # For each positive, compute softmax over positive + negatives
184
+ total_prob = 0.0
185
+ for i, (src, dst) in enumerate(positive_pairs):
186
+ # Get corresponding negatives
187
+ start_idx = i * len(negative_pairs) // len(positive_pairs)
188
+ end_idx = (i + 1) * len(negative_pairs) // len(positive_pairs)
189
+ if end_idx == start_idx:
190
+ end_idx = start_idx + 1
191
+
192
+ relevant_negatives = negative_pairs[start_idx:end_idx]
193
+ all_scores = [pos_scores[i]]
194
+ for neg_src, neg_dst in relevant_negatives:
195
+ neg_score = compute_calibrated_score(embeddings[neg_src], embeddings[neg_dst], tau)
196
+ all_scores.append(neg_score)
197
+
198
+ probs = softmax_scores(np.array(all_scores))
199
+ total_prob += probs[0] # Probability of positive
200
+
201
+ return total_prob / len(positive_pairs)
202
+
203
+
204
+ def learn_temperature(embeddings: np.ndarray,
205
+ val_edges: List[Tuple[int, int, str, float]],
206
+ num_nodes: int,
207
+ num_negatives: int = 5,
208
+ tau_candidates: np.ndarray = None,
209
+ random_state: int = 42,
210
+ n_jobs: int = -1,
211
+ node_metadata: List[Dict] = None,
212
+ node_file_info: Dict[str, Tuple[str, str, str]] = None,
213
+ idx_to_node: Dict[int, str] = None) -> float:
214
+ """
215
+ Learn optimal temperature parameter via parallel grid search with improved negative sampling.
216
+
217
+ Args:
218
+ embeddings: fused embeddings
219
+ val_edges: validation edges
220
+ num_nodes: number of nodes
221
+ num_negatives: number of negatives per positive
222
+ tau_candidates: candidate tau values (if None, use logspace)
223
+ random_state: random seed
224
+ n_jobs: number of parallel jobs (-1 for all cores - 1)
225
+ node_metadata: list of node metadata dictionaries
226
+ node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
227
+ idx_to_node: reverse mapping from index to node_id
228
+
229
+ Returns:
230
+ Optimal temperature tau
231
+ """
232
+ # Handle empty validation set
233
+ if not val_edges:
234
+ return 1.0
235
+
236
+ if tau_candidates is None:
237
+ # Reduced from 50 to 30 for faster training (still good coverage)
238
+ tau_candidates = np.logspace(-2, 2, num=30)
239
+
240
+ positive_pairs = [(src, dst) for src, dst, _, _ in val_edges]
241
+ negative_pairs = sample_negatives(
242
+ num_nodes, positive_pairs, num_negatives, random_state,
243
+ node_metadata, node_file_info, idx_to_node
244
+ )
245
+
246
+ # Handle case where no negatives could be sampled
247
+ if not negative_pairs:
248
+ return 1.0
249
+
250
+ if n_jobs == -1:
251
+ n_jobs = _get_num_workers()
252
+
253
+ # Parallel evaluation of tau candidates
254
+ if n_jobs > 1 and len(tau_candidates) > 4:
255
+ args_list = [(tau, embeddings, positive_pairs, negative_pairs) for tau in tau_candidates]
256
+ chunksize = max(1, len(args_list) // (n_jobs * 2))
257
+ with Pool(processes=n_jobs) as pool:
258
+ results = pool.map(_evaluate_single_tau, args_list, chunksize=chunksize)
259
+
260
+ best_tau = 1.0
261
+ best_score = -np.inf
262
+ for tau, score in results:
263
+ if score > best_score:
264
+ best_score = score
265
+ best_tau = tau
266
+ else:
267
+ # Sequential evaluation for small cases
268
+ best_tau = 1.0
269
+ best_score = -np.inf
270
+ for tau in tau_candidates:
271
+ score = evaluate_tau(embeddings, positive_pairs, negative_pairs, tau)
272
+ if score > best_score:
273
+ best_score = score
274
+ best_tau = tau
275
+
276
+ return best_tau