tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/context_view.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Context view: Node2Vec-style random walks and Word2Vec."""
|
|
2
|
+
import random
|
|
3
|
+
from multiprocessing import Pool, cpu_count
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from gensim.models import Word2Vec
|
|
8
|
+
from gensim.models.keyedvectors import KeyedVectors
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_num_workers() -> int:
|
|
12
|
+
"""Get number of workers (all cores - 1, minimum 1)."""
|
|
13
|
+
return max(1, cpu_count() - 1)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _generate_walks_for_node(args):
|
|
17
|
+
"""Generate walks for a single node (helper for multiprocessing)."""
|
|
18
|
+
start_node, adj_list, num_walks, walk_length, p, q, seed_offset = args
|
|
19
|
+
# Set seed for this worker
|
|
20
|
+
random.seed(42 + seed_offset + start_node)
|
|
21
|
+
np.random.seed(42 + seed_offset + start_node)
|
|
22
|
+
|
|
23
|
+
walks = []
|
|
24
|
+
for walk_idx in range(num_walks):
|
|
25
|
+
if not adj_list[start_node]:
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
walk = [start_node]
|
|
29
|
+
|
|
30
|
+
for _ in range(walk_length - 1):
|
|
31
|
+
curr = walk[-1]
|
|
32
|
+
neighbors = adj_list[curr]
|
|
33
|
+
|
|
34
|
+
if not neighbors:
|
|
35
|
+
break
|
|
36
|
+
|
|
37
|
+
if len(walk) == 1:
|
|
38
|
+
next_node = random.choice(neighbors)[0]
|
|
39
|
+
else:
|
|
40
|
+
prev = walk[-2]
|
|
41
|
+
probs = []
|
|
42
|
+
nodes = []
|
|
43
|
+
|
|
44
|
+
for neighbor, weight in neighbors:
|
|
45
|
+
nodes.append(neighbor)
|
|
46
|
+
if neighbor == prev:
|
|
47
|
+
prob = weight / p
|
|
48
|
+
elif neighbor in adj_list[prev]:
|
|
49
|
+
prob = weight
|
|
50
|
+
else:
|
|
51
|
+
prob = weight / q
|
|
52
|
+
probs.append(max(prob, 1e-10))
|
|
53
|
+
|
|
54
|
+
probs = np.array(probs)
|
|
55
|
+
probs = probs / probs.sum()
|
|
56
|
+
next_node = np.random.choice(nodes, p=probs)
|
|
57
|
+
|
|
58
|
+
walk.append(next_node)
|
|
59
|
+
|
|
60
|
+
walks.append([str(node) for node in walk])
|
|
61
|
+
|
|
62
|
+
return walks
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def generate_random_walks(edges: List[Tuple[int, int, str, float]],
|
|
66
|
+
num_nodes: int,
|
|
67
|
+
num_walks: int = 10,
|
|
68
|
+
walk_length: int = 80,
|
|
69
|
+
p: float = 1.0,
|
|
70
|
+
q: float = 1.0,
|
|
71
|
+
random_state: int = 42,
|
|
72
|
+
n_jobs: int = -1) -> List[List[int]]:
|
|
73
|
+
"""
|
|
74
|
+
Generate Node2Vec-style random walks with multiprocessing support.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
edges: list of (src_idx, dst_idx, relation, weight) tuples
|
|
78
|
+
num_nodes: number of nodes
|
|
79
|
+
num_walks: number of walks per node
|
|
80
|
+
walk_length: length of each walk
|
|
81
|
+
p: return parameter (1/p controls likelihood of returning)
|
|
82
|
+
q: in-out parameter (1/q controls likelihood of going further)
|
|
83
|
+
random_state: random seed
|
|
84
|
+
n_jobs: number of parallel jobs (-1 for all cores - 1)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of walks (each walk is a list of node indices)
|
|
88
|
+
"""
|
|
89
|
+
# Build adjacency list
|
|
90
|
+
adj_list = {i: [] for i in range(num_nodes)}
|
|
91
|
+
for src_idx, dst_idx, rel, weight in edges:
|
|
92
|
+
adj_list[src_idx].append((dst_idx, weight))
|
|
93
|
+
adj_list[dst_idx].append((src_idx, weight))
|
|
94
|
+
|
|
95
|
+
# Determine number of workers
|
|
96
|
+
if n_jobs == -1:
|
|
97
|
+
n_jobs = _get_num_workers()
|
|
98
|
+
n_jobs = max(1, min(n_jobs, num_nodes))
|
|
99
|
+
|
|
100
|
+
# Prepare arguments for parallel processing
|
|
101
|
+
nodes_to_process = [node for node in range(num_nodes) if adj_list[node]]
|
|
102
|
+
|
|
103
|
+
if n_jobs == 1 or len(nodes_to_process) < n_jobs:
|
|
104
|
+
# Sequential processing for small cases
|
|
105
|
+
random.seed(random_state)
|
|
106
|
+
np.random.seed(random_state)
|
|
107
|
+
walks = []
|
|
108
|
+
for _ in range(num_walks):
|
|
109
|
+
for start_node in nodes_to_process:
|
|
110
|
+
walk = [start_node]
|
|
111
|
+
for _ in range(walk_length - 1):
|
|
112
|
+
curr = walk[-1]
|
|
113
|
+
neighbors = adj_list[curr]
|
|
114
|
+
if not neighbors:
|
|
115
|
+
break
|
|
116
|
+
|
|
117
|
+
if len(walk) == 1:
|
|
118
|
+
next_node = random.choice(neighbors)[0]
|
|
119
|
+
else:
|
|
120
|
+
prev = walk[-2]
|
|
121
|
+
probs = []
|
|
122
|
+
nodes = []
|
|
123
|
+
for neighbor, weight in neighbors:
|
|
124
|
+
nodes.append(neighbor)
|
|
125
|
+
if neighbor == prev:
|
|
126
|
+
prob = weight / p
|
|
127
|
+
elif neighbor in adj_list[prev]:
|
|
128
|
+
prob = weight
|
|
129
|
+
else:
|
|
130
|
+
prob = weight / q
|
|
131
|
+
probs.append(max(prob, 1e-10))
|
|
132
|
+
probs = np.array(probs)
|
|
133
|
+
probs = probs / probs.sum()
|
|
134
|
+
next_node = np.random.choice(nodes, p=probs)
|
|
135
|
+
walk.append(next_node)
|
|
136
|
+
walks.append([str(node) for node in walk])
|
|
137
|
+
else:
|
|
138
|
+
# Parallel processing
|
|
139
|
+
args_list = [
|
|
140
|
+
(node, adj_list, num_walks, walk_length, p, q, random_state + i)
|
|
141
|
+
for i, node in enumerate(nodes_to_process)
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
# Parallel processing - use standard Pool which works on all platforms
|
|
145
|
+
chunksize = max(1, len(args_list) // (n_jobs * 2))
|
|
146
|
+
with Pool(processes=n_jobs) as pool:
|
|
147
|
+
results = pool.map(_generate_walks_for_node, args_list, chunksize=chunksize)
|
|
148
|
+
|
|
149
|
+
# Flatten results
|
|
150
|
+
walks = []
|
|
151
|
+
for result in results:
|
|
152
|
+
walks.extend(result)
|
|
153
|
+
|
|
154
|
+
return walks
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def train_word2vec(walks: List[List[str]], dim: int, window: int = 7,
|
|
158
|
+
negative: int = 3, epochs: int = 3, random_state: int = 42,
|
|
159
|
+
n_jobs: int = -1, batch_words: int = 10000) -> KeyedVectors:
|
|
160
|
+
"""
|
|
161
|
+
Train Word2Vec SkipGram model on random walks with multiprocessing.
|
|
162
|
+
Optimized defaults: window=7 (was 10), negative=3 (was 5) for faster training.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
walks: list of walks (each walk is a list of node ID strings)
|
|
166
|
+
dim: embedding dimensionality
|
|
167
|
+
window: context window size (reduced default: 7)
|
|
168
|
+
negative: number of negative samples (reduced default: 3)
|
|
169
|
+
epochs: number of training epochs
|
|
170
|
+
random_state: random seed
|
|
171
|
+
n_jobs: number of parallel workers (-1 for all cores - 1)
|
|
172
|
+
batch_words: words per batch (larger = faster but more memory)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Trained KeyedVectors model
|
|
176
|
+
"""
|
|
177
|
+
if n_jobs == -1:
|
|
178
|
+
n_jobs = _get_num_workers()
|
|
179
|
+
|
|
180
|
+
# For gensim 4.x, use workers parameter
|
|
181
|
+
workers = max(1, n_jobs)
|
|
182
|
+
|
|
183
|
+
model = Word2Vec(
|
|
184
|
+
sentences=walks,
|
|
185
|
+
vector_size=dim,
|
|
186
|
+
window=window,
|
|
187
|
+
min_count=1,
|
|
188
|
+
workers=workers,
|
|
189
|
+
sg=1, # SkipGram
|
|
190
|
+
negative=negative,
|
|
191
|
+
epochs=epochs,
|
|
192
|
+
seed=random_state,
|
|
193
|
+
batch_words=batch_words # Larger batches for faster training
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return model.wv
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def compute_context_view(edges: List[Tuple[int, int, str, float]],
|
|
200
|
+
num_nodes: int,
|
|
201
|
+
dim: int,
|
|
202
|
+
num_walks: int = 10,
|
|
203
|
+
walk_length: int = 80,
|
|
204
|
+
random_state: int = 42,
|
|
205
|
+
n_jobs: int = -1) -> Tuple[np.ndarray, KeyedVectors]:
|
|
206
|
+
"""
|
|
207
|
+
Compute context view embeddings using Node2Vec + Word2Vec with multiprocessing.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
embeddings: node embeddings from context view
|
|
211
|
+
keyed_vectors: Word2Vec KeyedVectors model
|
|
212
|
+
"""
|
|
213
|
+
walks = generate_random_walks(edges, num_nodes, num_walks, walk_length,
|
|
214
|
+
random_state=random_state, n_jobs=n_jobs)
|
|
215
|
+
kv = train_word2vec(walks, dim, random_state=random_state, n_jobs=n_jobs)
|
|
216
|
+
|
|
217
|
+
# Extract embeddings for all nodes
|
|
218
|
+
embeddings = np.zeros((num_nodes, dim))
|
|
219
|
+
for i in range(num_nodes):
|
|
220
|
+
node_str = str(i)
|
|
221
|
+
if node_str in kv:
|
|
222
|
+
embeddings[i] = kv[node_str]
|
|
223
|
+
else:
|
|
224
|
+
# Initialize with small random values if node not seen
|
|
225
|
+
np.random.seed(random_state + i)
|
|
226
|
+
embeddings[i] = np.random.normal(0, 0.01, dim)
|
|
227
|
+
|
|
228
|
+
return embeddings, kv
|
tricoder/data_loader.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Data loading utilities for TriVector Code Intelligence."""
|
|
2
|
+
import json
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
from .subtoken_utils import extract_subtokens, get_file_hierarchy
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_nodes(nodes_path: str) -> Tuple[
|
|
10
|
+
Dict[str, int], List[Dict], Dict[str, List[str]], Dict[str, Tuple[str, str, str]]]:
|
|
11
|
+
"""
|
|
12
|
+
Load nodes from JSONL file.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
node_to_idx: mapping from node ID to index
|
|
16
|
+
node_metadata: list of node metadata dictionaries
|
|
17
|
+
node_subtokens: mapping from node_id to list of normalized subtokens
|
|
18
|
+
node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
|
|
19
|
+
"""
|
|
20
|
+
node_to_idx = {}
|
|
21
|
+
node_metadata = []
|
|
22
|
+
node_subtokens = {}
|
|
23
|
+
node_file_info = {}
|
|
24
|
+
|
|
25
|
+
with open(nodes_path, 'r') as f:
|
|
26
|
+
for line in f:
|
|
27
|
+
if not line.strip():
|
|
28
|
+
continue
|
|
29
|
+
node = json.loads(line)
|
|
30
|
+
node_id = node['id']
|
|
31
|
+
if node_id not in node_to_idx:
|
|
32
|
+
idx = len(node_to_idx)
|
|
33
|
+
node_to_idx[node_id] = idx
|
|
34
|
+
|
|
35
|
+
# Extract name and metadata
|
|
36
|
+
name = node.get('name', '')
|
|
37
|
+
meta = node.get('meta', {})
|
|
38
|
+
|
|
39
|
+
# Extract subtokens
|
|
40
|
+
raw_subtokens, normalized_subtokens = extract_subtokens(name, normalize=True)
|
|
41
|
+
node_subtokens[node_id] = normalized_subtokens
|
|
42
|
+
|
|
43
|
+
# Extract file hierarchy
|
|
44
|
+
file_path = meta.get('file', '') if isinstance(meta, dict) else ''
|
|
45
|
+
file_name, directory_path, top_level_package = get_file_hierarchy(file_path)
|
|
46
|
+
node_file_info[node_id] = (file_name, directory_path, top_level_package)
|
|
47
|
+
|
|
48
|
+
# Store metadata with subtoken info for debugging
|
|
49
|
+
meta_with_subtokens = meta.copy() if isinstance(meta, dict) else {}
|
|
50
|
+
meta_with_subtokens['_raw_subtokens'] = raw_subtokens
|
|
51
|
+
meta_with_subtokens['_normalized_subtokens'] = normalized_subtokens
|
|
52
|
+
|
|
53
|
+
node_metadata.append({
|
|
54
|
+
'id': node_id,
|
|
55
|
+
'kind': node.get('kind', 'unknown'),
|
|
56
|
+
'name': name,
|
|
57
|
+
'meta': meta_with_subtokens
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
return node_to_idx, node_metadata, node_subtokens, node_file_info
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_edges(edges_path: str, node_to_idx: Dict[str, int]) -> Tuple[List[Tuple[int, int, str, float]], int]:
|
|
64
|
+
"""
|
|
65
|
+
Load edges from JSONL file.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
edges: list of (src_idx, dst_idx, relation, weight) tuples
|
|
69
|
+
num_nodes: number of unique nodes
|
|
70
|
+
"""
|
|
71
|
+
edges = []
|
|
72
|
+
seen_nodes = set()
|
|
73
|
+
|
|
74
|
+
with open(edges_path, 'r') as f:
|
|
75
|
+
for line in f:
|
|
76
|
+
if not line.strip():
|
|
77
|
+
continue
|
|
78
|
+
edge = json.loads(line)
|
|
79
|
+
src_id = edge['src']
|
|
80
|
+
dst_id = edge['dst']
|
|
81
|
+
|
|
82
|
+
if src_id in node_to_idx and dst_id in node_to_idx:
|
|
83
|
+
src_idx = node_to_idx[src_id]
|
|
84
|
+
dst_idx = node_to_idx[dst_id]
|
|
85
|
+
rel = edge.get('rel', 'unknown')
|
|
86
|
+
weight = float(edge.get('weight', 1.0))
|
|
87
|
+
edges.append((src_idx, dst_idx, rel, weight))
|
|
88
|
+
seen_nodes.add(src_idx)
|
|
89
|
+
seen_nodes.add(dst_idx)
|
|
90
|
+
|
|
91
|
+
num_nodes = len(node_to_idx)
|
|
92
|
+
return edges, num_nodes
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def load_types(types_path: str, node_to_idx: Dict[str, int]) -> Tuple[
|
|
96
|
+
Dict[int, Dict[str, int]], Dict[str, int]]:
|
|
97
|
+
"""
|
|
98
|
+
Load type tokens from JSONL file.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
node_types: mapping from node_idx to {type_token: count}
|
|
102
|
+
type_to_idx: mapping from type token to index
|
|
103
|
+
"""
|
|
104
|
+
node_types = defaultdict(lambda: defaultdict(int))
|
|
105
|
+
type_to_idx = {}
|
|
106
|
+
|
|
107
|
+
with open(types_path, 'r') as f:
|
|
108
|
+
for line in f:
|
|
109
|
+
if not line.strip():
|
|
110
|
+
continue
|
|
111
|
+
entry = json.loads(line)
|
|
112
|
+
symbol_id = entry['symbol']
|
|
113
|
+
type_token = entry['type_token']
|
|
114
|
+
count = int(entry.get('count', 1))
|
|
115
|
+
|
|
116
|
+
if symbol_id in node_to_idx:
|
|
117
|
+
node_idx = node_to_idx[symbol_id]
|
|
118
|
+
node_types[node_idx][type_token] += count
|
|
119
|
+
|
|
120
|
+
if type_token not in type_to_idx:
|
|
121
|
+
type_to_idx[type_token] = len(type_to_idx)
|
|
122
|
+
|
|
123
|
+
return dict(node_types), type_to_idx
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def build_node_location_map(node_metadata: List[Dict]) -> Dict[int, Tuple[str, int]]:
|
|
127
|
+
"""
|
|
128
|
+
Build mapping from node_idx to (file_path, line_number) for context window co-occurrence.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
node_metadata: list of node metadata dictionaries
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Mapping from node_idx to (file_path, line_number)
|
|
135
|
+
"""
|
|
136
|
+
location_map = {}
|
|
137
|
+
for idx, node_meta in enumerate(node_metadata):
|
|
138
|
+
meta = node_meta.get('meta', {})
|
|
139
|
+
if isinstance(meta, dict):
|
|
140
|
+
file_path = meta.get('file', '')
|
|
141
|
+
lineno = meta.get('lineno', -1)
|
|
142
|
+
if file_path and lineno >= 0:
|
|
143
|
+
location_map[idx] = (file_path, lineno)
|
|
144
|
+
return location_map
|