tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,228 @@
1
+ """Context view: Node2Vec-style random walks and Word2Vec."""
2
+ import random
3
+ from multiprocessing import Pool, cpu_count
4
+ from typing import List, Tuple
5
+
6
+ import numpy as np
7
+ from gensim.models import Word2Vec
8
+ from gensim.models.keyedvectors import KeyedVectors
9
+
10
+
11
+ def _get_num_workers() -> int:
12
+ """Get number of workers (all cores - 1, minimum 1)."""
13
+ return max(1, cpu_count() - 1)
14
+
15
+
16
+ def _generate_walks_for_node(args):
17
+ """Generate walks for a single node (helper for multiprocessing)."""
18
+ start_node, adj_list, num_walks, walk_length, p, q, seed_offset = args
19
+ # Set seed for this worker
20
+ random.seed(42 + seed_offset + start_node)
21
+ np.random.seed(42 + seed_offset + start_node)
22
+
23
+ walks = []
24
+ for walk_idx in range(num_walks):
25
+ if not adj_list[start_node]:
26
+ continue
27
+
28
+ walk = [start_node]
29
+
30
+ for _ in range(walk_length - 1):
31
+ curr = walk[-1]
32
+ neighbors = adj_list[curr]
33
+
34
+ if not neighbors:
35
+ break
36
+
37
+ if len(walk) == 1:
38
+ next_node = random.choice(neighbors)[0]
39
+ else:
40
+ prev = walk[-2]
41
+ probs = []
42
+ nodes = []
43
+
44
+ for neighbor, weight in neighbors:
45
+ nodes.append(neighbor)
46
+ if neighbor == prev:
47
+ prob = weight / p
48
+ elif neighbor in adj_list[prev]:
49
+ prob = weight
50
+ else:
51
+ prob = weight / q
52
+ probs.append(max(prob, 1e-10))
53
+
54
+ probs = np.array(probs)
55
+ probs = probs / probs.sum()
56
+ next_node = np.random.choice(nodes, p=probs)
57
+
58
+ walk.append(next_node)
59
+
60
+ walks.append([str(node) for node in walk])
61
+
62
+ return walks
63
+
64
+
65
+ def generate_random_walks(edges: List[Tuple[int, int, str, float]],
66
+ num_nodes: int,
67
+ num_walks: int = 10,
68
+ walk_length: int = 80,
69
+ p: float = 1.0,
70
+ q: float = 1.0,
71
+ random_state: int = 42,
72
+ n_jobs: int = -1) -> List[List[int]]:
73
+ """
74
+ Generate Node2Vec-style random walks with multiprocessing support.
75
+
76
+ Args:
77
+ edges: list of (src_idx, dst_idx, relation, weight) tuples
78
+ num_nodes: number of nodes
79
+ num_walks: number of walks per node
80
+ walk_length: length of each walk
81
+ p: return parameter (1/p controls likelihood of returning)
82
+ q: in-out parameter (1/q controls likelihood of going further)
83
+ random_state: random seed
84
+ n_jobs: number of parallel jobs (-1 for all cores - 1)
85
+
86
+ Returns:
87
+ List of walks (each walk is a list of node indices)
88
+ """
89
+ # Build adjacency list
90
+ adj_list = {i: [] for i in range(num_nodes)}
91
+ for src_idx, dst_idx, rel, weight in edges:
92
+ adj_list[src_idx].append((dst_idx, weight))
93
+ adj_list[dst_idx].append((src_idx, weight))
94
+
95
+ # Determine number of workers
96
+ if n_jobs == -1:
97
+ n_jobs = _get_num_workers()
98
+ n_jobs = max(1, min(n_jobs, num_nodes))
99
+
100
+ # Prepare arguments for parallel processing
101
+ nodes_to_process = [node for node in range(num_nodes) if adj_list[node]]
102
+
103
+ if n_jobs == 1 or len(nodes_to_process) < n_jobs:
104
+ # Sequential processing for small cases
105
+ random.seed(random_state)
106
+ np.random.seed(random_state)
107
+ walks = []
108
+ for _ in range(num_walks):
109
+ for start_node in nodes_to_process:
110
+ walk = [start_node]
111
+ for _ in range(walk_length - 1):
112
+ curr = walk[-1]
113
+ neighbors = adj_list[curr]
114
+ if not neighbors:
115
+ break
116
+
117
+ if len(walk) == 1:
118
+ next_node = random.choice(neighbors)[0]
119
+ else:
120
+ prev = walk[-2]
121
+ probs = []
122
+ nodes = []
123
+ for neighbor, weight in neighbors:
124
+ nodes.append(neighbor)
125
+ if neighbor == prev:
126
+ prob = weight / p
127
+ elif neighbor in adj_list[prev]:
128
+ prob = weight
129
+ else:
130
+ prob = weight / q
131
+ probs.append(max(prob, 1e-10))
132
+ probs = np.array(probs)
133
+ probs = probs / probs.sum()
134
+ next_node = np.random.choice(nodes, p=probs)
135
+ walk.append(next_node)
136
+ walks.append([str(node) for node in walk])
137
+ else:
138
+ # Parallel processing
139
+ args_list = [
140
+ (node, adj_list, num_walks, walk_length, p, q, random_state + i)
141
+ for i, node in enumerate(nodes_to_process)
142
+ ]
143
+
144
+ # Parallel processing - use standard Pool which works on all platforms
145
+ chunksize = max(1, len(args_list) // (n_jobs * 2))
146
+ with Pool(processes=n_jobs) as pool:
147
+ results = pool.map(_generate_walks_for_node, args_list, chunksize=chunksize)
148
+
149
+ # Flatten results
150
+ walks = []
151
+ for result in results:
152
+ walks.extend(result)
153
+
154
+ return walks
155
+
156
+
157
+ def train_word2vec(walks: List[List[str]], dim: int, window: int = 7,
158
+ negative: int = 3, epochs: int = 3, random_state: int = 42,
159
+ n_jobs: int = -1, batch_words: int = 10000) -> KeyedVectors:
160
+ """
161
+ Train Word2Vec SkipGram model on random walks with multiprocessing.
162
+ Optimized defaults: window=7 (was 10), negative=3 (was 5) for faster training.
163
+
164
+ Args:
165
+ walks: list of walks (each walk is a list of node ID strings)
166
+ dim: embedding dimensionality
167
+ window: context window size (reduced default: 7)
168
+ negative: number of negative samples (reduced default: 3)
169
+ epochs: number of training epochs
170
+ random_state: random seed
171
+ n_jobs: number of parallel workers (-1 for all cores - 1)
172
+ batch_words: words per batch (larger = faster but more memory)
173
+
174
+ Returns:
175
+ Trained KeyedVectors model
176
+ """
177
+ if n_jobs == -1:
178
+ n_jobs = _get_num_workers()
179
+
180
+ # For gensim 4.x, use workers parameter
181
+ workers = max(1, n_jobs)
182
+
183
+ model = Word2Vec(
184
+ sentences=walks,
185
+ vector_size=dim,
186
+ window=window,
187
+ min_count=1,
188
+ workers=workers,
189
+ sg=1, # SkipGram
190
+ negative=negative,
191
+ epochs=epochs,
192
+ seed=random_state,
193
+ batch_words=batch_words # Larger batches for faster training
194
+ )
195
+
196
+ return model.wv
197
+
198
+
199
+ def compute_context_view(edges: List[Tuple[int, int, str, float]],
200
+ num_nodes: int,
201
+ dim: int,
202
+ num_walks: int = 10,
203
+ walk_length: int = 80,
204
+ random_state: int = 42,
205
+ n_jobs: int = -1) -> Tuple[np.ndarray, KeyedVectors]:
206
+ """
207
+ Compute context view embeddings using Node2Vec + Word2Vec with multiprocessing.
208
+
209
+ Returns:
210
+ embeddings: node embeddings from context view
211
+ keyed_vectors: Word2Vec KeyedVectors model
212
+ """
213
+ walks = generate_random_walks(edges, num_nodes, num_walks, walk_length,
214
+ random_state=random_state, n_jobs=n_jobs)
215
+ kv = train_word2vec(walks, dim, random_state=random_state, n_jobs=n_jobs)
216
+
217
+ # Extract embeddings for all nodes
218
+ embeddings = np.zeros((num_nodes, dim))
219
+ for i in range(num_nodes):
220
+ node_str = str(i)
221
+ if node_str in kv:
222
+ embeddings[i] = kv[node_str]
223
+ else:
224
+ # Initialize with small random values if node not seen
225
+ np.random.seed(random_state + i)
226
+ embeddings[i] = np.random.normal(0, 0.01, dim)
227
+
228
+ return embeddings, kv
@@ -0,0 +1,144 @@
1
+ """Data loading utilities for TriVector Code Intelligence."""
2
+ import json
3
+ from collections import defaultdict
4
+ from typing import Dict, List, Tuple
5
+
6
+ from .subtoken_utils import extract_subtokens, get_file_hierarchy
7
+
8
+
9
+ def load_nodes(nodes_path: str) -> Tuple[
10
+ Dict[str, int], List[Dict], Dict[str, List[str]], Dict[str, Tuple[str, str, str]]]:
11
+ """
12
+ Load nodes from JSONL file.
13
+
14
+ Returns:
15
+ node_to_idx: mapping from node ID to index
16
+ node_metadata: list of node metadata dictionaries
17
+ node_subtokens: mapping from node_id to list of normalized subtokens
18
+ node_file_info: mapping from node_id to (file_name, directory_path, top_level_package)
19
+ """
20
+ node_to_idx = {}
21
+ node_metadata = []
22
+ node_subtokens = {}
23
+ node_file_info = {}
24
+
25
+ with open(nodes_path, 'r') as f:
26
+ for line in f:
27
+ if not line.strip():
28
+ continue
29
+ node = json.loads(line)
30
+ node_id = node['id']
31
+ if node_id not in node_to_idx:
32
+ idx = len(node_to_idx)
33
+ node_to_idx[node_id] = idx
34
+
35
+ # Extract name and metadata
36
+ name = node.get('name', '')
37
+ meta = node.get('meta', {})
38
+
39
+ # Extract subtokens
40
+ raw_subtokens, normalized_subtokens = extract_subtokens(name, normalize=True)
41
+ node_subtokens[node_id] = normalized_subtokens
42
+
43
+ # Extract file hierarchy
44
+ file_path = meta.get('file', '') if isinstance(meta, dict) else ''
45
+ file_name, directory_path, top_level_package = get_file_hierarchy(file_path)
46
+ node_file_info[node_id] = (file_name, directory_path, top_level_package)
47
+
48
+ # Store metadata with subtoken info for debugging
49
+ meta_with_subtokens = meta.copy() if isinstance(meta, dict) else {}
50
+ meta_with_subtokens['_raw_subtokens'] = raw_subtokens
51
+ meta_with_subtokens['_normalized_subtokens'] = normalized_subtokens
52
+
53
+ node_metadata.append({
54
+ 'id': node_id,
55
+ 'kind': node.get('kind', 'unknown'),
56
+ 'name': name,
57
+ 'meta': meta_with_subtokens
58
+ })
59
+
60
+ return node_to_idx, node_metadata, node_subtokens, node_file_info
61
+
62
+
63
+ def load_edges(edges_path: str, node_to_idx: Dict[str, int]) -> Tuple[List[Tuple[int, int, str, float]], int]:
64
+ """
65
+ Load edges from JSONL file.
66
+
67
+ Returns:
68
+ edges: list of (src_idx, dst_idx, relation, weight) tuples
69
+ num_nodes: number of unique nodes
70
+ """
71
+ edges = []
72
+ seen_nodes = set()
73
+
74
+ with open(edges_path, 'r') as f:
75
+ for line in f:
76
+ if not line.strip():
77
+ continue
78
+ edge = json.loads(line)
79
+ src_id = edge['src']
80
+ dst_id = edge['dst']
81
+
82
+ if src_id in node_to_idx and dst_id in node_to_idx:
83
+ src_idx = node_to_idx[src_id]
84
+ dst_idx = node_to_idx[dst_id]
85
+ rel = edge.get('rel', 'unknown')
86
+ weight = float(edge.get('weight', 1.0))
87
+ edges.append((src_idx, dst_idx, rel, weight))
88
+ seen_nodes.add(src_idx)
89
+ seen_nodes.add(dst_idx)
90
+
91
+ num_nodes = len(node_to_idx)
92
+ return edges, num_nodes
93
+
94
+
95
+ def load_types(types_path: str, node_to_idx: Dict[str, int]) -> Tuple[
96
+ Dict[int, Dict[str, int]], Dict[str, int]]:
97
+ """
98
+ Load type tokens from JSONL file.
99
+
100
+ Returns:
101
+ node_types: mapping from node_idx to {type_token: count}
102
+ type_to_idx: mapping from type token to index
103
+ """
104
+ node_types = defaultdict(lambda: defaultdict(int))
105
+ type_to_idx = {}
106
+
107
+ with open(types_path, 'r') as f:
108
+ for line in f:
109
+ if not line.strip():
110
+ continue
111
+ entry = json.loads(line)
112
+ symbol_id = entry['symbol']
113
+ type_token = entry['type_token']
114
+ count = int(entry.get('count', 1))
115
+
116
+ if symbol_id in node_to_idx:
117
+ node_idx = node_to_idx[symbol_id]
118
+ node_types[node_idx][type_token] += count
119
+
120
+ if type_token not in type_to_idx:
121
+ type_to_idx[type_token] = len(type_to_idx)
122
+
123
+ return dict(node_types), type_to_idx
124
+
125
+
126
+ def build_node_location_map(node_metadata: List[Dict]) -> Dict[int, Tuple[str, int]]:
127
+ """
128
+ Build mapping from node_idx to (file_path, line_number) for context window co-occurrence.
129
+
130
+ Args:
131
+ node_metadata: list of node metadata dictionaries
132
+
133
+ Returns:
134
+ Mapping from node_idx to (file_path, line_number)
135
+ """
136
+ location_map = {}
137
+ for idx, node_meta in enumerate(node_metadata):
138
+ meta = node_meta.get('meta', {})
139
+ if isinstance(meta, dict):
140
+ file_path = meta.get('file', '')
141
+ lineno = meta.get('lineno', -1)
142
+ if file_path and lineno >= 0:
143
+ location_map[idx] = (file_path, lineno)
144
+ return location_map