tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/optimize.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""Optimization utilities for filtering nodes and edges."""
|
|
2
|
+
import json
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Dict, List, Set, Tuple
|
|
5
|
+
from .model import DEFAULT_EXCLUDED_KEYWORDS
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_generic_name(name: str, kind: str) -> bool:
|
|
9
|
+
"""Check if a symbol name is too generic to be useful."""
|
|
10
|
+
name_lower = name.lower().strip()
|
|
11
|
+
|
|
12
|
+
# Single character names (except for classes/functions which might be intentionally short)
|
|
13
|
+
if len(name_lower) <= 1 and kind in ['var', 'import']:
|
|
14
|
+
return True
|
|
15
|
+
|
|
16
|
+
# Very short names for variables
|
|
17
|
+
if len(name_lower) <= 2 and kind == 'var':
|
|
18
|
+
return True
|
|
19
|
+
|
|
20
|
+
# Common generic names
|
|
21
|
+
generic_names = {
|
|
22
|
+
'var', 'variable', 'val', 'value', 'item', 'obj', 'object', 'data', 'result',
|
|
23
|
+
'temp', 'tmp', 'arg', 'args', 'kwarg', 'kwargs', 'param', 'params', 'elem',
|
|
24
|
+
'element', 'entry', 'record', 'row', 'col', 'column', 'idx', 'index', 'i', 'j', 'k',
|
|
25
|
+
'x', 'y', 'z', 'n', 'm', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'a', 'b', 'c', 'd',
|
|
26
|
+
'e', 'f', 'g', 'h', 'l', 'o', 'helper', 'util', 'utils', 'func', 'fn', 'cb', 'callback'
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if name_lower in generic_names:
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def optimize_nodes_and_edges(
|
|
36
|
+
nodes_path: str,
|
|
37
|
+
edges_path: str,
|
|
38
|
+
types_path: str = None,
|
|
39
|
+
output_nodes: str = None,
|
|
40
|
+
output_edges: str = None,
|
|
41
|
+
output_types: str = None,
|
|
42
|
+
min_edge_weight: float = 0.3,
|
|
43
|
+
remove_isolated: bool = True,
|
|
44
|
+
remove_generic_names: bool = True,
|
|
45
|
+
excluded_keywords: Set[str] = None
|
|
46
|
+
) -> Tuple[int, int, int]:
|
|
47
|
+
"""
|
|
48
|
+
Optimize nodes and edges by filtering out low-value entries.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
nodes_path: Path to input nodes.jsonl
|
|
52
|
+
edges_path: Path to input edges.jsonl
|
|
53
|
+
types_path: Path to input types.jsonl (optional)
|
|
54
|
+
output_nodes: Path to output nodes.jsonl (default: overwrites input)
|
|
55
|
+
output_edges: Path to output edges.jsonl (default: overwrites input)
|
|
56
|
+
output_types: Path to output types.jsonl (default: overwrites input)
|
|
57
|
+
min_edge_weight: Minimum edge weight to keep
|
|
58
|
+
remove_isolated: Whether to remove nodes with no edges
|
|
59
|
+
remove_generic_names: Whether to remove nodes with generic names
|
|
60
|
+
excluded_keywords: Additional keywords to exclude
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (nodes_removed, edges_removed, types_removed)
|
|
64
|
+
"""
|
|
65
|
+
if excluded_keywords is None:
|
|
66
|
+
excluded_keywords = DEFAULT_EXCLUDED_KEYWORDS
|
|
67
|
+
|
|
68
|
+
# Load all data
|
|
69
|
+
nodes = []
|
|
70
|
+
node_ids = set()
|
|
71
|
+
|
|
72
|
+
with open(nodes_path, 'r') as f:
|
|
73
|
+
for line in f:
|
|
74
|
+
if not line.strip():
|
|
75
|
+
continue
|
|
76
|
+
node = json.loads(line)
|
|
77
|
+
nodes.append(node)
|
|
78
|
+
node_ids.add(node['id'])
|
|
79
|
+
|
|
80
|
+
edges = []
|
|
81
|
+
with open(edges_path, 'r') as f:
|
|
82
|
+
for line in f:
|
|
83
|
+
if not line.strip():
|
|
84
|
+
continue
|
|
85
|
+
edge = json.loads(line)
|
|
86
|
+
edges.append(edge)
|
|
87
|
+
|
|
88
|
+
types = []
|
|
89
|
+
if types_path:
|
|
90
|
+
try:
|
|
91
|
+
with open(types_path, 'r') as f:
|
|
92
|
+
for line in f:
|
|
93
|
+
if not line.strip():
|
|
94
|
+
continue
|
|
95
|
+
type_entry = json.loads(line)
|
|
96
|
+
types.append(type_entry)
|
|
97
|
+
except FileNotFoundError:
|
|
98
|
+
types = []
|
|
99
|
+
|
|
100
|
+
original_node_count = len(nodes)
|
|
101
|
+
original_edge_count = len(edges)
|
|
102
|
+
original_type_count = len(types)
|
|
103
|
+
|
|
104
|
+
# Build edge statistics
|
|
105
|
+
node_edge_count = defaultdict(int)
|
|
106
|
+
node_in_degree = defaultdict(int)
|
|
107
|
+
node_out_degree = defaultdict(int)
|
|
108
|
+
|
|
109
|
+
for edge in edges:
|
|
110
|
+
src = edge['src']
|
|
111
|
+
dst = edge['dst']
|
|
112
|
+
weight = float(edge.get('weight', 1.0))
|
|
113
|
+
|
|
114
|
+
if src in node_ids and dst in node_ids:
|
|
115
|
+
node_edge_count[src] += 1
|
|
116
|
+
node_edge_count[dst] += 1
|
|
117
|
+
node_out_degree[src] += 1
|
|
118
|
+
node_in_degree[dst] += 1
|
|
119
|
+
|
|
120
|
+
# Filter nodes
|
|
121
|
+
nodes_to_keep = set()
|
|
122
|
+
nodes_to_remove = set()
|
|
123
|
+
|
|
124
|
+
for node in nodes:
|
|
125
|
+
node_id = node['id']
|
|
126
|
+
name = node.get('name', '')
|
|
127
|
+
kind = node.get('kind', '')
|
|
128
|
+
|
|
129
|
+
# Check if node should be removed
|
|
130
|
+
should_remove = False
|
|
131
|
+
|
|
132
|
+
# Remove if name is in excluded keywords
|
|
133
|
+
if name.lower() in excluded_keywords:
|
|
134
|
+
should_remove = True
|
|
135
|
+
|
|
136
|
+
# Remove generic names
|
|
137
|
+
if remove_generic_names and is_generic_name(name, kind):
|
|
138
|
+
should_remove = True
|
|
139
|
+
|
|
140
|
+
# Remove isolated nodes (no edges)
|
|
141
|
+
if remove_isolated and node_edge_count[node_id] == 0:
|
|
142
|
+
# Keep file nodes and important structural nodes
|
|
143
|
+
if kind not in ['file', 'class', 'function']:
|
|
144
|
+
should_remove = True
|
|
145
|
+
|
|
146
|
+
if should_remove:
|
|
147
|
+
nodes_to_remove.add(node_id)
|
|
148
|
+
else:
|
|
149
|
+
nodes_to_keep.add(node_id)
|
|
150
|
+
|
|
151
|
+
# Filter edges
|
|
152
|
+
edges_to_keep = []
|
|
153
|
+
for edge in edges:
|
|
154
|
+
src = edge['src']
|
|
155
|
+
dst = edge['dst']
|
|
156
|
+
weight = float(edge.get('weight', 1.0))
|
|
157
|
+
|
|
158
|
+
# Remove if either node is removed
|
|
159
|
+
if src in nodes_to_remove or dst in nodes_to_remove:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Remove low-weight edges
|
|
163
|
+
if weight < min_edge_weight:
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
edges_to_keep.append(edge)
|
|
167
|
+
|
|
168
|
+
# Filter types (only keep types for nodes that are kept)
|
|
169
|
+
types_to_keep = []
|
|
170
|
+
for type_entry in types:
|
|
171
|
+
symbol_id = type_entry.get('symbol', '')
|
|
172
|
+
if symbol_id in nodes_to_keep:
|
|
173
|
+
types_to_keep.append(type_entry)
|
|
174
|
+
|
|
175
|
+
# Filter nodes (only keep nodes that are kept)
|
|
176
|
+
filtered_nodes = [node for node in nodes if node['id'] in nodes_to_keep]
|
|
177
|
+
|
|
178
|
+
# Write optimized files
|
|
179
|
+
output_nodes_path = output_nodes or nodes_path
|
|
180
|
+
output_edges_path = output_edges or edges_path
|
|
181
|
+
output_types_path = output_types or (types_path if types_path else None)
|
|
182
|
+
|
|
183
|
+
with open(output_nodes_path, 'w') as f:
|
|
184
|
+
for node in filtered_nodes:
|
|
185
|
+
f.write(json.dumps(node) + '\n')
|
|
186
|
+
|
|
187
|
+
with open(output_edges_path, 'w') as f:
|
|
188
|
+
for edge in edges_to_keep:
|
|
189
|
+
f.write(json.dumps(edge) + '\n')
|
|
190
|
+
|
|
191
|
+
if output_types_path and types_to_keep:
|
|
192
|
+
with open(output_types_path, 'w') as f:
|
|
193
|
+
for type_entry in types_to_keep:
|
|
194
|
+
f.write(json.dumps(type_entry) + '\n')
|
|
195
|
+
|
|
196
|
+
nodes_removed = original_node_count - len(filtered_nodes)
|
|
197
|
+
edges_removed = original_edge_count - len(edges_to_keep)
|
|
198
|
+
types_removed = original_type_count - len(types_to_keep)
|
|
199
|
+
|
|
200
|
+
# Calculate detailed statistics
|
|
201
|
+
stats = {
|
|
202
|
+
'original': {
|
|
203
|
+
'nodes': original_node_count,
|
|
204
|
+
'edges': original_edge_count,
|
|
205
|
+
'types': original_type_count
|
|
206
|
+
},
|
|
207
|
+
'final': {
|
|
208
|
+
'nodes': len(filtered_nodes),
|
|
209
|
+
'edges': len(edges_to_keep),
|
|
210
|
+
'types': len(types_to_keep)
|
|
211
|
+
},
|
|
212
|
+
'removed': {
|
|
213
|
+
'nodes': nodes_removed,
|
|
214
|
+
'edges': edges_removed,
|
|
215
|
+
'types': types_removed
|
|
216
|
+
},
|
|
217
|
+
'removal_reasons': {
|
|
218
|
+
'excluded_keywords': 0,
|
|
219
|
+
'generic_names': 0,
|
|
220
|
+
'isolated': 0,
|
|
221
|
+
'low_weight_edges': 0,
|
|
222
|
+
'orphaned_edges': 0 # Edges removed because nodes were removed
|
|
223
|
+
},
|
|
224
|
+
'by_kind': defaultdict(lambda: {'original': 0, 'removed': 0, 'final': 0})
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
# Count removal reasons
|
|
228
|
+
for node in nodes:
|
|
229
|
+
node_id = node['id']
|
|
230
|
+
name = node.get('name', '')
|
|
231
|
+
kind = node.get('kind', '')
|
|
232
|
+
|
|
233
|
+
stats['by_kind'][kind]['original'] += 1
|
|
234
|
+
|
|
235
|
+
if node_id in nodes_to_remove:
|
|
236
|
+
stats['by_kind'][kind]['removed'] += 1
|
|
237
|
+
if name.lower() in excluded_keywords:
|
|
238
|
+
stats['removal_reasons']['excluded_keywords'] += 1
|
|
239
|
+
if remove_generic_names and is_generic_name(name, kind):
|
|
240
|
+
stats['removal_reasons']['generic_names'] += 1
|
|
241
|
+
if remove_isolated and node_edge_count[node_id] == 0:
|
|
242
|
+
stats['removal_reasons']['isolated'] += 1
|
|
243
|
+
else:
|
|
244
|
+
stats['by_kind'][kind]['final'] += 1
|
|
245
|
+
|
|
246
|
+
# Count edge removal reasons
|
|
247
|
+
orphaned_edges = 0
|
|
248
|
+
low_weight_edges = 0
|
|
249
|
+
for edge in edges:
|
|
250
|
+
src = edge['src']
|
|
251
|
+
dst = edge['dst']
|
|
252
|
+
weight = float(edge.get('weight', 1.0))
|
|
253
|
+
|
|
254
|
+
if src in nodes_to_remove or dst in nodes_to_remove:
|
|
255
|
+
orphaned_edges += 1
|
|
256
|
+
elif weight < min_edge_weight:
|
|
257
|
+
low_weight_edges += 1
|
|
258
|
+
|
|
259
|
+
stats['removal_reasons']['orphaned_edges'] = orphaned_edges
|
|
260
|
+
stats['removal_reasons']['low_weight_edges'] = low_weight_edges
|
|
261
|
+
|
|
262
|
+
return nodes_removed, edges_removed, types_removed, stats
|
|
263
|
+
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Subtoken extraction and normalization utilities."""
|
|
2
|
+
import re
|
|
3
|
+
import string
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SimpleStemmer:
|
|
8
|
+
"""Lightweight Porter-like stemmer for code subtokens."""
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
# Common suffixes to remove
|
|
12
|
+
self.suffixes = [
|
|
13
|
+
('ing', ''),
|
|
14
|
+
('ed', ''),
|
|
15
|
+
('er', ''),
|
|
16
|
+
('est', ''),
|
|
17
|
+
('ly', ''),
|
|
18
|
+
('tion', ''),
|
|
19
|
+
('sion', ''),
|
|
20
|
+
('ness', ''),
|
|
21
|
+
('ment', ''),
|
|
22
|
+
('able', ''),
|
|
23
|
+
('ible', ''),
|
|
24
|
+
('ful', ''),
|
|
25
|
+
('less', ''),
|
|
26
|
+
('ize', ''),
|
|
27
|
+
('ise', ''),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
def stem(self, word: str) -> str:
|
|
31
|
+
"""Apply stemming to a word."""
|
|
32
|
+
if len(word) <= 3:
|
|
33
|
+
return word
|
|
34
|
+
|
|
35
|
+
word_lower = word.lower()
|
|
36
|
+
|
|
37
|
+
# Try to remove suffixes
|
|
38
|
+
for suffix, replacement in self.suffixes:
|
|
39
|
+
if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 2:
|
|
40
|
+
return word_lower[:-len(suffix)] + replacement
|
|
41
|
+
|
|
42
|
+
return word_lower
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def split_camel_case(name: str) -> List[str]:
|
|
46
|
+
"""Split camelCase or PascalCase identifier into tokens."""
|
|
47
|
+
# Insert space before uppercase letters (but not if already preceded by space)
|
|
48
|
+
# Handle sequences of uppercase letters (e.g., "XMLParser" -> "XML", "Parser")
|
|
49
|
+
tokens = []
|
|
50
|
+
current_token = ""
|
|
51
|
+
|
|
52
|
+
for i, char in enumerate(name):
|
|
53
|
+
if char.isupper():
|
|
54
|
+
if current_token and not current_token[-1].isupper():
|
|
55
|
+
# End of lowercase token, start new uppercase token
|
|
56
|
+
tokens.append(current_token)
|
|
57
|
+
current_token = char
|
|
58
|
+
else:
|
|
59
|
+
# Continue uppercase sequence
|
|
60
|
+
current_token += char
|
|
61
|
+
elif char.islower() or char.isdigit():
|
|
62
|
+
if current_token and current_token[-1].isupper() and len(current_token) > 1:
|
|
63
|
+
# End of uppercase sequence (except last char), start new token
|
|
64
|
+
tokens.append(current_token[:-1])
|
|
65
|
+
current_token = current_token[-1] + char
|
|
66
|
+
else:
|
|
67
|
+
current_token += char
|
|
68
|
+
else:
|
|
69
|
+
# Non-alphanumeric character
|
|
70
|
+
if current_token:
|
|
71
|
+
tokens.append(current_token)
|
|
72
|
+
current_token = ""
|
|
73
|
+
|
|
74
|
+
if current_token:
|
|
75
|
+
tokens.append(current_token)
|
|
76
|
+
|
|
77
|
+
return [t for t in tokens if t]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def split_snake_case(name: str) -> List[str]:
|
|
81
|
+
"""Split snake_case identifier into tokens."""
|
|
82
|
+
return [t for t in name.split('_') if t]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def split_kebab_case(name: str) -> List[str]:
|
|
86
|
+
"""Split kebab-case identifier into tokens."""
|
|
87
|
+
return [t for t in name.split('-') if t]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def split_numeric(name: str) -> List[str]:
|
|
91
|
+
"""Split identifier at numeric boundaries."""
|
|
92
|
+
# Split on transitions between digits and non-digits
|
|
93
|
+
parts = re.split(r'(\d+)', name)
|
|
94
|
+
tokens = []
|
|
95
|
+
for part in parts:
|
|
96
|
+
if part:
|
|
97
|
+
# Further split non-numeric parts by case changes
|
|
98
|
+
if part.isdigit():
|
|
99
|
+
tokens.append(part)
|
|
100
|
+
else:
|
|
101
|
+
# Split camelCase within numeric-separated parts
|
|
102
|
+
camel_tokens = split_camel_case(part)
|
|
103
|
+
tokens.extend(camel_tokens)
|
|
104
|
+
return [t for t in tokens if t]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract_subtokens(symbol_name: str, normalize: bool = True) -> Tuple[List[str], List[str]]:
|
|
108
|
+
"""
|
|
109
|
+
Extract subtokens from a symbol name.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
symbol_name: name of the symbol
|
|
113
|
+
normalize: whether to normalize subtokens (lowercase + stem)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (raw_subtokens, normalized_subtokens)
|
|
117
|
+
"""
|
|
118
|
+
if not symbol_name:
|
|
119
|
+
return [], []
|
|
120
|
+
|
|
121
|
+
# Try different splitting strategies
|
|
122
|
+
tokens = set()
|
|
123
|
+
|
|
124
|
+
# Split by common delimiters first
|
|
125
|
+
parts = re.split(r'[_\-\s\.]+', symbol_name)
|
|
126
|
+
for part in parts:
|
|
127
|
+
if not part:
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Try camelCase splitting
|
|
131
|
+
camel_tokens = split_camel_case(part)
|
|
132
|
+
tokens.update(camel_tokens)
|
|
133
|
+
|
|
134
|
+
# Try numeric splitting
|
|
135
|
+
numeric_tokens = split_numeric(part)
|
|
136
|
+
tokens.update(numeric_tokens)
|
|
137
|
+
|
|
138
|
+
# Also try direct snake_case and kebab_case
|
|
139
|
+
snake_tokens = split_snake_case(symbol_name)
|
|
140
|
+
tokens.update(snake_tokens)
|
|
141
|
+
|
|
142
|
+
kebab_tokens = split_kebab_case(symbol_name)
|
|
143
|
+
tokens.update(kebab_tokens)
|
|
144
|
+
|
|
145
|
+
# Filter out empty tokens and very short ones
|
|
146
|
+
raw_tokens = [t for t in tokens if len(t) > 0]
|
|
147
|
+
|
|
148
|
+
# Normalize if requested
|
|
149
|
+
normalized_tokens = []
|
|
150
|
+
if normalize:
|
|
151
|
+
stemmer = SimpleStemmer()
|
|
152
|
+
for token in raw_tokens:
|
|
153
|
+
# Remove punctuation
|
|
154
|
+
cleaned = token.strip(string.punctuation)
|
|
155
|
+
if cleaned:
|
|
156
|
+
# Lowercase and stem
|
|
157
|
+
normalized = stemmer.stem(cleaned.lower())
|
|
158
|
+
if normalized:
|
|
159
|
+
normalized_tokens.append(normalized)
|
|
160
|
+
else:
|
|
161
|
+
normalized_tokens = raw_tokens
|
|
162
|
+
|
|
163
|
+
return raw_tokens, normalized_tokens
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def get_file_hierarchy(file_path: str) -> Tuple[str, str, str]:
|
|
167
|
+
"""
|
|
168
|
+
Extract file hierarchy information.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
file_path: path to file
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Tuple of (file_name, directory_path, top_level_package)
|
|
175
|
+
"""
|
|
176
|
+
if not file_path:
|
|
177
|
+
return "", "", ""
|
|
178
|
+
|
|
179
|
+
# Normalize path separators
|
|
180
|
+
normalized = file_path.replace('\\', '/')
|
|
181
|
+
|
|
182
|
+
# Extract components
|
|
183
|
+
parts = normalized.split('/')
|
|
184
|
+
file_name = parts[-1] if parts else ""
|
|
185
|
+
|
|
186
|
+
# Directory is everything except filename
|
|
187
|
+
directory_path = '/'.join(parts[:-1]) if len(parts) > 1 else ""
|
|
188
|
+
|
|
189
|
+
# Top-level package is first non-empty component
|
|
190
|
+
top_level = ""
|
|
191
|
+
for part in parts:
|
|
192
|
+
if part and part not in ['.', '..']:
|
|
193
|
+
top_level = part
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
return file_name, directory_path, top_level
|