tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/typed_view.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""Typed view: symbol × type-token matrix, PPMI, and SVD."""
|
|
2
|
+
import re
|
|
3
|
+
from multiprocessing import cpu_count
|
|
4
|
+
from typing import Dict, Tuple, List
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from scipy import sparse
|
|
8
|
+
from sklearn.decomposition import TruncatedSVD
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_num_workers() -> int:
|
|
12
|
+
"""Get number of workers (all cores - 1, minimum 1)."""
|
|
13
|
+
return max(1, cpu_count() - 1)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_composite_type(type_token: str) -> Tuple[List[str], List[str]]:
|
|
17
|
+
"""
|
|
18
|
+
Parse composite type into constructor tokens and primitive tokens.
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
List[int] -> (['List'], ['int'])
|
|
22
|
+
Dict[str, int] -> (['Dict'], ['str', 'int'])
|
|
23
|
+
Optional[T] -> (['Optional'], ['T'])
|
|
24
|
+
List[Dict[str, int]] -> (['List', 'Dict'], ['str', 'int'])
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
type_token: type token string
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Tuple of (constructor_tokens, primitive_tokens)
|
|
31
|
+
"""
|
|
32
|
+
constructors = []
|
|
33
|
+
primitives = []
|
|
34
|
+
|
|
35
|
+
# Extract generic type constructors (List, Dict, Optional, Set, Tuple, etc.)
|
|
36
|
+
# Pattern: ConstructorName[content]
|
|
37
|
+
generic_pattern = r'([A-Z][a-zA-Z0-9_]*)\s*\[([^\]]+)\]'
|
|
38
|
+
|
|
39
|
+
# Find all generic types
|
|
40
|
+
matches = list(re.finditer(generic_pattern, type_token))
|
|
41
|
+
|
|
42
|
+
if matches:
|
|
43
|
+
# Extract constructors
|
|
44
|
+
for match in matches:
|
|
45
|
+
constructor = match.group(1)
|
|
46
|
+
constructors.append(constructor)
|
|
47
|
+
|
|
48
|
+
# Recursively parse inner content
|
|
49
|
+
inner_content = match.group(2)
|
|
50
|
+
inner_constructors, inner_primitives = parse_composite_type(inner_content)
|
|
51
|
+
constructors.extend(inner_constructors)
|
|
52
|
+
primitives.extend(inner_primitives)
|
|
53
|
+
|
|
54
|
+
# Extract remaining primitives (not in generic brackets)
|
|
55
|
+
remaining = type_token
|
|
56
|
+
for match in reversed(matches):
|
|
57
|
+
remaining = remaining[:match.start()] + remaining[match.end():]
|
|
58
|
+
|
|
59
|
+
# Split by comma and extract primitives
|
|
60
|
+
for part in remaining.split(','):
|
|
61
|
+
part = part.strip()
|
|
62
|
+
if part and not re.match(r'^[A-Z][a-zA-Z0-9_]*\s*\[', part):
|
|
63
|
+
# Check if it's a primitive (lowercase or single letter)
|
|
64
|
+
if part[0].islower() or (len(part) == 1 and part.isalpha()):
|
|
65
|
+
primitives.append(part)
|
|
66
|
+
else:
|
|
67
|
+
# No generic types, check if it's a primitive or simple type
|
|
68
|
+
parts = [p.strip() for p in type_token.split(',')]
|
|
69
|
+
for part in parts:
|
|
70
|
+
if part:
|
|
71
|
+
# If starts with uppercase and not a known primitive, treat as constructor
|
|
72
|
+
if part[0].isupper() and part not in ['int', 'str', 'float', 'bool', 'None']:
|
|
73
|
+
constructors.append(part)
|
|
74
|
+
else:
|
|
75
|
+
primitives.append(part)
|
|
76
|
+
|
|
77
|
+
return constructors, primitives
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def expand_type_semantics(node_types: Dict[int, Dict[str, int]],
|
|
81
|
+
type_to_idx: Dict[str, int],
|
|
82
|
+
num_nodes: int) -> Tuple[Dict[int, Dict[str, int]], Dict[str, int]]:
|
|
83
|
+
"""
|
|
84
|
+
Expand type tokens into constructors and primitives.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
node_types: mapping from node_idx to {type_token: count}
|
|
88
|
+
type_to_idx: mapping from type token to index
|
|
89
|
+
num_nodes: number of nodes
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Tuple of (expanded_node_types, expanded_type_to_idx)
|
|
93
|
+
"""
|
|
94
|
+
expanded_node_types = {}
|
|
95
|
+
expanded_type_to_idx = dict(type_to_idx)
|
|
96
|
+
|
|
97
|
+
# Start with existing type indices
|
|
98
|
+
current_idx = len(expanded_type_to_idx)
|
|
99
|
+
|
|
100
|
+
# Process each node's types
|
|
101
|
+
for node_idx, types_dict in node_types.items():
|
|
102
|
+
expanded_types = dict(types_dict) # Start with original types
|
|
103
|
+
|
|
104
|
+
for type_token, count in types_dict.items():
|
|
105
|
+
# Parse composite type
|
|
106
|
+
constructors, primitives = parse_composite_type(type_token)
|
|
107
|
+
|
|
108
|
+
# Add constructor tokens
|
|
109
|
+
for constructor in constructors:
|
|
110
|
+
if constructor not in expanded_type_to_idx:
|
|
111
|
+
expanded_type_to_idx[constructor] = current_idx
|
|
112
|
+
current_idx += 1
|
|
113
|
+
|
|
114
|
+
constructor_idx = expanded_type_to_idx[constructor]
|
|
115
|
+
expanded_types[constructor] = expanded_types.get(constructor, 0) + int(count * 0.5)
|
|
116
|
+
|
|
117
|
+
# Add primitive tokens
|
|
118
|
+
for primitive in primitives:
|
|
119
|
+
if primitive not in expanded_type_to_idx:
|
|
120
|
+
expanded_type_to_idx[primitive] = current_idx
|
|
121
|
+
current_idx += 1
|
|
122
|
+
|
|
123
|
+
primitive_idx = expanded_type_to_idx[primitive]
|
|
124
|
+
expanded_types[primitive] = expanded_types.get(primitive, 0) + int(count * 0.25)
|
|
125
|
+
|
|
126
|
+
expanded_node_types[node_idx] = expanded_types
|
|
127
|
+
|
|
128
|
+
return expanded_node_types, expanded_type_to_idx
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def build_type_matrix(node_types: Dict[int, Dict[str, int]],
|
|
132
|
+
type_to_idx: Dict[str, int],
|
|
133
|
+
num_nodes: int,
|
|
134
|
+
expand_types: bool = True) -> Tuple[sparse.csr_matrix, Dict[str, int]]:
|
|
135
|
+
"""
|
|
136
|
+
Build sparse symbol × type-token matrix with optional type expansion.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
node_types: mapping from node_idx to {type_token: count}
|
|
140
|
+
type_to_idx: mapping from type token to index
|
|
141
|
+
num_nodes: number of nodes (may include subtokens, but node_types only has original nodes)
|
|
142
|
+
expand_types: whether to expand composite types
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Tuple of (type_matrix, final_type_to_idx)
|
|
146
|
+
"""
|
|
147
|
+
final_node_types = node_types
|
|
148
|
+
final_type_to_idx = type_to_idx
|
|
149
|
+
|
|
150
|
+
if expand_types:
|
|
151
|
+
# Use original num_nodes (before subtokens) for expansion
|
|
152
|
+
original_num_nodes = max(node_types.keys()) + 1 if node_types else 0
|
|
153
|
+
final_node_types, final_type_to_idx = expand_type_semantics(
|
|
154
|
+
node_types, type_to_idx, original_num_nodes
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
num_types = len(final_type_to_idx)
|
|
158
|
+
rows = []
|
|
159
|
+
cols = []
|
|
160
|
+
data = []
|
|
161
|
+
|
|
162
|
+
for node_idx, types_dict in final_node_types.items():
|
|
163
|
+
for type_token, count in types_dict.items():
|
|
164
|
+
if type_token in final_type_to_idx:
|
|
165
|
+
type_idx = final_type_to_idx[type_token]
|
|
166
|
+
rows.append(node_idx)
|
|
167
|
+
cols.append(type_idx)
|
|
168
|
+
data.append(float(count))
|
|
169
|
+
|
|
170
|
+
# Pad matrix to num_nodes rows (subtokens will have zero rows)
|
|
171
|
+
type_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(num_nodes, num_types))
|
|
172
|
+
return type_matrix, final_type_to_idx
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def compute_ppmi_types(type_matrix: sparse.csr_matrix, k: float = 1.0) -> sparse.csr_matrix:
|
|
176
|
+
"""
|
|
177
|
+
Compute PPMI for type matrix.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
type_matrix: sparse symbol × type matrix
|
|
181
|
+
k: shift parameter
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
PPMI matrix (sparse)
|
|
185
|
+
"""
|
|
186
|
+
cooc = type_matrix.copy()
|
|
187
|
+
cooc.data = np.maximum(cooc.data, 0)
|
|
188
|
+
|
|
189
|
+
# Compute marginals
|
|
190
|
+
row_sums = np.array(cooc.sum(axis=1)).flatten()
|
|
191
|
+
col_sums = np.array(cooc.sum(axis=0)).flatten()
|
|
192
|
+
total = float(cooc.sum())
|
|
193
|
+
|
|
194
|
+
# Ensure row_sums and col_sums match matrix dimensions exactly
|
|
195
|
+
if len(row_sums) != cooc.shape[0]:
|
|
196
|
+
# Create full-size array and fill with row_sums
|
|
197
|
+
full_row_sums = np.zeros(cooc.shape[0])
|
|
198
|
+
full_row_sums[:len(row_sums)] = row_sums
|
|
199
|
+
row_sums = full_row_sums
|
|
200
|
+
if len(col_sums) != cooc.shape[1]:
|
|
201
|
+
# Create full-size array and fill with col_sums
|
|
202
|
+
full_col_sums = np.zeros(cooc.shape[1])
|
|
203
|
+
full_col_sums[:len(col_sums)] = col_sums
|
|
204
|
+
col_sums = full_col_sums
|
|
205
|
+
|
|
206
|
+
# Avoid division by zero
|
|
207
|
+
row_sums = np.maximum(row_sums, 1e-10)
|
|
208
|
+
col_sums = np.maximum(col_sums, 1e-10)
|
|
209
|
+
total = max(total, 1e-10)
|
|
210
|
+
|
|
211
|
+
# Compute PMI
|
|
212
|
+
rows, cols = cooc.nonzero()
|
|
213
|
+
values = cooc.data
|
|
214
|
+
|
|
215
|
+
# Ensure indices are within bounds
|
|
216
|
+
rows = np.clip(rows, 0, len(row_sums) - 1).astype(int)
|
|
217
|
+
cols = np.clip(cols, 0, len(col_sums) - 1).astype(int)
|
|
218
|
+
|
|
219
|
+
# Ensure all arrays have the same length
|
|
220
|
+
min_len = min(len(values), len(rows), len(cols))
|
|
221
|
+
if min_len < len(values):
|
|
222
|
+
values = values[:min_len]
|
|
223
|
+
rows = rows[:min_len]
|
|
224
|
+
cols = cols[:min_len]
|
|
225
|
+
|
|
226
|
+
p_ij = values / total
|
|
227
|
+
p_i = row_sums[rows] / total
|
|
228
|
+
p_j = col_sums[cols] / total
|
|
229
|
+
|
|
230
|
+
# Ensure all have same shape
|
|
231
|
+
assert len(p_ij) == len(p_i) == len(
|
|
232
|
+
p_j), f"Shape mismatch: p_ij={len(p_ij)}, p_i={len(p_i)}, p_j={len(p_j)}"
|
|
233
|
+
|
|
234
|
+
pmi = np.log(p_ij / (p_i * p_j + 1e-10) + 1e-10)
|
|
235
|
+
ppmi = np.maximum(pmi, 0.0)
|
|
236
|
+
|
|
237
|
+
ppmi_matrix = sparse.csr_matrix((ppmi, (rows, cols)), shape=cooc.shape)
|
|
238
|
+
return ppmi_matrix
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def compute_typed_view(node_types: Dict[int, Dict[str, int]],
|
|
242
|
+
type_to_idx: Dict[str, int],
|
|
243
|
+
num_nodes: int,
|
|
244
|
+
dim: int,
|
|
245
|
+
random_state: int = 42,
|
|
246
|
+
n_jobs: int = -1,
|
|
247
|
+
expand_types: bool = True,
|
|
248
|
+
gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray, Dict[str, int]]:
|
|
249
|
+
"""
|
|
250
|
+
Compute typed view embeddings with optional type expansion.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
embeddings: node embeddings from typed view
|
|
254
|
+
svd_components: SVD components for reconstruction
|
|
255
|
+
final_type_to_idx: expanded type token mapping
|
|
256
|
+
"""
|
|
257
|
+
type_matrix, final_type_to_idx = build_type_matrix(
|
|
258
|
+
node_types, type_to_idx, num_nodes, expand_types=expand_types
|
|
259
|
+
)
|
|
260
|
+
ppmi = compute_ppmi_types(type_matrix)
|
|
261
|
+
embeddings, svd_components = reduce_dimensions_ppmi(ppmi, dim, random_state, n_jobs, gpu_accelerator)
|
|
262
|
+
return embeddings, svd_components, final_type_to_idx
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def reduce_dimensions_ppmi(ppmi: sparse.csr_matrix, dim: int, random_state: int = 42,
|
|
266
|
+
n_jobs: int = -1, gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray]:
|
|
267
|
+
"""
|
|
268
|
+
Reduce PPMI matrix dimensionality using Truncated SVD (GPU-accelerated if available).
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
ppmi: PPMI matrix
|
|
272
|
+
dim: target dimensionality
|
|
273
|
+
random_state: random seed
|
|
274
|
+
n_jobs: number of parallel jobs (not used, kept for API consistency)
|
|
275
|
+
gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Reduced embeddings matrix and SVD components
|
|
279
|
+
"""
|
|
280
|
+
num_features = ppmi.shape[1]
|
|
281
|
+
actual_dim = min(dim, num_features)
|
|
282
|
+
|
|
283
|
+
# Try GPU acceleration if available
|
|
284
|
+
if gpu_accelerator and gpu_accelerator.use_gpu:
|
|
285
|
+
try:
|
|
286
|
+
# Convert sparse matrix to dense for GPU SVD
|
|
287
|
+
if ppmi.shape[0] * ppmi.shape[1] < 50_000_000: # ~50M elements threshold
|
|
288
|
+
ppmi_dense = ppmi.toarray()
|
|
289
|
+
U, S, Vt = gpu_accelerator.svd(ppmi_dense, actual_dim, random_state)
|
|
290
|
+
embeddings = U @ np.diag(S)
|
|
291
|
+
components = Vt
|
|
292
|
+
else:
|
|
293
|
+
raise ValueError("Matrix too large for GPU dense SVD")
|
|
294
|
+
except Exception:
|
|
295
|
+
# Fall back to CPU
|
|
296
|
+
svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
|
|
297
|
+
embeddings = svd.fit_transform(ppmi)
|
|
298
|
+
components = svd.components_
|
|
299
|
+
else:
|
|
300
|
+
# CPU path
|
|
301
|
+
svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
|
|
302
|
+
embeddings = svd.fit_transform(ppmi)
|
|
303
|
+
components = svd.components_
|
|
304
|
+
|
|
305
|
+
# Pad embeddings if needed to match requested dimension
|
|
306
|
+
if actual_dim < dim:
|
|
307
|
+
padding = np.zeros((embeddings.shape[0], dim - actual_dim))
|
|
308
|
+
embeddings = np.hstack([embeddings, padding])
|
|
309
|
+
# Pad components similarly
|
|
310
|
+
component_padding = np.zeros((dim - actual_dim, components.shape[1]))
|
|
311
|
+
components = np.vstack([components, component_padding])
|
|
312
|
+
|
|
313
|
+
return embeddings, components
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tricoder
|
|
3
|
+
Version: 1.2.8
|
|
4
|
+
Summary: TriVector Code Intelligence - Multi-view code relationship model with advanced semantic embeddings
|
|
5
|
+
License: Non-Commercial
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: LICENSE_COMMERCIAL.md
|
|
8
|
+
Keywords: code-intelligence,embeddings,semantic-analysis,code-search
|
|
9
|
+
Author: Jiri Otoupal
|
|
10
|
+
Author-email: j.f.otoupal@gmail.com
|
|
11
|
+
Requires-Python: >=3.8.1,<4.0.0
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: Other/Proprietary License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Dist: annoy (>=1.17.0)
|
|
26
|
+
Requires-Dist: click (>=8.0.0)
|
|
27
|
+
Requires-Dist: gensim (>=4.0.0)
|
|
28
|
+
Requires-Dist: numpy (>=1.21.0)
|
|
29
|
+
Requires-Dist: rich (>=13.0.0)
|
|
30
|
+
Requires-Dist: scikit-learn (>=1.0.0)
|
|
31
|
+
Requires-Dist: scipy (>=1.7.0)
|
|
32
|
+
Project-URL: Homepage, https://github.com/jiri-otoupal/tricoder
|
|
33
|
+
Project-URL: Repository, https://github.com/jiri-otoupal/tricoder
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# TriVector Embeddings for Smarter Code Search for AI Agents
|
|
37
|
+
|
|
38
|
+
[](https://pypi.org/project/tricoder/)
|
|
39
|
+
[](https://pypi.org/project/tricoder/)
|
|
40
|
+
|
|
41
|
+
[](https://travis-ci.com/github/jiri-otoupal/tricoder)
|
|
42
|
+
[](https://pepy.tech/project/tricoder)
|
|
43
|
+
|
|
44
|
+
## TriCoder learns high-quality symbol-level embeddings from codebases using three complementary views:
|
|
45
|
+
|
|
46
|
+
1. **Graph View**: Structural relationships via PPMI and SVD
|
|
47
|
+
2. **Context View**: Semantic context via Node2Vec random walks and Word2Vec
|
|
48
|
+
3. **Typed View**: Type information via type-token co-occurrence (optional)
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **Subtoken Semantic Graph**: Captures fine-grained semantic relationships through subtoken analysis
|
|
53
|
+
- **File & Module Hierarchy**: Leverages file/directory structure for better clustering
|
|
54
|
+
- **Static Call-Graph Expansion**: Propagates call relationships to depth 2-3
|
|
55
|
+
- **Type Semantic Expansion**: Expands composite types into constructors and primitives
|
|
56
|
+
- **Context Window Co-occurrence**: Captures lexical context within ±5 lines
|
|
57
|
+
- **Improved Negative Sampling**: Biased sampling for better temperature calibration
|
|
58
|
+
- **Hybrid Similarity Scoring**: Length-penalized cosine similarity
|
|
59
|
+
- **Iterative Embedding Smoothing**: Diffusion-based smoothing for better clustering
|
|
60
|
+
- **Query-Time Semantic Expansion**: Expands queries with subtokens and types
|
|
61
|
+
- **GPU Acceleration**: Supports CUDA (NVIDIA) and MPS (Mac) for faster training
|
|
62
|
+
- **Keyword Search**: Search symbols by keywords and type tokens
|
|
63
|
+
- **Graph Optimization**: Filter out low-value nodes and edges to improve training efficiency
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
### Using Poetry (Recommended)
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
poetry install
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Using pip
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install .
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### GPU Support (Optional)
|
|
80
|
+
|
|
81
|
+
For NVIDIA GPUs (CUDA):
|
|
82
|
+
```bash
|
|
83
|
+
pip install cupy-cuda12x
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
For Mac GPUs (MPS):
|
|
87
|
+
```bash
|
|
88
|
+
pip install torch
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Usage
|
|
92
|
+
|
|
93
|
+
### 1. Extract Symbols from Codebase
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Basic extraction (Python files only)
|
|
97
|
+
tricoder extract --input-dir /path/to/codebase
|
|
98
|
+
|
|
99
|
+
# Extract specific file types
|
|
100
|
+
tricoder extract --input-dir /path/to/codebase --extensions "py,js,ts"
|
|
101
|
+
|
|
102
|
+
# Exclude specific keywords from extraction
|
|
103
|
+
tricoder extract --input-dir /path/to/codebase --exclude-keywords debug --exclude-keywords temp
|
|
104
|
+
|
|
105
|
+
# Custom output files
|
|
106
|
+
tricoder extract --input-dir /path/to/codebase --output-nodes my_nodes.jsonl --output-edges my_edges.jsonl
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Extraction Options:**
|
|
110
|
+
- `--input-dir`, `--root`, `-r`: Input directory to scan (default: current directory)
|
|
111
|
+
- `--extensions`, `--ext`: Comma-separated file extensions (default: `py`)
|
|
112
|
+
- `--include-dirs`, `-i`: Include only specific subdirectories (can specify multiple)
|
|
113
|
+
- `--exclude-dirs`, `-e`: Exclude directories (default: `.venv`, `__pycache__`, `.git`, `node_modules`, `.pytest_cache`)
|
|
114
|
+
- `--exclude-keywords`, `--exclude`: Exclude symbol names (appended to default excluded keywords)
|
|
115
|
+
- `--output-nodes`, `-n`: Output file for nodes (default: `nodes.jsonl`)
|
|
116
|
+
- `--output-edges`, `-d`: Output file for edges (default: `edges.jsonl`)
|
|
117
|
+
- `--output-types`, `-t`: Output file for types (default: `types.jsonl`)
|
|
118
|
+
- `--no-gitignore`: Disable `.gitignore` filtering (enabled by default)
|
|
119
|
+
|
|
120
|
+
### 2. Optimize Graph (Optional)
|
|
121
|
+
|
|
122
|
+
Reduce graph size by filtering low-value nodes and edges:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Basic optimization (overwrites input files)
|
|
126
|
+
tricoder optimize
|
|
127
|
+
|
|
128
|
+
# Custom output files
|
|
129
|
+
tricoder optimize --output-nodes nodes_opt.jsonl --output-edges edges_opt.jsonl
|
|
130
|
+
|
|
131
|
+
# Customize thresholds
|
|
132
|
+
tricoder optimize --min-edge-weight 0.5 --remove-isolated --remove-generic
|
|
133
|
+
|
|
134
|
+
# Keep isolated nodes
|
|
135
|
+
tricoder optimize --keep-isolated
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Optimization Options:**
|
|
139
|
+
- `--nodes`, `-n`: Input nodes file (default: `nodes.jsonl`)
|
|
140
|
+
- `--edges`, `-e`: Input edges file (default: `edges.jsonl`)
|
|
141
|
+
- `--types`, `-t`: Input types file (default: `types.jsonl`, optional)
|
|
142
|
+
- `--output-nodes`, `-N`: Output nodes file (default: overwrites input)
|
|
143
|
+
- `--output-edges`, `-E`: Output edges file (default: overwrites input)
|
|
144
|
+
- `--output-types`, `-T`: Output types file (default: overwrites input)
|
|
145
|
+
- `--min-edge-weight`: Minimum edge weight to keep (default: `0.3`)
|
|
146
|
+
- `--remove-isolated`: Remove nodes with no edges (default: `True`)
|
|
147
|
+
- `--keep-isolated`: Keep isolated nodes (overrides `--remove-isolated`)
|
|
148
|
+
- `--remove-generic`: Remove generic names (default: `True`)
|
|
149
|
+
- `--keep-generic`: Keep generic names (overrides `--remove-generic`)
|
|
150
|
+
- `--exclude-keywords`, `--exclude`: Additional keywords to exclude (can specify multiple)
|
|
151
|
+
|
|
152
|
+
### 3. Train Model
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
# Basic training
|
|
156
|
+
tricoder train --out model_output
|
|
157
|
+
|
|
158
|
+
# With GPU acceleration
|
|
159
|
+
tricoder train --out model_output --use-gpu
|
|
160
|
+
|
|
161
|
+
# Fast mode (faster training, slightly lower quality)
|
|
162
|
+
tricoder train --out model_output --fast
|
|
163
|
+
|
|
164
|
+
# Custom dimensions
|
|
165
|
+
tricoder train --out model_output --graph-dim 128 --context-dim 128 --final-dim 256
|
|
166
|
+
|
|
167
|
+
# Custom training parameters
|
|
168
|
+
tricoder train --out model_output --num-walks 20 --walk-length 100 --train-ratio 0.9
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Training Options:**
|
|
172
|
+
- `--nodes`, `-n`: Path to nodes.jsonl (default: `nodes.jsonl`)
|
|
173
|
+
- `--edges`, `-e`: Path to edges.jsonl (default: `edges.jsonl`)
|
|
174
|
+
- `--types`, `-t`: Path to types.jsonl (default: `types.jsonl`, optional)
|
|
175
|
+
- `--out`, `-o`: Output directory (required)
|
|
176
|
+
- `--graph-dim`: Graph view dimensionality (default: auto-calculated)
|
|
177
|
+
- `--context-dim`: Context view dimensionality (default: auto-calculated)
|
|
178
|
+
- `--typed-dim`: Typed view dimensionality (default: auto-calculated)
|
|
179
|
+
- `--final-dim`: Final fused embedding dimensionality (default: auto-calculated)
|
|
180
|
+
- `--num-walks`: Number of random walks per node (default: `10`)
|
|
181
|
+
- `--walk-length`: Length of each random walk (default: `80`)
|
|
182
|
+
- `--train-ratio`: Fraction of edges for training (default: `0.8`)
|
|
183
|
+
- `--random-state`: Random seed for reproducibility (default: `42`)
|
|
184
|
+
- `--fast`: Enable fast mode (reduces parameters for faster training)
|
|
185
|
+
- `--use-gpu`: Enable GPU acceleration (CUDA or MPS, falls back to CPU if unavailable)
|
|
186
|
+
|
|
187
|
+
### 4. Query Model
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Query by symbol ID
|
|
191
|
+
tricoder query --model-dir model_output --symbol function_my_function_0001 --top-k 10
|
|
192
|
+
|
|
193
|
+
# Search by keywords
|
|
194
|
+
tricoder query --model-dir model_output --keywords "database connection" --top-k 10
|
|
195
|
+
|
|
196
|
+
# Multi-word phrases (use quotes)
|
|
197
|
+
tricoder query --model-dir model_output --keywords '"user authentication" login'
|
|
198
|
+
|
|
199
|
+
# Exclude specific keywords from results
|
|
200
|
+
tricoder query --model-dir model_output --keywords handler --exclude-keywords debug --exclude-keywords temp
|
|
201
|
+
|
|
202
|
+
# Interactive mode
|
|
203
|
+
tricoder query --model-dir model_output --interactive
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Query Options:**
|
|
207
|
+
- `--model-dir`, `-m`: Path to model directory (required)
|
|
208
|
+
- `--symbol`, `-s`: Symbol ID to query
|
|
209
|
+
- `--keywords`, `-w`: Keywords to search for (use quotes for multi-word: `"my function"`)
|
|
210
|
+
- `--top-k`, `-k`: Number of results to return (default: `10`)
|
|
211
|
+
- `--exclude-keywords`, `--exclude`: Additional keywords to exclude (appended to default excluded keywords)
|
|
212
|
+
- `--interactive`, `-i`: Interactive mode
|
|
213
|
+
|
|
214
|
+
### 5. Incremental Retraining
|
|
215
|
+
|
|
216
|
+
Retrain only on changed files since last training:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
# Basic retraining (detects changed files automatically)
|
|
220
|
+
tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase
|
|
221
|
+
|
|
222
|
+
# Force full retraining
|
|
223
|
+
tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase --force
|
|
224
|
+
|
|
225
|
+
# Custom training parameters
|
|
226
|
+
tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase --num-walks 20
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**Retrain Options:**
|
|
230
|
+
- `--model-dir`, `-m`: Path to existing model directory (required)
|
|
231
|
+
- `--codebase-dir`, `-c`: Path to codebase root (default: current directory)
|
|
232
|
+
- `--output-nodes`, `-n`: Temporary nodes file (default: `nodes_retrain.jsonl`)
|
|
233
|
+
- `--output-edges`, `-d`: Temporary edges file (default: `edges_retrain.jsonl`)
|
|
234
|
+
- `--output-types`, `-t`: Temporary types file (default: `types_retrain.jsonl`)
|
|
235
|
+
- `--graph-dim`, `--context-dim`, `--typed-dim`, `--final-dim`: Override model dimensions
|
|
236
|
+
- `--num-walks`, `--walk-length`, `--train-ratio`, `--random-state`: Training parameters
|
|
237
|
+
- `--force`: Force full retraining even if no files changed
|
|
238
|
+
|
|
239
|
+
## Examples
|
|
240
|
+
|
|
241
|
+
### Complete Workflow
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# 1. Extract symbols from codebase
|
|
245
|
+
tricoder extract --input-dir ./my_project --extensions "py,js"
|
|
246
|
+
|
|
247
|
+
# 2. (Optional) Optimize the graph
|
|
248
|
+
tricoder optimize --min-edge-weight 0.4
|
|
249
|
+
|
|
250
|
+
# 3. Train model with GPU acceleration
|
|
251
|
+
tricoder train --out ./models/my_project --use-gpu
|
|
252
|
+
|
|
253
|
+
# 4. Query for similar symbols
|
|
254
|
+
tricoder query --model-dir ./models/my_project --keywords "database" --top-k 5
|
|
255
|
+
|
|
256
|
+
# 5. After code changes, retrain incrementally
|
|
257
|
+
tricoder retrain --model-dir ./models/my_project --codebase-dir ./my_project
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Keyword Search Examples
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# Search for authentication-related code
|
|
264
|
+
tricoder query --model-dir model_output --keywords "auth login password"
|
|
265
|
+
|
|
266
|
+
# Search for specific function name
|
|
267
|
+
tricoder query --model-dir model_output --keywords '"process_payment"'
|
|
268
|
+
|
|
269
|
+
# Search excluding common keywords
|
|
270
|
+
tricoder query --model-dir model_output --keywords handler --exclude-keywords temp --exclude-keywords debug
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## Requirements
|
|
274
|
+
|
|
275
|
+
- Python 3.8+
|
|
276
|
+
- numpy >= 1.21.0
|
|
277
|
+
- scipy >= 1.7.0
|
|
278
|
+
- scikit-learn >= 1.0.0
|
|
279
|
+
- gensim >= 4.0.0
|
|
280
|
+
- annoy >= 1.17.0
|
|
281
|
+
- click >= 8.0.0
|
|
282
|
+
- rich >= 13.0.0
|
|
283
|
+
|
|
284
|
+
**Optional (for GPU acceleration):**
|
|
285
|
+
- cupy-cuda12x >= 12.0.0 (for NVIDIA GPUs)
|
|
286
|
+
- torch >= 2.0.0 (for Mac GPUs or CUDA fallback)
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
TriCoder is available under a **Non-Commercial License**.
|
|
291
|
+
|
|
292
|
+
- ✅ **Free for non-commercial use**: Personal projects, education, research, open-source
|
|
293
|
+
- ❌ **Commercial license required**: Paid products, SaaS, commercial consulting, enterprise use
|
|
294
|
+
|
|
295
|
+
For commercial licensing inquiries, please contact: **j.f.otoupal@gmail.com**
|
|
296
|
+
|
|
297
|
+
See [LICENSE](LICENSE) for full terms and [LICENSE_COMMERCIAL.md](LICENSE_COMMERCIAL.md) for commercial license information.
|
|
298
|
+
|
|
299
|
+
<hr>
|
|
300
|
+
Did I made your life less painfull ?
|
|
301
|
+
<br>
|
|
302
|
+
<br>
|
|
303
|
+
Support my coffee addiction ;)
|
|
304
|
+
<br>
|
|
305
|
+
<a href="https://www.buymeacoffee.com/jiriotoupal" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy me a Coffee" style="height: 41px !important;width: 174px !important;box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a>
|
|
306
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
tricoder/__about__.py,sha256=zxNuFkE-43tg4N6VDTNwDHbnUa7Paps028DvLJzUIQ8,212
|
|
2
|
+
tricoder/__init__.py,sha256=oBD9XnaSedZoC_9IjO03vBpU4edMRaQu06_82qPe1cg,625
|
|
3
|
+
tricoder/calibration.py,sha256=i19uCm_pWk0F3YCx8YrpakadKNe4o7pe9XflBaoc-0M,9887
|
|
4
|
+
tricoder/cli.py,sha256=FqsSv6H_Woz1GBwAZHZSxOJ4DUsn3en6rR2W89lkwI8,41548
|
|
5
|
+
tricoder/context_view.py,sha256=RuGliktScT_qVGXsrMKipEX3vFV__FBEM347PKb6zKk,8052
|
|
6
|
+
tricoder/data_loader.py,sha256=oktaQLz8rGtim9NcOB_Ewi14B4HVpAbmhOrqrgoBsPw,5044
|
|
7
|
+
tricoder/extract.py,sha256=G1iSNMz00N1keYl1HYHjNIYjrE8XWZmkrm4TMNUGecg,23777
|
|
8
|
+
tricoder/fusion.py,sha256=2TDuoKJ2CTyN4potX6XW2KTTmkiEA0ARQYiQeRj3ujY,7605
|
|
9
|
+
tricoder/git_tracker.py,sha256=bKRt2gGWSR7BzPkDZvvNjQf8XrREOeBQgZjSH-15hcg,7155
|
|
10
|
+
tricoder/gpu_utils.py,sha256=QfaKefQ2HDTDSxUMOi9jKgik9_q-N1WjjLTV5_v7tqo,16126
|
|
11
|
+
tricoder/graph_view.py,sha256=YZAPhUajSvRw52WdsTSYoBkED9cKLpC-L8mPEoT8IqE,22227
|
|
12
|
+
tricoder/model.py,sha256=2FZl1C9UPriFkH_iMEqbGeOMONEq4gcURrU9-oUhUXw,20459
|
|
13
|
+
tricoder/optimize.py,sha256=FgbWLL-x0q5fV5US8r_2nllfazbU6-aohCXB3hPt_uM,8765
|
|
14
|
+
tricoder/subtoken_utils.py,sha256=Z7eHH6pjWmG-sJ9VsTJeoL11l6AmGk3kM9VI0WzAzJM,5833
|
|
15
|
+
tricoder/train.py,sha256=cmj_y3P_htJkGi_2G2z7VhX63k8xO90jeDjQZSPqoS0,41948
|
|
16
|
+
tricoder/typed_view.py,sha256=MzC-QAgY_nahXx_BEk_YMjPMxZj_t43NKSHkQweQ4lY,11580
|
|
17
|
+
tricoder-1.2.8.dist-info/METADATA,sha256=uW2jd2-ELFqqtMguJ2Stl84B5s7LSJFSzr4NsZpW6JE,11883
|
|
18
|
+
tricoder-1.2.8.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
19
|
+
tricoder-1.2.8.dist-info/entry_points.txt,sha256=luwxM7BzbfdWNJvwRYVUhm3z5bDF3ZoK0M5lzGajgZs,45
|
|
20
|
+
tricoder-1.2.8.dist-info/licenses/LICENSE,sha256=POHObqU-qPg7DWoG6PVphyxu187YFhlhVp_MEBldcF8,2526
|
|
21
|
+
tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md,sha256=Omn6tI3kUiuBMvcrXhJw2sEH_WHjzaImgDIra-WyRik,2472
|
|
22
|
+
tricoder-1.2.8.dist-info/RECORD,,
|