tricoder 1.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tricoder/__about__.py +6 -0
- tricoder/__init__.py +19 -0
- tricoder/calibration.py +276 -0
- tricoder/cli.py +890 -0
- tricoder/context_view.py +228 -0
- tricoder/data_loader.py +144 -0
- tricoder/extract.py +622 -0
- tricoder/fusion.py +203 -0
- tricoder/git_tracker.py +203 -0
- tricoder/gpu_utils.py +414 -0
- tricoder/graph_view.py +583 -0
- tricoder/model.py +476 -0
- tricoder/optimize.py +263 -0
- tricoder/subtoken_utils.py +196 -0
- tricoder/train.py +857 -0
- tricoder/typed_view.py +313 -0
- tricoder-1.2.8.dist-info/METADATA +306 -0
- tricoder-1.2.8.dist-info/RECORD +22 -0
- tricoder-1.2.8.dist-info/WHEEL +4 -0
- tricoder-1.2.8.dist-info/entry_points.txt +3 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE +56 -0
- tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md +68 -0
tricoder/fusion.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Fusion pipeline: concatenate views, PCA, normalize."""
|
|
2
|
+
from multiprocessing import cpu_count
|
|
3
|
+
from typing import Tuple, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy import sparse
|
|
7
|
+
from sklearn.decomposition import PCA
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_num_workers() -> int:
|
|
11
|
+
"""Get number of workers (all cores - 1, minimum 1)."""
|
|
12
|
+
return max(1, cpu_count() - 1)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def pad_to_same_rows(embeddings_list: List[np.ndarray], num_nodes: int) -> List[np.ndarray]:
|
|
16
|
+
"""
|
|
17
|
+
Pad each embedding matrix to have num_nodes rows.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
embeddings_list: list of embedding matrices
|
|
21
|
+
num_nodes: target number of rows
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of padded matrices
|
|
25
|
+
"""
|
|
26
|
+
padded = []
|
|
27
|
+
for emb in embeddings_list:
|
|
28
|
+
if emb.shape[0] < num_nodes:
|
|
29
|
+
# Pad with zeros
|
|
30
|
+
padding = np.zeros((num_nodes - emb.shape[0], emb.shape[1]))
|
|
31
|
+
emb_padded = np.vstack([emb, padding])
|
|
32
|
+
else:
|
|
33
|
+
emb_padded = emb[:num_nodes]
|
|
34
|
+
padded.append(emb_padded)
|
|
35
|
+
return padded
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def fuse_embeddings(embeddings_list: List[np.ndarray],
|
|
39
|
+
num_nodes: int,
|
|
40
|
+
final_dim: int,
|
|
41
|
+
random_state: int = 42,
|
|
42
|
+
n_jobs: int = -1,
|
|
43
|
+
gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
44
|
+
"""
|
|
45
|
+
Fuse multiple embedding views using concatenation and PCA (GPU-accelerated if available).
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
embeddings_list: list of embedding matrices from different views
|
|
49
|
+
num_nodes: number of nodes
|
|
50
|
+
final_dim: final embedding dimensionality
|
|
51
|
+
random_state: random seed
|
|
52
|
+
n_jobs: number of parallel jobs (not used, kept for API consistency)
|
|
53
|
+
gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
fused_embeddings: final fused embeddings
|
|
57
|
+
pca_components: PCA transformation matrix
|
|
58
|
+
pca_mean: PCA mean vector
|
|
59
|
+
"""
|
|
60
|
+
# Pad all matrices to same number of rows
|
|
61
|
+
padded = pad_to_same_rows(embeddings_list, num_nodes)
|
|
62
|
+
|
|
63
|
+
# Concatenate horizontally
|
|
64
|
+
concatenated = np.hstack(padded)
|
|
65
|
+
|
|
66
|
+
# Determine actual PCA dimension (can't exceed min(n_samples, n_features))
|
|
67
|
+
num_features = concatenated.shape[1]
|
|
68
|
+
actual_dim = min(final_dim, num_nodes, num_features)
|
|
69
|
+
|
|
70
|
+
# Try GPU acceleration if available
|
|
71
|
+
if gpu_accelerator and gpu_accelerator.use_gpu:
|
|
72
|
+
try:
|
|
73
|
+
fused, components, mean = gpu_accelerator.pca(concatenated, actual_dim, random_state)
|
|
74
|
+
except Exception:
|
|
75
|
+
# Fall back to CPU
|
|
76
|
+
pca = PCA(n_components=actual_dim, random_state=random_state)
|
|
77
|
+
fused = pca.fit_transform(concatenated)
|
|
78
|
+
components = pca.components_
|
|
79
|
+
mean = pca.mean_
|
|
80
|
+
else:
|
|
81
|
+
# CPU path
|
|
82
|
+
pca = PCA(n_components=actual_dim, random_state=random_state)
|
|
83
|
+
fused = pca.fit_transform(concatenated)
|
|
84
|
+
components = pca.components_
|
|
85
|
+
mean = pca.mean_
|
|
86
|
+
|
|
87
|
+
# Pad if needed to match requested dimension
|
|
88
|
+
if actual_dim < final_dim:
|
|
89
|
+
padding = np.zeros((fused.shape[0], final_dim - actual_dim))
|
|
90
|
+
fused = np.hstack([fused, padding])
|
|
91
|
+
# Pad components
|
|
92
|
+
component_padding = np.zeros((final_dim - actual_dim, components.shape[1]))
|
|
93
|
+
components = np.vstack([components, component_padding])
|
|
94
|
+
|
|
95
|
+
# Normalize each row to unit length (use GPU if available)
|
|
96
|
+
if gpu_accelerator and gpu_accelerator.use_gpu:
|
|
97
|
+
try:
|
|
98
|
+
norms = gpu_accelerator.norm(fused, axis=1, keepdims=True)
|
|
99
|
+
norms = gpu_accelerator.maximum(norms, 1e-10)
|
|
100
|
+
fused_normalized = fused / norms
|
|
101
|
+
except Exception:
|
|
102
|
+
norms = np.linalg.norm(fused, axis=1, keepdims=True)
|
|
103
|
+
norms = np.maximum(norms, 1e-10)
|
|
104
|
+
fused_normalized = fused / norms
|
|
105
|
+
else:
|
|
106
|
+
norms = np.linalg.norm(fused, axis=1, keepdims=True)
|
|
107
|
+
norms = np.maximum(norms, 1e-10)
|
|
108
|
+
fused_normalized = fused / norms
|
|
109
|
+
|
|
110
|
+
return fused_normalized, components, mean
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def build_neighbor_graph(edges: List[Tuple[int, int, str, float]], num_nodes: int) -> sparse.csr_matrix:
|
|
114
|
+
"""
|
|
115
|
+
Build neighbor graph adjacency matrix for smoothing.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
edges: list of (src_idx, dst_idx, relation, weight) tuples
|
|
119
|
+
num_nodes: number of nodes
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Sparse CSR adjacency matrix
|
|
123
|
+
"""
|
|
124
|
+
rows = []
|
|
125
|
+
cols = []
|
|
126
|
+
|
|
127
|
+
for src_idx, dst_idx, rel, weight in edges:
|
|
128
|
+
rows.append(src_idx)
|
|
129
|
+
cols.append(dst_idx)
|
|
130
|
+
|
|
131
|
+
# Create symmetric matrix (undirected graph)
|
|
132
|
+
rows_sym = rows + cols
|
|
133
|
+
cols_sym = cols + rows
|
|
134
|
+
|
|
135
|
+
adj = sparse.csr_matrix((np.ones(len(rows_sym)), (rows_sym, cols_sym)), shape=(num_nodes, num_nodes))
|
|
136
|
+
return adj
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def iterative_embedding_smoothing(embeddings: np.ndarray,
|
|
140
|
+
edges: List[Tuple[int, int, str, float]],
|
|
141
|
+
num_nodes: int,
|
|
142
|
+
num_iterations: int = 2,
|
|
143
|
+
beta: float = 0.35,
|
|
144
|
+
random_state: int = 42,
|
|
145
|
+
gpu_accelerator=None) -> np.ndarray:
|
|
146
|
+
"""
|
|
147
|
+
Apply iterative embedding smoothing (diffusion) to embeddings.
|
|
148
|
+
Optimized with vectorized operations.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
embeddings: input embeddings (num_nodes, dim)
|
|
152
|
+
edges: list of edges for neighbor graph
|
|
153
|
+
num_nodes: number of nodes
|
|
154
|
+
num_iterations: number of smoothing iterations (default 2)
|
|
155
|
+
beta: smoothing factor (default 0.35)
|
|
156
|
+
random_state: random seed
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Smoothed embeddings
|
|
160
|
+
"""
|
|
161
|
+
np.random.seed(random_state)
|
|
162
|
+
|
|
163
|
+
# Build neighbor graph
|
|
164
|
+
adj = build_neighbor_graph(edges, num_nodes)
|
|
165
|
+
|
|
166
|
+
# Iterative smoothing with vectorized operations
|
|
167
|
+
smoothed = embeddings.copy()
|
|
168
|
+
|
|
169
|
+
for iteration in range(num_iterations):
|
|
170
|
+
# Vectorized neighbor averaging using sparse matrix multiplication
|
|
171
|
+
# Use GPU acceleration if available
|
|
172
|
+
if gpu_accelerator and gpu_accelerator.use_gpu:
|
|
173
|
+
try:
|
|
174
|
+
neighbor_sums = gpu_accelerator.sparse_matmul(adj, smoothed)
|
|
175
|
+
degrees = gpu_accelerator.sum(adj, axis=1)
|
|
176
|
+
degrees = gpu_accelerator.maximum(degrees.flatten(), 1.0)
|
|
177
|
+
neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
|
|
178
|
+
smoothed = beta * neighbor_avg + (1 - beta) * smoothed
|
|
179
|
+
norms = gpu_accelerator.norm(smoothed, axis=1, keepdims=True)
|
|
180
|
+
norms = gpu_accelerator.maximum(norms, 1e-10)
|
|
181
|
+
smoothed = smoothed / norms
|
|
182
|
+
except Exception:
|
|
183
|
+
# Fall back to CPU
|
|
184
|
+
neighbor_sums = adj.dot(smoothed)
|
|
185
|
+
degrees = np.array(adj.sum(axis=1)).flatten()
|
|
186
|
+
degrees = np.maximum(degrees, 1.0)
|
|
187
|
+
neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
|
|
188
|
+
smoothed = beta * neighbor_avg + (1 - beta) * smoothed
|
|
189
|
+
norms = np.linalg.norm(smoothed, axis=1, keepdims=True)
|
|
190
|
+
norms = np.maximum(norms, 1e-10)
|
|
191
|
+
smoothed = smoothed / norms
|
|
192
|
+
else:
|
|
193
|
+
# CPU path
|
|
194
|
+
neighbor_sums = adj.dot(smoothed)
|
|
195
|
+
degrees = np.array(adj.sum(axis=1)).flatten()
|
|
196
|
+
degrees = np.maximum(degrees, 1.0)
|
|
197
|
+
neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
|
|
198
|
+
smoothed = beta * neighbor_avg + (1 - beta) * smoothed
|
|
199
|
+
norms = np.linalg.norm(smoothed, axis=1, keepdims=True)
|
|
200
|
+
norms = np.maximum(norms, 1e-10)
|
|
201
|
+
smoothed = smoothed / norms
|
|
202
|
+
|
|
203
|
+
return smoothed
|
tricoder/git_tracker.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Git tracking utilities for incremental retraining."""
|
|
2
|
+
import json
|
|
3
|
+
import subprocess
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Set, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_git_commit_hash(repo_path: str = '.') -> Optional[str]:
|
|
10
|
+
"""Get the current git commit hash."""
|
|
11
|
+
try:
|
|
12
|
+
result = subprocess.run(
|
|
13
|
+
['git', 'rev-parse', 'HEAD'],
|
|
14
|
+
cwd=repo_path,
|
|
15
|
+
capture_output=True,
|
|
16
|
+
text=True,
|
|
17
|
+
check=True
|
|
18
|
+
)
|
|
19
|
+
return result.stdout.strip()
|
|
20
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_git_commit_timestamp(repo_path: str = '.') -> Optional[datetime]:
|
|
25
|
+
"""Get the timestamp of the current git commit."""
|
|
26
|
+
try:
|
|
27
|
+
result = subprocess.run(
|
|
28
|
+
['git', 'log', '-1', '--format=%ct'],
|
|
29
|
+
cwd=repo_path,
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
check=True
|
|
33
|
+
)
|
|
34
|
+
timestamp = int(result.stdout.strip())
|
|
35
|
+
return datetime.fromtimestamp(timestamp)
|
|
36
|
+
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_file_git_timestamp(file_path: str, repo_path: str = '.') -> Optional[datetime]:
|
|
41
|
+
"""Get the last modification timestamp of a file from git."""
|
|
42
|
+
try:
|
|
43
|
+
# Get the last commit that modified this file
|
|
44
|
+
result = subprocess.run(
|
|
45
|
+
['git', 'log', '-1', '--format=%ct', '--', file_path],
|
|
46
|
+
cwd=repo_path,
|
|
47
|
+
capture_output=True,
|
|
48
|
+
text=True,
|
|
49
|
+
check=True
|
|
50
|
+
)
|
|
51
|
+
if result.stdout.strip():
|
|
52
|
+
timestamp = int(result.stdout.strip())
|
|
53
|
+
return datetime.fromtimestamp(timestamp)
|
|
54
|
+
return None
|
|
55
|
+
except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_changed_files_since_commit(commit_hash: str, repo_path: str = '.') -> Set[str]:
|
|
60
|
+
"""Get all files that changed since a given commit."""
|
|
61
|
+
try:
|
|
62
|
+
result = subprocess.run(
|
|
63
|
+
['git', 'diff', '--name-only', commit_hash, 'HEAD'],
|
|
64
|
+
cwd=repo_path,
|
|
65
|
+
capture_output=True,
|
|
66
|
+
text=True,
|
|
67
|
+
check=True
|
|
68
|
+
)
|
|
69
|
+
files = set()
|
|
70
|
+
for line in result.stdout.strip().split('\n'):
|
|
71
|
+
if line.strip():
|
|
72
|
+
files.add(line.strip())
|
|
73
|
+
return files
|
|
74
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
75
|
+
return set()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_all_python_files(repo_path: str = '.') -> Set[str]:
|
|
79
|
+
"""Get all Python files tracked by git."""
|
|
80
|
+
try:
|
|
81
|
+
result = subprocess.run(
|
|
82
|
+
['git', 'ls-files', '*.py'],
|
|
83
|
+
cwd=repo_path,
|
|
84
|
+
capture_output=True,
|
|
85
|
+
text=True,
|
|
86
|
+
check=True
|
|
87
|
+
)
|
|
88
|
+
files = set()
|
|
89
|
+
for line in result.stdout.strip().split('\n'):
|
|
90
|
+
if line.strip():
|
|
91
|
+
files.add(line.strip())
|
|
92
|
+
return files
|
|
93
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
94
|
+
return set()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def save_training_metadata(output_dir: str, commit_hash: Optional[str],
|
|
98
|
+
commit_timestamp: Optional[datetime],
|
|
99
|
+
files_trained: Set[str]):
|
|
100
|
+
"""Save training metadata including git commit info."""
|
|
101
|
+
metadata_path = Path(output_dir) / 'training_metadata.json'
|
|
102
|
+
|
|
103
|
+
metadata = {
|
|
104
|
+
'commit_hash': commit_hash,
|
|
105
|
+
'commit_timestamp': commit_timestamp.isoformat() if commit_timestamp else None,
|
|
106
|
+
'training_timestamp': datetime.now().isoformat(),
|
|
107
|
+
'files_trained': sorted(list(files_trained))
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
with open(metadata_path, 'w') as f:
|
|
111
|
+
json.dump(metadata, f, indent=2)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def load_training_metadata(output_dir: str) -> Optional[Dict]:
|
|
115
|
+
"""Load training metadata."""
|
|
116
|
+
metadata_path = Path(output_dir) / 'training_metadata.json'
|
|
117
|
+
|
|
118
|
+
if not metadata_path.exists():
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
with open(metadata_path, 'r') as f:
|
|
123
|
+
metadata = json.load(f)
|
|
124
|
+
return metadata
|
|
125
|
+
except (json.JSONDecodeError, IOError):
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_changed_files_for_retraining(output_dir: str, repo_path: str = '.') -> Set[str]:
|
|
130
|
+
"""Get files that have changed since last training."""
|
|
131
|
+
metadata = load_training_metadata(output_dir)
|
|
132
|
+
|
|
133
|
+
if not metadata:
|
|
134
|
+
# No previous training, return all files
|
|
135
|
+
return get_all_python_files(repo_path)
|
|
136
|
+
|
|
137
|
+
commit_hash = metadata.get('commit_hash')
|
|
138
|
+
if not commit_hash:
|
|
139
|
+
# No commit hash stored, return all files
|
|
140
|
+
return get_all_python_files(repo_path)
|
|
141
|
+
|
|
142
|
+
# Get files changed since that commit
|
|
143
|
+
changed_files = get_changed_files_since_commit(commit_hash, repo_path)
|
|
144
|
+
|
|
145
|
+
# Also check files that were trained before (in case they were deleted from git)
|
|
146
|
+
files_trained = set(metadata.get('files_trained', []))
|
|
147
|
+
|
|
148
|
+
# Return union of changed files and previously trained files (for safety)
|
|
149
|
+
all_python_files = get_all_python_files(repo_path)
|
|
150
|
+
return changed_files.intersection(all_python_files)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def extract_files_from_jsonl(jsonl_path: str) -> Set[str]:
|
|
154
|
+
"""Extract file paths from nodes.jsonl or edges.jsonl."""
|
|
155
|
+
files = set()
|
|
156
|
+
try:
|
|
157
|
+
with open(jsonl_path, 'r') as f:
|
|
158
|
+
for line in f:
|
|
159
|
+
if not line.strip():
|
|
160
|
+
continue
|
|
161
|
+
try:
|
|
162
|
+
data = json.loads(line)
|
|
163
|
+
# Check for file path in metadata
|
|
164
|
+
if 'meta' in data and isinstance(data['meta'], dict):
|
|
165
|
+
file_path = data['meta'].get('file', '')
|
|
166
|
+
if file_path:
|
|
167
|
+
files.add(file_path)
|
|
168
|
+
except json.JSONDecodeError:
|
|
169
|
+
continue
|
|
170
|
+
except IOError:
|
|
171
|
+
pass
|
|
172
|
+
return files
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def filter_jsonl_by_files(jsonl_path: str, allowed_files: Set[str],
|
|
176
|
+
output_path: str) -> int:
|
|
177
|
+
"""Filter a JSONL file to only include entries from allowed files."""
|
|
178
|
+
count = 0
|
|
179
|
+
with open(output_path, 'w') as out_f:
|
|
180
|
+
with open(jsonl_path, 'r') as in_f:
|
|
181
|
+
for line in in_f:
|
|
182
|
+
if not line.strip():
|
|
183
|
+
continue
|
|
184
|
+
try:
|
|
185
|
+
data = json.loads(line)
|
|
186
|
+
# Check if this entry belongs to an allowed file
|
|
187
|
+
file_path = ''
|
|
188
|
+
if 'meta' in data and isinstance(data['meta'], dict):
|
|
189
|
+
file_path = data['meta'].get('file', '')
|
|
190
|
+
|
|
191
|
+
# For edges, we need to check both src and dst
|
|
192
|
+
if 'src' in data or 'dst' in data:
|
|
193
|
+
# This is an edge - we'll include it if either endpoint is in allowed files
|
|
194
|
+
# We'll need to check this against the nodes
|
|
195
|
+
out_f.write(line)
|
|
196
|
+
count += 1
|
|
197
|
+
elif file_path in allowed_files or not file_path:
|
|
198
|
+
# Node or type entry - include if file matches or no file specified
|
|
199
|
+
out_f.write(line)
|
|
200
|
+
count += 1
|
|
201
|
+
except json.JSONDecodeError:
|
|
202
|
+
continue
|
|
203
|
+
return count
|