tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tricoder/fusion.py ADDED
@@ -0,0 +1,203 @@
1
+ """Fusion pipeline: concatenate views, PCA, normalize."""
2
+ from multiprocessing import cpu_count
3
+ from typing import Tuple, List
4
+
5
+ import numpy as np
6
+ from scipy import sparse
7
+ from sklearn.decomposition import PCA
8
+
9
+
10
+ def _get_num_workers() -> int:
11
+ """Get number of workers (all cores - 1, minimum 1)."""
12
+ return max(1, cpu_count() - 1)
13
+
14
+
15
+ def pad_to_same_rows(embeddings_list: List[np.ndarray], num_nodes: int) -> List[np.ndarray]:
16
+ """
17
+ Pad each embedding matrix to have num_nodes rows.
18
+
19
+ Args:
20
+ embeddings_list: list of embedding matrices
21
+ num_nodes: target number of rows
22
+
23
+ Returns:
24
+ List of padded matrices
25
+ """
26
+ padded = []
27
+ for emb in embeddings_list:
28
+ if emb.shape[0] < num_nodes:
29
+ # Pad with zeros
30
+ padding = np.zeros((num_nodes - emb.shape[0], emb.shape[1]))
31
+ emb_padded = np.vstack([emb, padding])
32
+ else:
33
+ emb_padded = emb[:num_nodes]
34
+ padded.append(emb_padded)
35
+ return padded
36
+
37
+
38
+ def fuse_embeddings(embeddings_list: List[np.ndarray],
39
+ num_nodes: int,
40
+ final_dim: int,
41
+ random_state: int = 42,
42
+ n_jobs: int = -1,
43
+ gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
44
+ """
45
+ Fuse multiple embedding views using concatenation and PCA (GPU-accelerated if available).
46
+
47
+ Args:
48
+ embeddings_list: list of embedding matrices from different views
49
+ num_nodes: number of nodes
50
+ final_dim: final embedding dimensionality
51
+ random_state: random seed
52
+ n_jobs: number of parallel jobs (not used, kept for API consistency)
53
+ gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
54
+
55
+ Returns:
56
+ fused_embeddings: final fused embeddings
57
+ pca_components: PCA transformation matrix
58
+ pca_mean: PCA mean vector
59
+ """
60
+ # Pad all matrices to same number of rows
61
+ padded = pad_to_same_rows(embeddings_list, num_nodes)
62
+
63
+ # Concatenate horizontally
64
+ concatenated = np.hstack(padded)
65
+
66
+ # Determine actual PCA dimension (can't exceed min(n_samples, n_features))
67
+ num_features = concatenated.shape[1]
68
+ actual_dim = min(final_dim, num_nodes, num_features)
69
+
70
+ # Try GPU acceleration if available
71
+ if gpu_accelerator and gpu_accelerator.use_gpu:
72
+ try:
73
+ fused, components, mean = gpu_accelerator.pca(concatenated, actual_dim, random_state)
74
+ except Exception:
75
+ # Fall back to CPU
76
+ pca = PCA(n_components=actual_dim, random_state=random_state)
77
+ fused = pca.fit_transform(concatenated)
78
+ components = pca.components_
79
+ mean = pca.mean_
80
+ else:
81
+ # CPU path
82
+ pca = PCA(n_components=actual_dim, random_state=random_state)
83
+ fused = pca.fit_transform(concatenated)
84
+ components = pca.components_
85
+ mean = pca.mean_
86
+
87
+ # Pad if needed to match requested dimension
88
+ if actual_dim < final_dim:
89
+ padding = np.zeros((fused.shape[0], final_dim - actual_dim))
90
+ fused = np.hstack([fused, padding])
91
+ # Pad components
92
+ component_padding = np.zeros((final_dim - actual_dim, components.shape[1]))
93
+ components = np.vstack([components, component_padding])
94
+
95
+ # Normalize each row to unit length (use GPU if available)
96
+ if gpu_accelerator and gpu_accelerator.use_gpu:
97
+ try:
98
+ norms = gpu_accelerator.norm(fused, axis=1, keepdims=True)
99
+ norms = gpu_accelerator.maximum(norms, 1e-10)
100
+ fused_normalized = fused / norms
101
+ except Exception:
102
+ norms = np.linalg.norm(fused, axis=1, keepdims=True)
103
+ norms = np.maximum(norms, 1e-10)
104
+ fused_normalized = fused / norms
105
+ else:
106
+ norms = np.linalg.norm(fused, axis=1, keepdims=True)
107
+ norms = np.maximum(norms, 1e-10)
108
+ fused_normalized = fused / norms
109
+
110
+ return fused_normalized, components, mean
111
+
112
+
113
+ def build_neighbor_graph(edges: List[Tuple[int, int, str, float]], num_nodes: int) -> sparse.csr_matrix:
114
+ """
115
+ Build neighbor graph adjacency matrix for smoothing.
116
+
117
+ Args:
118
+ edges: list of (src_idx, dst_idx, relation, weight) tuples
119
+ num_nodes: number of nodes
120
+
121
+ Returns:
122
+ Sparse CSR adjacency matrix
123
+ """
124
+ rows = []
125
+ cols = []
126
+
127
+ for src_idx, dst_idx, rel, weight in edges:
128
+ rows.append(src_idx)
129
+ cols.append(dst_idx)
130
+
131
+ # Create symmetric matrix (undirected graph)
132
+ rows_sym = rows + cols
133
+ cols_sym = cols + rows
134
+
135
+ adj = sparse.csr_matrix((np.ones(len(rows_sym)), (rows_sym, cols_sym)), shape=(num_nodes, num_nodes))
136
+ return adj
137
+
138
+
139
+ def iterative_embedding_smoothing(embeddings: np.ndarray,
140
+ edges: List[Tuple[int, int, str, float]],
141
+ num_nodes: int,
142
+ num_iterations: int = 2,
143
+ beta: float = 0.35,
144
+ random_state: int = 42,
145
+ gpu_accelerator=None) -> np.ndarray:
146
+ """
147
+ Apply iterative embedding smoothing (diffusion) to embeddings.
148
+ Optimized with vectorized operations.
149
+
150
+ Args:
151
+ embeddings: input embeddings (num_nodes, dim)
152
+ edges: list of edges for neighbor graph
153
+ num_nodes: number of nodes
154
+ num_iterations: number of smoothing iterations (default 2)
155
+ beta: smoothing factor (default 0.35)
156
+ random_state: random seed
157
+
158
+ Returns:
159
+ Smoothed embeddings
160
+ """
161
+ np.random.seed(random_state)
162
+
163
+ # Build neighbor graph
164
+ adj = build_neighbor_graph(edges, num_nodes)
165
+
166
+ # Iterative smoothing with vectorized operations
167
+ smoothed = embeddings.copy()
168
+
169
+ for iteration in range(num_iterations):
170
+ # Vectorized neighbor averaging using sparse matrix multiplication
171
+ # Use GPU acceleration if available
172
+ if gpu_accelerator and gpu_accelerator.use_gpu:
173
+ try:
174
+ neighbor_sums = gpu_accelerator.sparse_matmul(adj, smoothed)
175
+ degrees = gpu_accelerator.sum(adj, axis=1)
176
+ degrees = gpu_accelerator.maximum(degrees.flatten(), 1.0)
177
+ neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
178
+ smoothed = beta * neighbor_avg + (1 - beta) * smoothed
179
+ norms = gpu_accelerator.norm(smoothed, axis=1, keepdims=True)
180
+ norms = gpu_accelerator.maximum(norms, 1e-10)
181
+ smoothed = smoothed / norms
182
+ except Exception:
183
+ # Fall back to CPU
184
+ neighbor_sums = adj.dot(smoothed)
185
+ degrees = np.array(adj.sum(axis=1)).flatten()
186
+ degrees = np.maximum(degrees, 1.0)
187
+ neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
188
+ smoothed = beta * neighbor_avg + (1 - beta) * smoothed
189
+ norms = np.linalg.norm(smoothed, axis=1, keepdims=True)
190
+ norms = np.maximum(norms, 1e-10)
191
+ smoothed = smoothed / norms
192
+ else:
193
+ # CPU path
194
+ neighbor_sums = adj.dot(smoothed)
195
+ degrees = np.array(adj.sum(axis=1)).flatten()
196
+ degrees = np.maximum(degrees, 1.0)
197
+ neighbor_avg = neighbor_sums / degrees[:, np.newaxis]
198
+ smoothed = beta * neighbor_avg + (1 - beta) * smoothed
199
+ norms = np.linalg.norm(smoothed, axis=1, keepdims=True)
200
+ norms = np.maximum(norms, 1e-10)
201
+ smoothed = smoothed / norms
202
+
203
+ return smoothed
@@ -0,0 +1,203 @@
1
+ """Git tracking utilities for incremental retraining."""
2
+ import json
3
+ import subprocess
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Dict, Set, Optional
7
+
8
+
9
+ def get_git_commit_hash(repo_path: str = '.') -> Optional[str]:
10
+ """Get the current git commit hash."""
11
+ try:
12
+ result = subprocess.run(
13
+ ['git', 'rev-parse', 'HEAD'],
14
+ cwd=repo_path,
15
+ capture_output=True,
16
+ text=True,
17
+ check=True
18
+ )
19
+ return result.stdout.strip()
20
+ except (subprocess.CalledProcessError, FileNotFoundError):
21
+ return None
22
+
23
+
24
+ def get_git_commit_timestamp(repo_path: str = '.') -> Optional[datetime]:
25
+ """Get the timestamp of the current git commit."""
26
+ try:
27
+ result = subprocess.run(
28
+ ['git', 'log', '-1', '--format=%ct'],
29
+ cwd=repo_path,
30
+ capture_output=True,
31
+ text=True,
32
+ check=True
33
+ )
34
+ timestamp = int(result.stdout.strip())
35
+ return datetime.fromtimestamp(timestamp)
36
+ except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
37
+ return None
38
+
39
+
40
+ def get_file_git_timestamp(file_path: str, repo_path: str = '.') -> Optional[datetime]:
41
+ """Get the last modification timestamp of a file from git."""
42
+ try:
43
+ # Get the last commit that modified this file
44
+ result = subprocess.run(
45
+ ['git', 'log', '-1', '--format=%ct', '--', file_path],
46
+ cwd=repo_path,
47
+ capture_output=True,
48
+ text=True,
49
+ check=True
50
+ )
51
+ if result.stdout.strip():
52
+ timestamp = int(result.stdout.strip())
53
+ return datetime.fromtimestamp(timestamp)
54
+ return None
55
+ except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
56
+ return None
57
+
58
+
59
+ def get_changed_files_since_commit(commit_hash: str, repo_path: str = '.') -> Set[str]:
60
+ """Get all files that changed since a given commit."""
61
+ try:
62
+ result = subprocess.run(
63
+ ['git', 'diff', '--name-only', commit_hash, 'HEAD'],
64
+ cwd=repo_path,
65
+ capture_output=True,
66
+ text=True,
67
+ check=True
68
+ )
69
+ files = set()
70
+ for line in result.stdout.strip().split('\n'):
71
+ if line.strip():
72
+ files.add(line.strip())
73
+ return files
74
+ except (subprocess.CalledProcessError, FileNotFoundError):
75
+ return set()
76
+
77
+
78
+ def get_all_python_files(repo_path: str = '.') -> Set[str]:
79
+ """Get all Python files tracked by git."""
80
+ try:
81
+ result = subprocess.run(
82
+ ['git', 'ls-files', '*.py'],
83
+ cwd=repo_path,
84
+ capture_output=True,
85
+ text=True,
86
+ check=True
87
+ )
88
+ files = set()
89
+ for line in result.stdout.strip().split('\n'):
90
+ if line.strip():
91
+ files.add(line.strip())
92
+ return files
93
+ except (subprocess.CalledProcessError, FileNotFoundError):
94
+ return set()
95
+
96
+
97
+ def save_training_metadata(output_dir: str, commit_hash: Optional[str],
98
+ commit_timestamp: Optional[datetime],
99
+ files_trained: Set[str]):
100
+ """Save training metadata including git commit info."""
101
+ metadata_path = Path(output_dir) / 'training_metadata.json'
102
+
103
+ metadata = {
104
+ 'commit_hash': commit_hash,
105
+ 'commit_timestamp': commit_timestamp.isoformat() if commit_timestamp else None,
106
+ 'training_timestamp': datetime.now().isoformat(),
107
+ 'files_trained': sorted(list(files_trained))
108
+ }
109
+
110
+ with open(metadata_path, 'w') as f:
111
+ json.dump(metadata, f, indent=2)
112
+
113
+
114
+ def load_training_metadata(output_dir: str) -> Optional[Dict]:
115
+ """Load training metadata."""
116
+ metadata_path = Path(output_dir) / 'training_metadata.json'
117
+
118
+ if not metadata_path.exists():
119
+ return None
120
+
121
+ try:
122
+ with open(metadata_path, 'r') as f:
123
+ metadata = json.load(f)
124
+ return metadata
125
+ except (json.JSONDecodeError, IOError):
126
+ return None
127
+
128
+
129
+ def get_changed_files_for_retraining(output_dir: str, repo_path: str = '.') -> Set[str]:
130
+ """Get files that have changed since last training."""
131
+ metadata = load_training_metadata(output_dir)
132
+
133
+ if not metadata:
134
+ # No previous training, return all files
135
+ return get_all_python_files(repo_path)
136
+
137
+ commit_hash = metadata.get('commit_hash')
138
+ if not commit_hash:
139
+ # No commit hash stored, return all files
140
+ return get_all_python_files(repo_path)
141
+
142
+ # Get files changed since that commit
143
+ changed_files = get_changed_files_since_commit(commit_hash, repo_path)
144
+
145
+ # Also check files that were trained before (in case they were deleted from git)
146
+ files_trained = set(metadata.get('files_trained', []))
147
+
148
+ # Return union of changed files and previously trained files (for safety)
149
+ all_python_files = get_all_python_files(repo_path)
150
+ return changed_files.intersection(all_python_files)
151
+
152
+
153
+ def extract_files_from_jsonl(jsonl_path: str) -> Set[str]:
154
+ """Extract file paths from nodes.jsonl or edges.jsonl."""
155
+ files = set()
156
+ try:
157
+ with open(jsonl_path, 'r') as f:
158
+ for line in f:
159
+ if not line.strip():
160
+ continue
161
+ try:
162
+ data = json.loads(line)
163
+ # Check for file path in metadata
164
+ if 'meta' in data and isinstance(data['meta'], dict):
165
+ file_path = data['meta'].get('file', '')
166
+ if file_path:
167
+ files.add(file_path)
168
+ except json.JSONDecodeError:
169
+ continue
170
+ except IOError:
171
+ pass
172
+ return files
173
+
174
+
175
+ def filter_jsonl_by_files(jsonl_path: str, allowed_files: Set[str],
176
+ output_path: str) -> int:
177
+ """Filter a JSONL file to only include entries from allowed files."""
178
+ count = 0
179
+ with open(output_path, 'w') as out_f:
180
+ with open(jsonl_path, 'r') as in_f:
181
+ for line in in_f:
182
+ if not line.strip():
183
+ continue
184
+ try:
185
+ data = json.loads(line)
186
+ # Check if this entry belongs to an allowed file
187
+ file_path = ''
188
+ if 'meta' in data and isinstance(data['meta'], dict):
189
+ file_path = data['meta'].get('file', '')
190
+
191
+ # For edges, we need to check both src and dst
192
+ if 'src' in data or 'dst' in data:
193
+ # This is an edge - we'll include it if either endpoint is in allowed files
194
+ # We'll need to check this against the nodes
195
+ out_f.write(line)
196
+ count += 1
197
+ elif file_path in allowed_files or not file_path:
198
+ # Node or type entry - include if file matches or no file specified
199
+ out_f.write(line)
200
+ count += 1
201
+ except json.JSONDecodeError:
202
+ continue
203
+ return count