tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tricoder/typed_view.py ADDED
@@ -0,0 +1,313 @@
1
+ """Typed view: symbol × type-token matrix, PPMI, and SVD."""
2
+ import re
3
+ from multiprocessing import cpu_count
4
+ from typing import Dict, Tuple, List
5
+
6
+ import numpy as np
7
+ from scipy import sparse
8
+ from sklearn.decomposition import TruncatedSVD
9
+
10
+
11
+ def _get_num_workers() -> int:
12
+ """Get number of workers (all cores - 1, minimum 1)."""
13
+ return max(1, cpu_count() - 1)
14
+
15
+
16
+ def parse_composite_type(type_token: str) -> Tuple[List[str], List[str]]:
17
+ """
18
+ Parse composite type into constructor tokens and primitive tokens.
19
+
20
+ Examples:
21
+ List[int] -> (['List'], ['int'])
22
+ Dict[str, int] -> (['Dict'], ['str', 'int'])
23
+ Optional[T] -> (['Optional'], ['T'])
24
+ List[Dict[str, int]] -> (['List', 'Dict'], ['str', 'int'])
25
+
26
+ Args:
27
+ type_token: type token string
28
+
29
+ Returns:
30
+ Tuple of (constructor_tokens, primitive_tokens)
31
+ """
32
+ constructors = []
33
+ primitives = []
34
+
35
+ # Extract generic type constructors (List, Dict, Optional, Set, Tuple, etc.)
36
+ # Pattern: ConstructorName[content]
37
+ generic_pattern = r'([A-Z][a-zA-Z0-9_]*)\s*\[([^\]]+)\]'
38
+
39
+ # Find all generic types
40
+ matches = list(re.finditer(generic_pattern, type_token))
41
+
42
+ if matches:
43
+ # Extract constructors
44
+ for match in matches:
45
+ constructor = match.group(1)
46
+ constructors.append(constructor)
47
+
48
+ # Recursively parse inner content
49
+ inner_content = match.group(2)
50
+ inner_constructors, inner_primitives = parse_composite_type(inner_content)
51
+ constructors.extend(inner_constructors)
52
+ primitives.extend(inner_primitives)
53
+
54
+ # Extract remaining primitives (not in generic brackets)
55
+ remaining = type_token
56
+ for match in reversed(matches):
57
+ remaining = remaining[:match.start()] + remaining[match.end():]
58
+
59
+ # Split by comma and extract primitives
60
+ for part in remaining.split(','):
61
+ part = part.strip()
62
+ if part and not re.match(r'^[A-Z][a-zA-Z0-9_]*\s*\[', part):
63
+ # Check if it's a primitive (lowercase or single letter)
64
+ if part[0].islower() or (len(part) == 1 and part.isalpha()):
65
+ primitives.append(part)
66
+ else:
67
+ # No generic types, check if it's a primitive or simple type
68
+ parts = [p.strip() for p in type_token.split(',')]
69
+ for part in parts:
70
+ if part:
71
+ # If starts with uppercase and not a known primitive, treat as constructor
72
+ if part[0].isupper() and part not in ['int', 'str', 'float', 'bool', 'None']:
73
+ constructors.append(part)
74
+ else:
75
+ primitives.append(part)
76
+
77
+ return constructors, primitives
78
+
79
+
80
+ def expand_type_semantics(node_types: Dict[int, Dict[str, int]],
81
+ type_to_idx: Dict[str, int],
82
+ num_nodes: int) -> Tuple[Dict[int, Dict[str, int]], Dict[str, int]]:
83
+ """
84
+ Expand type tokens into constructors and primitives.
85
+
86
+ Args:
87
+ node_types: mapping from node_idx to {type_token: count}
88
+ type_to_idx: mapping from type token to index
89
+ num_nodes: number of nodes
90
+
91
+ Returns:
92
+ Tuple of (expanded_node_types, expanded_type_to_idx)
93
+ """
94
+ expanded_node_types = {}
95
+ expanded_type_to_idx = dict(type_to_idx)
96
+
97
+ # Start with existing type indices
98
+ current_idx = len(expanded_type_to_idx)
99
+
100
+ # Process each node's types
101
+ for node_idx, types_dict in node_types.items():
102
+ expanded_types = dict(types_dict) # Start with original types
103
+
104
+ for type_token, count in types_dict.items():
105
+ # Parse composite type
106
+ constructors, primitives = parse_composite_type(type_token)
107
+
108
+ # Add constructor tokens
109
+ for constructor in constructors:
110
+ if constructor not in expanded_type_to_idx:
111
+ expanded_type_to_idx[constructor] = current_idx
112
+ current_idx += 1
113
+
114
+ constructor_idx = expanded_type_to_idx[constructor]
115
+ expanded_types[constructor] = expanded_types.get(constructor, 0) + int(count * 0.5)
116
+
117
+ # Add primitive tokens
118
+ for primitive in primitives:
119
+ if primitive not in expanded_type_to_idx:
120
+ expanded_type_to_idx[primitive] = current_idx
121
+ current_idx += 1
122
+
123
+ primitive_idx = expanded_type_to_idx[primitive]
124
+ expanded_types[primitive] = expanded_types.get(primitive, 0) + int(count * 0.25)
125
+
126
+ expanded_node_types[node_idx] = expanded_types
127
+
128
+ return expanded_node_types, expanded_type_to_idx
129
+
130
+
131
+ def build_type_matrix(node_types: Dict[int, Dict[str, int]],
132
+ type_to_idx: Dict[str, int],
133
+ num_nodes: int,
134
+ expand_types: bool = True) -> Tuple[sparse.csr_matrix, Dict[str, int]]:
135
+ """
136
+ Build sparse symbol × type-token matrix with optional type expansion.
137
+
138
+ Args:
139
+ node_types: mapping from node_idx to {type_token: count}
140
+ type_to_idx: mapping from type token to index
141
+ num_nodes: number of nodes (may include subtokens, but node_types only has original nodes)
142
+ expand_types: whether to expand composite types
143
+
144
+ Returns:
145
+ Tuple of (type_matrix, final_type_to_idx)
146
+ """
147
+ final_node_types = node_types
148
+ final_type_to_idx = type_to_idx
149
+
150
+ if expand_types:
151
+ # Use original num_nodes (before subtokens) for expansion
152
+ original_num_nodes = max(node_types.keys()) + 1 if node_types else 0
153
+ final_node_types, final_type_to_idx = expand_type_semantics(
154
+ node_types, type_to_idx, original_num_nodes
155
+ )
156
+
157
+ num_types = len(final_type_to_idx)
158
+ rows = []
159
+ cols = []
160
+ data = []
161
+
162
+ for node_idx, types_dict in final_node_types.items():
163
+ for type_token, count in types_dict.items():
164
+ if type_token in final_type_to_idx:
165
+ type_idx = final_type_to_idx[type_token]
166
+ rows.append(node_idx)
167
+ cols.append(type_idx)
168
+ data.append(float(count))
169
+
170
+ # Pad matrix to num_nodes rows (subtokens will have zero rows)
171
+ type_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(num_nodes, num_types))
172
+ return type_matrix, final_type_to_idx
173
+
174
+
175
+ def compute_ppmi_types(type_matrix: sparse.csr_matrix, k: float = 1.0) -> sparse.csr_matrix:
176
+ """
177
+ Compute PPMI for type matrix.
178
+
179
+ Args:
180
+ type_matrix: sparse symbol × type matrix
181
+ k: shift parameter
182
+
183
+ Returns:
184
+ PPMI matrix (sparse)
185
+ """
186
+ cooc = type_matrix.copy()
187
+ cooc.data = np.maximum(cooc.data, 0)
188
+
189
+ # Compute marginals
190
+ row_sums = np.array(cooc.sum(axis=1)).flatten()
191
+ col_sums = np.array(cooc.sum(axis=0)).flatten()
192
+ total = float(cooc.sum())
193
+
194
+ # Ensure row_sums and col_sums match matrix dimensions exactly
195
+ if len(row_sums) != cooc.shape[0]:
196
+ # Create full-size array and fill with row_sums
197
+ full_row_sums = np.zeros(cooc.shape[0])
198
+ full_row_sums[:len(row_sums)] = row_sums
199
+ row_sums = full_row_sums
200
+ if len(col_sums) != cooc.shape[1]:
201
+ # Create full-size array and fill with col_sums
202
+ full_col_sums = np.zeros(cooc.shape[1])
203
+ full_col_sums[:len(col_sums)] = col_sums
204
+ col_sums = full_col_sums
205
+
206
+ # Avoid division by zero
207
+ row_sums = np.maximum(row_sums, 1e-10)
208
+ col_sums = np.maximum(col_sums, 1e-10)
209
+ total = max(total, 1e-10)
210
+
211
+ # Compute PMI
212
+ rows, cols = cooc.nonzero()
213
+ values = cooc.data
214
+
215
+ # Ensure indices are within bounds
216
+ rows = np.clip(rows, 0, len(row_sums) - 1).astype(int)
217
+ cols = np.clip(cols, 0, len(col_sums) - 1).astype(int)
218
+
219
+ # Ensure all arrays have the same length
220
+ min_len = min(len(values), len(rows), len(cols))
221
+ if min_len < len(values):
222
+ values = values[:min_len]
223
+ rows = rows[:min_len]
224
+ cols = cols[:min_len]
225
+
226
+ p_ij = values / total
227
+ p_i = row_sums[rows] / total
228
+ p_j = col_sums[cols] / total
229
+
230
+ # Ensure all have same shape
231
+ assert len(p_ij) == len(p_i) == len(
232
+ p_j), f"Shape mismatch: p_ij={len(p_ij)}, p_i={len(p_i)}, p_j={len(p_j)}"
233
+
234
+ pmi = np.log(p_ij / (p_i * p_j + 1e-10) + 1e-10)
235
+ ppmi = np.maximum(pmi, 0.0)
236
+
237
+ ppmi_matrix = sparse.csr_matrix((ppmi, (rows, cols)), shape=cooc.shape)
238
+ return ppmi_matrix
239
+
240
+
241
+ def compute_typed_view(node_types: Dict[int, Dict[str, int]],
242
+ type_to_idx: Dict[str, int],
243
+ num_nodes: int,
244
+ dim: int,
245
+ random_state: int = 42,
246
+ n_jobs: int = -1,
247
+ expand_types: bool = True,
248
+ gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray, Dict[str, int]]:
249
+ """
250
+ Compute typed view embeddings with optional type expansion.
251
+
252
+ Returns:
253
+ embeddings: node embeddings from typed view
254
+ svd_components: SVD components for reconstruction
255
+ final_type_to_idx: expanded type token mapping
256
+ """
257
+ type_matrix, final_type_to_idx = build_type_matrix(
258
+ node_types, type_to_idx, num_nodes, expand_types=expand_types
259
+ )
260
+ ppmi = compute_ppmi_types(type_matrix)
261
+ embeddings, svd_components = reduce_dimensions_ppmi(ppmi, dim, random_state, n_jobs, gpu_accelerator)
262
+ return embeddings, svd_components, final_type_to_idx
263
+
264
+
265
+ def reduce_dimensions_ppmi(ppmi: sparse.csr_matrix, dim: int, random_state: int = 42,
266
+ n_jobs: int = -1, gpu_accelerator=None) -> Tuple[np.ndarray, np.ndarray]:
267
+ """
268
+ Reduce PPMI matrix dimensionality using Truncated SVD (GPU-accelerated if available).
269
+
270
+ Args:
271
+ ppmi: PPMI matrix
272
+ dim: target dimensionality
273
+ random_state: random seed
274
+ n_jobs: number of parallel jobs (not used, kept for API consistency)
275
+ gpu_accelerator: Optional GPUAccelerator instance for GPU acceleration
276
+
277
+ Returns:
278
+ Reduced embeddings matrix and SVD components
279
+ """
280
+ num_features = ppmi.shape[1]
281
+ actual_dim = min(dim, num_features)
282
+
283
+ # Try GPU acceleration if available
284
+ if gpu_accelerator and gpu_accelerator.use_gpu:
285
+ try:
286
+ # Convert sparse matrix to dense for GPU SVD
287
+ if ppmi.shape[0] * ppmi.shape[1] < 50_000_000: # ~50M elements threshold
288
+ ppmi_dense = ppmi.toarray()
289
+ U, S, Vt = gpu_accelerator.svd(ppmi_dense, actual_dim, random_state)
290
+ embeddings = U @ np.diag(S)
291
+ components = Vt
292
+ else:
293
+ raise ValueError("Matrix too large for GPU dense SVD")
294
+ except Exception:
295
+ # Fall back to CPU
296
+ svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
297
+ embeddings = svd.fit_transform(ppmi)
298
+ components = svd.components_
299
+ else:
300
+ # CPU path
301
+ svd = TruncatedSVD(n_components=actual_dim, random_state=random_state, n_iter=5)
302
+ embeddings = svd.fit_transform(ppmi)
303
+ components = svd.components_
304
+
305
+ # Pad embeddings if needed to match requested dimension
306
+ if actual_dim < dim:
307
+ padding = np.zeros((embeddings.shape[0], dim - actual_dim))
308
+ embeddings = np.hstack([embeddings, padding])
309
+ # Pad components similarly
310
+ component_padding = np.zeros((dim - actual_dim, components.shape[1]))
311
+ components = np.vstack([components, component_padding])
312
+
313
+ return embeddings, components
@@ -0,0 +1,306 @@
1
+ Metadata-Version: 2.4
2
+ Name: tricoder
3
+ Version: 1.2.8
4
+ Summary: TriVector Code Intelligence - Multi-view code relationship model with advanced semantic embeddings
5
+ License: Non-Commercial
6
+ License-File: LICENSE
7
+ License-File: LICENSE_COMMERCIAL.md
8
+ Keywords: code-intelligence,embeddings,semantic-analysis,code-search
9
+ Author: Jiri Otoupal
10
+ Author-email: j.f.otoupal@gmail.com
11
+ Requires-Python: >=3.8.1,<4.0.0
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Programming Language :: Python :: 3.8
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Dist: annoy (>=1.17.0)
26
+ Requires-Dist: click (>=8.0.0)
27
+ Requires-Dist: gensim (>=4.0.0)
28
+ Requires-Dist: numpy (>=1.21.0)
29
+ Requires-Dist: rich (>=13.0.0)
30
+ Requires-Dist: scikit-learn (>=1.0.0)
31
+ Requires-Dist: scipy (>=1.7.0)
32
+ Project-URL: Homepage, https://github.com/jiri-otoupal/tricoder
33
+ Project-URL: Repository, https://github.com/jiri-otoupal/tricoder
34
+ Description-Content-Type: text/markdown
35
+
36
+ # TriVector Embeddings for Smarter Code Search for AI Agents
37
+
38
+ [![image](https://img.shields.io/pypi/v/tricoder.svg)](https://pypi.org/project/tricoder/)
39
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/tricoder)](https://pypi.org/project/tricoder/)
40
+
41
+ [![Build Status](https://travis-ci.com/jiri-otoupal/pycrosskit.svg?branch=master)](https://travis-ci.com/github/jiri-otoupal/tricoder)
42
+ [![Downloads](https://pepy.tech/badge/tricoder)](https://pepy.tech/project/tricoder)
43
+
44
+ ## TriCoder learns high-quality symbol-level embeddings from codebases using three complementary views:
45
+
46
+ 1. **Graph View**: Structural relationships via PPMI and SVD
47
+ 2. **Context View**: Semantic context via Node2Vec random walks and Word2Vec
48
+ 3. **Typed View**: Type information via type-token co-occurrence (optional)
49
+
50
+ ## Features
51
+
52
+ - **Subtoken Semantic Graph**: Captures fine-grained semantic relationships through subtoken analysis
53
+ - **File & Module Hierarchy**: Leverages file/directory structure for better clustering
54
+ - **Static Call-Graph Expansion**: Propagates call relationships to depth 2-3
55
+ - **Type Semantic Expansion**: Expands composite types into constructors and primitives
56
+ - **Context Window Co-occurrence**: Captures lexical context within ±5 lines
57
+ - **Improved Negative Sampling**: Biased sampling for better temperature calibration
58
+ - **Hybrid Similarity Scoring**: Length-penalized cosine similarity
59
+ - **Iterative Embedding Smoothing**: Diffusion-based smoothing for better clustering
60
+ - **Query-Time Semantic Expansion**: Expands queries with subtokens and types
61
+ - **GPU Acceleration**: Supports CUDA (NVIDIA) and MPS (Mac) for faster training
62
+ - **Keyword Search**: Search symbols by keywords and type tokens
63
+ - **Graph Optimization**: Filter out low-value nodes and edges to improve training efficiency
64
+
65
+ ## Installation
66
+
67
+ ### Using Poetry (Recommended)
68
+
69
+ ```bash
70
+ poetry install
71
+ ```
72
+
73
+ ### Using pip
74
+
75
+ ```bash
76
+ pip install .
77
+ ```
78
+
79
+ ### GPU Support (Optional)
80
+
81
+ For NVIDIA GPUs (CUDA):
82
+ ```bash
83
+ pip install cupy-cuda12x
84
+ ```
85
+
86
+ For Mac GPUs (MPS):
87
+ ```bash
88
+ pip install torch
89
+ ```
90
+
91
+ ## Usage
92
+
93
+ ### 1. Extract Symbols from Codebase
94
+
95
+ ```bash
96
+ # Basic extraction (Python files only)
97
+ tricoder extract --input-dir /path/to/codebase
98
+
99
+ # Extract specific file types
100
+ tricoder extract --input-dir /path/to/codebase --extensions "py,js,ts"
101
+
102
+ # Exclude specific keywords from extraction
103
+ tricoder extract --input-dir /path/to/codebase --exclude-keywords debug --exclude-keywords temp
104
+
105
+ # Custom output files
106
+ tricoder extract --input-dir /path/to/codebase --output-nodes my_nodes.jsonl --output-edges my_edges.jsonl
107
+ ```
108
+
109
+ **Extraction Options:**
110
+ - `--input-dir`, `--root`, `-r`: Input directory to scan (default: current directory)
111
+ - `--extensions`, `--ext`: Comma-separated file extensions (default: `py`)
112
+ - `--include-dirs`, `-i`: Include only specific subdirectories (can specify multiple)
113
+ - `--exclude-dirs`, `-e`: Exclude directories (default: `.venv`, `__pycache__`, `.git`, `node_modules`, `.pytest_cache`)
114
+ - `--exclude-keywords`, `--exclude`: Exclude symbol names (appended to default excluded keywords)
115
+ - `--output-nodes`, `-n`: Output file for nodes (default: `nodes.jsonl`)
116
+ - `--output-edges`, `-d`: Output file for edges (default: `edges.jsonl`)
117
+ - `--output-types`, `-t`: Output file for types (default: `types.jsonl`)
118
+ - `--no-gitignore`: Disable `.gitignore` filtering (enabled by default)
119
+
120
+ ### 2. Optimize Graph (Optional)
121
+
122
+ Reduce graph size by filtering low-value nodes and edges:
123
+
124
+ ```bash
125
+ # Basic optimization (overwrites input files)
126
+ tricoder optimize
127
+
128
+ # Custom output files
129
+ tricoder optimize --output-nodes nodes_opt.jsonl --output-edges edges_opt.jsonl
130
+
131
+ # Customize thresholds
132
+ tricoder optimize --min-edge-weight 0.5 --remove-isolated --remove-generic
133
+
134
+ # Keep isolated nodes
135
+ tricoder optimize --keep-isolated
136
+ ```
137
+
138
+ **Optimization Options:**
139
+ - `--nodes`, `-n`: Input nodes file (default: `nodes.jsonl`)
140
+ - `--edges`, `-e`: Input edges file (default: `edges.jsonl`)
141
+ - `--types`, `-t`: Input types file (default: `types.jsonl`, optional)
142
+ - `--output-nodes`, `-N`: Output nodes file (default: overwrites input)
143
+ - `--output-edges`, `-E`: Output edges file (default: overwrites input)
144
+ - `--output-types`, `-T`: Output types file (default: overwrites input)
145
+ - `--min-edge-weight`: Minimum edge weight to keep (default: `0.3`)
146
+ - `--remove-isolated`: Remove nodes with no edges (default: `True`)
147
+ - `--keep-isolated`: Keep isolated nodes (overrides `--remove-isolated`)
148
+ - `--remove-generic`: Remove generic names (default: `True`)
149
+ - `--keep-generic`: Keep generic names (overrides `--remove-generic`)
150
+ - `--exclude-keywords`, `--exclude`: Additional keywords to exclude (can specify multiple)
151
+
152
+ ### 3. Train Model
153
+
154
+ ```bash
155
+ # Basic training
156
+ tricoder train --out model_output
157
+
158
+ # With GPU acceleration
159
+ tricoder train --out model_output --use-gpu
160
+
161
+ # Fast mode (faster training, slightly lower quality)
162
+ tricoder train --out model_output --fast
163
+
164
+ # Custom dimensions
165
+ tricoder train --out model_output --graph-dim 128 --context-dim 128 --final-dim 256
166
+
167
+ # Custom training parameters
168
+ tricoder train --out model_output --num-walks 20 --walk-length 100 --train-ratio 0.9
169
+ ```
170
+
171
+ **Training Options:**
172
+ - `--nodes`, `-n`: Path to nodes.jsonl (default: `nodes.jsonl`)
173
+ - `--edges`, `-e`: Path to edges.jsonl (default: `edges.jsonl`)
174
+ - `--types`, `-t`: Path to types.jsonl (default: `types.jsonl`, optional)
175
+ - `--out`, `-o`: Output directory (required)
176
+ - `--graph-dim`: Graph view dimensionality (default: auto-calculated)
177
+ - `--context-dim`: Context view dimensionality (default: auto-calculated)
178
+ - `--typed-dim`: Typed view dimensionality (default: auto-calculated)
179
+ - `--final-dim`: Final fused embedding dimensionality (default: auto-calculated)
180
+ - `--num-walks`: Number of random walks per node (default: `10`)
181
+ - `--walk-length`: Length of each random walk (default: `80`)
182
+ - `--train-ratio`: Fraction of edges for training (default: `0.8`)
183
+ - `--random-state`: Random seed for reproducibility (default: `42`)
184
+ - `--fast`: Enable fast mode (reduces parameters for faster training)
185
+ - `--use-gpu`: Enable GPU acceleration (CUDA or MPS, falls back to CPU if unavailable)
186
+
187
+ ### 4. Query Model
188
+
189
+ ```bash
190
+ # Query by symbol ID
191
+ tricoder query --model-dir model_output --symbol function_my_function_0001 --top-k 10
192
+
193
+ # Search by keywords
194
+ tricoder query --model-dir model_output --keywords "database connection" --top-k 10
195
+
196
+ # Multi-word phrases (use quotes)
197
+ tricoder query --model-dir model_output --keywords '"user authentication" login'
198
+
199
+ # Exclude specific keywords from results
200
+ tricoder query --model-dir model_output --keywords handler --exclude-keywords debug --exclude-keywords temp
201
+
202
+ # Interactive mode
203
+ tricoder query --model-dir model_output --interactive
204
+ ```
205
+
206
+ **Query Options:**
207
+ - `--model-dir`, `-m`: Path to model directory (required)
208
+ - `--symbol`, `-s`: Symbol ID to query
209
+ - `--keywords`, `-w`: Keywords to search for (use quotes for multi-word: `"my function"`)
210
+ - `--top-k`, `-k`: Number of results to return (default: `10`)
211
+ - `--exclude-keywords`, `--exclude`: Additional keywords to exclude (appended to default excluded keywords)
212
+ - `--interactive`, `-i`: Interactive mode
213
+
214
+ ### 5. Incremental Retraining
215
+
216
+ Retrain only on changed files since last training:
217
+
218
+ ```bash
219
+ # Basic retraining (detects changed files automatically)
220
+ tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase
221
+
222
+ # Force full retraining
223
+ tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase --force
224
+
225
+ # Custom training parameters
226
+ tricoder retrain --model-dir model_output --codebase-dir /path/to/codebase --num-walks 20
227
+ ```
228
+
229
+ **Retrain Options:**
230
+ - `--model-dir`, `-m`: Path to existing model directory (required)
231
+ - `--codebase-dir`, `-c`: Path to codebase root (default: current directory)
232
+ - `--output-nodes`, `-n`: Temporary nodes file (default: `nodes_retrain.jsonl`)
233
+ - `--output-edges`, `-d`: Temporary edges file (default: `edges_retrain.jsonl`)
234
+ - `--output-types`, `-t`: Temporary types file (default: `types_retrain.jsonl`)
235
+ - `--graph-dim`, `--context-dim`, `--typed-dim`, `--final-dim`: Override model dimensions
236
+ - `--num-walks`, `--walk-length`, `--train-ratio`, `--random-state`: Training parameters
237
+ - `--force`: Force full retraining even if no files changed
238
+
239
+ ## Examples
240
+
241
+ ### Complete Workflow
242
+
243
+ ```bash
244
+ # 1. Extract symbols from codebase
245
+ tricoder extract --input-dir ./my_project --extensions "py,js"
246
+
247
+ # 2. (Optional) Optimize the graph
248
+ tricoder optimize --min-edge-weight 0.4
249
+
250
+ # 3. Train model with GPU acceleration
251
+ tricoder train --out ./models/my_project --use-gpu
252
+
253
+ # 4. Query for similar symbols
254
+ tricoder query --model-dir ./models/my_project --keywords "database" --top-k 5
255
+
256
+ # 5. After code changes, retrain incrementally
257
+ tricoder retrain --model-dir ./models/my_project --codebase-dir ./my_project
258
+ ```
259
+
260
+ ### Keyword Search Examples
261
+
262
+ ```bash
263
+ # Search for authentication-related code
264
+ tricoder query --model-dir model_output --keywords "auth login password"
265
+
266
+ # Search for specific function name
267
+ tricoder query --model-dir model_output --keywords '"process_payment"'
268
+
269
+ # Search excluding common keywords
270
+ tricoder query --model-dir model_output --keywords handler --exclude-keywords temp --exclude-keywords debug
271
+ ```
272
+
273
+ ## Requirements
274
+
275
+ - Python 3.8+
276
+ - numpy >= 1.21.0
277
+ - scipy >= 1.7.0
278
+ - scikit-learn >= 1.0.0
279
+ - gensim >= 4.0.0
280
+ - annoy >= 1.17.0
281
+ - click >= 8.0.0
282
+ - rich >= 13.0.0
283
+
284
+ **Optional (for GPU acceleration):**
285
+ - cupy-cuda12x >= 12.0.0 (for NVIDIA GPUs)
286
+ - torch >= 2.0.0 (for Mac GPUs or CUDA fallback)
287
+
288
+ ## License
289
+
290
+ TriCoder is available under a **Non-Commercial License**.
291
+
292
+ - ✅ **Free for non-commercial use**: Personal projects, education, research, open-source
293
+ - ❌ **Commercial license required**: Paid products, SaaS, commercial consulting, enterprise use
294
+
295
+ For commercial licensing inquiries, please contact: **j.f.otoupal@gmail.com**
296
+
297
+ See [LICENSE](LICENSE) for full terms and [LICENSE_COMMERCIAL.md](LICENSE_COMMERCIAL.md) for commercial license information.
298
+
299
+ <hr>
300
+ Did I made your life less painfull ?
301
+ <br>
302
+ <br>
303
+ Support my coffee addiction ;)
304
+ <br>
305
+ <a href="https://www.buymeacoffee.com/jiriotoupal" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy me a Coffee" style="height: 41px !important;width: 174px !important;box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a>
306
+
@@ -0,0 +1,22 @@
1
+ tricoder/__about__.py,sha256=zxNuFkE-43tg4N6VDTNwDHbnUa7Paps028DvLJzUIQ8,212
2
+ tricoder/__init__.py,sha256=oBD9XnaSedZoC_9IjO03vBpU4edMRaQu06_82qPe1cg,625
3
+ tricoder/calibration.py,sha256=i19uCm_pWk0F3YCx8YrpakadKNe4o7pe9XflBaoc-0M,9887
4
+ tricoder/cli.py,sha256=FqsSv6H_Woz1GBwAZHZSxOJ4DUsn3en6rR2W89lkwI8,41548
5
+ tricoder/context_view.py,sha256=RuGliktScT_qVGXsrMKipEX3vFV__FBEM347PKb6zKk,8052
6
+ tricoder/data_loader.py,sha256=oktaQLz8rGtim9NcOB_Ewi14B4HVpAbmhOrqrgoBsPw,5044
7
+ tricoder/extract.py,sha256=G1iSNMz00N1keYl1HYHjNIYjrE8XWZmkrm4TMNUGecg,23777
8
+ tricoder/fusion.py,sha256=2TDuoKJ2CTyN4potX6XW2KTTmkiEA0ARQYiQeRj3ujY,7605
9
+ tricoder/git_tracker.py,sha256=bKRt2gGWSR7BzPkDZvvNjQf8XrREOeBQgZjSH-15hcg,7155
10
+ tricoder/gpu_utils.py,sha256=QfaKefQ2HDTDSxUMOi9jKgik9_q-N1WjjLTV5_v7tqo,16126
11
+ tricoder/graph_view.py,sha256=YZAPhUajSvRw52WdsTSYoBkED9cKLpC-L8mPEoT8IqE,22227
12
+ tricoder/model.py,sha256=2FZl1C9UPriFkH_iMEqbGeOMONEq4gcURrU9-oUhUXw,20459
13
+ tricoder/optimize.py,sha256=FgbWLL-x0q5fV5US8r_2nllfazbU6-aohCXB3hPt_uM,8765
14
+ tricoder/subtoken_utils.py,sha256=Z7eHH6pjWmG-sJ9VsTJeoL11l6AmGk3kM9VI0WzAzJM,5833
15
+ tricoder/train.py,sha256=cmj_y3P_htJkGi_2G2z7VhX63k8xO90jeDjQZSPqoS0,41948
16
+ tricoder/typed_view.py,sha256=MzC-QAgY_nahXx_BEk_YMjPMxZj_t43NKSHkQweQ4lY,11580
17
+ tricoder-1.2.8.dist-info/METADATA,sha256=uW2jd2-ELFqqtMguJ2Stl84B5s7LSJFSzr4NsZpW6JE,11883
18
+ tricoder-1.2.8.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
19
+ tricoder-1.2.8.dist-info/entry_points.txt,sha256=luwxM7BzbfdWNJvwRYVUhm3z5bDF3ZoK0M5lzGajgZs,45
20
+ tricoder-1.2.8.dist-info/licenses/LICENSE,sha256=POHObqU-qPg7DWoG6PVphyxu187YFhlhVp_MEBldcF8,2526
21
+ tricoder-1.2.8.dist-info/licenses/LICENSE_COMMERCIAL.md,sha256=Omn6tI3kUiuBMvcrXhJw2sEH_WHjzaImgDIra-WyRik,2472
22
+ tricoder-1.2.8.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.2.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ tricoder=tricoder.cli:cli
3
+