tricoder 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tricoder/optimize.py ADDED
@@ -0,0 +1,263 @@
1
+ """Optimization utilities for filtering nodes and edges."""
2
+ import json
3
+ from collections import defaultdict
4
+ from typing import Dict, List, Set, Tuple
5
+ from .model import DEFAULT_EXCLUDED_KEYWORDS
6
+
7
+
8
+ def is_generic_name(name: str, kind: str) -> bool:
9
+ """Check if a symbol name is too generic to be useful."""
10
+ name_lower = name.lower().strip()
11
+
12
+ # Single character names (except for classes/functions which might be intentionally short)
13
+ if len(name_lower) <= 1 and kind in ['var', 'import']:
14
+ return True
15
+
16
+ # Very short names for variables
17
+ if len(name_lower) <= 2 and kind == 'var':
18
+ return True
19
+
20
+ # Common generic names
21
+ generic_names = {
22
+ 'var', 'variable', 'val', 'value', 'item', 'obj', 'object', 'data', 'result',
23
+ 'temp', 'tmp', 'arg', 'args', 'kwarg', 'kwargs', 'param', 'params', 'elem',
24
+ 'element', 'entry', 'record', 'row', 'col', 'column', 'idx', 'index', 'i', 'j', 'k',
25
+ 'x', 'y', 'z', 'n', 'm', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'a', 'b', 'c', 'd',
26
+ 'e', 'f', 'g', 'h', 'l', 'o', 'helper', 'util', 'utils', 'func', 'fn', 'cb', 'callback'
27
+ }
28
+
29
+ if name_lower in generic_names:
30
+ return True
31
+
32
+ return False
33
+
34
+
35
+ def optimize_nodes_and_edges(
36
+ nodes_path: str,
37
+ edges_path: str,
38
+ types_path: str = None,
39
+ output_nodes: str = None,
40
+ output_edges: str = None,
41
+ output_types: str = None,
42
+ min_edge_weight: float = 0.3,
43
+ remove_isolated: bool = True,
44
+ remove_generic_names: bool = True,
45
+ excluded_keywords: Set[str] = None
46
+ ) -> Tuple[int, int, int]:
47
+ """
48
+ Optimize nodes and edges by filtering out low-value entries.
49
+
50
+ Args:
51
+ nodes_path: Path to input nodes.jsonl
52
+ edges_path: Path to input edges.jsonl
53
+ types_path: Path to input types.jsonl (optional)
54
+ output_nodes: Path to output nodes.jsonl (default: overwrites input)
55
+ output_edges: Path to output edges.jsonl (default: overwrites input)
56
+ output_types: Path to output types.jsonl (default: overwrites input)
57
+ min_edge_weight: Minimum edge weight to keep
58
+ remove_isolated: Whether to remove nodes with no edges
59
+ remove_generic_names: Whether to remove nodes with generic names
60
+ excluded_keywords: Additional keywords to exclude
61
+
62
+ Returns:
63
+ Tuple of (nodes_removed, edges_removed, types_removed)
64
+ """
65
+ if excluded_keywords is None:
66
+ excluded_keywords = DEFAULT_EXCLUDED_KEYWORDS
67
+
68
+ # Load all data
69
+ nodes = []
70
+ node_ids = set()
71
+
72
+ with open(nodes_path, 'r') as f:
73
+ for line in f:
74
+ if not line.strip():
75
+ continue
76
+ node = json.loads(line)
77
+ nodes.append(node)
78
+ node_ids.add(node['id'])
79
+
80
+ edges = []
81
+ with open(edges_path, 'r') as f:
82
+ for line in f:
83
+ if not line.strip():
84
+ continue
85
+ edge = json.loads(line)
86
+ edges.append(edge)
87
+
88
+ types = []
89
+ if types_path:
90
+ try:
91
+ with open(types_path, 'r') as f:
92
+ for line in f:
93
+ if not line.strip():
94
+ continue
95
+ type_entry = json.loads(line)
96
+ types.append(type_entry)
97
+ except FileNotFoundError:
98
+ types = []
99
+
100
+ original_node_count = len(nodes)
101
+ original_edge_count = len(edges)
102
+ original_type_count = len(types)
103
+
104
+ # Build edge statistics
105
+ node_edge_count = defaultdict(int)
106
+ node_in_degree = defaultdict(int)
107
+ node_out_degree = defaultdict(int)
108
+
109
+ for edge in edges:
110
+ src = edge['src']
111
+ dst = edge['dst']
112
+ weight = float(edge.get('weight', 1.0))
113
+
114
+ if src in node_ids and dst in node_ids:
115
+ node_edge_count[src] += 1
116
+ node_edge_count[dst] += 1
117
+ node_out_degree[src] += 1
118
+ node_in_degree[dst] += 1
119
+
120
+ # Filter nodes
121
+ nodes_to_keep = set()
122
+ nodes_to_remove = set()
123
+
124
+ for node in nodes:
125
+ node_id = node['id']
126
+ name = node.get('name', '')
127
+ kind = node.get('kind', '')
128
+
129
+ # Check if node should be removed
130
+ should_remove = False
131
+
132
+ # Remove if name is in excluded keywords
133
+ if name.lower() in excluded_keywords:
134
+ should_remove = True
135
+
136
+ # Remove generic names
137
+ if remove_generic_names and is_generic_name(name, kind):
138
+ should_remove = True
139
+
140
+ # Remove isolated nodes (no edges)
141
+ if remove_isolated and node_edge_count[node_id] == 0:
142
+ # Keep file nodes and important structural nodes
143
+ if kind not in ['file', 'class', 'function']:
144
+ should_remove = True
145
+
146
+ if should_remove:
147
+ nodes_to_remove.add(node_id)
148
+ else:
149
+ nodes_to_keep.add(node_id)
150
+
151
+ # Filter edges
152
+ edges_to_keep = []
153
+ for edge in edges:
154
+ src = edge['src']
155
+ dst = edge['dst']
156
+ weight = float(edge.get('weight', 1.0))
157
+
158
+ # Remove if either node is removed
159
+ if src in nodes_to_remove or dst in nodes_to_remove:
160
+ continue
161
+
162
+ # Remove low-weight edges
163
+ if weight < min_edge_weight:
164
+ continue
165
+
166
+ edges_to_keep.append(edge)
167
+
168
+ # Filter types (only keep types for nodes that are kept)
169
+ types_to_keep = []
170
+ for type_entry in types:
171
+ symbol_id = type_entry.get('symbol', '')
172
+ if symbol_id in nodes_to_keep:
173
+ types_to_keep.append(type_entry)
174
+
175
+ # Filter nodes (only keep nodes that are kept)
176
+ filtered_nodes = [node for node in nodes if node['id'] in nodes_to_keep]
177
+
178
+ # Write optimized files
179
+ output_nodes_path = output_nodes or nodes_path
180
+ output_edges_path = output_edges or edges_path
181
+ output_types_path = output_types or (types_path if types_path else None)
182
+
183
+ with open(output_nodes_path, 'w') as f:
184
+ for node in filtered_nodes:
185
+ f.write(json.dumps(node) + '\n')
186
+
187
+ with open(output_edges_path, 'w') as f:
188
+ for edge in edges_to_keep:
189
+ f.write(json.dumps(edge) + '\n')
190
+
191
+ if output_types_path and types_to_keep:
192
+ with open(output_types_path, 'w') as f:
193
+ for type_entry in types_to_keep:
194
+ f.write(json.dumps(type_entry) + '\n')
195
+
196
+ nodes_removed = original_node_count - len(filtered_nodes)
197
+ edges_removed = original_edge_count - len(edges_to_keep)
198
+ types_removed = original_type_count - len(types_to_keep)
199
+
200
+ # Calculate detailed statistics
201
+ stats = {
202
+ 'original': {
203
+ 'nodes': original_node_count,
204
+ 'edges': original_edge_count,
205
+ 'types': original_type_count
206
+ },
207
+ 'final': {
208
+ 'nodes': len(filtered_nodes),
209
+ 'edges': len(edges_to_keep),
210
+ 'types': len(types_to_keep)
211
+ },
212
+ 'removed': {
213
+ 'nodes': nodes_removed,
214
+ 'edges': edges_removed,
215
+ 'types': types_removed
216
+ },
217
+ 'removal_reasons': {
218
+ 'excluded_keywords': 0,
219
+ 'generic_names': 0,
220
+ 'isolated': 0,
221
+ 'low_weight_edges': 0,
222
+ 'orphaned_edges': 0 # Edges removed because nodes were removed
223
+ },
224
+ 'by_kind': defaultdict(lambda: {'original': 0, 'removed': 0, 'final': 0})
225
+ }
226
+
227
+ # Count removal reasons
228
+ for node in nodes:
229
+ node_id = node['id']
230
+ name = node.get('name', '')
231
+ kind = node.get('kind', '')
232
+
233
+ stats['by_kind'][kind]['original'] += 1
234
+
235
+ if node_id in nodes_to_remove:
236
+ stats['by_kind'][kind]['removed'] += 1
237
+ if name.lower() in excluded_keywords:
238
+ stats['removal_reasons']['excluded_keywords'] += 1
239
+ if remove_generic_names and is_generic_name(name, kind):
240
+ stats['removal_reasons']['generic_names'] += 1
241
+ if remove_isolated and node_edge_count[node_id] == 0:
242
+ stats['removal_reasons']['isolated'] += 1
243
+ else:
244
+ stats['by_kind'][kind]['final'] += 1
245
+
246
+ # Count edge removal reasons
247
+ orphaned_edges = 0
248
+ low_weight_edges = 0
249
+ for edge in edges:
250
+ src = edge['src']
251
+ dst = edge['dst']
252
+ weight = float(edge.get('weight', 1.0))
253
+
254
+ if src in nodes_to_remove or dst in nodes_to_remove:
255
+ orphaned_edges += 1
256
+ elif weight < min_edge_weight:
257
+ low_weight_edges += 1
258
+
259
+ stats['removal_reasons']['orphaned_edges'] = orphaned_edges
260
+ stats['removal_reasons']['low_weight_edges'] = low_weight_edges
261
+
262
+ return nodes_removed, edges_removed, types_removed, stats
263
+
@@ -0,0 +1,196 @@
1
+ """Subtoken extraction and normalization utilities."""
2
+ import re
3
+ import string
4
+ from typing import List, Tuple
5
+
6
+
7
+ class SimpleStemmer:
8
+ """Lightweight Porter-like stemmer for code subtokens."""
9
+
10
+ def __init__(self):
11
+ # Common suffixes to remove
12
+ self.suffixes = [
13
+ ('ing', ''),
14
+ ('ed', ''),
15
+ ('er', ''),
16
+ ('est', ''),
17
+ ('ly', ''),
18
+ ('tion', ''),
19
+ ('sion', ''),
20
+ ('ness', ''),
21
+ ('ment', ''),
22
+ ('able', ''),
23
+ ('ible', ''),
24
+ ('ful', ''),
25
+ ('less', ''),
26
+ ('ize', ''),
27
+ ('ise', ''),
28
+ ]
29
+
30
+ def stem(self, word: str) -> str:
31
+ """Apply stemming to a word."""
32
+ if len(word) <= 3:
33
+ return word
34
+
35
+ word_lower = word.lower()
36
+
37
+ # Try to remove suffixes
38
+ for suffix, replacement in self.suffixes:
39
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 2:
40
+ return word_lower[:-len(suffix)] + replacement
41
+
42
+ return word_lower
43
+
44
+
45
+ def split_camel_case(name: str) -> List[str]:
46
+ """Split camelCase or PascalCase identifier into tokens."""
47
+ # Insert space before uppercase letters (but not if already preceded by space)
48
+ # Handle sequences of uppercase letters (e.g., "XMLParser" -> "XML", "Parser")
49
+ tokens = []
50
+ current_token = ""
51
+
52
+ for i, char in enumerate(name):
53
+ if char.isupper():
54
+ if current_token and not current_token[-1].isupper():
55
+ # End of lowercase token, start new uppercase token
56
+ tokens.append(current_token)
57
+ current_token = char
58
+ else:
59
+ # Continue uppercase sequence
60
+ current_token += char
61
+ elif char.islower() or char.isdigit():
62
+ if current_token and current_token[-1].isupper() and len(current_token) > 1:
63
+ # End of uppercase sequence (except last char), start new token
64
+ tokens.append(current_token[:-1])
65
+ current_token = current_token[-1] + char
66
+ else:
67
+ current_token += char
68
+ else:
69
+ # Non-alphanumeric character
70
+ if current_token:
71
+ tokens.append(current_token)
72
+ current_token = ""
73
+
74
+ if current_token:
75
+ tokens.append(current_token)
76
+
77
+ return [t for t in tokens if t]
78
+
79
+
80
+ def split_snake_case(name: str) -> List[str]:
81
+ """Split snake_case identifier into tokens."""
82
+ return [t for t in name.split('_') if t]
83
+
84
+
85
+ def split_kebab_case(name: str) -> List[str]:
86
+ """Split kebab-case identifier into tokens."""
87
+ return [t for t in name.split('-') if t]
88
+
89
+
90
+ def split_numeric(name: str) -> List[str]:
91
+ """Split identifier at numeric boundaries."""
92
+ # Split on transitions between digits and non-digits
93
+ parts = re.split(r'(\d+)', name)
94
+ tokens = []
95
+ for part in parts:
96
+ if part:
97
+ # Further split non-numeric parts by case changes
98
+ if part.isdigit():
99
+ tokens.append(part)
100
+ else:
101
+ # Split camelCase within numeric-separated parts
102
+ camel_tokens = split_camel_case(part)
103
+ tokens.extend(camel_tokens)
104
+ return [t for t in tokens if t]
105
+
106
+
107
+ def extract_subtokens(symbol_name: str, normalize: bool = True) -> Tuple[List[str], List[str]]:
108
+ """
109
+ Extract subtokens from a symbol name.
110
+
111
+ Args:
112
+ symbol_name: name of the symbol
113
+ normalize: whether to normalize subtokens (lowercase + stem)
114
+
115
+ Returns:
116
+ Tuple of (raw_subtokens, normalized_subtokens)
117
+ """
118
+ if not symbol_name:
119
+ return [], []
120
+
121
+ # Try different splitting strategies
122
+ tokens = set()
123
+
124
+ # Split by common delimiters first
125
+ parts = re.split(r'[_\-\s\.]+', symbol_name)
126
+ for part in parts:
127
+ if not part:
128
+ continue
129
+
130
+ # Try camelCase splitting
131
+ camel_tokens = split_camel_case(part)
132
+ tokens.update(camel_tokens)
133
+
134
+ # Try numeric splitting
135
+ numeric_tokens = split_numeric(part)
136
+ tokens.update(numeric_tokens)
137
+
138
+ # Also try direct snake_case and kebab_case
139
+ snake_tokens = split_snake_case(symbol_name)
140
+ tokens.update(snake_tokens)
141
+
142
+ kebab_tokens = split_kebab_case(symbol_name)
143
+ tokens.update(kebab_tokens)
144
+
145
+ # Filter out empty tokens and very short ones
146
+ raw_tokens = [t for t in tokens if len(t) > 0]
147
+
148
+ # Normalize if requested
149
+ normalized_tokens = []
150
+ if normalize:
151
+ stemmer = SimpleStemmer()
152
+ for token in raw_tokens:
153
+ # Remove punctuation
154
+ cleaned = token.strip(string.punctuation)
155
+ if cleaned:
156
+ # Lowercase and stem
157
+ normalized = stemmer.stem(cleaned.lower())
158
+ if normalized:
159
+ normalized_tokens.append(normalized)
160
+ else:
161
+ normalized_tokens = raw_tokens
162
+
163
+ return raw_tokens, normalized_tokens
164
+
165
+
166
+ def get_file_hierarchy(file_path: str) -> Tuple[str, str, str]:
167
+ """
168
+ Extract file hierarchy information.
169
+
170
+ Args:
171
+ file_path: path to file
172
+
173
+ Returns:
174
+ Tuple of (file_name, directory_path, top_level_package)
175
+ """
176
+ if not file_path:
177
+ return "", "", ""
178
+
179
+ # Normalize path separators
180
+ normalized = file_path.replace('\\', '/')
181
+
182
+ # Extract components
183
+ parts = normalized.split('/')
184
+ file_name = parts[-1] if parts else ""
185
+
186
+ # Directory is everything except filename
187
+ directory_path = '/'.join(parts[:-1]) if len(parts) > 1 else ""
188
+
189
+ # Top-level package is first non-empty component
190
+ top_level = ""
191
+ for part in parts:
192
+ if part and part not in ['.', '..']:
193
+ top_level = part
194
+ break
195
+
196
+ return file_name, directory_path, top_level