mcp-ticketer 0.12.0__py3-none-any.whl → 2.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-ticketer might be problematic. Click here for more details.
- mcp_ticketer/__init__.py +10 -10
- mcp_ticketer/__version__.py +1 -1
- mcp_ticketer/_version_scm.py +1 -0
- mcp_ticketer/adapters/aitrackdown.py +507 -6
- mcp_ticketer/adapters/asana/adapter.py +229 -0
- mcp_ticketer/adapters/asana/mappers.py +14 -0
- mcp_ticketer/adapters/github/__init__.py +26 -0
- mcp_ticketer/adapters/github/adapter.py +3229 -0
- mcp_ticketer/adapters/github/client.py +335 -0
- mcp_ticketer/adapters/github/mappers.py +797 -0
- mcp_ticketer/adapters/github/queries.py +692 -0
- mcp_ticketer/adapters/github/types.py +460 -0
- mcp_ticketer/adapters/hybrid.py +47 -5
- mcp_ticketer/adapters/jira/__init__.py +35 -0
- mcp_ticketer/adapters/jira/adapter.py +1351 -0
- mcp_ticketer/adapters/jira/client.py +271 -0
- mcp_ticketer/adapters/jira/mappers.py +246 -0
- mcp_ticketer/adapters/jira/queries.py +216 -0
- mcp_ticketer/adapters/jira/types.py +304 -0
- mcp_ticketer/adapters/linear/adapter.py +2730 -139
- mcp_ticketer/adapters/linear/client.py +175 -3
- mcp_ticketer/adapters/linear/mappers.py +203 -8
- mcp_ticketer/adapters/linear/queries.py +280 -3
- mcp_ticketer/adapters/linear/types.py +120 -4
- mcp_ticketer/analysis/__init__.py +56 -0
- mcp_ticketer/analysis/dependency_graph.py +255 -0
- mcp_ticketer/analysis/health_assessment.py +304 -0
- mcp_ticketer/analysis/orphaned.py +218 -0
- mcp_ticketer/analysis/project_status.py +594 -0
- mcp_ticketer/analysis/similarity.py +224 -0
- mcp_ticketer/analysis/staleness.py +266 -0
- mcp_ticketer/automation/__init__.py +11 -0
- mcp_ticketer/automation/project_updates.py +378 -0
- mcp_ticketer/cli/adapter_diagnostics.py +3 -1
- mcp_ticketer/cli/auggie_configure.py +17 -5
- mcp_ticketer/cli/codex_configure.py +97 -61
- mcp_ticketer/cli/configure.py +1288 -105
- mcp_ticketer/cli/cursor_configure.py +314 -0
- mcp_ticketer/cli/diagnostics.py +13 -12
- mcp_ticketer/cli/discover.py +5 -0
- mcp_ticketer/cli/gemini_configure.py +17 -5
- mcp_ticketer/cli/init_command.py +880 -0
- mcp_ticketer/cli/install_mcp_server.py +418 -0
- mcp_ticketer/cli/instruction_commands.py +6 -0
- mcp_ticketer/cli/main.py +267 -3175
- mcp_ticketer/cli/mcp_configure.py +821 -119
- mcp_ticketer/cli/mcp_server_commands.py +415 -0
- mcp_ticketer/cli/platform_detection.py +77 -12
- mcp_ticketer/cli/platform_installer.py +545 -0
- mcp_ticketer/cli/project_update_commands.py +350 -0
- mcp_ticketer/cli/setup_command.py +795 -0
- mcp_ticketer/cli/simple_health.py +12 -10
- mcp_ticketer/cli/ticket_commands.py +705 -103
- mcp_ticketer/cli/utils.py +113 -0
- mcp_ticketer/core/__init__.py +56 -6
- mcp_ticketer/core/adapter.py +533 -2
- mcp_ticketer/core/config.py +21 -21
- mcp_ticketer/core/exceptions.py +7 -1
- mcp_ticketer/core/label_manager.py +732 -0
- mcp_ticketer/core/mappers.py +31 -19
- mcp_ticketer/core/milestone_manager.py +252 -0
- mcp_ticketer/core/models.py +480 -0
- mcp_ticketer/core/onepassword_secrets.py +1 -1
- mcp_ticketer/core/priority_matcher.py +463 -0
- mcp_ticketer/core/project_config.py +132 -14
- mcp_ticketer/core/project_utils.py +281 -0
- mcp_ticketer/core/project_validator.py +376 -0
- mcp_ticketer/core/session_state.py +176 -0
- mcp_ticketer/core/state_matcher.py +625 -0
- mcp_ticketer/core/url_parser.py +425 -0
- mcp_ticketer/core/validators.py +69 -0
- mcp_ticketer/mcp/server/__main__.py +2 -1
- mcp_ticketer/mcp/server/diagnostic_helper.py +175 -0
- mcp_ticketer/mcp/server/main.py +106 -25
- mcp_ticketer/mcp/server/routing.py +723 -0
- mcp_ticketer/mcp/server/server_sdk.py +58 -0
- mcp_ticketer/mcp/server/tools/__init__.py +33 -11
- mcp_ticketer/mcp/server/tools/analysis_tools.py +854 -0
- mcp_ticketer/mcp/server/tools/attachment_tools.py +5 -5
- mcp_ticketer/mcp/server/tools/bulk_tools.py +259 -202
- mcp_ticketer/mcp/server/tools/comment_tools.py +74 -12
- mcp_ticketer/mcp/server/tools/config_tools.py +1391 -145
- mcp_ticketer/mcp/server/tools/diagnostic_tools.py +211 -0
- mcp_ticketer/mcp/server/tools/hierarchy_tools.py +870 -460
- mcp_ticketer/mcp/server/tools/instruction_tools.py +7 -5
- mcp_ticketer/mcp/server/tools/label_tools.py +942 -0
- mcp_ticketer/mcp/server/tools/milestone_tools.py +338 -0
- mcp_ticketer/mcp/server/tools/pr_tools.py +3 -7
- mcp_ticketer/mcp/server/tools/project_status_tools.py +158 -0
- mcp_ticketer/mcp/server/tools/project_update_tools.py +473 -0
- mcp_ticketer/mcp/server/tools/search_tools.py +209 -97
- mcp_ticketer/mcp/server/tools/session_tools.py +308 -0
- mcp_ticketer/mcp/server/tools/ticket_tools.py +1107 -124
- mcp_ticketer/mcp/server/tools/user_ticket_tools.py +218 -236
- mcp_ticketer/queue/queue.py +68 -0
- mcp_ticketer/queue/worker.py +1 -1
- mcp_ticketer/utils/__init__.py +5 -0
- mcp_ticketer/utils/token_utils.py +246 -0
- mcp_ticketer-2.2.13.dist-info/METADATA +1396 -0
- mcp_ticketer-2.2.13.dist-info/RECORD +158 -0
- mcp_ticketer-2.2.13.dist-info/top_level.txt +2 -0
- py_mcp_installer/examples/phase3_demo.py +178 -0
- py_mcp_installer/scripts/manage_version.py +54 -0
- py_mcp_installer/setup.py +6 -0
- py_mcp_installer/src/py_mcp_installer/__init__.py +153 -0
- py_mcp_installer/src/py_mcp_installer/command_builder.py +445 -0
- py_mcp_installer/src/py_mcp_installer/config_manager.py +541 -0
- py_mcp_installer/src/py_mcp_installer/exceptions.py +243 -0
- py_mcp_installer/src/py_mcp_installer/installation_strategy.py +617 -0
- py_mcp_installer/src/py_mcp_installer/installer.py +656 -0
- py_mcp_installer/src/py_mcp_installer/mcp_inspector.py +750 -0
- py_mcp_installer/src/py_mcp_installer/platform_detector.py +451 -0
- py_mcp_installer/src/py_mcp_installer/platforms/__init__.py +26 -0
- py_mcp_installer/src/py_mcp_installer/platforms/claude_code.py +225 -0
- py_mcp_installer/src/py_mcp_installer/platforms/codex.py +181 -0
- py_mcp_installer/src/py_mcp_installer/platforms/cursor.py +191 -0
- py_mcp_installer/src/py_mcp_installer/types.py +222 -0
- py_mcp_installer/src/py_mcp_installer/utils.py +463 -0
- py_mcp_installer/tests/__init__.py +0 -0
- py_mcp_installer/tests/platforms/__init__.py +0 -0
- py_mcp_installer/tests/test_platform_detector.py +17 -0
- mcp_ticketer/adapters/github.py +0 -1574
- mcp_ticketer/adapters/jira.py +0 -1258
- mcp_ticketer-0.12.0.dist-info/METADATA +0 -550
- mcp_ticketer-0.12.0.dist-info/RECORD +0 -91
- mcp_ticketer-0.12.0.dist-info/top_level.txt +0 -1
- {mcp_ticketer-0.12.0.dist-info → mcp_ticketer-2.2.13.dist-info}/WHEEL +0 -0
- {mcp_ticketer-0.12.0.dist-info → mcp_ticketer-2.2.13.dist-info}/entry_points.txt +0 -0
- {mcp_ticketer-0.12.0.dist-info → mcp_ticketer-2.2.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,732 @@
|
|
|
1
|
+
"""Label management and normalization for ticket systems.
|
|
2
|
+
|
|
3
|
+
This module provides intelligent label matching, normalization, and deduplication
|
|
4
|
+
to maintain consistent labeling across different ticket management platforms.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Multi-stage label matching (exact → fuzzy → spelling correction)
|
|
8
|
+
- Configurable casing strategies (lowercase, titlecase, uppercase, kebab-case, snake_case)
|
|
9
|
+
- Spelling dictionary for common typos and variations
|
|
10
|
+
- Fuzzy matching with confidence scoring
|
|
11
|
+
- Duplicate detection with similarity thresholds
|
|
12
|
+
- Consolidation suggestions for similar labels
|
|
13
|
+
|
|
14
|
+
Design Decision: Three-Stage Matching Pipeline
|
|
15
|
+
----------------------------------------------
|
|
16
|
+
The label matcher uses a cascading approach to maximize accuracy:
|
|
17
|
+
|
|
18
|
+
1. Exact Match: Direct label match with normalized casing (confidence: 1.0)
|
|
19
|
+
2. Spelling Correction: Check against common misspellings (confidence: 0.95)
|
|
20
|
+
3. Fuzzy Match: Levenshtein distance with thresholds (confidence: 0.70-0.95)
|
|
21
|
+
|
|
22
|
+
This approach ensures high confidence for common labels while gracefully handling
|
|
23
|
+
typos and variations.
|
|
24
|
+
|
|
25
|
+
Performance Considerations:
|
|
26
|
+
- Average match time: <5ms (target: <10ms)
|
|
27
|
+
- Exact match: O(1) with dict/set lookup
|
|
28
|
+
- Fuzzy matching: O(n) where n = number of available labels
|
|
29
|
+
- Memory footprint: <2MB for normalizer instance with 1000 labels
|
|
30
|
+
|
|
31
|
+
Trade-offs:
|
|
32
|
+
- Performance vs. Flexibility: Chose fuzzy matching over ML embeddings for speed
|
|
33
|
+
- Memory vs. Accuracy: Spelling dictionary trades memory for correction quality
|
|
34
|
+
- Simplicity vs. Intelligence: Three-stage pipeline balances both
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> normalizer = LabelNormalizer(casing="lowercase")
|
|
38
|
+
>>> result = normalizer.normalize("Bug-Report")
|
|
39
|
+
>>> print(result)
|
|
40
|
+
bug-report
|
|
41
|
+
|
|
42
|
+
>>> matches = normalizer.find_similar("perfomance", available_labels, threshold=0.8)
|
|
43
|
+
>>> for match in matches:
|
|
44
|
+
... print(f"{match.label}: {match.confidence}")
|
|
45
|
+
performance: 0.95
|
|
46
|
+
|
|
47
|
+
>>> deduplicator = LabelDeduplicator()
|
|
48
|
+
>>> duplicates = deduplicator.find_duplicates(labels, threshold=0.85)
|
|
49
|
+
>>> for label1, label2, score in duplicates:
|
|
50
|
+
... print(f"{label1} ≈ {label2} (similarity: {score:.2f})")
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
from dataclasses import dataclass
|
|
57
|
+
from enum import Enum
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from rapidfuzz import fuzz
|
|
61
|
+
|
|
62
|
+
FUZZY_AVAILABLE = True
|
|
63
|
+
except ImportError:
|
|
64
|
+
FUZZY_AVAILABLE = False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class CasingStrategy(str, Enum):
|
|
68
|
+
"""Supported casing strategies for label normalization.
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
LOWERCASE: Convert to lowercase (e.g., "bug report")
|
|
72
|
+
TITLECASE: Convert to title case (e.g., "Bug Report")
|
|
73
|
+
UPPERCASE: Convert to uppercase (e.g., "BUG REPORT")
|
|
74
|
+
KEBAB_CASE: Convert to kebab-case (e.g., "bug-report")
|
|
75
|
+
SNAKE_CASE: Convert to snake_case (e.g., "bug_report")
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
LOWERCASE = "lowercase"
|
|
80
|
+
TITLECASE = "titlecase"
|
|
81
|
+
UPPERCASE = "uppercase"
|
|
82
|
+
KEBAB_CASE = "kebab-case"
|
|
83
|
+
SNAKE_CASE = "snake_case"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class LabelMatch:
|
|
88
|
+
"""Result of a label matching operation.
|
|
89
|
+
|
|
90
|
+
Attributes:
|
|
91
|
+
label: Matched label string
|
|
92
|
+
confidence: Confidence score (0.0-1.0)
|
|
93
|
+
match_type: Type of match used (exact, spelling, fuzzy)
|
|
94
|
+
original_input: Original user input string
|
|
95
|
+
suggestions: Alternative matches for ambiguous inputs
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
label: str
|
|
100
|
+
confidence: float
|
|
101
|
+
match_type: str
|
|
102
|
+
original_input: str
|
|
103
|
+
suggestions: list[LabelMatch] | None = None
|
|
104
|
+
|
|
105
|
+
def is_high_confidence(self) -> bool:
|
|
106
|
+
"""Check if confidence is high enough for auto-apply."""
|
|
107
|
+
return self.confidence >= 0.90
|
|
108
|
+
|
|
109
|
+
def is_medium_confidence(self) -> bool:
|
|
110
|
+
"""Check if confidence is medium (needs confirmation)."""
|
|
111
|
+
return 0.70 <= self.confidence < 0.90
|
|
112
|
+
|
|
113
|
+
def is_low_confidence(self) -> bool:
|
|
114
|
+
"""Check if confidence is too low (ambiguous)."""
|
|
115
|
+
return self.confidence < 0.70
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class LabelNormalizer:
|
|
119
|
+
"""Label normalizer with configurable casing and spelling correction.
|
|
120
|
+
|
|
121
|
+
Normalizes label strings to a consistent format and provides fuzzy matching
|
|
122
|
+
capabilities with confidence scoring.
|
|
123
|
+
|
|
124
|
+
The normalizer supports multiple casing strategies and includes a spelling
|
|
125
|
+
dictionary for common typos and variations.
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
>>> normalizer = LabelNormalizer(casing="kebab-case")
|
|
129
|
+
>>> print(normalizer.normalize("Bug Report"))
|
|
130
|
+
bug-report
|
|
131
|
+
|
|
132
|
+
>>> available = ["bug", "feature", "performance"]
|
|
133
|
+
>>> matches = normalizer.find_similar("perfomance", available, threshold=0.8)
|
|
134
|
+
>>> print(matches[0].label)
|
|
135
|
+
performance
|
|
136
|
+
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
# Spelling dictionary: common misspellings → correct spelling
|
|
140
|
+
SPELLING_CORRECTIONS: dict[str, str] = {
|
|
141
|
+
# Common typos
|
|
142
|
+
"feture": "feature",
|
|
143
|
+
"featrue": "feature",
|
|
144
|
+
"feautre": "feature",
|
|
145
|
+
"perfomance": "performance",
|
|
146
|
+
"peformance": "performance",
|
|
147
|
+
"performace": "performance",
|
|
148
|
+
"documention": "documentation",
|
|
149
|
+
"documentaion": "documentation",
|
|
150
|
+
"bugfix": "bug-fix",
|
|
151
|
+
"hotfix": "hot-fix",
|
|
152
|
+
"enhancment": "enhancement",
|
|
153
|
+
"improvment": "improvement",
|
|
154
|
+
"refacor": "refactor",
|
|
155
|
+
"refactro": "refactor",
|
|
156
|
+
"secuirty": "security",
|
|
157
|
+
"securty": "security",
|
|
158
|
+
"authenciation": "authentication",
|
|
159
|
+
"authentcation": "authentication",
|
|
160
|
+
"authorisation": "authorization",
|
|
161
|
+
"databse": "database",
|
|
162
|
+
"databae": "database",
|
|
163
|
+
"backend": "back-end",
|
|
164
|
+
"frontend": "front-end",
|
|
165
|
+
"fullstack": "full-stack",
|
|
166
|
+
# Plural variations (singular → plural)
|
|
167
|
+
"bugs": "bug",
|
|
168
|
+
"features": "feature",
|
|
169
|
+
"enhancements": "enhancement",
|
|
170
|
+
"improvements": "improvement",
|
|
171
|
+
"issues": "issue",
|
|
172
|
+
"tasks": "task",
|
|
173
|
+
"stories": "story",
|
|
174
|
+
"epics": "epic",
|
|
175
|
+
# Common variations
|
|
176
|
+
"api-endpoint": "api",
|
|
177
|
+
"ui-bug": "ui",
|
|
178
|
+
"ux-issue": "ux",
|
|
179
|
+
"db-migration": "database",
|
|
180
|
+
"sql-query": "database",
|
|
181
|
+
"test-case": "testing",
|
|
182
|
+
"unit-test": "testing",
|
|
183
|
+
"integration-test": "testing",
|
|
184
|
+
"e2e-test": "testing",
|
|
185
|
+
"code-review": "review",
|
|
186
|
+
"pr-review": "review",
|
|
187
|
+
"needs-review": "review",
|
|
188
|
+
# Priority-like labels
|
|
189
|
+
"urgent": "critical",
|
|
190
|
+
"high-priority": "high",
|
|
191
|
+
"low-priority": "low",
|
|
192
|
+
"blocker": "blocked",
|
|
193
|
+
"blocking": "blocked",
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# Confidence thresholds
|
|
197
|
+
CONFIDENCE_HIGH = 0.90
|
|
198
|
+
CONFIDENCE_MEDIUM = 0.70
|
|
199
|
+
FUZZY_THRESHOLD_HIGH = 90
|
|
200
|
+
FUZZY_THRESHOLD_MEDIUM = 70
|
|
201
|
+
|
|
202
|
+
def __init__(self, casing: str = "lowercase") -> None:
|
|
203
|
+
"""Initialize label normalizer with casing strategy.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
casing: Casing strategy - one of: lowercase, titlecase, uppercase,
|
|
207
|
+
kebab-case, snake_case (default: lowercase)
|
|
208
|
+
|
|
209
|
+
Raises:
|
|
210
|
+
ValueError: If casing strategy is not supported
|
|
211
|
+
|
|
212
|
+
"""
|
|
213
|
+
try:
|
|
214
|
+
self.casing = CasingStrategy(casing)
|
|
215
|
+
except ValueError as e:
|
|
216
|
+
valid_options = ", ".join(c.value for c in CasingStrategy)
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Invalid casing strategy '{casing}'. "
|
|
219
|
+
f"Valid options: {valid_options}"
|
|
220
|
+
) from e
|
|
221
|
+
|
|
222
|
+
# Build reverse spelling lookup for O(1) correction
|
|
223
|
+
self._spelling_map: dict[str, str] = {}
|
|
224
|
+
for wrong, correct in self.SPELLING_CORRECTIONS.items():
|
|
225
|
+
normalized_wrong = self._normalize_case(wrong)
|
|
226
|
+
normalized_correct = self._normalize_case(correct)
|
|
227
|
+
self._spelling_map[normalized_wrong] = normalized_correct
|
|
228
|
+
|
|
229
|
+
def normalize(self, label: str) -> str:
|
|
230
|
+
"""Normalize label to configured casing strategy.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
label: Raw label string to normalize
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Normalized label string with consistent casing
|
|
237
|
+
|
|
238
|
+
Example:
|
|
239
|
+
>>> normalizer = LabelNormalizer(casing="kebab-case")
|
|
240
|
+
>>> normalizer.normalize("Bug Report")
|
|
241
|
+
'bug-report'
|
|
242
|
+
|
|
243
|
+
>>> normalizer = LabelNormalizer(casing="snake_case")
|
|
244
|
+
>>> normalizer.normalize("Bug Report")
|
|
245
|
+
'bug_report'
|
|
246
|
+
|
|
247
|
+
"""
|
|
248
|
+
if not label:
|
|
249
|
+
return ""
|
|
250
|
+
|
|
251
|
+
# Just apply casing strategy (spelling correction only in find_similar)
|
|
252
|
+
return self._normalize_case(label)
|
|
253
|
+
|
|
254
|
+
def find_similar(
|
|
255
|
+
self,
|
|
256
|
+
label: str,
|
|
257
|
+
available_labels: list[str] | set[str],
|
|
258
|
+
threshold: float = 0.80,
|
|
259
|
+
) -> list[LabelMatch]:
|
|
260
|
+
"""Find similar labels from available options using fuzzy matching.
|
|
261
|
+
|
|
262
|
+
Uses three-stage matching pipeline:
|
|
263
|
+
1. Exact match (case-insensitive)
|
|
264
|
+
2. Spelling correction
|
|
265
|
+
3. Fuzzy matching with Levenshtein distance
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
label: Input label to match
|
|
269
|
+
available_labels: List of available labels to match against
|
|
270
|
+
threshold: Minimum similarity threshold (0.0-1.0, default: 0.80)
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
List of LabelMatch objects sorted by confidence (highest first)
|
|
274
|
+
|
|
275
|
+
Example:
|
|
276
|
+
>>> normalizer = LabelNormalizer()
|
|
277
|
+
>>> available = ["bug", "feature", "performance", "documentation"]
|
|
278
|
+
>>> matches = normalizer.find_similar("perfomance", available, threshold=0.8)
|
|
279
|
+
>>> print(matches[0].label, matches[0].confidence)
|
|
280
|
+
performance 0.95
|
|
281
|
+
|
|
282
|
+
"""
|
|
283
|
+
if not label or not available_labels:
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
normalized_input = self.normalize(label)
|
|
287
|
+
normalized_available = {self.normalize(lbl): lbl for lbl in available_labels}
|
|
288
|
+
|
|
289
|
+
results: list[LabelMatch] = []
|
|
290
|
+
|
|
291
|
+
# Stage 1: Exact match (case-insensitive)
|
|
292
|
+
if normalized_input in normalized_available:
|
|
293
|
+
results.append(
|
|
294
|
+
LabelMatch(
|
|
295
|
+
label=normalized_available[normalized_input],
|
|
296
|
+
confidence=1.0,
|
|
297
|
+
match_type="exact",
|
|
298
|
+
original_input=label,
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
return results
|
|
302
|
+
|
|
303
|
+
# Stage 2: Spelling correction
|
|
304
|
+
corrected = self._apply_spelling_correction(normalized_input)
|
|
305
|
+
if corrected != normalized_input and corrected in normalized_available:
|
|
306
|
+
results.append(
|
|
307
|
+
LabelMatch(
|
|
308
|
+
label=normalized_available[corrected],
|
|
309
|
+
confidence=0.95,
|
|
310
|
+
match_type="spelling",
|
|
311
|
+
original_input=label,
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
return results
|
|
315
|
+
|
|
316
|
+
# Stage 3: Fuzzy matching
|
|
317
|
+
if FUZZY_AVAILABLE:
|
|
318
|
+
results = self._fuzzy_match(
|
|
319
|
+
normalized_input, normalized_available, threshold, label
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
return results
|
|
323
|
+
|
|
324
|
+
def _normalize_case(self, text: str) -> str:
|
|
325
|
+
"""Apply casing strategy to text.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
text: Text to normalize
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
Text with applied casing strategy
|
|
332
|
+
|
|
333
|
+
"""
|
|
334
|
+
text = text.strip()
|
|
335
|
+
|
|
336
|
+
if self.casing == CasingStrategy.LOWERCASE:
|
|
337
|
+
return text.lower()
|
|
338
|
+
elif self.casing == CasingStrategy.UPPERCASE:
|
|
339
|
+
return text.upper()
|
|
340
|
+
elif self.casing == CasingStrategy.TITLECASE:
|
|
341
|
+
return text.title()
|
|
342
|
+
elif self.casing == CasingStrategy.KEBAB_CASE:
|
|
343
|
+
# Replace spaces and underscores with hyphens
|
|
344
|
+
result = text.lower().replace(" ", "-").replace("_", "-")
|
|
345
|
+
# Remove duplicate hyphens
|
|
346
|
+
while "--" in result:
|
|
347
|
+
result = result.replace("--", "-")
|
|
348
|
+
return result
|
|
349
|
+
elif self.casing == CasingStrategy.SNAKE_CASE:
|
|
350
|
+
# Replace spaces and hyphens with underscores
|
|
351
|
+
result = text.lower().replace(" ", "_").replace("-", "_")
|
|
352
|
+
# Remove duplicate underscores
|
|
353
|
+
while "__" in result:
|
|
354
|
+
result = result.replace("__", "_")
|
|
355
|
+
return result
|
|
356
|
+
else:
|
|
357
|
+
return text.lower() # Default to lowercase
|
|
358
|
+
|
|
359
|
+
def _apply_spelling_correction(self, label: str) -> str:
|
|
360
|
+
"""Apply spelling corrections from dictionary.
|
|
361
|
+
|
|
362
|
+
Only corrects if the entire label matches a known misspelling.
|
|
363
|
+
Does not correct partial matches or compound labels.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
label: Label to correct (should be normalized)
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Corrected label if found in dictionary, otherwise original
|
|
370
|
+
|
|
371
|
+
"""
|
|
372
|
+
# Only apply correction if exact match in spelling map
|
|
373
|
+
return self._spelling_map.get(label, label)
|
|
374
|
+
|
|
375
|
+
def _fuzzy_match(
|
|
376
|
+
self,
|
|
377
|
+
normalized_input: str,
|
|
378
|
+
normalized_available: dict[str, str],
|
|
379
|
+
threshold: float,
|
|
380
|
+
original_input: str,
|
|
381
|
+
) -> list[LabelMatch]:
|
|
382
|
+
"""Perform fuzzy matching using Levenshtein distance.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
normalized_input: Normalized input label
|
|
386
|
+
normalized_available: Dict of normalized → original labels
|
|
387
|
+
threshold: Similarity threshold (0.0-1.0)
|
|
388
|
+
original_input: Original user input
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
List of LabelMatch objects sorted by confidence
|
|
392
|
+
|
|
393
|
+
"""
|
|
394
|
+
matches: list[tuple[str, float]] = []
|
|
395
|
+
|
|
396
|
+
for normalized_label, original_label in normalized_available.items():
|
|
397
|
+
similarity = fuzz.ratio(normalized_input, normalized_label)
|
|
398
|
+
|
|
399
|
+
# Convert similarity (0-100) to confidence (0.0-1.0)
|
|
400
|
+
confidence = similarity / 100.0
|
|
401
|
+
|
|
402
|
+
if confidence >= threshold:
|
|
403
|
+
matches.append((original_label, confidence))
|
|
404
|
+
|
|
405
|
+
# Sort by confidence descending
|
|
406
|
+
matches.sort(key=lambda x: x[1], reverse=True)
|
|
407
|
+
|
|
408
|
+
# Convert to LabelMatch objects
|
|
409
|
+
return [
|
|
410
|
+
LabelMatch(
|
|
411
|
+
label=lbl,
|
|
412
|
+
confidence=conf,
|
|
413
|
+
match_type="fuzzy",
|
|
414
|
+
original_input=original_input,
|
|
415
|
+
)
|
|
416
|
+
for lbl, conf in matches
|
|
417
|
+
]
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
class LabelDeduplicator:
|
|
421
|
+
"""Label deduplicator for finding and consolidating similar labels.
|
|
422
|
+
|
|
423
|
+
Identifies duplicate labels using multiple strategies:
|
|
424
|
+
- Exact duplicates (case-insensitive)
|
|
425
|
+
- Fuzzy duplicates (Levenshtein similarity)
|
|
426
|
+
- Plural variations (e.g., "bug" vs "bugs")
|
|
427
|
+
- Common synonyms (e.g., "bug" vs "issue")
|
|
428
|
+
|
|
429
|
+
Example:
|
|
430
|
+
>>> deduplicator = LabelDeduplicator()
|
|
431
|
+
>>> labels = ["bug", "Bug", "bugs", "feature", "Feature Request"]
|
|
432
|
+
>>> duplicates = deduplicator.find_duplicates(labels, threshold=0.85)
|
|
433
|
+
>>> for label1, label2, score in duplicates:
|
|
434
|
+
... print(f"{label1} ≈ {label2} (similarity: {score:.2f})")
|
|
435
|
+
bug ≈ Bug (similarity: 1.00)
|
|
436
|
+
bug ≈ bugs (similarity: 0.93)
|
|
437
|
+
|
|
438
|
+
>>> suggestions = deduplicator.suggest_consolidation(labels)
|
|
439
|
+
>>> for canonical, variants in suggestions.items():
|
|
440
|
+
... print(f"{canonical}: {', '.join(variants)}")
|
|
441
|
+
bug: Bug, bugs
|
|
442
|
+
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
# Similarity threshold for considering labels as duplicates
|
|
446
|
+
DEFAULT_THRESHOLD = 0.85
|
|
447
|
+
|
|
448
|
+
# Common label synonyms
|
|
449
|
+
LABEL_SYNONYMS: dict[str, set[str]] = {
|
|
450
|
+
"bug": {"issue", "defect", "problem", "error"},
|
|
451
|
+
"feature": {"enhancement", "improvement", "new feature"},
|
|
452
|
+
"documentation": {"docs", "doc", "readme"},
|
|
453
|
+
"testing": {"test", "qa", "quality assurance"},
|
|
454
|
+
"security": {"vulnerability", "cve", "exploit"},
|
|
455
|
+
"performance": {"optimization", "speed", "efficiency"},
|
|
456
|
+
"ui": {"ux", "user interface", "frontend"},
|
|
457
|
+
"backend": {"back-end", "server", "api"},
|
|
458
|
+
"database": {"db", "sql", "data"},
|
|
459
|
+
"refactor": {"refactoring", "cleanup", "tech debt"},
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
def find_duplicates(
|
|
463
|
+
self,
|
|
464
|
+
labels: list[str],
|
|
465
|
+
threshold: float | None = None,
|
|
466
|
+
) -> list[tuple[str, str, float]]:
|
|
467
|
+
"""Find duplicate labels with similarity scores.
|
|
468
|
+
|
|
469
|
+
Compares all labels pairwise and returns those exceeding the similarity
|
|
470
|
+
threshold. Results are sorted by similarity score descending.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
labels: List of labels to check for duplicates
|
|
474
|
+
threshold: Similarity threshold (0.0-1.0, default: 0.85)
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
List of (label1, label2, similarity_score) tuples sorted by score
|
|
478
|
+
|
|
479
|
+
Example:
|
|
480
|
+
>>> deduplicator = LabelDeduplicator()
|
|
481
|
+
>>> labels = ["bug", "Bug", "bugs", "feature", "feture"]
|
|
482
|
+
>>> duplicates = deduplicator.find_duplicates(labels)
|
|
483
|
+
>>> for l1, l2, score in duplicates:
|
|
484
|
+
... print(f"{l1} ≈ {l2}: {score:.2f}")
|
|
485
|
+
bug ≈ Bug: 1.00
|
|
486
|
+
bug ≈ bugs: 0.93
|
|
487
|
+
feature ≈ feture: 0.92
|
|
488
|
+
|
|
489
|
+
"""
|
|
490
|
+
if not labels:
|
|
491
|
+
return []
|
|
492
|
+
|
|
493
|
+
threshold = threshold or self.DEFAULT_THRESHOLD
|
|
494
|
+
duplicates: list[tuple[str, str, float]] = []
|
|
495
|
+
|
|
496
|
+
# Compare all pairs
|
|
497
|
+
for i, label1 in enumerate(labels):
|
|
498
|
+
for label2 in labels[i + 1 :]:
|
|
499
|
+
similarity = self._calculate_similarity(label1, label2)
|
|
500
|
+
if similarity >= threshold:
|
|
501
|
+
duplicates.append((label1, label2, similarity))
|
|
502
|
+
|
|
503
|
+
# Sort by similarity descending
|
|
504
|
+
duplicates.sort(key=lambda x: x[2], reverse=True)
|
|
505
|
+
|
|
506
|
+
return duplicates
|
|
507
|
+
|
|
508
|
+
def suggest_consolidation(
|
|
509
|
+
self,
|
|
510
|
+
labels: list[str],
|
|
511
|
+
threshold: float | None = None,
|
|
512
|
+
) -> dict[str, list[str]]:
|
|
513
|
+
"""Suggest label consolidations for similar labels.
|
|
514
|
+
|
|
515
|
+
Groups similar labels together and suggests a canonical label for each group.
|
|
516
|
+
The canonical label is typically the most common or shortest variant.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
labels: List of labels to consolidate
|
|
520
|
+
threshold: Similarity threshold (0.0-1.0, default: 0.85)
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Dictionary mapping canonical label → list of similar variants
|
|
524
|
+
|
|
525
|
+
Example:
|
|
526
|
+
>>> deduplicator = LabelDeduplicator()
|
|
527
|
+
>>> labels = ["bug", "Bug", "bugs", "feature", "feture", "features"]
|
|
528
|
+
>>> suggestions = deduplicator.suggest_consolidation(labels)
|
|
529
|
+
>>> for canonical, variants in suggestions.items():
|
|
530
|
+
... print(f"Use '{canonical}' instead of: {', '.join(variants)}")
|
|
531
|
+
Use 'bug' instead of: Bug, bugs
|
|
532
|
+
Use 'feature' instead of: feture, features
|
|
533
|
+
|
|
534
|
+
"""
|
|
535
|
+
if not labels:
|
|
536
|
+
return {}
|
|
537
|
+
|
|
538
|
+
threshold = threshold or self.DEFAULT_THRESHOLD
|
|
539
|
+
duplicates = self.find_duplicates(labels, threshold)
|
|
540
|
+
|
|
541
|
+
# Build graph of similar labels
|
|
542
|
+
similarity_graph: dict[str, set[str]] = {label: set() for label in labels}
|
|
543
|
+
|
|
544
|
+
for label1, label2, _ in duplicates:
|
|
545
|
+
similarity_graph[label1].add(label2)
|
|
546
|
+
similarity_graph[label2].add(label1)
|
|
547
|
+
|
|
548
|
+
# Find connected components (groups of similar labels)
|
|
549
|
+
visited: set[str] = set()
|
|
550
|
+
groups: list[set[str]] = []
|
|
551
|
+
|
|
552
|
+
for label in labels:
|
|
553
|
+
if label in visited:
|
|
554
|
+
continue
|
|
555
|
+
|
|
556
|
+
# BFS to find connected component
|
|
557
|
+
group = self._find_connected_component(label, similarity_graph)
|
|
558
|
+
groups.append(group)
|
|
559
|
+
visited.update(group)
|
|
560
|
+
|
|
561
|
+
# Select canonical label for each group
|
|
562
|
+
consolidations: dict[str, list[str]] = {}
|
|
563
|
+
|
|
564
|
+
for group in groups:
|
|
565
|
+
if len(group) <= 1:
|
|
566
|
+
continue # No duplicates
|
|
567
|
+
|
|
568
|
+
# Choose canonical label (prefer lowercase, then shortest)
|
|
569
|
+
canonical = min(group, key=lambda x: (not x.islower(), len(x), x))
|
|
570
|
+
|
|
571
|
+
variants = [lbl for lbl in group if lbl != canonical]
|
|
572
|
+
if variants:
|
|
573
|
+
consolidations[canonical] = variants
|
|
574
|
+
|
|
575
|
+
return consolidations
|
|
576
|
+
|
|
577
|
+
def _calculate_similarity(self, label1: str, label2: str) -> float:
|
|
578
|
+
"""Calculate similarity score between two labels.
|
|
579
|
+
|
|
580
|
+
Uses multiple similarity checks:
|
|
581
|
+
1. Case-insensitive exact match → 1.0
|
|
582
|
+
2. Synonym match → 0.95
|
|
583
|
+
3. Fuzzy matching (if available) → 0.0-1.0
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
label1: First label
|
|
587
|
+
label2: Second label
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
Similarity score (0.0-1.0)
|
|
591
|
+
|
|
592
|
+
"""
|
|
593
|
+
# Normalize for comparison
|
|
594
|
+
norm1 = label1.lower().strip()
|
|
595
|
+
norm2 = label2.lower().strip()
|
|
596
|
+
|
|
597
|
+
# Exact match (case-insensitive)
|
|
598
|
+
if norm1 == norm2:
|
|
599
|
+
return 1.0
|
|
600
|
+
|
|
601
|
+
# Check synonyms
|
|
602
|
+
if self._are_synonyms(norm1, norm2):
|
|
603
|
+
return 0.95
|
|
604
|
+
|
|
605
|
+
# Fuzzy matching
|
|
606
|
+
if FUZZY_AVAILABLE:
|
|
607
|
+
similarity = fuzz.ratio(norm1, norm2)
|
|
608
|
+
return similarity / 100.0
|
|
609
|
+
|
|
610
|
+
# Fallback: simple string comparison
|
|
611
|
+
return 1.0 if norm1 == norm2 else 0.0
|
|
612
|
+
|
|
613
|
+
def _are_synonyms(self, label1: str, label2: str) -> bool:
|
|
614
|
+
"""Check if two labels are synonyms.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
label1: First label (normalized)
|
|
618
|
+
label2: Second label (normalized)
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
True if labels are synonyms, False otherwise
|
|
622
|
+
|
|
623
|
+
"""
|
|
624
|
+
for canonical, synonyms in self.LABEL_SYNONYMS.items():
|
|
625
|
+
if label1 == canonical and label2 in synonyms:
|
|
626
|
+
return True
|
|
627
|
+
if label2 == canonical and label1 in synonyms:
|
|
628
|
+
return True
|
|
629
|
+
if label1 in synonyms and label2 in synonyms:
|
|
630
|
+
return True
|
|
631
|
+
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
def _find_connected_component(
|
|
635
|
+
self,
|
|
636
|
+
start: str,
|
|
637
|
+
graph: dict[str, set[str]],
|
|
638
|
+
) -> set[str]:
|
|
639
|
+
"""Find connected component in similarity graph using BFS.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
start: Starting label
|
|
643
|
+
graph: Adjacency list of label similarities
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
Set of labels in the connected component
|
|
647
|
+
|
|
648
|
+
"""
|
|
649
|
+
visited = {start}
|
|
650
|
+
queue = [start]
|
|
651
|
+
|
|
652
|
+
while queue:
|
|
653
|
+
label = queue.pop(0)
|
|
654
|
+
|
|
655
|
+
for neighbor in graph[label]:
|
|
656
|
+
if neighbor not in visited:
|
|
657
|
+
visited.add(neighbor)
|
|
658
|
+
queue.append(neighbor)
|
|
659
|
+
|
|
660
|
+
return visited
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
# Convenience functions for common operations
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def normalize_label(label: str, casing: str = "lowercase") -> str:
|
|
667
|
+
"""Normalize a single label with specified casing strategy.
|
|
668
|
+
|
|
669
|
+
Convenience function that creates a LabelNormalizer instance.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
label: Label to normalize
|
|
673
|
+
casing: Casing strategy (default: lowercase)
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
Normalized label string
|
|
677
|
+
|
|
678
|
+
Example:
|
|
679
|
+
>>> normalize_label("Bug Report", casing="kebab-case")
|
|
680
|
+
'bug-report'
|
|
681
|
+
|
|
682
|
+
"""
|
|
683
|
+
normalizer = LabelNormalizer(casing=casing)
|
|
684
|
+
return normalizer.normalize(label)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def find_duplicate_labels(
|
|
688
|
+
labels: list[str],
|
|
689
|
+
threshold: float = 0.85,
|
|
690
|
+
) -> list[tuple[str, str, float]]:
|
|
691
|
+
"""Find duplicate labels in a list.
|
|
692
|
+
|
|
693
|
+
Convenience function that creates a LabelDeduplicator instance.
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
labels: List of labels to check
|
|
697
|
+
threshold: Similarity threshold (default: 0.85)
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
List of (label1, label2, similarity_score) tuples
|
|
701
|
+
|
|
702
|
+
Example:
|
|
703
|
+
>>> labels = ["bug", "Bug", "bugs", "feature"]
|
|
704
|
+
>>> duplicates = find_duplicate_labels(labels)
|
|
705
|
+
>>> for l1, l2, score in duplicates:
|
|
706
|
+
... print(f"{l1} ≈ {l2}: {score:.2f}")
|
|
707
|
+
|
|
708
|
+
"""
|
|
709
|
+
deduplicator = LabelDeduplicator()
|
|
710
|
+
return deduplicator.find_duplicates(labels, threshold)
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
# Singleton instance for convenience
|
|
714
|
+
_default_normalizer: LabelNormalizer | None = None
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def get_label_normalizer(casing: str = "lowercase") -> LabelNormalizer:
|
|
718
|
+
"""Get default label normalizer instance.
|
|
719
|
+
|
|
720
|
+
Creates or returns cached normalizer with specified casing.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
casing: Casing strategy (default: lowercase)
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
LabelNormalizer instance
|
|
727
|
+
|
|
728
|
+
"""
|
|
729
|
+
global _default_normalizer
|
|
730
|
+
if _default_normalizer is None or _default_normalizer.casing.value != casing:
|
|
731
|
+
_default_normalizer = LabelNormalizer(casing=casing)
|
|
732
|
+
return _default_normalizer
|