gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1100 @@
|
|
|
1
|
+
"""ML-enhanced ticket reference extraction with sophisticated commit categorization.
|
|
2
|
+
|
|
3
|
+
This module extends the basic TicketExtractor with machine learning capabilities for
|
|
4
|
+
better commit categorization. It integrates with the existing qualitative analysis
|
|
5
|
+
infrastructure to provide hybrid rule-based + ML classification.
|
|
6
|
+
|
|
7
|
+
WHY: Traditional regex-based categorization has limitations in understanding context
|
|
8
|
+
and nuanced commit messages. This ML-enhanced version provides better accuracy while
|
|
9
|
+
maintaining backward compatibility and performance through intelligent caching.
|
|
10
|
+
|
|
11
|
+
DESIGN DECISIONS:
|
|
12
|
+
- Hybrid approach: Falls back to rule-based when ML confidence is low
|
|
13
|
+
- Confidence scoring: All classifications include confidence scores for reliability
|
|
14
|
+
- Caching strategy: ML predictions are cached to maintain performance
|
|
15
|
+
- Feature extraction: Uses both message content and file patterns for better accuracy
|
|
16
|
+
- Integration: Leverages existing ChangeTypeClassifier from qualitative analysis
|
|
17
|
+
|
|
18
|
+
PERFORMANCE: Designed to handle large repositories efficiently with:
|
|
19
|
+
- Batch processing for ML predictions
|
|
20
|
+
- Intelligent caching of ML results
|
|
21
|
+
- Fallback to fast rule-based classification when appropriate
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import sqlite3
|
|
26
|
+
import time
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Any, Optional
|
|
30
|
+
|
|
31
|
+
from ..qualitative.classifiers.change_type import ChangeTypeClassifier
|
|
32
|
+
from ..qualitative.classifiers.llm_commit_classifier import LLMCommitClassifier, LLMConfig
|
|
33
|
+
from ..qualitative.models.schemas import ChangeTypeConfig
|
|
34
|
+
from .tickets import TicketExtractor, filter_git_artifacts
|
|
35
|
+
|
|
36
|
+
# Import training model loader with fallback
|
|
37
|
+
try:
|
|
38
|
+
from ..training.model_loader import TrainingModelLoader
|
|
39
|
+
|
|
40
|
+
TRAINING_LOADER_AVAILABLE = True
|
|
41
|
+
except ImportError:
|
|
42
|
+
TRAINING_LOADER_AVAILABLE = False
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
import spacy
|
|
46
|
+
from spacy.tokens import Doc
|
|
47
|
+
|
|
48
|
+
SPACY_AVAILABLE = True
|
|
49
|
+
except ImportError:
|
|
50
|
+
SPACY_AVAILABLE = False
|
|
51
|
+
Doc = Any
|
|
52
|
+
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MLTicketExtractor(TicketExtractor):
|
|
57
|
+
"""ML-enhanced ticket extractor with sophisticated commit categorization.
|
|
58
|
+
|
|
59
|
+
This extractor extends the basic TicketExtractor with machine learning capabilities
|
|
60
|
+
while maintaining full backward compatibility. It uses a hybrid approach combining
|
|
61
|
+
rule-based patterns with ML-based semantic analysis for improved accuracy.
|
|
62
|
+
|
|
63
|
+
Key features:
|
|
64
|
+
- Hybrid categorization (ML + rule-based fallback)
|
|
65
|
+
- Confidence scoring for all predictions
|
|
66
|
+
- Intelligent caching for performance
|
|
67
|
+
- Feature extraction from commit message and file patterns
|
|
68
|
+
- Integration with existing qualitative analysis infrastructure
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
allowed_platforms: Optional[list[str]] = None,
|
|
74
|
+
untracked_file_threshold: int = 1,
|
|
75
|
+
ml_config: Optional[dict[str, Any]] = None,
|
|
76
|
+
llm_config: Optional[dict[str, Any]] = None,
|
|
77
|
+
cache_dir: Optional[Path] = None,
|
|
78
|
+
enable_ml: bool = True,
|
|
79
|
+
enable_llm: bool = False,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Initialize ML-enhanced ticket extractor.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
allowed_platforms: List of platforms to extract tickets from
|
|
85
|
+
untracked_file_threshold: Minimum files changed for significant commits
|
|
86
|
+
ml_config: Configuration for ML categorization (optional)
|
|
87
|
+
llm_config: Configuration for LLM classification (optional)
|
|
88
|
+
cache_dir: Directory for caching ML predictions
|
|
89
|
+
enable_ml: Whether to enable ML features (fallback to rule-based if False)
|
|
90
|
+
enable_llm: Whether to enable LLM classification (fallback to ML/rules if False)
|
|
91
|
+
"""
|
|
92
|
+
# Initialize parent class
|
|
93
|
+
super().__init__(allowed_platforms, untracked_file_threshold)
|
|
94
|
+
|
|
95
|
+
self.enable_ml = enable_ml and SPACY_AVAILABLE
|
|
96
|
+
self.enable_llm = enable_llm
|
|
97
|
+
self.cache_dir = cache_dir or Path(".gitflow-cache")
|
|
98
|
+
self.cache_dir.mkdir(exist_ok=True)
|
|
99
|
+
|
|
100
|
+
# ML configuration with sensible defaults
|
|
101
|
+
default_ml_config = {
|
|
102
|
+
"min_confidence": 0.6,
|
|
103
|
+
"semantic_weight": 0.7,
|
|
104
|
+
"file_pattern_weight": 0.3,
|
|
105
|
+
"hybrid_threshold": 0.5, # Confidence threshold for using ML vs rule-based
|
|
106
|
+
"cache_duration_days": 30,
|
|
107
|
+
"batch_size": 100,
|
|
108
|
+
"enable_caching": True,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
self.ml_config = {**default_ml_config, **(ml_config or {})}
|
|
112
|
+
|
|
113
|
+
# LLM configuration with sensible defaults
|
|
114
|
+
default_llm_config = {
|
|
115
|
+
"api_key": None,
|
|
116
|
+
"model": "mistralai/mistral-7b-instruct",
|
|
117
|
+
"confidence_threshold": 0.7,
|
|
118
|
+
"max_tokens": 50,
|
|
119
|
+
"temperature": 0.1,
|
|
120
|
+
"timeout_seconds": 30.0,
|
|
121
|
+
"cache_duration_days": 90,
|
|
122
|
+
"enable_caching": True,
|
|
123
|
+
"max_daily_requests": 1000,
|
|
124
|
+
"domain_terms": {},
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
self.llm_config_dict = {**default_llm_config, **(llm_config or {})}
|
|
128
|
+
|
|
129
|
+
# Initialize ML components
|
|
130
|
+
self.change_type_classifier = None
|
|
131
|
+
self.nlp_model = None
|
|
132
|
+
self.ml_cache = None
|
|
133
|
+
self.trained_model_loader = None
|
|
134
|
+
self.llm_classifier = None
|
|
135
|
+
|
|
136
|
+
if self.enable_ml:
|
|
137
|
+
self._initialize_ml_components()
|
|
138
|
+
|
|
139
|
+
# Initialize LLM classifier if enabled
|
|
140
|
+
if self.enable_llm:
|
|
141
|
+
self._initialize_llm_classifier()
|
|
142
|
+
|
|
143
|
+
# Initialize trained model loader if available
|
|
144
|
+
if TRAINING_LOADER_AVAILABLE and self.enable_ml:
|
|
145
|
+
try:
|
|
146
|
+
self.trained_model_loader = TrainingModelLoader(self.cache_dir)
|
|
147
|
+
logger.info("Trained model loader initialized")
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(f"Failed to initialize trained model loader: {e}")
|
|
150
|
+
self.trained_model_loader = None
|
|
151
|
+
|
|
152
|
+
logger.info(
|
|
153
|
+
f"MLTicketExtractor initialized with ML {'enabled' if self.enable_ml else 'disabled'}, LLM {'enabled' if self.enable_llm else 'disabled'}"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def _initialize_ml_components(self) -> None:
|
|
157
|
+
"""Initialize ML components (ChangeTypeClassifier and spaCy model).
|
|
158
|
+
|
|
159
|
+
WHY: Separate initialization allows for graceful degradation if ML components
|
|
160
|
+
fail to load. The extractor will fall back to rule-based classification.
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
# Initialize ChangeTypeClassifier
|
|
164
|
+
change_type_config = ChangeTypeConfig(
|
|
165
|
+
min_confidence=self.ml_config["min_confidence"],
|
|
166
|
+
semantic_weight=self.ml_config["semantic_weight"],
|
|
167
|
+
file_pattern_weight=self.ml_config["file_pattern_weight"],
|
|
168
|
+
)
|
|
169
|
+
self.change_type_classifier = ChangeTypeClassifier(change_type_config)
|
|
170
|
+
|
|
171
|
+
# Initialize spaCy model (try English first, then basic)
|
|
172
|
+
try:
|
|
173
|
+
self.nlp_model = spacy.load("en_core_web_sm")
|
|
174
|
+
logger.info("spaCy model 'en_core_web_sm' loaded successfully")
|
|
175
|
+
except OSError:
|
|
176
|
+
logger.warning(
|
|
177
|
+
"spaCy model 'en_core_web_sm' not found. Trying alternative model..."
|
|
178
|
+
)
|
|
179
|
+
try:
|
|
180
|
+
self.nlp_model = spacy.load("en_core_web_md")
|
|
181
|
+
logger.info("spaCy model 'en_core_web_md' loaded successfully")
|
|
182
|
+
except OSError:
|
|
183
|
+
logger.warning(
|
|
184
|
+
"No spaCy models found. ML categorization will gracefully fall back to rule-based classification. "
|
|
185
|
+
"To enable ML features, install a spaCy model: python -m spacy download en_core_web_sm"
|
|
186
|
+
)
|
|
187
|
+
self.nlp_model = None
|
|
188
|
+
|
|
189
|
+
# Initialize ML cache
|
|
190
|
+
if self.ml_config["enable_caching"]:
|
|
191
|
+
self._initialize_ml_cache()
|
|
192
|
+
|
|
193
|
+
logger.info("ML components initialized successfully")
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
logger.warning(f"Failed to initialize ML components: {e}")
|
|
197
|
+
logger.info("Analysis will continue with rule-based classification only")
|
|
198
|
+
self.enable_ml = False
|
|
199
|
+
|
|
200
|
+
def _initialize_llm_classifier(self) -> None:
|
|
201
|
+
"""Initialize LLM classifier for commit categorization.
|
|
202
|
+
|
|
203
|
+
WHY: LLM-based classification can provide more nuanced understanding
|
|
204
|
+
of commit messages compared to rule-based or traditional ML approaches.
|
|
205
|
+
This method handles graceful degradation if LLM setup fails.
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
# Create LLM configuration object
|
|
209
|
+
llm_config = LLMConfig(
|
|
210
|
+
api_key=self.llm_config_dict.get("api_key"),
|
|
211
|
+
model=self.llm_config_dict.get("model", "mistralai/mistral-7b-instruct"),
|
|
212
|
+
confidence_threshold=self.llm_config_dict.get("confidence_threshold", 0.7),
|
|
213
|
+
max_tokens=self.llm_config_dict.get("max_tokens", 50),
|
|
214
|
+
temperature=self.llm_config_dict.get("temperature", 0.1),
|
|
215
|
+
timeout_seconds=self.llm_config_dict.get("timeout_seconds", 30.0),
|
|
216
|
+
cache_duration_days=self.llm_config_dict.get("cache_duration_days", 90),
|
|
217
|
+
enable_caching=self.llm_config_dict.get("enable_caching", True),
|
|
218
|
+
max_daily_requests=self.llm_config_dict.get("max_daily_requests", 1000),
|
|
219
|
+
domain_terms=self.llm_config_dict.get("domain_terms", {}),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Initialize LLM classifier
|
|
223
|
+
self.llm_classifier = LLMCommitClassifier(llm_config, self.cache_dir)
|
|
224
|
+
logger.info(f"LLM classifier initialized with model: {llm_config.model}")
|
|
225
|
+
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.warning(f"Failed to initialize LLM classifier: {e}")
|
|
228
|
+
logger.info("Analysis will continue without LLM classification")
|
|
229
|
+
self.enable_llm = False
|
|
230
|
+
self.llm_classifier = None
|
|
231
|
+
|
|
232
|
+
def _initialize_ml_cache(self) -> None:
|
|
233
|
+
"""Initialize SQLite cache for ML predictions.
|
|
234
|
+
|
|
235
|
+
WHY: ML predictions can be expensive, so we cache results to improve performance
|
|
236
|
+
on subsequent runs. The cache includes expiration and invalidation logic.
|
|
237
|
+
"""
|
|
238
|
+
try:
|
|
239
|
+
cache_path = self.cache_dir / "ml_predictions.db"
|
|
240
|
+
self.ml_cache = MLPredictionCache(cache_path, self.ml_config["cache_duration_days"])
|
|
241
|
+
logger.debug("ML prediction cache initialized")
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger.warning(f"Failed to initialize ML cache: {e}")
|
|
244
|
+
self.ml_cache = None
|
|
245
|
+
|
|
246
|
+
def categorize_commit(self, message: str, files_changed: Optional[list[str]] = None) -> str:
|
|
247
|
+
"""Categorize a commit using LLM -> ML -> rule-based fallback approach.
|
|
248
|
+
|
|
249
|
+
This method extends the parent's categorize_commit with LLM and ML capabilities
|
|
250
|
+
while maintaining backward compatibility. It returns the same category strings as
|
|
251
|
+
the parent class.
|
|
252
|
+
|
|
253
|
+
Classification priority:
|
|
254
|
+
1. LLM-based classification (if enabled and confident)
|
|
255
|
+
2. ML-based classification (if enabled and confident)
|
|
256
|
+
3. Rule-based classification (always available)
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
message: The commit message to categorize
|
|
260
|
+
files_changed: Optional list of changed files for additional context
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
String category (bug_fix, feature, refactor, documentation,
|
|
264
|
+
maintenance, test, style, build, or other)
|
|
265
|
+
"""
|
|
266
|
+
if not message:
|
|
267
|
+
return "other"
|
|
268
|
+
|
|
269
|
+
# Filter git artifacts for cleaner classification
|
|
270
|
+
cleaned_message = filter_git_artifacts(message)
|
|
271
|
+
if not cleaned_message:
|
|
272
|
+
return "other"
|
|
273
|
+
|
|
274
|
+
# Try LLM classification first if enabled
|
|
275
|
+
if self.enable_llm and self.llm_classifier:
|
|
276
|
+
llm_result = self._llm_categorize_commit(cleaned_message, files_changed or [])
|
|
277
|
+
if llm_result and llm_result["confidence"] >= self.llm_config_dict.get(
|
|
278
|
+
"confidence_threshold", 0.7
|
|
279
|
+
):
|
|
280
|
+
# Map LLM categories to parent class categories
|
|
281
|
+
mapped_category = self._map_llm_to_parent_category(llm_result["category"])
|
|
282
|
+
return mapped_category
|
|
283
|
+
|
|
284
|
+
# Fall back to ML categorization if enabled
|
|
285
|
+
if self.enable_ml:
|
|
286
|
+
ml_result = self._ml_categorize_commit(cleaned_message, files_changed or [])
|
|
287
|
+
if ml_result and ml_result["confidence"] >= self.ml_config["hybrid_threshold"]:
|
|
288
|
+
# Map ML categories to parent class categories
|
|
289
|
+
mapped_category = self._map_ml_to_parent_category(ml_result["category"])
|
|
290
|
+
return mapped_category
|
|
291
|
+
|
|
292
|
+
# Final fallback to parent's rule-based categorization
|
|
293
|
+
return super().categorize_commit(cleaned_message)
|
|
294
|
+
|
|
295
|
+
def categorize_commit_with_confidence(
|
|
296
|
+
self, message: str, files_changed: Optional[list[str]] = None
|
|
297
|
+
) -> dict[str, Any]:
|
|
298
|
+
"""Categorize commit with detailed confidence information.
|
|
299
|
+
|
|
300
|
+
This is the main entry point for getting detailed categorization results
|
|
301
|
+
including confidence scores, alternative predictions, and processing metadata.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
message: The commit message to categorize
|
|
305
|
+
files_changed: Optional list of changed files for additional context
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Dictionary with categorization results:
|
|
309
|
+
{
|
|
310
|
+
'category': str,
|
|
311
|
+
'confidence': float,
|
|
312
|
+
'method': str ('ml', 'rules', 'cached'),
|
|
313
|
+
'alternatives': List[Dict],
|
|
314
|
+
'features': Dict,
|
|
315
|
+
'processing_time_ms': float
|
|
316
|
+
}
|
|
317
|
+
"""
|
|
318
|
+
start_time = time.time()
|
|
319
|
+
|
|
320
|
+
if not message:
|
|
321
|
+
return {
|
|
322
|
+
"category": "other",
|
|
323
|
+
"confidence": 1.0,
|
|
324
|
+
"method": "default",
|
|
325
|
+
"alternatives": [],
|
|
326
|
+
"features": {},
|
|
327
|
+
"processing_time_ms": 0.0,
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
# Filter git artifacts for cleaner classification
|
|
331
|
+
cleaned_message = filter_git_artifacts(message)
|
|
332
|
+
if not cleaned_message:
|
|
333
|
+
return {
|
|
334
|
+
"category": "other",
|
|
335
|
+
"confidence": 0.3,
|
|
336
|
+
"method": "filtered_empty",
|
|
337
|
+
"alternatives": [],
|
|
338
|
+
"features": {},
|
|
339
|
+
"processing_time_ms": (time.time() - start_time) * 1000,
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
files_changed = files_changed or []
|
|
343
|
+
|
|
344
|
+
# Check cache first
|
|
345
|
+
if self.ml_cache and self.ml_config["enable_caching"]:
|
|
346
|
+
cached_result = self.ml_cache.get_prediction(cleaned_message, files_changed)
|
|
347
|
+
if cached_result:
|
|
348
|
+
cached_result["processing_time_ms"] = (time.time() - start_time) * 1000
|
|
349
|
+
return cached_result
|
|
350
|
+
|
|
351
|
+
# Try LLM categorization first if enabled
|
|
352
|
+
if self.enable_llm and self.llm_classifier:
|
|
353
|
+
llm_result = self._llm_categorize_commit_detailed(cleaned_message, files_changed)
|
|
354
|
+
if llm_result and llm_result["confidence"] >= self.llm_config_dict.get(
|
|
355
|
+
"confidence_threshold", 0.7
|
|
356
|
+
):
|
|
357
|
+
# Map to parent categories and cache result
|
|
358
|
+
llm_result["category"] = self._map_llm_to_parent_category(llm_result["category"])
|
|
359
|
+
llm_result["processing_time_ms"] = (time.time() - start_time) * 1000
|
|
360
|
+
|
|
361
|
+
if self.ml_cache and self.ml_config["enable_caching"]:
|
|
362
|
+
self.ml_cache.store_prediction(cleaned_message, files_changed, llm_result)
|
|
363
|
+
|
|
364
|
+
return llm_result
|
|
365
|
+
|
|
366
|
+
# Fall back to ML categorization
|
|
367
|
+
if self.enable_ml:
|
|
368
|
+
ml_result = self._ml_categorize_commit_detailed(cleaned_message, files_changed)
|
|
369
|
+
if ml_result and ml_result["confidence"] >= self.ml_config["hybrid_threshold"]:
|
|
370
|
+
# Map to parent categories and cache result
|
|
371
|
+
ml_result["category"] = self._map_ml_to_parent_category(ml_result["category"])
|
|
372
|
+
ml_result["processing_time_ms"] = (time.time() - start_time) * 1000
|
|
373
|
+
|
|
374
|
+
if self.ml_cache and self.ml_config["enable_caching"]:
|
|
375
|
+
self.ml_cache.store_prediction(cleaned_message, files_changed, ml_result)
|
|
376
|
+
|
|
377
|
+
return ml_result
|
|
378
|
+
|
|
379
|
+
# Fall back to rule-based categorization
|
|
380
|
+
rule_category = super().categorize_commit(cleaned_message)
|
|
381
|
+
rule_result = {
|
|
382
|
+
"category": rule_category,
|
|
383
|
+
"confidence": 0.8 if rule_category != "other" else 0.3,
|
|
384
|
+
"method": "rules",
|
|
385
|
+
"alternatives": [],
|
|
386
|
+
"features": {"rule_based": True},
|
|
387
|
+
"processing_time_ms": (time.time() - start_time) * 1000,
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if self.ml_cache and self.ml_config["enable_caching"]:
|
|
391
|
+
self.ml_cache.store_prediction(message, files_changed, rule_result)
|
|
392
|
+
|
|
393
|
+
return rule_result
|
|
394
|
+
|
|
395
|
+
def _ml_categorize_commit(
|
|
396
|
+
self, message: str, files_changed: list[str]
|
|
397
|
+
) -> Optional[dict[str, Any]]:
|
|
398
|
+
"""Internal ML categorization method (simplified version).
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
message: Commit message
|
|
402
|
+
files_changed: List of changed files
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Dictionary with category and confidence, or None if ML unavailable
|
|
406
|
+
"""
|
|
407
|
+
if not self.change_type_classifier or not message:
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
try:
|
|
411
|
+
# Process message with spaCy if available
|
|
412
|
+
doc = None
|
|
413
|
+
if self.nlp_model:
|
|
414
|
+
doc = self.nlp_model(message)
|
|
415
|
+
|
|
416
|
+
# Get ML classification
|
|
417
|
+
ml_category, confidence = self.change_type_classifier.classify(
|
|
418
|
+
message, doc, files_changed
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
if ml_category and ml_category != "unknown":
|
|
422
|
+
return {"category": ml_category, "confidence": confidence}
|
|
423
|
+
|
|
424
|
+
except Exception as e:
|
|
425
|
+
logger.warning(f"ML categorization failed: {e}")
|
|
426
|
+
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
def _ml_categorize_commit_detailed(
|
|
430
|
+
self, message: str, files_changed: list[str]
|
|
431
|
+
) -> Optional[dict[str, Any]]:
|
|
432
|
+
"""Detailed ML categorization with comprehensive metadata.
|
|
433
|
+
|
|
434
|
+
Tries trained models first, then falls back to built-in ML classification.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
message: Commit message
|
|
438
|
+
files_changed: List of changed files
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
Detailed categorization result dictionary or None if ML unavailable
|
|
442
|
+
"""
|
|
443
|
+
if not message:
|
|
444
|
+
return None
|
|
445
|
+
|
|
446
|
+
# Try trained model first if available
|
|
447
|
+
if self.trained_model_loader:
|
|
448
|
+
try:
|
|
449
|
+
trained_result = self.trained_model_loader.predict_commit_category(
|
|
450
|
+
message, files_changed
|
|
451
|
+
)
|
|
452
|
+
if (
|
|
453
|
+
trained_result["method"] != "failed"
|
|
454
|
+
and trained_result["confidence"] >= self.ml_config["hybrid_threshold"]
|
|
455
|
+
):
|
|
456
|
+
return trained_result
|
|
457
|
+
except Exception as e:
|
|
458
|
+
logger.debug(f"Trained model prediction failed, falling back to built-in ML: {e}")
|
|
459
|
+
|
|
460
|
+
# Fall back to built-in ML classification
|
|
461
|
+
if not self.change_type_classifier:
|
|
462
|
+
return None
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
# Process message with spaCy
|
|
466
|
+
doc = None
|
|
467
|
+
features = {}
|
|
468
|
+
if self.nlp_model:
|
|
469
|
+
doc = self.nlp_model(message)
|
|
470
|
+
features = self._extract_features(message, doc, files_changed)
|
|
471
|
+
|
|
472
|
+
# Get ML classification
|
|
473
|
+
ml_category, confidence = self.change_type_classifier.classify(
|
|
474
|
+
message, doc, files_changed
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
if ml_category and ml_category != "unknown":
|
|
478
|
+
return {
|
|
479
|
+
"category": ml_category,
|
|
480
|
+
"confidence": confidence,
|
|
481
|
+
"method": "builtin_ml",
|
|
482
|
+
"alternatives": self._get_alternative_predictions(message, doc, files_changed),
|
|
483
|
+
"features": features,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
logger.warning(f"Built-in ML categorization failed: {e}")
|
|
488
|
+
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
def _extract_features(
|
|
492
|
+
self, message: str, doc: Optional[Doc], files_changed: list[str]
|
|
493
|
+
) -> dict[str, Any]:
|
|
494
|
+
"""Extract features used for ML classification.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
message: Commit message
|
|
498
|
+
doc: spaCy processed document
|
|
499
|
+
files_changed: List of changed files
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
Dictionary of extracted features
|
|
503
|
+
"""
|
|
504
|
+
features = {
|
|
505
|
+
"message_length": len(message),
|
|
506
|
+
"word_count": len(message.split()),
|
|
507
|
+
"files_count": len(files_changed),
|
|
508
|
+
"file_extensions": list(
|
|
509
|
+
set(Path(f).suffix.lower() for f in files_changed if Path(f).suffix)
|
|
510
|
+
),
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
if doc:
|
|
514
|
+
features.update(
|
|
515
|
+
{
|
|
516
|
+
"has_verbs": any(token.pos_ == "VERB" for token in doc),
|
|
517
|
+
"has_entities": len(doc.ents) > 0,
|
|
518
|
+
"sentiment_polarity": 0.0, # Placeholder - could add sentiment analysis
|
|
519
|
+
}
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
return features
|
|
523
|
+
|
|
524
|
+
def _get_alternative_predictions(
|
|
525
|
+
self, message: str, doc: Optional[Doc], files_changed: list[str]
|
|
526
|
+
) -> list[dict[str, Any]]:
|
|
527
|
+
"""Get alternative predictions with lower confidence scores.
|
|
528
|
+
|
|
529
|
+
This is a simplified version - in a full implementation, you would
|
|
530
|
+
get all classification scores and return top N alternatives.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
message: Commit message
|
|
534
|
+
doc: spaCy processed document
|
|
535
|
+
files_changed: List of changed files
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
List of alternative predictions
|
|
539
|
+
"""
|
|
540
|
+
# Placeholder implementation - could be enhanced to return actual alternatives
|
|
541
|
+
alternatives = []
|
|
542
|
+
|
|
543
|
+
# Add rule-based prediction as alternative
|
|
544
|
+
rule_category = super().categorize_commit(message)
|
|
545
|
+
if rule_category != "other":
|
|
546
|
+
alternatives.append({"category": rule_category, "confidence": 0.6, "method": "rules"})
|
|
547
|
+
|
|
548
|
+
return alternatives[:3] # Top 3 alternatives
|
|
549
|
+
|
|
550
|
+
def _llm_categorize_commit(
|
|
551
|
+
self, message: str, files_changed: list[str]
|
|
552
|
+
) -> Optional[dict[str, Any]]:
|
|
553
|
+
"""Internal LLM categorization method (simplified version).
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
message: Cleaned commit message (git artifacts already filtered)
|
|
557
|
+
files_changed: List of changed files
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Dictionary with category and confidence, or None if LLM unavailable
|
|
561
|
+
"""
|
|
562
|
+
if not self.llm_classifier or not message:
|
|
563
|
+
return None
|
|
564
|
+
|
|
565
|
+
try:
|
|
566
|
+
# Get LLM classification
|
|
567
|
+
llm_result = self.llm_classifier.classify_commit(message, files_changed)
|
|
568
|
+
|
|
569
|
+
if (
|
|
570
|
+
llm_result
|
|
571
|
+
and llm_result.get("category")
|
|
572
|
+
and llm_result["category"] != "maintenance"
|
|
573
|
+
):
|
|
574
|
+
return {"category": llm_result["category"], "confidence": llm_result["confidence"]}
|
|
575
|
+
elif (
|
|
576
|
+
llm_result
|
|
577
|
+
and llm_result.get("category") == "maintenance"
|
|
578
|
+
and llm_result["confidence"] >= 0.8
|
|
579
|
+
):
|
|
580
|
+
# Accept maintenance category only if high confidence
|
|
581
|
+
return {"category": llm_result["category"], "confidence": llm_result["confidence"]}
|
|
582
|
+
|
|
583
|
+
except Exception as e:
|
|
584
|
+
logger.warning(f"LLM categorization failed: {e}")
|
|
585
|
+
|
|
586
|
+
return None
|
|
587
|
+
|
|
588
|
+
def _llm_categorize_commit_detailed(
|
|
589
|
+
self, message: str, files_changed: list[str]
|
|
590
|
+
) -> Optional[dict[str, Any]]:
|
|
591
|
+
"""Detailed LLM categorization with comprehensive metadata.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
message: Cleaned commit message (git artifacts already filtered)
|
|
595
|
+
files_changed: List of changed files
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
Detailed categorization result dictionary or None if LLM unavailable
|
|
599
|
+
"""
|
|
600
|
+
if not self.llm_classifier or not message:
|
|
601
|
+
return None
|
|
602
|
+
|
|
603
|
+
try:
|
|
604
|
+
# Get detailed LLM classification
|
|
605
|
+
llm_result = self.llm_classifier.classify_commit(message, files_changed)
|
|
606
|
+
|
|
607
|
+
if llm_result and llm_result.get("category"):
|
|
608
|
+
return {
|
|
609
|
+
"category": llm_result["category"],
|
|
610
|
+
"confidence": llm_result["confidence"],
|
|
611
|
+
"method": "llm",
|
|
612
|
+
"reasoning": llm_result.get("reasoning", "LLM-based classification"),
|
|
613
|
+
"model": llm_result.get("model", "unknown"),
|
|
614
|
+
"alternatives": llm_result.get("alternatives", []),
|
|
615
|
+
"features": {"llm_classification": True},
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
except Exception as e:
|
|
619
|
+
logger.warning(f"Detailed LLM categorization failed: {e}")
|
|
620
|
+
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
def _map_llm_to_parent_category(self, llm_category: str) -> str:
|
|
624
|
+
"""Map LLM categories to parent class categories.
|
|
625
|
+
|
|
626
|
+
WHY: The LLM classifier uses streamlined 7-category system while the parent
|
|
627
|
+
TicketExtractor uses different category names. This mapping ensures
|
|
628
|
+
backward compatibility with existing reports and analysis.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
llm_category: Category from LLM classifier
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
Category compatible with parent class
|
|
635
|
+
"""
|
|
636
|
+
# Map from LLM's 7 streamlined categories to parent categories
|
|
637
|
+
mapping = {
|
|
638
|
+
"feature": "feature", # New functionality -> feature
|
|
639
|
+
"bugfix": "bug_fix", # Bug fixes -> bug_fix (parent uses underscore)
|
|
640
|
+
"maintenance": "maintenance", # Maintenance -> maintenance
|
|
641
|
+
"integration": "build", # Integration -> build (closest parent category)
|
|
642
|
+
"content": "documentation", # Content -> documentation
|
|
643
|
+
"media": "other", # Media -> other (no direct parent equivalent)
|
|
644
|
+
"localization": "other", # Localization -> other (no direct parent equivalent)
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
return mapping.get(llm_category, "other")
|
|
648
|
+
|
|
649
|
+
def _map_ml_to_parent_category(self, ml_category: str) -> str:
|
|
650
|
+
"""Map ML categories to parent class categories.
|
|
651
|
+
|
|
652
|
+
WHY: The ChangeTypeClassifier uses different category names than the parent
|
|
653
|
+
TicketExtractor. This mapping ensures backward compatibility.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
ml_category: Category from ML classifier
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Category compatible with parent class
|
|
660
|
+
"""
|
|
661
|
+
mapping = {
|
|
662
|
+
"feature": "feature",
|
|
663
|
+
"bugfix": "bug_fix",
|
|
664
|
+
"refactor": "refactor",
|
|
665
|
+
"docs": "documentation",
|
|
666
|
+
"test": "test",
|
|
667
|
+
"chore": "maintenance",
|
|
668
|
+
"security": "bug_fix", # Security fixes are a type of bug fix
|
|
669
|
+
"hotfix": "bug_fix", # Hotfixes are urgent bug fixes
|
|
670
|
+
"config": "maintenance", # Configuration changes are maintenance
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
return mapping.get(ml_category, "other")
|
|
674
|
+
|
|
675
|
+
def analyze_ticket_coverage(
|
|
676
|
+
self, commits: list[dict[str, Any]], prs: list[dict[str, Any]]
|
|
677
|
+
) -> dict[str, Any]:
|
|
678
|
+
"""Enhanced ticket coverage analysis with ML categorization insights.
|
|
679
|
+
|
|
680
|
+
This method extends the parent's analysis with ML-specific insights including
|
|
681
|
+
confidence distributions, method breakdowns, and prediction quality metrics.
|
|
682
|
+
|
|
683
|
+
Args:
|
|
684
|
+
commits: List of commit data
|
|
685
|
+
prs: List of PR data
|
|
686
|
+
|
|
687
|
+
Returns:
|
|
688
|
+
Enhanced analysis results with ML insights
|
|
689
|
+
"""
|
|
690
|
+
# Get base analysis from parent
|
|
691
|
+
base_analysis = super().analyze_ticket_coverage(commits, prs)
|
|
692
|
+
|
|
693
|
+
if not self.enable_ml:
|
|
694
|
+
# Add indicator that ML was not used
|
|
695
|
+
base_analysis["ml_analysis"] = {
|
|
696
|
+
"enabled": False,
|
|
697
|
+
"reason": "ML components not available or disabled",
|
|
698
|
+
}
|
|
699
|
+
return base_analysis
|
|
700
|
+
|
|
701
|
+
# Enhance with ML-specific analysis
|
|
702
|
+
ml_analysis = self._analyze_ml_categorization_quality(commits)
|
|
703
|
+
base_analysis["ml_analysis"] = ml_analysis
|
|
704
|
+
|
|
705
|
+
# Enhance untracked commits with confidence scores
|
|
706
|
+
if "untracked_commits" in base_analysis:
|
|
707
|
+
self._enhance_untracked_commits(base_analysis["untracked_commits"])
|
|
708
|
+
|
|
709
|
+
return base_analysis
|
|
710
|
+
|
|
711
|
+
def _analyze_ml_categorization_quality(self, commits: list[dict[str, Any]]) -> dict[str, Any]:
|
|
712
|
+
"""Analyze the quality and distribution of ML categorizations.
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
commits: List of commit data
|
|
716
|
+
|
|
717
|
+
Returns:
|
|
718
|
+
ML analysis results including confidence distributions and method usage
|
|
719
|
+
"""
|
|
720
|
+
ml_stats = {
|
|
721
|
+
"enabled": True,
|
|
722
|
+
"total_ml_predictions": 0,
|
|
723
|
+
"total_rule_predictions": 0,
|
|
724
|
+
"total_cached_predictions": 0,
|
|
725
|
+
"avg_confidence": 0.0,
|
|
726
|
+
"confidence_distribution": {"high": 0, "medium": 0, "low": 0},
|
|
727
|
+
"method_breakdown": defaultdict(int),
|
|
728
|
+
"category_confidence": defaultdict(list),
|
|
729
|
+
"processing_time_stats": {"total_ms": 0.0, "avg_ms": 0.0},
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
total_confidence = 0.0
|
|
733
|
+
total_processing_time = 0.0
|
|
734
|
+
processed_commits = 0
|
|
735
|
+
|
|
736
|
+
for commit in commits:
|
|
737
|
+
# Get files_changed count efficiently with proper type handling
|
|
738
|
+
files_count = commit.get("files_changed_count")
|
|
739
|
+
if files_count is None:
|
|
740
|
+
files_changed = commit.get("files_changed", 0)
|
|
741
|
+
if isinstance(files_changed, int):
|
|
742
|
+
files_count = files_changed
|
|
743
|
+
elif isinstance(files_changed, list):
|
|
744
|
+
files_count = len(files_changed)
|
|
745
|
+
else:
|
|
746
|
+
logger.warning(
|
|
747
|
+
f"Unexpected files_changed type: {type(files_changed)}, defaulting to 0"
|
|
748
|
+
)
|
|
749
|
+
files_count = 0
|
|
750
|
+
|
|
751
|
+
if commit.get("is_merge") or files_count < self.untracked_file_threshold:
|
|
752
|
+
continue
|
|
753
|
+
|
|
754
|
+
# Get detailed categorization for analysis
|
|
755
|
+
message = commit.get("message", "")
|
|
756
|
+
# Normalize files_changed to ensure it's always a list
|
|
757
|
+
files_changed_raw = commit.get("files_changed", [])
|
|
758
|
+
if isinstance(files_changed_raw, int):
|
|
759
|
+
# If files_changed is an integer count, we can't provide file names
|
|
760
|
+
files_changed = []
|
|
761
|
+
elif isinstance(files_changed_raw, list):
|
|
762
|
+
files_changed = files_changed_raw
|
|
763
|
+
else:
|
|
764
|
+
files_changed = []
|
|
765
|
+
|
|
766
|
+
result = self.categorize_commit_with_confidence(message, files_changed)
|
|
767
|
+
|
|
768
|
+
# Update statistics
|
|
769
|
+
confidence = result["confidence"]
|
|
770
|
+
method = result["method"]
|
|
771
|
+
category = result["category"]
|
|
772
|
+
processing_time = result.get("processing_time_ms", 0.0)
|
|
773
|
+
|
|
774
|
+
total_confidence += confidence
|
|
775
|
+
total_processing_time += processing_time
|
|
776
|
+
processed_commits += 1
|
|
777
|
+
|
|
778
|
+
# Method breakdown
|
|
779
|
+
ml_stats["method_breakdown"][method] += 1
|
|
780
|
+
if method == "ml":
|
|
781
|
+
ml_stats["total_ml_predictions"] += 1
|
|
782
|
+
elif method == "rules":
|
|
783
|
+
ml_stats["total_rule_predictions"] += 1
|
|
784
|
+
elif method == "cached":
|
|
785
|
+
ml_stats["total_cached_predictions"] += 1
|
|
786
|
+
|
|
787
|
+
# Confidence distribution
|
|
788
|
+
if confidence >= 0.8:
|
|
789
|
+
ml_stats["confidence_distribution"]["high"] += 1
|
|
790
|
+
elif confidence >= 0.6:
|
|
791
|
+
ml_stats["confidence_distribution"]["medium"] += 1
|
|
792
|
+
else:
|
|
793
|
+
ml_stats["confidence_distribution"]["low"] += 1
|
|
794
|
+
|
|
795
|
+
# Category confidence tracking
|
|
796
|
+
ml_stats["category_confidence"][category].append(confidence)
|
|
797
|
+
|
|
798
|
+
# Calculate averages
|
|
799
|
+
if processed_commits > 0:
|
|
800
|
+
ml_stats["avg_confidence"] = total_confidence / processed_commits
|
|
801
|
+
ml_stats["processing_time_stats"] = {
|
|
802
|
+
"total_ms": total_processing_time,
|
|
803
|
+
"avg_ms": total_processing_time / processed_commits,
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
# Convert defaultdicts to regular dicts for JSON serialization
|
|
807
|
+
ml_stats["method_breakdown"] = dict(ml_stats["method_breakdown"])
|
|
808
|
+
ml_stats["category_confidence"] = {
|
|
809
|
+
cat: {"avg": sum(confidences) / len(confidences), "count": len(confidences)}
|
|
810
|
+
for cat, confidences in ml_stats["category_confidence"].items()
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
return ml_stats
|
|
814
|
+
|
|
815
|
+
def _enhance_untracked_commits(self, untracked_commits: list[dict[str, Any]]) -> None:
|
|
816
|
+
"""Enhance untracked commits with ML confidence scores and metadata.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
untracked_commits: List of untracked commit data to enhance in-place
|
|
820
|
+
"""
|
|
821
|
+
for commit in untracked_commits:
|
|
822
|
+
message = commit.get("full_message", commit.get("message", ""))
|
|
823
|
+
files_changed = [] # Would need to extract from commit data
|
|
824
|
+
|
|
825
|
+
# Get detailed categorization
|
|
826
|
+
result = self.categorize_commit_with_confidence(message, files_changed)
|
|
827
|
+
|
|
828
|
+
# Add ML-specific fields
|
|
829
|
+
commit["ml_confidence"] = result["confidence"]
|
|
830
|
+
commit["ml_method"] = result["method"]
|
|
831
|
+
commit["ml_alternatives"] = result.get("alternatives", [])
|
|
832
|
+
commit["ml_processing_time_ms"] = result.get("processing_time_ms", 0.0)
|
|
833
|
+
|
|
834
|
+
def get_ml_statistics(self) -> dict[str, Any]:
|
|
835
|
+
"""Get comprehensive ML and LLM usage and performance statistics.
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
Dictionary with ML/LLM performance metrics and usage statistics
|
|
839
|
+
"""
|
|
840
|
+
stats = {
|
|
841
|
+
"ml_enabled": self.enable_ml,
|
|
842
|
+
"llm_enabled": self.enable_llm,
|
|
843
|
+
"spacy_available": SPACY_AVAILABLE,
|
|
844
|
+
"training_loader_available": TRAINING_LOADER_AVAILABLE,
|
|
845
|
+
"components_loaded": {
|
|
846
|
+
"change_type_classifier": self.change_type_classifier is not None,
|
|
847
|
+
"nlp_model": self.nlp_model is not None,
|
|
848
|
+
"ml_cache": self.ml_cache is not None,
|
|
849
|
+
"trained_model_loader": self.trained_model_loader is not None,
|
|
850
|
+
"llm_classifier": self.llm_classifier is not None,
|
|
851
|
+
},
|
|
852
|
+
"configuration": {
|
|
853
|
+
"ml_config": self.ml_config.copy(),
|
|
854
|
+
"llm_config": self.llm_config_dict.copy(),
|
|
855
|
+
},
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
# Add cache statistics if available
|
|
859
|
+
if self.ml_cache:
|
|
860
|
+
stats["cache_statistics"] = self.ml_cache.get_statistics()
|
|
861
|
+
|
|
862
|
+
# Add trained model statistics if available
|
|
863
|
+
if self.trained_model_loader:
|
|
864
|
+
try:
|
|
865
|
+
stats["trained_model_statistics"] = self.trained_model_loader.get_model_statistics()
|
|
866
|
+
except Exception as e:
|
|
867
|
+
logger.warning(f"Failed to get trained model statistics: {e}")
|
|
868
|
+
stats["trained_model_statistics"] = {"error": str(e)}
|
|
869
|
+
|
|
870
|
+
# Add LLM statistics if available
|
|
871
|
+
if self.llm_classifier:
|
|
872
|
+
try:
|
|
873
|
+
stats["llm_statistics"] = self.llm_classifier.get_statistics()
|
|
874
|
+
except Exception as e:
|
|
875
|
+
logger.warning(f"Failed to get LLM statistics: {e}")
|
|
876
|
+
stats["llm_statistics"] = {"error": str(e)}
|
|
877
|
+
|
|
878
|
+
return stats
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
class MLPredictionCache:
|
|
882
|
+
"""SQLite-based cache for ML predictions with expiration support.
|
|
883
|
+
|
|
884
|
+
WHY: ML predictions can be expensive, especially for large repositories.
|
|
885
|
+
This cache stores predictions with metadata to avoid re-processing identical
|
|
886
|
+
commit messages and file patterns.
|
|
887
|
+
|
|
888
|
+
DESIGN: Uses SQLite for persistence across runs with:
|
|
889
|
+
- Expiration based on configurable time periods
|
|
890
|
+
- Hash-based keys for efficient lookup
|
|
891
|
+
- Metadata storage for cache invalidation
|
|
892
|
+
"""
|
|
893
|
+
|
|
894
|
+
def __init__(self, cache_path: Path, expiration_days: int = 30):
|
|
895
|
+
"""Initialize ML prediction cache.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
cache_path: Path to SQLite cache database
|
|
899
|
+
expiration_days: Number of days to keep predictions
|
|
900
|
+
"""
|
|
901
|
+
self.cache_path = cache_path
|
|
902
|
+
self.expiration_days = expiration_days
|
|
903
|
+
self._init_database()
|
|
904
|
+
|
|
905
|
+
def _init_database(self) -> None:
|
|
906
|
+
"""Initialize SQLite database with prediction cache table."""
|
|
907
|
+
with sqlite3.connect(self.cache_path) as conn:
|
|
908
|
+
conn.execute(
|
|
909
|
+
"""
|
|
910
|
+
CREATE TABLE IF NOT EXISTS ml_predictions (
|
|
911
|
+
key TEXT PRIMARY KEY,
|
|
912
|
+
message_hash TEXT NOT NULL,
|
|
913
|
+
files_hash TEXT NOT NULL,
|
|
914
|
+
category TEXT NOT NULL,
|
|
915
|
+
confidence REAL NOT NULL,
|
|
916
|
+
method TEXT NOT NULL,
|
|
917
|
+
features TEXT, -- JSON encoded
|
|
918
|
+
alternatives TEXT, -- JSON encoded
|
|
919
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
920
|
+
expires_at TIMESTAMP NOT NULL
|
|
921
|
+
)
|
|
922
|
+
"""
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Create index for efficient cleanup
|
|
926
|
+
conn.execute(
|
|
927
|
+
"""
|
|
928
|
+
CREATE INDEX IF NOT EXISTS idx_expires_at ON ml_predictions(expires_at)
|
|
929
|
+
"""
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
conn.commit()
|
|
933
|
+
|
|
934
|
+
def _generate_cache_key(self, message: str, files_changed: list[str]) -> tuple[str, str, str]:
|
|
935
|
+
"""Generate cache key components.
|
|
936
|
+
|
|
937
|
+
Args:
|
|
938
|
+
message: Commit message
|
|
939
|
+
files_changed: List of changed files
|
|
940
|
+
|
|
941
|
+
Returns:
|
|
942
|
+
Tuple of (cache_key, message_hash, files_hash)
|
|
943
|
+
"""
|
|
944
|
+
import hashlib
|
|
945
|
+
|
|
946
|
+
message_hash = hashlib.md5(message.encode("utf-8")).hexdigest()
|
|
947
|
+
files_hash = hashlib.md5("|".join(sorted(files_changed)).encode("utf-8")).hexdigest()
|
|
948
|
+
cache_key = f"{message_hash}:{files_hash}"
|
|
949
|
+
|
|
950
|
+
return cache_key, message_hash, files_hash
|
|
951
|
+
|
|
952
|
+
def get_prediction(self, message: str, files_changed: list[str]) -> Optional[dict[str, Any]]:
|
|
953
|
+
"""Get cached prediction if available and not expired.
|
|
954
|
+
|
|
955
|
+
Args:
|
|
956
|
+
message: Commit message
|
|
957
|
+
files_changed: List of changed files
|
|
958
|
+
|
|
959
|
+
Returns:
|
|
960
|
+
Cached prediction dictionary or None if not found/expired
|
|
961
|
+
"""
|
|
962
|
+
cache_key, _, _ = self._generate_cache_key(message, files_changed)
|
|
963
|
+
|
|
964
|
+
try:
|
|
965
|
+
with sqlite3.connect(self.cache_path) as conn:
|
|
966
|
+
conn.row_factory = sqlite3.Row
|
|
967
|
+
cursor = conn.execute(
|
|
968
|
+
"""
|
|
969
|
+
SELECT category, confidence, method, features, alternatives
|
|
970
|
+
FROM ml_predictions
|
|
971
|
+
WHERE key = ? AND expires_at > datetime('now')
|
|
972
|
+
""",
|
|
973
|
+
(cache_key,),
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
row = cursor.fetchone()
|
|
977
|
+
if row:
|
|
978
|
+
import json
|
|
979
|
+
|
|
980
|
+
return {
|
|
981
|
+
"category": row["category"],
|
|
982
|
+
"confidence": row["confidence"],
|
|
983
|
+
"method": "cached", # Override method to indicate cached result
|
|
984
|
+
"features": json.loads(row["features"]) if row["features"] else {},
|
|
985
|
+
"alternatives": (
|
|
986
|
+
json.loads(row["alternatives"]) if row["alternatives"] else []
|
|
987
|
+
),
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
except Exception as e:
|
|
991
|
+
logger.warning(f"Cache lookup failed: {e}")
|
|
992
|
+
|
|
993
|
+
return None
|
|
994
|
+
|
|
995
|
+
def store_prediction(
|
|
996
|
+
self, message: str, files_changed: list[str], result: dict[str, Any]
|
|
997
|
+
) -> None:
|
|
998
|
+
"""Store prediction in cache with expiration.
|
|
999
|
+
|
|
1000
|
+
Args:
|
|
1001
|
+
message: Commit message
|
|
1002
|
+
files_changed: List of changed files
|
|
1003
|
+
result: Prediction result to cache
|
|
1004
|
+
"""
|
|
1005
|
+
cache_key, message_hash, files_hash = self._generate_cache_key(message, files_changed)
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
import json
|
|
1009
|
+
from datetime import datetime, timedelta
|
|
1010
|
+
|
|
1011
|
+
expires_at = datetime.now() + timedelta(days=self.expiration_days)
|
|
1012
|
+
|
|
1013
|
+
with sqlite3.connect(self.cache_path) as conn:
|
|
1014
|
+
conn.execute(
|
|
1015
|
+
"""
|
|
1016
|
+
INSERT OR REPLACE INTO ml_predictions
|
|
1017
|
+
(key, message_hash, files_hash, category, confidence, method,
|
|
1018
|
+
features, alternatives, expires_at)
|
|
1019
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
1020
|
+
""",
|
|
1021
|
+
(
|
|
1022
|
+
cache_key,
|
|
1023
|
+
message_hash,
|
|
1024
|
+
files_hash,
|
|
1025
|
+
result["category"],
|
|
1026
|
+
result["confidence"],
|
|
1027
|
+
result["method"],
|
|
1028
|
+
json.dumps(result.get("features", {})),
|
|
1029
|
+
json.dumps(result.get("alternatives", [])),
|
|
1030
|
+
expires_at,
|
|
1031
|
+
),
|
|
1032
|
+
)
|
|
1033
|
+
conn.commit()
|
|
1034
|
+
|
|
1035
|
+
except Exception as e:
|
|
1036
|
+
logger.warning(f"Cache storage failed: {e}")
|
|
1037
|
+
|
|
1038
|
+
def cleanup_expired(self) -> int:
|
|
1039
|
+
"""Remove expired predictions from cache.
|
|
1040
|
+
|
|
1041
|
+
Returns:
|
|
1042
|
+
Number of expired entries removed
|
|
1043
|
+
"""
|
|
1044
|
+
try:
|
|
1045
|
+
with sqlite3.connect(self.cache_path) as conn:
|
|
1046
|
+
cursor = conn.execute(
|
|
1047
|
+
"""
|
|
1048
|
+
DELETE FROM ml_predictions WHERE expires_at <= datetime('now')
|
|
1049
|
+
"""
|
|
1050
|
+
)
|
|
1051
|
+
conn.commit()
|
|
1052
|
+
return cursor.rowcount
|
|
1053
|
+
|
|
1054
|
+
except Exception as e:
|
|
1055
|
+
logger.warning(f"Cache cleanup failed: {e}")
|
|
1056
|
+
return 0
|
|
1057
|
+
|
|
1058
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
1059
|
+
"""Get cache usage statistics.
|
|
1060
|
+
|
|
1061
|
+
Returns:
|
|
1062
|
+
Dictionary with cache statistics
|
|
1063
|
+
"""
|
|
1064
|
+
try:
|
|
1065
|
+
with sqlite3.connect(self.cache_path) as conn:
|
|
1066
|
+
cursor = conn.execute(
|
|
1067
|
+
"""
|
|
1068
|
+
SELECT
|
|
1069
|
+
COUNT(*) as total_entries,
|
|
1070
|
+
COUNT(CASE WHEN expires_at > datetime('now') THEN 1 END) as active_entries,
|
|
1071
|
+
COUNT(CASE WHEN expires_at <= datetime('now') THEN 1 END) as expired_entries,
|
|
1072
|
+
COUNT(DISTINCT method) as unique_methods
|
|
1073
|
+
FROM ml_predictions
|
|
1074
|
+
"""
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
row = cursor.fetchone()
|
|
1078
|
+
if row:
|
|
1079
|
+
return {
|
|
1080
|
+
"total_entries": row[0],
|
|
1081
|
+
"active_entries": row[1],
|
|
1082
|
+
"expired_entries": row[2],
|
|
1083
|
+
"unique_methods": row[3],
|
|
1084
|
+
"cache_file_size_mb": (
|
|
1085
|
+
self.cache_path.stat().st_size / (1024 * 1024)
|
|
1086
|
+
if self.cache_path.exists()
|
|
1087
|
+
else 0
|
|
1088
|
+
),
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
except Exception as e:
|
|
1092
|
+
logger.warning(f"Cache statistics failed: {e}")
|
|
1093
|
+
|
|
1094
|
+
return {
|
|
1095
|
+
"total_entries": 0,
|
|
1096
|
+
"active_entries": 0,
|
|
1097
|
+
"expired_entries": 0,
|
|
1098
|
+
"unique_methods": 0,
|
|
1099
|
+
"cache_file_size_mb": 0,
|
|
1100
|
+
}
|