gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4108 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
"""Machine learning model for commit classification.
|
|
2
|
+
|
|
3
|
+
This module implements a Random Forest-based commit classification model with
|
|
4
|
+
comprehensive training, validation, and prediction capabilities. The model is
|
|
5
|
+
designed for production use with robust error handling, model persistence,
|
|
6
|
+
and performance monitoring.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import pickle
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Optional
|
|
14
|
+
|
|
15
|
+
import joblib
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
20
|
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
|
21
|
+
from sklearn.model_selection import cross_val_score, train_test_split
|
|
22
|
+
from sklearn.preprocessing import LabelEncoder
|
|
23
|
+
|
|
24
|
+
SKLEARN_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
SKLEARN_AVAILABLE = False
|
|
27
|
+
RandomForestClassifier = None
|
|
28
|
+
LabelEncoder = None
|
|
29
|
+
|
|
30
|
+
from .feature_extractor import FeatureExtractor
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CommitClassificationModel:
|
|
36
|
+
"""Random Forest-based commit classification model.
|
|
37
|
+
|
|
38
|
+
This model provides comprehensive commit classification using a Random Forest
|
|
39
|
+
classifier trained on 68-dimensional feature vectors. It includes:
|
|
40
|
+
|
|
41
|
+
- Robust training pipeline with cross-validation
|
|
42
|
+
- Model persistence and versioning
|
|
43
|
+
- Batch prediction capabilities
|
|
44
|
+
- Performance monitoring and metrics
|
|
45
|
+
- Graceful fallback when scikit-learn is unavailable
|
|
46
|
+
|
|
47
|
+
The model is designed to classify commits into categories such as:
|
|
48
|
+
- feature: New functionality
|
|
49
|
+
- bugfix: Bug fixes and corrections
|
|
50
|
+
- refactor: Code restructuring
|
|
51
|
+
- docs: Documentation changes
|
|
52
|
+
- test: Testing-related changes
|
|
53
|
+
- config: Configuration changes
|
|
54
|
+
- chore: Maintenance tasks
|
|
55
|
+
- security: Security-related changes
|
|
56
|
+
- hotfix: Emergency fixes
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, model_path: Optional[Path] = None, config: Optional[dict[str, Any]] = None):
|
|
60
|
+
"""Initialize the commit classification model.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
model_path: Path to save/load model files
|
|
64
|
+
config: Configuration dictionary with model parameters
|
|
65
|
+
"""
|
|
66
|
+
if not SKLEARN_AVAILABLE:
|
|
67
|
+
logger.warning("scikit-learn not available. Model functionality will be limited.")
|
|
68
|
+
self.model = None
|
|
69
|
+
self.label_encoder = None
|
|
70
|
+
self.feature_extractor = None
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
self.model_path = model_path or Path(".gitflow-cache/classification")
|
|
74
|
+
self.model_path.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
# Configuration with defaults
|
|
77
|
+
self.config = config or {}
|
|
78
|
+
self.n_estimators = self.config.get("n_estimators", 100)
|
|
79
|
+
self.max_depth = self.config.get("max_depth", 20)
|
|
80
|
+
self.min_samples_split = self.config.get("min_samples_split", 5)
|
|
81
|
+
self.min_samples_leaf = self.config.get("min_samples_leaf", 2)
|
|
82
|
+
self.random_state = self.config.get("random_state", 42)
|
|
83
|
+
self.n_jobs = self.config.get("n_jobs", -1) # Use all available cores
|
|
84
|
+
|
|
85
|
+
# Initialize components
|
|
86
|
+
self.model = RandomForestClassifier(
|
|
87
|
+
n_estimators=self.n_estimators,
|
|
88
|
+
max_depth=self.max_depth,
|
|
89
|
+
min_samples_split=self.min_samples_split,
|
|
90
|
+
min_samples_leaf=self.min_samples_leaf,
|
|
91
|
+
random_state=self.random_state,
|
|
92
|
+
n_jobs=self.n_jobs,
|
|
93
|
+
class_weight="balanced", # Handle class imbalance
|
|
94
|
+
)
|
|
95
|
+
self.label_encoder = LabelEncoder()
|
|
96
|
+
self.feature_extractor = FeatureExtractor()
|
|
97
|
+
|
|
98
|
+
# Model metadata
|
|
99
|
+
self.is_trained = False
|
|
100
|
+
self.training_timestamp = None
|
|
101
|
+
self.feature_importance = None
|
|
102
|
+
self.class_names = None
|
|
103
|
+
self.training_metrics = {}
|
|
104
|
+
|
|
105
|
+
# Load existing model if available
|
|
106
|
+
self._load_model()
|
|
107
|
+
|
|
108
|
+
def train(
|
|
109
|
+
self, commits: list[dict[str, Any]], labels: list[str], validation_split: float = 0.2
|
|
110
|
+
) -> dict[str, Any]:
|
|
111
|
+
"""Train the classification model on labeled commit data.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
commits: List of commit data dictionaries
|
|
115
|
+
labels: List of corresponding classification labels
|
|
116
|
+
validation_split: Fraction of data to use for validation
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Dictionary containing training metrics and results
|
|
120
|
+
"""
|
|
121
|
+
if not SKLEARN_AVAILABLE:
|
|
122
|
+
raise RuntimeError("scikit-learn is required for model training")
|
|
123
|
+
|
|
124
|
+
if len(commits) != len(labels):
|
|
125
|
+
raise ValueError("Number of commits must match number of labels")
|
|
126
|
+
|
|
127
|
+
if len(commits) < 10:
|
|
128
|
+
raise ValueError("Need at least 10 samples for training")
|
|
129
|
+
|
|
130
|
+
logger.info(f"Training classification model on {len(commits)} commits")
|
|
131
|
+
|
|
132
|
+
# Extract features from commits
|
|
133
|
+
logger.info("Extracting features from commits...")
|
|
134
|
+
features = self.feature_extractor.extract_batch_features(commits)
|
|
135
|
+
|
|
136
|
+
# Encode labels
|
|
137
|
+
encoded_labels = self.label_encoder.fit_transform(labels)
|
|
138
|
+
self.class_names = self.label_encoder.classes_.tolist()
|
|
139
|
+
|
|
140
|
+
# Split data for validation
|
|
141
|
+
if validation_split > 0:
|
|
142
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
143
|
+
features,
|
|
144
|
+
encoded_labels,
|
|
145
|
+
test_size=validation_split,
|
|
146
|
+
random_state=self.random_state,
|
|
147
|
+
stratify=encoded_labels,
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
X_train, y_train = features, encoded_labels
|
|
151
|
+
X_val, y_val = None, None
|
|
152
|
+
|
|
153
|
+
# Train the model
|
|
154
|
+
logger.info("Training Random Forest classifier...")
|
|
155
|
+
self.model.fit(X_train, y_train)
|
|
156
|
+
self.is_trained = True
|
|
157
|
+
self.training_timestamp = datetime.now()
|
|
158
|
+
|
|
159
|
+
# Calculate feature importance
|
|
160
|
+
self.feature_importance = self.model.feature_importances_
|
|
161
|
+
|
|
162
|
+
# Evaluate the model
|
|
163
|
+
training_metrics = self._evaluate_model(X_train, y_train, X_val, y_val)
|
|
164
|
+
self.training_metrics = training_metrics
|
|
165
|
+
|
|
166
|
+
# Save the trained model
|
|
167
|
+
self._save_model()
|
|
168
|
+
|
|
169
|
+
logger.info(f"Model training completed. Accuracy: {training_metrics['accuracy']:.3f}")
|
|
170
|
+
return training_metrics
|
|
171
|
+
|
|
172
|
+
def predict(self, commits: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
173
|
+
"""Predict classifications for a batch of commits.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
commits: List of commit data dictionaries
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
List of prediction dictionaries containing:
|
|
180
|
+
- predicted_class: Predicted classification
|
|
181
|
+
- confidence: Prediction confidence (0-1)
|
|
182
|
+
- class_probabilities: Probabilities for all classes
|
|
183
|
+
"""
|
|
184
|
+
if not SKLEARN_AVAILABLE or not self.is_trained:
|
|
185
|
+
logger.warning("Model not available or not trained. Using fallback classification.")
|
|
186
|
+
return self._fallback_predictions(commits)
|
|
187
|
+
|
|
188
|
+
if not commits:
|
|
189
|
+
return []
|
|
190
|
+
|
|
191
|
+
# Extract features
|
|
192
|
+
features = self.feature_extractor.extract_batch_features(commits)
|
|
193
|
+
|
|
194
|
+
# Make predictions
|
|
195
|
+
predictions = self.model.predict(features)
|
|
196
|
+
probabilities = self.model.predict_proba(features)
|
|
197
|
+
|
|
198
|
+
# Format results
|
|
199
|
+
results = []
|
|
200
|
+
for i, commit in enumerate(commits):
|
|
201
|
+
predicted_label = self.label_encoder.inverse_transform([predictions[i]])[0]
|
|
202
|
+
max_prob = np.max(probabilities[i])
|
|
203
|
+
|
|
204
|
+
# Create probability dictionary for all classes
|
|
205
|
+
class_probs = dict(zip(self.class_names, probabilities[i]))
|
|
206
|
+
|
|
207
|
+
results.append(
|
|
208
|
+
{
|
|
209
|
+
"commit_hash": commit.get("hash", ""),
|
|
210
|
+
"predicted_class": predicted_label,
|
|
211
|
+
"confidence": float(max_prob),
|
|
212
|
+
"class_probabilities": class_probs,
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return results
|
|
217
|
+
|
|
218
|
+
def predict_single(self, commit: dict[str, Any]) -> dict[str, Any]:
|
|
219
|
+
"""Predict classification for a single commit.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
commit: Commit data dictionary
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Prediction dictionary with class and confidence
|
|
226
|
+
"""
|
|
227
|
+
results = self.predict([commit])
|
|
228
|
+
return results[0] if results else {"predicted_class": "unknown", "confidence": 0.0}
|
|
229
|
+
|
|
230
|
+
def _evaluate_model(
|
|
231
|
+
self,
|
|
232
|
+
X_train: np.ndarray,
|
|
233
|
+
y_train: np.ndarray,
|
|
234
|
+
X_val: Optional[np.ndarray] = None,
|
|
235
|
+
y_val: Optional[np.ndarray] = None,
|
|
236
|
+
) -> dict[str, Any]:
|
|
237
|
+
"""Evaluate model performance with comprehensive metrics.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
X_train: Training features
|
|
241
|
+
y_train: Training labels
|
|
242
|
+
X_val: Validation features (optional)
|
|
243
|
+
y_val: Validation labels (optional)
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Dictionary with evaluation metrics
|
|
247
|
+
"""
|
|
248
|
+
metrics = {}
|
|
249
|
+
|
|
250
|
+
# Cross-validation on training data
|
|
251
|
+
cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring="accuracy")
|
|
252
|
+
metrics["cv_accuracy_mean"] = float(np.mean(cv_scores))
|
|
253
|
+
metrics["cv_accuracy_std"] = float(np.std(cv_scores))
|
|
254
|
+
|
|
255
|
+
# Training accuracy
|
|
256
|
+
train_pred = self.model.predict(X_train)
|
|
257
|
+
metrics["train_accuracy"] = float(accuracy_score(y_train, train_pred))
|
|
258
|
+
|
|
259
|
+
# Validation metrics if validation data provided
|
|
260
|
+
if X_val is not None and y_val is not None:
|
|
261
|
+
val_pred = self.model.predict(X_val)
|
|
262
|
+
metrics["val_accuracy"] = float(accuracy_score(y_val, val_pred))
|
|
263
|
+
|
|
264
|
+
# Detailed classification report
|
|
265
|
+
class_names = [
|
|
266
|
+
self.label_encoder.inverse_transform([i])[0]
|
|
267
|
+
for i in range(len(self.label_encoder.classes_))
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
val_report = classification_report(
|
|
271
|
+
y_val, val_pred, target_names=class_names, output_dict=True
|
|
272
|
+
)
|
|
273
|
+
metrics["classification_report"] = val_report
|
|
274
|
+
|
|
275
|
+
# Confusion matrix
|
|
276
|
+
conf_matrix = confusion_matrix(y_val, val_pred)
|
|
277
|
+
metrics["confusion_matrix"] = conf_matrix.tolist()
|
|
278
|
+
|
|
279
|
+
# Overall accuracy for reporting
|
|
280
|
+
metrics["accuracy"] = metrics.get("val_accuracy", metrics["train_accuracy"])
|
|
281
|
+
|
|
282
|
+
return metrics
|
|
283
|
+
|
|
284
|
+
def get_feature_importance(self, top_n: int = 20) -> list[tuple[str, float]]:
|
|
285
|
+
"""Get top feature importances from the trained model.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
top_n: Number of top features to return
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of (feature_name, importance) tuples, sorted by importance
|
|
292
|
+
"""
|
|
293
|
+
if not self.is_trained or self.feature_importance is None:
|
|
294
|
+
return []
|
|
295
|
+
|
|
296
|
+
feature_names = self.feature_extractor.get_feature_names()
|
|
297
|
+
importance_pairs = list(zip(feature_names, self.feature_importance))
|
|
298
|
+
|
|
299
|
+
# Sort by importance descending
|
|
300
|
+
importance_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
301
|
+
|
|
302
|
+
return importance_pairs[:top_n]
|
|
303
|
+
|
|
304
|
+
def _save_model(self) -> None:
|
|
305
|
+
"""Save the trained model to disk."""
|
|
306
|
+
if not self.is_trained:
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
model_file = self.model_path / "commit_classifier.joblib"
|
|
310
|
+
metadata_file = self.model_path / "model_metadata.pkl"
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
# Save the scikit-learn model
|
|
314
|
+
joblib.dump(self.model, model_file)
|
|
315
|
+
|
|
316
|
+
# Save metadata
|
|
317
|
+
metadata = {
|
|
318
|
+
"label_encoder": self.label_encoder,
|
|
319
|
+
"is_trained": self.is_trained,
|
|
320
|
+
"training_timestamp": self.training_timestamp,
|
|
321
|
+
"feature_importance": self.feature_importance,
|
|
322
|
+
"class_names": self.class_names,
|
|
323
|
+
"training_metrics": self.training_metrics,
|
|
324
|
+
"config": self.config,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
with open(metadata_file, "wb") as f:
|
|
328
|
+
pickle.dump(metadata, f)
|
|
329
|
+
|
|
330
|
+
logger.info(f"Model saved to {model_file}")
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error(f"Failed to save model: {e}")
|
|
334
|
+
|
|
335
|
+
def _load_model(self) -> bool:
|
|
336
|
+
"""Load a previously trained model from disk.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
True if model loaded successfully, False otherwise
|
|
340
|
+
"""
|
|
341
|
+
if not SKLEARN_AVAILABLE:
|
|
342
|
+
return False
|
|
343
|
+
|
|
344
|
+
model_file = self.model_path / "commit_classifier.joblib"
|
|
345
|
+
metadata_file = self.model_path / "model_metadata.pkl"
|
|
346
|
+
|
|
347
|
+
if not (model_file.exists() and metadata_file.exists()):
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
try:
|
|
351
|
+
# Load the scikit-learn model
|
|
352
|
+
self.model = joblib.load(model_file)
|
|
353
|
+
|
|
354
|
+
# Load metadata
|
|
355
|
+
with open(metadata_file, "rb") as f:
|
|
356
|
+
metadata = pickle.load(f)
|
|
357
|
+
|
|
358
|
+
self.label_encoder = metadata["label_encoder"]
|
|
359
|
+
self.is_trained = metadata["is_trained"]
|
|
360
|
+
self.training_timestamp = metadata["training_timestamp"]
|
|
361
|
+
self.feature_importance = metadata["feature_importance"]
|
|
362
|
+
self.class_names = metadata["class_names"]
|
|
363
|
+
self.training_metrics = metadata["training_metrics"]
|
|
364
|
+
|
|
365
|
+
# Check if model is too old (older than 30 days)
|
|
366
|
+
if self.training_timestamp:
|
|
367
|
+
age = datetime.now() - self.training_timestamp
|
|
368
|
+
if age > timedelta(days=30):
|
|
369
|
+
logger.warning(f"Loaded model is {age.days} days old. Consider retraining.")
|
|
370
|
+
|
|
371
|
+
logger.info(f"Model loaded from {model_file}")
|
|
372
|
+
return True
|
|
373
|
+
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"Failed to load model: {e}")
|
|
376
|
+
return False
|
|
377
|
+
|
|
378
|
+
def _fallback_predictions(self, commits: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
379
|
+
"""Provide fallback predictions when ML model is not available.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
commits: List of commit data dictionaries
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
List of basic prediction dictionaries
|
|
386
|
+
"""
|
|
387
|
+
results = []
|
|
388
|
+
|
|
389
|
+
for commit in commits:
|
|
390
|
+
message = commit.get("message", "").lower()
|
|
391
|
+
|
|
392
|
+
# Simple rule-based fallback classification
|
|
393
|
+
predicted_class = "chore" # Default
|
|
394
|
+
confidence = 0.3 # Low confidence for rule-based
|
|
395
|
+
|
|
396
|
+
if any(word in message for word in ["fix", "bug", "error", "issue"]):
|
|
397
|
+
predicted_class = "bugfix"
|
|
398
|
+
confidence = 0.6
|
|
399
|
+
elif any(word in message for word in ["feat", "add", "implement", "new"]):
|
|
400
|
+
predicted_class = "feature"
|
|
401
|
+
confidence = 0.6
|
|
402
|
+
elif any(word in message for word in ["doc", "readme", "comment"]):
|
|
403
|
+
predicted_class = "docs"
|
|
404
|
+
confidence = 0.7
|
|
405
|
+
elif any(word in message for word in ["test", "spec", "coverage"]):
|
|
406
|
+
predicted_class = "test"
|
|
407
|
+
confidence = 0.7
|
|
408
|
+
elif any(word in message for word in ["refactor", "cleanup", "optimize"]):
|
|
409
|
+
predicted_class = "refactor"
|
|
410
|
+
confidence = 0.6
|
|
411
|
+
elif any(word in message for word in ["config", "setting", "env"]):
|
|
412
|
+
predicted_class = "config"
|
|
413
|
+
confidence = 0.6
|
|
414
|
+
|
|
415
|
+
results.append(
|
|
416
|
+
{
|
|
417
|
+
"commit_hash": commit.get("hash", ""),
|
|
418
|
+
"predicted_class": predicted_class,
|
|
419
|
+
"confidence": confidence,
|
|
420
|
+
"class_probabilities": {predicted_class: confidence},
|
|
421
|
+
}
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
return results
|
|
425
|
+
|
|
426
|
+
def get_model_info(self) -> dict[str, Any]:
|
|
427
|
+
"""Get information about the current model state.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Dictionary with model information
|
|
431
|
+
"""
|
|
432
|
+
return {
|
|
433
|
+
"is_trained": self.is_trained,
|
|
434
|
+
"sklearn_available": SKLEARN_AVAILABLE,
|
|
435
|
+
"training_timestamp": self.training_timestamp,
|
|
436
|
+
"class_names": self.class_names,
|
|
437
|
+
"n_classes": len(self.class_names) if self.class_names else 0,
|
|
438
|
+
"training_metrics": self.training_metrics,
|
|
439
|
+
"model_path": str(self.model_path),
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
def retrain_needed(self, days_old: int = 30) -> bool:
|
|
443
|
+
"""Check if model retraining is recommended.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
days_old: Age threshold in days
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
True if retraining is recommended
|
|
450
|
+
"""
|
|
451
|
+
if not self.is_trained or not self.training_timestamp:
|
|
452
|
+
return True
|
|
453
|
+
|
|
454
|
+
age = datetime.now() - self.training_timestamp
|
|
455
|
+
return age.days > days_old
|