gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""Main commit classification orchestrator.
|
|
2
|
+
|
|
3
|
+
This module provides the primary interface for commit classification,
|
|
4
|
+
orchestrating feature extraction, model training, and prediction.
|
|
5
|
+
It integrates with GitFlow Analytics' existing infrastructure and
|
|
6
|
+
provides both training and inference capabilities.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from .feature_extractor import FeatureExtractor
|
|
15
|
+
from .linguist_analyzer import LinguistAnalyzer
|
|
16
|
+
from .model import CommitClassificationModel
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CommitClassifier:
|
|
22
|
+
"""Main interface for commit classification.
|
|
23
|
+
|
|
24
|
+
This class provides a high-level interface for commit classification,
|
|
25
|
+
handling the entire pipeline from feature extraction to prediction.
|
|
26
|
+
It's designed to integrate seamlessly with GitFlow Analytics while
|
|
27
|
+
providing standalone functionality for other use cases.
|
|
28
|
+
|
|
29
|
+
Key capabilities:
|
|
30
|
+
- Automated feature extraction from git commits
|
|
31
|
+
- Model training with cross-validation
|
|
32
|
+
- Batch and single commit prediction
|
|
33
|
+
- Performance monitoring and metrics
|
|
34
|
+
- Integration with existing GitFlow Analytics caching
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, config: Optional[dict[str, Any]] = None, cache_dir: Optional[Path] = None):
|
|
38
|
+
"""Initialize the commit classifier.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
config: Configuration dictionary for classification parameters
|
|
42
|
+
cache_dir: Directory for caching models and intermediate results
|
|
43
|
+
"""
|
|
44
|
+
self.config = config or {}
|
|
45
|
+
|
|
46
|
+
# Setup paths
|
|
47
|
+
self.cache_dir = cache_dir or Path(".gitflow-cache")
|
|
48
|
+
self.model_path = self.cache_dir / "classification"
|
|
49
|
+
self.model_path.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
|
|
51
|
+
# Initialize components
|
|
52
|
+
self.feature_extractor = FeatureExtractor()
|
|
53
|
+
self.linguist_analyzer = LinguistAnalyzer()
|
|
54
|
+
self.model = CommitClassificationModel(
|
|
55
|
+
model_path=self.model_path, config=self.config.get("model", {})
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Classification configuration
|
|
59
|
+
self.enabled = self.config.get("enabled", True)
|
|
60
|
+
self.confidence_threshold = self.config.get("confidence_threshold", 0.5)
|
|
61
|
+
self.batch_size = self.config.get("batch_size", 100)
|
|
62
|
+
self.auto_retrain = self.config.get("auto_retrain", True)
|
|
63
|
+
self.retrain_threshold_days = self.config.get("retrain_threshold_days", 30)
|
|
64
|
+
|
|
65
|
+
# Supported classification categories
|
|
66
|
+
self.classification_categories = {
|
|
67
|
+
"feature": "New functionality or capabilities",
|
|
68
|
+
"bugfix": "Bug fixes and error corrections",
|
|
69
|
+
"refactor": "Code restructuring and optimization",
|
|
70
|
+
"docs": "Documentation changes and updates",
|
|
71
|
+
"test": "Testing-related changes",
|
|
72
|
+
"config": "Configuration and settings changes",
|
|
73
|
+
"chore": "Maintenance and housekeeping tasks",
|
|
74
|
+
"security": "Security-related changes",
|
|
75
|
+
"hotfix": "Emergency production fixes",
|
|
76
|
+
"style": "Code style and formatting changes",
|
|
77
|
+
"build": "Build system and dependency changes",
|
|
78
|
+
"ci": "Continuous integration changes",
|
|
79
|
+
"revert": "Reverts of previous changes",
|
|
80
|
+
"merge": "Merge commits and integration",
|
|
81
|
+
"wip": "Work in progress commits",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
logger.info(
|
|
85
|
+
f"CommitClassifier initialized with {len(self.classification_categories)} categories"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def train_model(
|
|
89
|
+
self, training_data: list[tuple[dict[str, Any], str]], validation_split: float = 0.2
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
"""Train the classification model on labeled data.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
training_data: List of (commit_data, label) tuples
|
|
95
|
+
validation_split: Fraction of data to use for validation
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dictionary containing training results and metrics
|
|
99
|
+
"""
|
|
100
|
+
if not self.enabled:
|
|
101
|
+
raise RuntimeError("Classification is disabled in configuration")
|
|
102
|
+
|
|
103
|
+
if len(training_data) < 20:
|
|
104
|
+
raise ValueError("Need at least 20 labeled examples for reliable training")
|
|
105
|
+
|
|
106
|
+
logger.info(f"Training commit classifier on {len(training_data)} examples")
|
|
107
|
+
|
|
108
|
+
# Separate commits and labels
|
|
109
|
+
commits = [item[0] for item in training_data]
|
|
110
|
+
labels = [item[1] for item in training_data]
|
|
111
|
+
|
|
112
|
+
# Validate labels
|
|
113
|
+
valid_labels = set(self.classification_categories.keys())
|
|
114
|
+
invalid_labels = set(labels) - valid_labels
|
|
115
|
+
if invalid_labels:
|
|
116
|
+
logger.warning(f"Found invalid labels: {invalid_labels}. Using fallback mapping.")
|
|
117
|
+
labels = [self._map_fallback_label(label) for label in labels]
|
|
118
|
+
|
|
119
|
+
# Train the model
|
|
120
|
+
training_results = self.model.train(commits, labels, validation_split)
|
|
121
|
+
|
|
122
|
+
# Log training summary
|
|
123
|
+
accuracy = training_results.get("accuracy", 0.0)
|
|
124
|
+
logger.info(f"Model training completed with accuracy: {accuracy:.3f}")
|
|
125
|
+
|
|
126
|
+
return training_results
|
|
127
|
+
|
|
128
|
+
def classify_commits(self, commits: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
129
|
+
"""Classify a batch of commits.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
commits: List of commit data dictionaries
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of classification results with predictions and metadata
|
|
136
|
+
"""
|
|
137
|
+
if not self.enabled:
|
|
138
|
+
logger.info("Classification disabled, returning empty results")
|
|
139
|
+
return []
|
|
140
|
+
|
|
141
|
+
if not commits:
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
logger.info(f"Classifying {len(commits)} commits")
|
|
145
|
+
|
|
146
|
+
# Check if model needs retraining
|
|
147
|
+
if self.auto_retrain and self.model.retrain_needed(self.retrain_threshold_days):
|
|
148
|
+
logger.warning("Model may need retraining - consider updating with recent data")
|
|
149
|
+
|
|
150
|
+
# Process commits in batches for memory efficiency
|
|
151
|
+
results = []
|
|
152
|
+
for i in range(0, len(commits), self.batch_size):
|
|
153
|
+
batch = commits[i : i + self.batch_size]
|
|
154
|
+
batch_results = self._classify_batch(batch)
|
|
155
|
+
results.extend(batch_results)
|
|
156
|
+
|
|
157
|
+
logger.info(f"Classification completed for {len(results)} commits")
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
def _classify_batch(self, commit_batch: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
161
|
+
"""Classify a single batch of commits.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
commit_batch: Batch of commit data dictionaries
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
List of classification results for the batch
|
|
168
|
+
"""
|
|
169
|
+
# Get model predictions
|
|
170
|
+
predictions = self.model.predict(commit_batch)
|
|
171
|
+
|
|
172
|
+
# Enhance results with additional analysis
|
|
173
|
+
enhanced_results = []
|
|
174
|
+
for _i, (commit, prediction) in enumerate(zip(commit_batch, predictions)):
|
|
175
|
+
# Add file analysis context
|
|
176
|
+
file_analysis = self.linguist_analyzer.analyze_commit_files(
|
|
177
|
+
commit.get("files_changed", [])
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Determine if prediction is reliable
|
|
181
|
+
confidence = prediction["confidence"]
|
|
182
|
+
is_reliable = confidence >= self.confidence_threshold
|
|
183
|
+
|
|
184
|
+
# Create enhanced result
|
|
185
|
+
result = {
|
|
186
|
+
"commit_hash": commit.get("hash", ""),
|
|
187
|
+
"commit_message": commit.get("message", ""),
|
|
188
|
+
"predicted_class": prediction["predicted_class"],
|
|
189
|
+
"confidence": confidence,
|
|
190
|
+
"is_reliable_prediction": is_reliable,
|
|
191
|
+
"class_probabilities": prediction["class_probabilities"],
|
|
192
|
+
"file_analysis": {
|
|
193
|
+
"primary_language": file_analysis["primary_language"],
|
|
194
|
+
"primary_activity": file_analysis["primary_activity"],
|
|
195
|
+
"file_count": file_analysis["file_count"],
|
|
196
|
+
"is_multilingual": file_analysis["is_multilingual"],
|
|
197
|
+
"is_cross_functional": file_analysis["is_cross_functional"],
|
|
198
|
+
},
|
|
199
|
+
"classification_metadata": {
|
|
200
|
+
"model_timestamp": self.model.training_timestamp,
|
|
201
|
+
"feature_count": 68,
|
|
202
|
+
"categories_available": len(self.classification_categories),
|
|
203
|
+
},
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
enhanced_results.append(result)
|
|
207
|
+
|
|
208
|
+
return enhanced_results
|
|
209
|
+
|
|
210
|
+
def classify_single_commit(self, commit: dict[str, Any]) -> dict[str, Any]:
|
|
211
|
+
"""Classify a single commit.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
commit: Commit data dictionary
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Classification result dictionary
|
|
218
|
+
"""
|
|
219
|
+
results = self.classify_commits([commit])
|
|
220
|
+
return results[0] if results else {}
|
|
221
|
+
|
|
222
|
+
def get_feature_importance(self, top_n: int = 20) -> list[tuple[str, float]]:
|
|
223
|
+
"""Get feature importance rankings from the trained model.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
top_n: Number of top features to return
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of (feature_name, importance_score) tuples
|
|
230
|
+
"""
|
|
231
|
+
return self.model.get_feature_importance(top_n)
|
|
232
|
+
|
|
233
|
+
def analyze_commit_patterns(self, commits: list[dict[str, Any]]) -> dict[str, Any]:
|
|
234
|
+
"""Analyze patterns in a collection of commits.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
commits: List of commit data dictionaries
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Dictionary with pattern analysis results
|
|
241
|
+
"""
|
|
242
|
+
if not commits:
|
|
243
|
+
return {}
|
|
244
|
+
|
|
245
|
+
# Classify all commits
|
|
246
|
+
classifications = self.classify_commits(commits)
|
|
247
|
+
|
|
248
|
+
# Aggregate pattern statistics
|
|
249
|
+
class_counts = {}
|
|
250
|
+
language_usage = {}
|
|
251
|
+
activity_patterns = {}
|
|
252
|
+
confidence_distribution = []
|
|
253
|
+
|
|
254
|
+
for result in classifications:
|
|
255
|
+
# Count classifications
|
|
256
|
+
predicted_class = result["predicted_class"]
|
|
257
|
+
class_counts[predicted_class] = class_counts.get(predicted_class, 0) + 1
|
|
258
|
+
|
|
259
|
+
# Track confidence scores
|
|
260
|
+
confidence_distribution.append(result["confidence"])
|
|
261
|
+
|
|
262
|
+
# Aggregate language usage
|
|
263
|
+
primary_lang = result["file_analysis"]["primary_language"]
|
|
264
|
+
if primary_lang:
|
|
265
|
+
if primary_lang not in language_usage:
|
|
266
|
+
language_usage[primary_lang] = {}
|
|
267
|
+
if predicted_class not in language_usage[primary_lang]:
|
|
268
|
+
language_usage[primary_lang][predicted_class] = 0
|
|
269
|
+
language_usage[primary_lang][predicted_class] += 1
|
|
270
|
+
|
|
271
|
+
# Aggregate activity patterns
|
|
272
|
+
primary_activity = result["file_analysis"]["primary_activity"]
|
|
273
|
+
if primary_activity:
|
|
274
|
+
if primary_activity not in activity_patterns:
|
|
275
|
+
activity_patterns[primary_activity] = {}
|
|
276
|
+
if predicted_class not in activity_patterns[primary_activity]:
|
|
277
|
+
activity_patterns[primary_activity][predicted_class] = 0
|
|
278
|
+
activity_patterns[primary_activity][predicted_class] += 1
|
|
279
|
+
|
|
280
|
+
# Calculate statistics
|
|
281
|
+
total_commits = len(classifications)
|
|
282
|
+
avg_confidence = (
|
|
283
|
+
sum(confidence_distribution) / len(confidence_distribution)
|
|
284
|
+
if confidence_distribution
|
|
285
|
+
else 0.0
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
"total_commits_analyzed": total_commits,
|
|
290
|
+
"classification_distribution": class_counts,
|
|
291
|
+
"average_confidence": avg_confidence,
|
|
292
|
+
"high_confidence_ratio": sum(
|
|
293
|
+
1 for c in confidence_distribution if c >= self.confidence_threshold
|
|
294
|
+
)
|
|
295
|
+
/ total_commits,
|
|
296
|
+
"language_usage_patterns": language_usage,
|
|
297
|
+
"activity_patterns": activity_patterns,
|
|
298
|
+
"most_common_class": (
|
|
299
|
+
max(class_counts.items(), key=lambda x: x[1])[0] if class_counts else None
|
|
300
|
+
),
|
|
301
|
+
"classification_diversity": len(class_counts),
|
|
302
|
+
"supported_categories": list(self.classification_categories.keys()),
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
def _map_fallback_label(self, label: str) -> str:
|
|
306
|
+
"""Map unknown labels to supported categories.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
label: Original label
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Mapped label from supported categories
|
|
313
|
+
"""
|
|
314
|
+
label_lower = label.lower()
|
|
315
|
+
|
|
316
|
+
# Common mappings
|
|
317
|
+
mappings = {
|
|
318
|
+
"feat": "feature",
|
|
319
|
+
"fix": "bugfix",
|
|
320
|
+
"bug_fix": "bugfix", # From training pipeline
|
|
321
|
+
"doc": "docs",
|
|
322
|
+
"documentation": "docs",
|
|
323
|
+
"testing": "test",
|
|
324
|
+
"tests": "test",
|
|
325
|
+
"maintenance": "chore", # From training pipeline
|
|
326
|
+
"cleanup": "chore",
|
|
327
|
+
"optimization": "refactor",
|
|
328
|
+
"optimize": "refactor",
|
|
329
|
+
"enhancement": "feature",
|
|
330
|
+
"improvement": "refactor",
|
|
331
|
+
"styling": "style",
|
|
332
|
+
"format": "style",
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return mappings.get(label_lower, "chore") # Default to chore
|
|
336
|
+
|
|
337
|
+
def get_model_status(self) -> dict[str, Any]:
|
|
338
|
+
"""Get comprehensive status of the classification system.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Dictionary with system status and capabilities
|
|
342
|
+
"""
|
|
343
|
+
model_info = self.model.get_model_info()
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
"enabled": self.enabled,
|
|
347
|
+
"model_trained": model_info["is_trained"],
|
|
348
|
+
"sklearn_available": model_info["sklearn_available"],
|
|
349
|
+
"training_timestamp": model_info["training_timestamp"],
|
|
350
|
+
"supported_categories": list(self.classification_categories.keys()),
|
|
351
|
+
"confidence_threshold": self.confidence_threshold,
|
|
352
|
+
"batch_size": self.batch_size,
|
|
353
|
+
"model_path": str(self.model_path),
|
|
354
|
+
"auto_retrain_enabled": self.auto_retrain,
|
|
355
|
+
"needs_retraining": self.model.retrain_needed(self.retrain_threshold_days),
|
|
356
|
+
"training_metrics": model_info.get("training_metrics", {}),
|
|
357
|
+
"cache_directory": str(self.cache_dir),
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
def export_training_data(self, commits: list[dict[str, Any]], output_path: Path) -> None:
|
|
361
|
+
"""Export commits in a format suitable for manual labeling.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
commits: List of commit data dictionaries
|
|
365
|
+
output_path: Path to save the training data CSV
|
|
366
|
+
"""
|
|
367
|
+
import csv
|
|
368
|
+
|
|
369
|
+
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
370
|
+
writer = csv.writer(f)
|
|
371
|
+
|
|
372
|
+
# Write header
|
|
373
|
+
writer.writerow(
|
|
374
|
+
[
|
|
375
|
+
"hash",
|
|
376
|
+
"message",
|
|
377
|
+
"author",
|
|
378
|
+
"timestamp",
|
|
379
|
+
"files_changed",
|
|
380
|
+
"insertions",
|
|
381
|
+
"deletions",
|
|
382
|
+
"primary_language",
|
|
383
|
+
"primary_activity",
|
|
384
|
+
"suggested_class",
|
|
385
|
+
"manual_label",
|
|
386
|
+
]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Analyze commits for suggestions
|
|
390
|
+
for commit in commits:
|
|
391
|
+
file_analysis = self.linguist_analyzer.analyze_commit_files(
|
|
392
|
+
commit.get("files_changed", [])
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Get a prediction for suggestion
|
|
396
|
+
if self.model.is_trained:
|
|
397
|
+
prediction = self.classify_single_commit(commit)
|
|
398
|
+
suggested_class = prediction.get("predicted_class", "unknown")
|
|
399
|
+
else:
|
|
400
|
+
suggested_class = "unknown"
|
|
401
|
+
|
|
402
|
+
# Write row
|
|
403
|
+
writer.writerow(
|
|
404
|
+
[
|
|
405
|
+
commit.get("hash", ""),
|
|
406
|
+
commit.get("message", ""),
|
|
407
|
+
commit.get("author_name", ""),
|
|
408
|
+
commit.get("timestamp", ""),
|
|
409
|
+
len(commit.get("files_changed", [])),
|
|
410
|
+
commit.get("insertions", 0),
|
|
411
|
+
commit.get("deletions", 0),
|
|
412
|
+
file_analysis["primary_language"] or "unknown",
|
|
413
|
+
file_analysis["primary_activity"] or "unknown",
|
|
414
|
+
suggested_class,
|
|
415
|
+
"", # Empty column for manual labeling
|
|
416
|
+
]
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
logger.info(f"Training data exported to {output_path}")
|
|
420
|
+
|
|
421
|
+
def load_training_data(self, csv_path: Path) -> list[tuple[dict[str, Any], str]]:
|
|
422
|
+
"""Load manually labeled training data from CSV.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
csv_path: Path to CSV file with labeled data
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
List of (commit_data, label) tuples
|
|
429
|
+
"""
|
|
430
|
+
import csv
|
|
431
|
+
|
|
432
|
+
training_data = []
|
|
433
|
+
|
|
434
|
+
with open(csv_path, encoding="utf-8") as f:
|
|
435
|
+
reader = csv.DictReader(f)
|
|
436
|
+
|
|
437
|
+
for row in reader:
|
|
438
|
+
# Skip rows without manual labels
|
|
439
|
+
if not row.get("manual_label", "").strip():
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
# Parse timestamp
|
|
443
|
+
timestamp_str = row.get("timestamp", "")
|
|
444
|
+
try:
|
|
445
|
+
timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00"))
|
|
446
|
+
except (ValueError, AttributeError):
|
|
447
|
+
timestamp = datetime.now()
|
|
448
|
+
|
|
449
|
+
# Create commit data structure
|
|
450
|
+
commit_data = {
|
|
451
|
+
"hash": row.get("hash", ""),
|
|
452
|
+
"message": row.get("message", ""),
|
|
453
|
+
"author_name": row.get("author", ""),
|
|
454
|
+
"timestamp": timestamp,
|
|
455
|
+
"files_changed": [], # Would need to be reconstructed from git
|
|
456
|
+
"insertions": int(row.get("insertions", 0) or 0),
|
|
457
|
+
"deletions": int(row.get("deletions", 0) or 0),
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
label = row["manual_label"].strip()
|
|
461
|
+
training_data.append((commit_data, label))
|
|
462
|
+
|
|
463
|
+
logger.info(f"Loaded {len(training_data)} labeled examples from {csv_path}")
|
|
464
|
+
return training_data
|