gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,377 @@
1
+ """Model loader for integrating trained classification models.
2
+
3
+ This module provides functionality to load and use trained classification models
4
+ within the existing GitFlow Analytics ML pipeline. It bridges the gap between
5
+ the training pipeline and the production classification system.
6
+
7
+ WHY: Trained models need to be seamlessly integrated into the existing ML
8
+ categorization workflow without breaking backward compatibility. This loader
9
+ provides a unified interface for both rule-based and trained model classification.
10
+
11
+ DESIGN DECISIONS:
12
+ - Backward compatibility: Falls back to rule-based classification if model unavailable
13
+ - Model versioning: Supports loading specific model versions
14
+ - Performance: Caches loaded models in memory for efficiency
15
+ - Integration: Works with existing MLTicketExtractor infrastructure
16
+ """
17
+
18
+ import logging
19
+ import pickle
20
+ import time
21
+ from datetime import datetime
22
+ from pathlib import Path
23
+ from typing import Any, Optional
24
+
25
+ from ..models.database import ClassificationModel, Database
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class TrainingModelLoader:
31
+ """Load and manage trained classification models.
32
+
33
+ This class provides functionality to load trained models from the database
34
+ and storage, integrate them with the existing classification pipeline, and
35
+ manage model lifecycle (versioning, caching, fallback).
36
+ """
37
+
38
+ def __init__(self, cache_dir: Path) -> None:
39
+ """Initialize model loader.
40
+
41
+ Args:
42
+ cache_dir: Directory containing training database and models
43
+ """
44
+ self.cache_dir = cache_dir
45
+ self.db = Database(cache_dir / "training.db")
46
+ self.loaded_models: dict[str, Any] = {} # Model cache
47
+ self.model_metadata: dict[str, dict[str, Any]] = {} # Metadata cache
48
+
49
+ logger.info("TrainingModelLoader initialized")
50
+
51
+ def get_best_model(self) -> Optional[dict[str, Any]]:
52
+ """Get the best performing active model.
53
+
54
+ Returns:
55
+ Dictionary with model metadata or None if no models available
56
+ """
57
+ with self.db.get_session() as session:
58
+ best_model = (
59
+ session.query(ClassificationModel)
60
+ .filter_by(active=True)
61
+ .order_by(ClassificationModel.validation_accuracy.desc())
62
+ .first()
63
+ )
64
+
65
+ if best_model:
66
+ return {
67
+ "model_id": best_model.model_id,
68
+ "version": best_model.version,
69
+ "accuracy": best_model.validation_accuracy,
70
+ "categories": best_model.categories,
71
+ "model_path": best_model.model_file_path,
72
+ "model_type": best_model.model_type,
73
+ "created_at": best_model.created_at,
74
+ }
75
+
76
+ return None
77
+
78
+ def load_model(self, model_id: Optional[str] = None) -> tuple[Any, dict[str, Any]]:
79
+ """Load a trained model by ID or get the best available model.
80
+
81
+ Args:
82
+ model_id: Specific model ID to load, or None for best model
83
+
84
+ Returns:
85
+ Tuple of (loaded_model, model_metadata)
86
+
87
+ Raises:
88
+ FileNotFoundError: If model file not found
89
+ ValueError: If model_id not found or invalid
90
+ """
91
+ # Check cache first
92
+ cache_key = model_id or "best"
93
+ if cache_key in self.loaded_models:
94
+ return self.loaded_models[cache_key], self.model_metadata[cache_key]
95
+
96
+ # Get model metadata
97
+ model_info = self._get_model_by_id(model_id) if model_id else self.get_best_model()
98
+
99
+ if not model_info:
100
+ raise ValueError(
101
+ f"No model found with ID: {model_id}" if model_id else "No trained models available"
102
+ )
103
+
104
+ # Load model from file
105
+ model_path = Path(model_info["model_path"])
106
+ if not model_path.exists():
107
+ raise FileNotFoundError(f"Model file not found: {model_path}")
108
+
109
+ try:
110
+ with open(model_path, "rb") as f:
111
+ model = pickle.load(f)
112
+
113
+ # Cache loaded model
114
+ self.loaded_models[cache_key] = model
115
+ self.model_metadata[cache_key] = model_info
116
+
117
+ # Update usage statistics
118
+ self._update_model_usage(model_info["model_id"])
119
+
120
+ logger.info(
121
+ f"Loaded model {model_info['model_id']} v{model_info['version']} ({model_info['accuracy']:.3f} accuracy)"
122
+ )
123
+ return model, model_info
124
+
125
+ except Exception as e:
126
+ raise ValueError(f"Failed to load model from {model_path}: {e}") from e
127
+
128
+ def _get_model_by_id(self, model_id: str) -> Optional[dict[str, Any]]:
129
+ """Get model metadata by ID.
130
+
131
+ Args:
132
+ model_id: Model identifier
133
+
134
+ Returns:
135
+ Model metadata dictionary or None if not found
136
+ """
137
+ with self.db.get_session() as session:
138
+ model = (
139
+ session.query(ClassificationModel).filter_by(model_id=model_id, active=True).first()
140
+ )
141
+
142
+ if model:
143
+ return {
144
+ "model_id": model.model_id,
145
+ "version": model.version,
146
+ "accuracy": model.validation_accuracy,
147
+ "categories": model.categories,
148
+ "model_path": model.model_file_path,
149
+ "model_type": model.model_type,
150
+ "created_at": model.created_at,
151
+ }
152
+
153
+ return None
154
+
155
+ def _update_model_usage(self, model_id: str) -> None:
156
+ """Update model usage statistics.
157
+
158
+ Args:
159
+ model_id: Model identifier
160
+ """
161
+ try:
162
+ with self.db.get_session() as session:
163
+ model = session.query(ClassificationModel).filter_by(model_id=model_id).first()
164
+ if model:
165
+ model.usage_count = (model.usage_count or 0) + 1
166
+ model.last_used = datetime.utcnow()
167
+ session.commit()
168
+ except Exception as e:
169
+ logger.warning(f"Failed to update model usage for {model_id}: {e}")
170
+
171
+ def list_available_models(self) -> list[dict[str, Any]]:
172
+ """List all available trained models.
173
+
174
+ Returns:
175
+ List of model metadata dictionaries
176
+ """
177
+ models = []
178
+
179
+ with self.db.get_session() as session:
180
+ db_models = (
181
+ session.query(ClassificationModel)
182
+ .filter_by(active=True)
183
+ .order_by(ClassificationModel.validation_accuracy.desc())
184
+ .all()
185
+ )
186
+
187
+ for model in db_models:
188
+ models.append(
189
+ {
190
+ "model_id": model.model_id,
191
+ "version": model.version,
192
+ "accuracy": model.validation_accuracy,
193
+ "categories": model.categories,
194
+ "model_type": model.model_type,
195
+ "created_at": model.created_at,
196
+ "usage_count": model.usage_count or 0,
197
+ "model_size_mb": self._get_model_file_size(model.model_file_path),
198
+ }
199
+ )
200
+
201
+ return models
202
+
203
+ def _get_model_file_size(self, model_path: str) -> float:
204
+ """Get model file size in MB.
205
+
206
+ Args:
207
+ model_path: Path to model file
208
+
209
+ Returns:
210
+ File size in MB
211
+ """
212
+ try:
213
+ path = Path(model_path)
214
+ if path.exists():
215
+ return path.stat().st_size / (1024 * 1024)
216
+ except Exception:
217
+ pass
218
+ return 0.0
219
+
220
+ def predict_commit_category(
221
+ self,
222
+ message: str,
223
+ files_changed: Optional[list[str]] = None,
224
+ model_id: Optional[str] = None,
225
+ ) -> dict[str, Any]:
226
+ """Predict commit category using a trained model.
227
+
228
+ This method provides a unified interface for commit classification
229
+ that can be integrated into the existing ML pipeline.
230
+
231
+ Args:
232
+ message: Commit message
233
+ files_changed: List of changed files (optional)
234
+ model_id: Specific model to use (optional, uses best model if None)
235
+
236
+ Returns:
237
+ Dictionary with prediction results:
238
+ {
239
+ 'category': str,
240
+ 'confidence': float,
241
+ 'method': 'trained_model',
242
+ 'model_info': dict,
243
+ 'alternatives': list,
244
+ 'processing_time_ms': float
245
+ }
246
+ """
247
+ start_time = time.time()
248
+
249
+ try:
250
+ # Load model
251
+ model, model_info = self.load_model(model_id)
252
+
253
+ # Prepare features (simplified - in production would use same vectorizer as training)
254
+ # This is a basic implementation - real implementation would need the training vectorizer
255
+ prediction_scores = model.predict_proba([message])
256
+ prediction = model.predict([message])[0]
257
+
258
+ # Get confidence from prediction probabilities
259
+ max_confidence = float(prediction_scores[0].max())
260
+
261
+ # Map model prediction to standard categories
262
+ mapped_category = self._map_model_category(prediction)
263
+
264
+ processing_time = (time.time() - start_time) * 1000
265
+
266
+ result = {
267
+ "category": mapped_category,
268
+ "confidence": max_confidence,
269
+ "method": "trained_model",
270
+ "model_info": {
271
+ "model_id": model_info["model_id"],
272
+ "version": model_info["version"],
273
+ "accuracy": model_info["accuracy"],
274
+ },
275
+ "alternatives": self._get_alternative_predictions(
276
+ prediction_scores[0], model.classes_
277
+ ),
278
+ "processing_time_ms": processing_time,
279
+ }
280
+
281
+ return result
282
+
283
+ except Exception as e:
284
+ logger.warning(f"Trained model prediction failed: {e}")
285
+ # Return error indicator
286
+ return {
287
+ "category": "other",
288
+ "confidence": 0.0,
289
+ "method": "failed",
290
+ "error": str(e),
291
+ "processing_time_ms": (time.time() - start_time) * 1000,
292
+ }
293
+
294
+ def _map_model_category(self, prediction: str) -> str:
295
+ """Map model prediction to standard category names.
296
+
297
+ Args:
298
+ prediction: Raw model prediction
299
+
300
+ Returns:
301
+ Standardized category name
302
+ """
303
+ # This mapping should match the training category mapping
304
+ mapping = {
305
+ "bug_fix": "bug_fix",
306
+ "feature": "feature",
307
+ "refactor": "refactor",
308
+ "documentation": "documentation",
309
+ "test": "test",
310
+ "maintenance": "maintenance",
311
+ "style": "style",
312
+ "build": "build",
313
+ }
314
+
315
+ return mapping.get(prediction, "other")
316
+
317
+ def _get_alternative_predictions(
318
+ self, prediction_scores: Any, classes: list[str]
319
+ ) -> list[dict[str, Any]]:
320
+ """Get alternative predictions with confidence scores.
321
+
322
+ Args:
323
+ prediction_scores: Model prediction probability scores
324
+ classes: Model class names
325
+
326
+ Returns:
327
+ List of alternative predictions sorted by confidence
328
+ """
329
+ alternatives = []
330
+
331
+ # Get top 3 alternatives (excluding the primary prediction)
332
+ score_indices = prediction_scores.argsort()[::-1] # Sort descending
333
+
334
+ for i, idx in enumerate(score_indices[1:4]): # Skip first (primary), take next 3
335
+ alternatives.append(
336
+ {
337
+ "category": self._map_model_category(classes[idx]),
338
+ "confidence": float(prediction_scores[idx]),
339
+ "rank": i + 2,
340
+ }
341
+ )
342
+
343
+ return alternatives
344
+
345
+ def get_model_statistics(self) -> dict[str, Any]:
346
+ """Get comprehensive model loading and usage statistics.
347
+
348
+ Returns:
349
+ Dictionary with model statistics
350
+ """
351
+ stats = {
352
+ "loaded_models_count": len(self.loaded_models),
353
+ "available_models_count": 0,
354
+ "total_usage_count": 0,
355
+ "best_model_accuracy": 0.0,
356
+ "model_types": {},
357
+ "memory_usage_mb": 0.0,
358
+ }
359
+
360
+ # Get database statistics
361
+ with self.db.get_session() as session:
362
+ models = session.query(ClassificationModel).filter_by(active=True).all()
363
+ stats["available_models_count"] = len(models)
364
+
365
+ if models:
366
+ stats["total_usage_count"] = sum(m.usage_count or 0 for m in models)
367
+ stats["best_model_accuracy"] = max(m.validation_accuracy or 0 for m in models)
368
+
369
+ # Count model types
370
+ for model in models:
371
+ model_type = model.model_type
372
+ stats["model_types"][model_type] = stats["model_types"].get(model_type, 0) + 1
373
+
374
+ # Estimate memory usage (rough approximation)
375
+ stats["memory_usage_mb"] = len(self.loaded_models) * 5.0 # Rough estimate
376
+
377
+ return stats