gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1100 @@
1
+ """ML-enhanced ticket reference extraction with sophisticated commit categorization.
2
+
3
+ This module extends the basic TicketExtractor with machine learning capabilities for
4
+ better commit categorization. It integrates with the existing qualitative analysis
5
+ infrastructure to provide hybrid rule-based + ML classification.
6
+
7
+ WHY: Traditional regex-based categorization has limitations in understanding context
8
+ and nuanced commit messages. This ML-enhanced version provides better accuracy while
9
+ maintaining backward compatibility and performance through intelligent caching.
10
+
11
+ DESIGN DECISIONS:
12
+ - Hybrid approach: Falls back to rule-based when ML confidence is low
13
+ - Confidence scoring: All classifications include confidence scores for reliability
14
+ - Caching strategy: ML predictions are cached to maintain performance
15
+ - Feature extraction: Uses both message content and file patterns for better accuracy
16
+ - Integration: Leverages existing ChangeTypeClassifier from qualitative analysis
17
+
18
+ PERFORMANCE: Designed to handle large repositories efficiently with:
19
+ - Batch processing for ML predictions
20
+ - Intelligent caching of ML results
21
+ - Fallback to fast rule-based classification when appropriate
22
+ """
23
+
24
+ import logging
25
+ import sqlite3
26
+ import time
27
+ from collections import defaultdict
28
+ from pathlib import Path
29
+ from typing import Any, Optional
30
+
31
+ from ..qualitative.classifiers.change_type import ChangeTypeClassifier
32
+ from ..qualitative.classifiers.llm_commit_classifier import LLMCommitClassifier, LLMConfig
33
+ from ..qualitative.models.schemas import ChangeTypeConfig
34
+ from .tickets import TicketExtractor, filter_git_artifacts
35
+
36
+ # Import training model loader with fallback
37
+ try:
38
+ from ..training.model_loader import TrainingModelLoader
39
+
40
+ TRAINING_LOADER_AVAILABLE = True
41
+ except ImportError:
42
+ TRAINING_LOADER_AVAILABLE = False
43
+
44
+ try:
45
+ import spacy
46
+ from spacy.tokens import Doc
47
+
48
+ SPACY_AVAILABLE = True
49
+ except ImportError:
50
+ SPACY_AVAILABLE = False
51
+ Doc = Any
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class MLTicketExtractor(TicketExtractor):
57
+ """ML-enhanced ticket extractor with sophisticated commit categorization.
58
+
59
+ This extractor extends the basic TicketExtractor with machine learning capabilities
60
+ while maintaining full backward compatibility. It uses a hybrid approach combining
61
+ rule-based patterns with ML-based semantic analysis for improved accuracy.
62
+
63
+ Key features:
64
+ - Hybrid categorization (ML + rule-based fallback)
65
+ - Confidence scoring for all predictions
66
+ - Intelligent caching for performance
67
+ - Feature extraction from commit message and file patterns
68
+ - Integration with existing qualitative analysis infrastructure
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ allowed_platforms: Optional[list[str]] = None,
74
+ untracked_file_threshold: int = 1,
75
+ ml_config: Optional[dict[str, Any]] = None,
76
+ llm_config: Optional[dict[str, Any]] = None,
77
+ cache_dir: Optional[Path] = None,
78
+ enable_ml: bool = True,
79
+ enable_llm: bool = False,
80
+ ) -> None:
81
+ """Initialize ML-enhanced ticket extractor.
82
+
83
+ Args:
84
+ allowed_platforms: List of platforms to extract tickets from
85
+ untracked_file_threshold: Minimum files changed for significant commits
86
+ ml_config: Configuration for ML categorization (optional)
87
+ llm_config: Configuration for LLM classification (optional)
88
+ cache_dir: Directory for caching ML predictions
89
+ enable_ml: Whether to enable ML features (fallback to rule-based if False)
90
+ enable_llm: Whether to enable LLM classification (fallback to ML/rules if False)
91
+ """
92
+ # Initialize parent class
93
+ super().__init__(allowed_platforms, untracked_file_threshold)
94
+
95
+ self.enable_ml = enable_ml and SPACY_AVAILABLE
96
+ self.enable_llm = enable_llm
97
+ self.cache_dir = cache_dir or Path(".gitflow-cache")
98
+ self.cache_dir.mkdir(exist_ok=True)
99
+
100
+ # ML configuration with sensible defaults
101
+ default_ml_config = {
102
+ "min_confidence": 0.6,
103
+ "semantic_weight": 0.7,
104
+ "file_pattern_weight": 0.3,
105
+ "hybrid_threshold": 0.5, # Confidence threshold for using ML vs rule-based
106
+ "cache_duration_days": 30,
107
+ "batch_size": 100,
108
+ "enable_caching": True,
109
+ }
110
+
111
+ self.ml_config = {**default_ml_config, **(ml_config or {})}
112
+
113
+ # LLM configuration with sensible defaults
114
+ default_llm_config = {
115
+ "api_key": None,
116
+ "model": "mistralai/mistral-7b-instruct",
117
+ "confidence_threshold": 0.7,
118
+ "max_tokens": 50,
119
+ "temperature": 0.1,
120
+ "timeout_seconds": 30.0,
121
+ "cache_duration_days": 90,
122
+ "enable_caching": True,
123
+ "max_daily_requests": 1000,
124
+ "domain_terms": {},
125
+ }
126
+
127
+ self.llm_config_dict = {**default_llm_config, **(llm_config or {})}
128
+
129
+ # Initialize ML components
130
+ self.change_type_classifier = None
131
+ self.nlp_model = None
132
+ self.ml_cache = None
133
+ self.trained_model_loader = None
134
+ self.llm_classifier = None
135
+
136
+ if self.enable_ml:
137
+ self._initialize_ml_components()
138
+
139
+ # Initialize LLM classifier if enabled
140
+ if self.enable_llm:
141
+ self._initialize_llm_classifier()
142
+
143
+ # Initialize trained model loader if available
144
+ if TRAINING_LOADER_AVAILABLE and self.enable_ml:
145
+ try:
146
+ self.trained_model_loader = TrainingModelLoader(self.cache_dir)
147
+ logger.info("Trained model loader initialized")
148
+ except Exception as e:
149
+ logger.warning(f"Failed to initialize trained model loader: {e}")
150
+ self.trained_model_loader = None
151
+
152
+ logger.info(
153
+ f"MLTicketExtractor initialized with ML {'enabled' if self.enable_ml else 'disabled'}, LLM {'enabled' if self.enable_llm else 'disabled'}"
154
+ )
155
+
156
+ def _initialize_ml_components(self) -> None:
157
+ """Initialize ML components (ChangeTypeClassifier and spaCy model).
158
+
159
+ WHY: Separate initialization allows for graceful degradation if ML components
160
+ fail to load. The extractor will fall back to rule-based classification.
161
+ """
162
+ try:
163
+ # Initialize ChangeTypeClassifier
164
+ change_type_config = ChangeTypeConfig(
165
+ min_confidence=self.ml_config["min_confidence"],
166
+ semantic_weight=self.ml_config["semantic_weight"],
167
+ file_pattern_weight=self.ml_config["file_pattern_weight"],
168
+ )
169
+ self.change_type_classifier = ChangeTypeClassifier(change_type_config)
170
+
171
+ # Initialize spaCy model (try English first, then basic)
172
+ try:
173
+ self.nlp_model = spacy.load("en_core_web_sm")
174
+ logger.info("spaCy model 'en_core_web_sm' loaded successfully")
175
+ except OSError:
176
+ logger.warning(
177
+ "spaCy model 'en_core_web_sm' not found. Trying alternative model..."
178
+ )
179
+ try:
180
+ self.nlp_model = spacy.load("en_core_web_md")
181
+ logger.info("spaCy model 'en_core_web_md' loaded successfully")
182
+ except OSError:
183
+ logger.warning(
184
+ "No spaCy models found. ML categorization will gracefully fall back to rule-based classification. "
185
+ "To enable ML features, install a spaCy model: python -m spacy download en_core_web_sm"
186
+ )
187
+ self.nlp_model = None
188
+
189
+ # Initialize ML cache
190
+ if self.ml_config["enable_caching"]:
191
+ self._initialize_ml_cache()
192
+
193
+ logger.info("ML components initialized successfully")
194
+
195
+ except Exception as e:
196
+ logger.warning(f"Failed to initialize ML components: {e}")
197
+ logger.info("Analysis will continue with rule-based classification only")
198
+ self.enable_ml = False
199
+
200
+ def _initialize_llm_classifier(self) -> None:
201
+ """Initialize LLM classifier for commit categorization.
202
+
203
+ WHY: LLM-based classification can provide more nuanced understanding
204
+ of commit messages compared to rule-based or traditional ML approaches.
205
+ This method handles graceful degradation if LLM setup fails.
206
+ """
207
+ try:
208
+ # Create LLM configuration object
209
+ llm_config = LLMConfig(
210
+ api_key=self.llm_config_dict.get("api_key"),
211
+ model=self.llm_config_dict.get("model", "mistralai/mistral-7b-instruct"),
212
+ confidence_threshold=self.llm_config_dict.get("confidence_threshold", 0.7),
213
+ max_tokens=self.llm_config_dict.get("max_tokens", 50),
214
+ temperature=self.llm_config_dict.get("temperature", 0.1),
215
+ timeout_seconds=self.llm_config_dict.get("timeout_seconds", 30.0),
216
+ cache_duration_days=self.llm_config_dict.get("cache_duration_days", 90),
217
+ enable_caching=self.llm_config_dict.get("enable_caching", True),
218
+ max_daily_requests=self.llm_config_dict.get("max_daily_requests", 1000),
219
+ domain_terms=self.llm_config_dict.get("domain_terms", {}),
220
+ )
221
+
222
+ # Initialize LLM classifier
223
+ self.llm_classifier = LLMCommitClassifier(llm_config, self.cache_dir)
224
+ logger.info(f"LLM classifier initialized with model: {llm_config.model}")
225
+
226
+ except Exception as e:
227
+ logger.warning(f"Failed to initialize LLM classifier: {e}")
228
+ logger.info("Analysis will continue without LLM classification")
229
+ self.enable_llm = False
230
+ self.llm_classifier = None
231
+
232
+ def _initialize_ml_cache(self) -> None:
233
+ """Initialize SQLite cache for ML predictions.
234
+
235
+ WHY: ML predictions can be expensive, so we cache results to improve performance
236
+ on subsequent runs. The cache includes expiration and invalidation logic.
237
+ """
238
+ try:
239
+ cache_path = self.cache_dir / "ml_predictions.db"
240
+ self.ml_cache = MLPredictionCache(cache_path, self.ml_config["cache_duration_days"])
241
+ logger.debug("ML prediction cache initialized")
242
+ except Exception as e:
243
+ logger.warning(f"Failed to initialize ML cache: {e}")
244
+ self.ml_cache = None
245
+
246
+ def categorize_commit(self, message: str, files_changed: Optional[list[str]] = None) -> str:
247
+ """Categorize a commit using LLM -> ML -> rule-based fallback approach.
248
+
249
+ This method extends the parent's categorize_commit with LLM and ML capabilities
250
+ while maintaining backward compatibility. It returns the same category strings as
251
+ the parent class.
252
+
253
+ Classification priority:
254
+ 1. LLM-based classification (if enabled and confident)
255
+ 2. ML-based classification (if enabled and confident)
256
+ 3. Rule-based classification (always available)
257
+
258
+ Args:
259
+ message: The commit message to categorize
260
+ files_changed: Optional list of changed files for additional context
261
+
262
+ Returns:
263
+ String category (bug_fix, feature, refactor, documentation,
264
+ maintenance, test, style, build, or other)
265
+ """
266
+ if not message:
267
+ return "other"
268
+
269
+ # Filter git artifacts for cleaner classification
270
+ cleaned_message = filter_git_artifacts(message)
271
+ if not cleaned_message:
272
+ return "other"
273
+
274
+ # Try LLM classification first if enabled
275
+ if self.enable_llm and self.llm_classifier:
276
+ llm_result = self._llm_categorize_commit(cleaned_message, files_changed or [])
277
+ if llm_result and llm_result["confidence"] >= self.llm_config_dict.get(
278
+ "confidence_threshold", 0.7
279
+ ):
280
+ # Map LLM categories to parent class categories
281
+ mapped_category = self._map_llm_to_parent_category(llm_result["category"])
282
+ return mapped_category
283
+
284
+ # Fall back to ML categorization if enabled
285
+ if self.enable_ml:
286
+ ml_result = self._ml_categorize_commit(cleaned_message, files_changed or [])
287
+ if ml_result and ml_result["confidence"] >= self.ml_config["hybrid_threshold"]:
288
+ # Map ML categories to parent class categories
289
+ mapped_category = self._map_ml_to_parent_category(ml_result["category"])
290
+ return mapped_category
291
+
292
+ # Final fallback to parent's rule-based categorization
293
+ return super().categorize_commit(cleaned_message)
294
+
295
+ def categorize_commit_with_confidence(
296
+ self, message: str, files_changed: Optional[list[str]] = None
297
+ ) -> dict[str, Any]:
298
+ """Categorize commit with detailed confidence information.
299
+
300
+ This is the main entry point for getting detailed categorization results
301
+ including confidence scores, alternative predictions, and processing metadata.
302
+
303
+ Args:
304
+ message: The commit message to categorize
305
+ files_changed: Optional list of changed files for additional context
306
+
307
+ Returns:
308
+ Dictionary with categorization results:
309
+ {
310
+ 'category': str,
311
+ 'confidence': float,
312
+ 'method': str ('ml', 'rules', 'cached'),
313
+ 'alternatives': List[Dict],
314
+ 'features': Dict,
315
+ 'processing_time_ms': float
316
+ }
317
+ """
318
+ start_time = time.time()
319
+
320
+ if not message:
321
+ return {
322
+ "category": "other",
323
+ "confidence": 1.0,
324
+ "method": "default",
325
+ "alternatives": [],
326
+ "features": {},
327
+ "processing_time_ms": 0.0,
328
+ }
329
+
330
+ # Filter git artifacts for cleaner classification
331
+ cleaned_message = filter_git_artifacts(message)
332
+ if not cleaned_message:
333
+ return {
334
+ "category": "other",
335
+ "confidence": 0.3,
336
+ "method": "filtered_empty",
337
+ "alternatives": [],
338
+ "features": {},
339
+ "processing_time_ms": (time.time() - start_time) * 1000,
340
+ }
341
+
342
+ files_changed = files_changed or []
343
+
344
+ # Check cache first
345
+ if self.ml_cache and self.ml_config["enable_caching"]:
346
+ cached_result = self.ml_cache.get_prediction(cleaned_message, files_changed)
347
+ if cached_result:
348
+ cached_result["processing_time_ms"] = (time.time() - start_time) * 1000
349
+ return cached_result
350
+
351
+ # Try LLM categorization first if enabled
352
+ if self.enable_llm and self.llm_classifier:
353
+ llm_result = self._llm_categorize_commit_detailed(cleaned_message, files_changed)
354
+ if llm_result and llm_result["confidence"] >= self.llm_config_dict.get(
355
+ "confidence_threshold", 0.7
356
+ ):
357
+ # Map to parent categories and cache result
358
+ llm_result["category"] = self._map_llm_to_parent_category(llm_result["category"])
359
+ llm_result["processing_time_ms"] = (time.time() - start_time) * 1000
360
+
361
+ if self.ml_cache and self.ml_config["enable_caching"]:
362
+ self.ml_cache.store_prediction(cleaned_message, files_changed, llm_result)
363
+
364
+ return llm_result
365
+
366
+ # Fall back to ML categorization
367
+ if self.enable_ml:
368
+ ml_result = self._ml_categorize_commit_detailed(cleaned_message, files_changed)
369
+ if ml_result and ml_result["confidence"] >= self.ml_config["hybrid_threshold"]:
370
+ # Map to parent categories and cache result
371
+ ml_result["category"] = self._map_ml_to_parent_category(ml_result["category"])
372
+ ml_result["processing_time_ms"] = (time.time() - start_time) * 1000
373
+
374
+ if self.ml_cache and self.ml_config["enable_caching"]:
375
+ self.ml_cache.store_prediction(cleaned_message, files_changed, ml_result)
376
+
377
+ return ml_result
378
+
379
+ # Fall back to rule-based categorization
380
+ rule_category = super().categorize_commit(cleaned_message)
381
+ rule_result = {
382
+ "category": rule_category,
383
+ "confidence": 0.8 if rule_category != "other" else 0.3,
384
+ "method": "rules",
385
+ "alternatives": [],
386
+ "features": {"rule_based": True},
387
+ "processing_time_ms": (time.time() - start_time) * 1000,
388
+ }
389
+
390
+ if self.ml_cache and self.ml_config["enable_caching"]:
391
+ self.ml_cache.store_prediction(message, files_changed, rule_result)
392
+
393
+ return rule_result
394
+
395
+ def _ml_categorize_commit(
396
+ self, message: str, files_changed: list[str]
397
+ ) -> Optional[dict[str, Any]]:
398
+ """Internal ML categorization method (simplified version).
399
+
400
+ Args:
401
+ message: Commit message
402
+ files_changed: List of changed files
403
+
404
+ Returns:
405
+ Dictionary with category and confidence, or None if ML unavailable
406
+ """
407
+ if not self.change_type_classifier or not message:
408
+ return None
409
+
410
+ try:
411
+ # Process message with spaCy if available
412
+ doc = None
413
+ if self.nlp_model:
414
+ doc = self.nlp_model(message)
415
+
416
+ # Get ML classification
417
+ ml_category, confidence = self.change_type_classifier.classify(
418
+ message, doc, files_changed
419
+ )
420
+
421
+ if ml_category and ml_category != "unknown":
422
+ return {"category": ml_category, "confidence": confidence}
423
+
424
+ except Exception as e:
425
+ logger.warning(f"ML categorization failed: {e}")
426
+
427
+ return None
428
+
429
+ def _ml_categorize_commit_detailed(
430
+ self, message: str, files_changed: list[str]
431
+ ) -> Optional[dict[str, Any]]:
432
+ """Detailed ML categorization with comprehensive metadata.
433
+
434
+ Tries trained models first, then falls back to built-in ML classification.
435
+
436
+ Args:
437
+ message: Commit message
438
+ files_changed: List of changed files
439
+
440
+ Returns:
441
+ Detailed categorization result dictionary or None if ML unavailable
442
+ """
443
+ if not message:
444
+ return None
445
+
446
+ # Try trained model first if available
447
+ if self.trained_model_loader:
448
+ try:
449
+ trained_result = self.trained_model_loader.predict_commit_category(
450
+ message, files_changed
451
+ )
452
+ if (
453
+ trained_result["method"] != "failed"
454
+ and trained_result["confidence"] >= self.ml_config["hybrid_threshold"]
455
+ ):
456
+ return trained_result
457
+ except Exception as e:
458
+ logger.debug(f"Trained model prediction failed, falling back to built-in ML: {e}")
459
+
460
+ # Fall back to built-in ML classification
461
+ if not self.change_type_classifier:
462
+ return None
463
+
464
+ try:
465
+ # Process message with spaCy
466
+ doc = None
467
+ features = {}
468
+ if self.nlp_model:
469
+ doc = self.nlp_model(message)
470
+ features = self._extract_features(message, doc, files_changed)
471
+
472
+ # Get ML classification
473
+ ml_category, confidence = self.change_type_classifier.classify(
474
+ message, doc, files_changed
475
+ )
476
+
477
+ if ml_category and ml_category != "unknown":
478
+ return {
479
+ "category": ml_category,
480
+ "confidence": confidence,
481
+ "method": "builtin_ml",
482
+ "alternatives": self._get_alternative_predictions(message, doc, files_changed),
483
+ "features": features,
484
+ }
485
+
486
+ except Exception as e:
487
+ logger.warning(f"Built-in ML categorization failed: {e}")
488
+
489
+ return None
490
+
491
+ def _extract_features(
492
+ self, message: str, doc: Optional[Doc], files_changed: list[str]
493
+ ) -> dict[str, Any]:
494
+ """Extract features used for ML classification.
495
+
496
+ Args:
497
+ message: Commit message
498
+ doc: spaCy processed document
499
+ files_changed: List of changed files
500
+
501
+ Returns:
502
+ Dictionary of extracted features
503
+ """
504
+ features = {
505
+ "message_length": len(message),
506
+ "word_count": len(message.split()),
507
+ "files_count": len(files_changed),
508
+ "file_extensions": list(
509
+ set(Path(f).suffix.lower() for f in files_changed if Path(f).suffix)
510
+ ),
511
+ }
512
+
513
+ if doc:
514
+ features.update(
515
+ {
516
+ "has_verbs": any(token.pos_ == "VERB" for token in doc),
517
+ "has_entities": len(doc.ents) > 0,
518
+ "sentiment_polarity": 0.0, # Placeholder - could add sentiment analysis
519
+ }
520
+ )
521
+
522
+ return features
523
+
524
+ def _get_alternative_predictions(
525
+ self, message: str, doc: Optional[Doc], files_changed: list[str]
526
+ ) -> list[dict[str, Any]]:
527
+ """Get alternative predictions with lower confidence scores.
528
+
529
+ This is a simplified version - in a full implementation, you would
530
+ get all classification scores and return top N alternatives.
531
+
532
+ Args:
533
+ message: Commit message
534
+ doc: spaCy processed document
535
+ files_changed: List of changed files
536
+
537
+ Returns:
538
+ List of alternative predictions
539
+ """
540
+ # Placeholder implementation - could be enhanced to return actual alternatives
541
+ alternatives = []
542
+
543
+ # Add rule-based prediction as alternative
544
+ rule_category = super().categorize_commit(message)
545
+ if rule_category != "other":
546
+ alternatives.append({"category": rule_category, "confidence": 0.6, "method": "rules"})
547
+
548
+ return alternatives[:3] # Top 3 alternatives
549
+
550
+ def _llm_categorize_commit(
551
+ self, message: str, files_changed: list[str]
552
+ ) -> Optional[dict[str, Any]]:
553
+ """Internal LLM categorization method (simplified version).
554
+
555
+ Args:
556
+ message: Cleaned commit message (git artifacts already filtered)
557
+ files_changed: List of changed files
558
+
559
+ Returns:
560
+ Dictionary with category and confidence, or None if LLM unavailable
561
+ """
562
+ if not self.llm_classifier or not message:
563
+ return None
564
+
565
+ try:
566
+ # Get LLM classification
567
+ llm_result = self.llm_classifier.classify_commit(message, files_changed)
568
+
569
+ if (
570
+ llm_result
571
+ and llm_result.get("category")
572
+ and llm_result["category"] != "maintenance"
573
+ ):
574
+ return {"category": llm_result["category"], "confidence": llm_result["confidence"]}
575
+ elif (
576
+ llm_result
577
+ and llm_result.get("category") == "maintenance"
578
+ and llm_result["confidence"] >= 0.8
579
+ ):
580
+ # Accept maintenance category only if high confidence
581
+ return {"category": llm_result["category"], "confidence": llm_result["confidence"]}
582
+
583
+ except Exception as e:
584
+ logger.warning(f"LLM categorization failed: {e}")
585
+
586
+ return None
587
+
588
+ def _llm_categorize_commit_detailed(
589
+ self, message: str, files_changed: list[str]
590
+ ) -> Optional[dict[str, Any]]:
591
+ """Detailed LLM categorization with comprehensive metadata.
592
+
593
+ Args:
594
+ message: Cleaned commit message (git artifacts already filtered)
595
+ files_changed: List of changed files
596
+
597
+ Returns:
598
+ Detailed categorization result dictionary or None if LLM unavailable
599
+ """
600
+ if not self.llm_classifier or not message:
601
+ return None
602
+
603
+ try:
604
+ # Get detailed LLM classification
605
+ llm_result = self.llm_classifier.classify_commit(message, files_changed)
606
+
607
+ if llm_result and llm_result.get("category"):
608
+ return {
609
+ "category": llm_result["category"],
610
+ "confidence": llm_result["confidence"],
611
+ "method": "llm",
612
+ "reasoning": llm_result.get("reasoning", "LLM-based classification"),
613
+ "model": llm_result.get("model", "unknown"),
614
+ "alternatives": llm_result.get("alternatives", []),
615
+ "features": {"llm_classification": True},
616
+ }
617
+
618
+ except Exception as e:
619
+ logger.warning(f"Detailed LLM categorization failed: {e}")
620
+
621
+ return None
622
+
623
+ def _map_llm_to_parent_category(self, llm_category: str) -> str:
624
+ """Map LLM categories to parent class categories.
625
+
626
+ WHY: The LLM classifier uses streamlined 7-category system while the parent
627
+ TicketExtractor uses different category names. This mapping ensures
628
+ backward compatibility with existing reports and analysis.
629
+
630
+ Args:
631
+ llm_category: Category from LLM classifier
632
+
633
+ Returns:
634
+ Category compatible with parent class
635
+ """
636
+ # Map from LLM's 7 streamlined categories to parent categories
637
+ mapping = {
638
+ "feature": "feature", # New functionality -> feature
639
+ "bugfix": "bug_fix", # Bug fixes -> bug_fix (parent uses underscore)
640
+ "maintenance": "maintenance", # Maintenance -> maintenance
641
+ "integration": "build", # Integration -> build (closest parent category)
642
+ "content": "documentation", # Content -> documentation
643
+ "media": "other", # Media -> other (no direct parent equivalent)
644
+ "localization": "other", # Localization -> other (no direct parent equivalent)
645
+ }
646
+
647
+ return mapping.get(llm_category, "other")
648
+
649
+ def _map_ml_to_parent_category(self, ml_category: str) -> str:
650
+ """Map ML categories to parent class categories.
651
+
652
+ WHY: The ChangeTypeClassifier uses different category names than the parent
653
+ TicketExtractor. This mapping ensures backward compatibility.
654
+
655
+ Args:
656
+ ml_category: Category from ML classifier
657
+
658
+ Returns:
659
+ Category compatible with parent class
660
+ """
661
+ mapping = {
662
+ "feature": "feature",
663
+ "bugfix": "bug_fix",
664
+ "refactor": "refactor",
665
+ "docs": "documentation",
666
+ "test": "test",
667
+ "chore": "maintenance",
668
+ "security": "bug_fix", # Security fixes are a type of bug fix
669
+ "hotfix": "bug_fix", # Hotfixes are urgent bug fixes
670
+ "config": "maintenance", # Configuration changes are maintenance
671
+ }
672
+
673
+ return mapping.get(ml_category, "other")
674
+
675
+ def analyze_ticket_coverage(
676
+ self, commits: list[dict[str, Any]], prs: list[dict[str, Any]]
677
+ ) -> dict[str, Any]:
678
+ """Enhanced ticket coverage analysis with ML categorization insights.
679
+
680
+ This method extends the parent's analysis with ML-specific insights including
681
+ confidence distributions, method breakdowns, and prediction quality metrics.
682
+
683
+ Args:
684
+ commits: List of commit data
685
+ prs: List of PR data
686
+
687
+ Returns:
688
+ Enhanced analysis results with ML insights
689
+ """
690
+ # Get base analysis from parent
691
+ base_analysis = super().analyze_ticket_coverage(commits, prs)
692
+
693
+ if not self.enable_ml:
694
+ # Add indicator that ML was not used
695
+ base_analysis["ml_analysis"] = {
696
+ "enabled": False,
697
+ "reason": "ML components not available or disabled",
698
+ }
699
+ return base_analysis
700
+
701
+ # Enhance with ML-specific analysis
702
+ ml_analysis = self._analyze_ml_categorization_quality(commits)
703
+ base_analysis["ml_analysis"] = ml_analysis
704
+
705
+ # Enhance untracked commits with confidence scores
706
+ if "untracked_commits" in base_analysis:
707
+ self._enhance_untracked_commits(base_analysis["untracked_commits"])
708
+
709
+ return base_analysis
710
+
711
+ def _analyze_ml_categorization_quality(self, commits: list[dict[str, Any]]) -> dict[str, Any]:
712
+ """Analyze the quality and distribution of ML categorizations.
713
+
714
+ Args:
715
+ commits: List of commit data
716
+
717
+ Returns:
718
+ ML analysis results including confidence distributions and method usage
719
+ """
720
+ ml_stats = {
721
+ "enabled": True,
722
+ "total_ml_predictions": 0,
723
+ "total_rule_predictions": 0,
724
+ "total_cached_predictions": 0,
725
+ "avg_confidence": 0.0,
726
+ "confidence_distribution": {"high": 0, "medium": 0, "low": 0},
727
+ "method_breakdown": defaultdict(int),
728
+ "category_confidence": defaultdict(list),
729
+ "processing_time_stats": {"total_ms": 0.0, "avg_ms": 0.0},
730
+ }
731
+
732
+ total_confidence = 0.0
733
+ total_processing_time = 0.0
734
+ processed_commits = 0
735
+
736
+ for commit in commits:
737
+ # Get files_changed count efficiently with proper type handling
738
+ files_count = commit.get("files_changed_count")
739
+ if files_count is None:
740
+ files_changed = commit.get("files_changed", 0)
741
+ if isinstance(files_changed, int):
742
+ files_count = files_changed
743
+ elif isinstance(files_changed, list):
744
+ files_count = len(files_changed)
745
+ else:
746
+ logger.warning(
747
+ f"Unexpected files_changed type: {type(files_changed)}, defaulting to 0"
748
+ )
749
+ files_count = 0
750
+
751
+ if commit.get("is_merge") or files_count < self.untracked_file_threshold:
752
+ continue
753
+
754
+ # Get detailed categorization for analysis
755
+ message = commit.get("message", "")
756
+ # Normalize files_changed to ensure it's always a list
757
+ files_changed_raw = commit.get("files_changed", [])
758
+ if isinstance(files_changed_raw, int):
759
+ # If files_changed is an integer count, we can't provide file names
760
+ files_changed = []
761
+ elif isinstance(files_changed_raw, list):
762
+ files_changed = files_changed_raw
763
+ else:
764
+ files_changed = []
765
+
766
+ result = self.categorize_commit_with_confidence(message, files_changed)
767
+
768
+ # Update statistics
769
+ confidence = result["confidence"]
770
+ method = result["method"]
771
+ category = result["category"]
772
+ processing_time = result.get("processing_time_ms", 0.0)
773
+
774
+ total_confidence += confidence
775
+ total_processing_time += processing_time
776
+ processed_commits += 1
777
+
778
+ # Method breakdown
779
+ ml_stats["method_breakdown"][method] += 1
780
+ if method == "ml":
781
+ ml_stats["total_ml_predictions"] += 1
782
+ elif method == "rules":
783
+ ml_stats["total_rule_predictions"] += 1
784
+ elif method == "cached":
785
+ ml_stats["total_cached_predictions"] += 1
786
+
787
+ # Confidence distribution
788
+ if confidence >= 0.8:
789
+ ml_stats["confidence_distribution"]["high"] += 1
790
+ elif confidence >= 0.6:
791
+ ml_stats["confidence_distribution"]["medium"] += 1
792
+ else:
793
+ ml_stats["confidence_distribution"]["low"] += 1
794
+
795
+ # Category confidence tracking
796
+ ml_stats["category_confidence"][category].append(confidence)
797
+
798
+ # Calculate averages
799
+ if processed_commits > 0:
800
+ ml_stats["avg_confidence"] = total_confidence / processed_commits
801
+ ml_stats["processing_time_stats"] = {
802
+ "total_ms": total_processing_time,
803
+ "avg_ms": total_processing_time / processed_commits,
804
+ }
805
+
806
+ # Convert defaultdicts to regular dicts for JSON serialization
807
+ ml_stats["method_breakdown"] = dict(ml_stats["method_breakdown"])
808
+ ml_stats["category_confidence"] = {
809
+ cat: {"avg": sum(confidences) / len(confidences), "count": len(confidences)}
810
+ for cat, confidences in ml_stats["category_confidence"].items()
811
+ }
812
+
813
+ return ml_stats
814
+
815
+ def _enhance_untracked_commits(self, untracked_commits: list[dict[str, Any]]) -> None:
816
+ """Enhance untracked commits with ML confidence scores and metadata.
817
+
818
+ Args:
819
+ untracked_commits: List of untracked commit data to enhance in-place
820
+ """
821
+ for commit in untracked_commits:
822
+ message = commit.get("full_message", commit.get("message", ""))
823
+ files_changed = [] # Would need to extract from commit data
824
+
825
+ # Get detailed categorization
826
+ result = self.categorize_commit_with_confidence(message, files_changed)
827
+
828
+ # Add ML-specific fields
829
+ commit["ml_confidence"] = result["confidence"]
830
+ commit["ml_method"] = result["method"]
831
+ commit["ml_alternatives"] = result.get("alternatives", [])
832
+ commit["ml_processing_time_ms"] = result.get("processing_time_ms", 0.0)
833
+
834
+ def get_ml_statistics(self) -> dict[str, Any]:
835
+ """Get comprehensive ML and LLM usage and performance statistics.
836
+
837
+ Returns:
838
+ Dictionary with ML/LLM performance metrics and usage statistics
839
+ """
840
+ stats = {
841
+ "ml_enabled": self.enable_ml,
842
+ "llm_enabled": self.enable_llm,
843
+ "spacy_available": SPACY_AVAILABLE,
844
+ "training_loader_available": TRAINING_LOADER_AVAILABLE,
845
+ "components_loaded": {
846
+ "change_type_classifier": self.change_type_classifier is not None,
847
+ "nlp_model": self.nlp_model is not None,
848
+ "ml_cache": self.ml_cache is not None,
849
+ "trained_model_loader": self.trained_model_loader is not None,
850
+ "llm_classifier": self.llm_classifier is not None,
851
+ },
852
+ "configuration": {
853
+ "ml_config": self.ml_config.copy(),
854
+ "llm_config": self.llm_config_dict.copy(),
855
+ },
856
+ }
857
+
858
+ # Add cache statistics if available
859
+ if self.ml_cache:
860
+ stats["cache_statistics"] = self.ml_cache.get_statistics()
861
+
862
+ # Add trained model statistics if available
863
+ if self.trained_model_loader:
864
+ try:
865
+ stats["trained_model_statistics"] = self.trained_model_loader.get_model_statistics()
866
+ except Exception as e:
867
+ logger.warning(f"Failed to get trained model statistics: {e}")
868
+ stats["trained_model_statistics"] = {"error": str(e)}
869
+
870
+ # Add LLM statistics if available
871
+ if self.llm_classifier:
872
+ try:
873
+ stats["llm_statistics"] = self.llm_classifier.get_statistics()
874
+ except Exception as e:
875
+ logger.warning(f"Failed to get LLM statistics: {e}")
876
+ stats["llm_statistics"] = {"error": str(e)}
877
+
878
+ return stats
879
+
880
+
881
+ class MLPredictionCache:
882
+ """SQLite-based cache for ML predictions with expiration support.
883
+
884
+ WHY: ML predictions can be expensive, especially for large repositories.
885
+ This cache stores predictions with metadata to avoid re-processing identical
886
+ commit messages and file patterns.
887
+
888
+ DESIGN: Uses SQLite for persistence across runs with:
889
+ - Expiration based on configurable time periods
890
+ - Hash-based keys for efficient lookup
891
+ - Metadata storage for cache invalidation
892
+ """
893
+
894
+ def __init__(self, cache_path: Path, expiration_days: int = 30):
895
+ """Initialize ML prediction cache.
896
+
897
+ Args:
898
+ cache_path: Path to SQLite cache database
899
+ expiration_days: Number of days to keep predictions
900
+ """
901
+ self.cache_path = cache_path
902
+ self.expiration_days = expiration_days
903
+ self._init_database()
904
+
905
+ def _init_database(self) -> None:
906
+ """Initialize SQLite database with prediction cache table."""
907
+ with sqlite3.connect(self.cache_path) as conn:
908
+ conn.execute(
909
+ """
910
+ CREATE TABLE IF NOT EXISTS ml_predictions (
911
+ key TEXT PRIMARY KEY,
912
+ message_hash TEXT NOT NULL,
913
+ files_hash TEXT NOT NULL,
914
+ category TEXT NOT NULL,
915
+ confidence REAL NOT NULL,
916
+ method TEXT NOT NULL,
917
+ features TEXT, -- JSON encoded
918
+ alternatives TEXT, -- JSON encoded
919
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
920
+ expires_at TIMESTAMP NOT NULL
921
+ )
922
+ """
923
+ )
924
+
925
+ # Create index for efficient cleanup
926
+ conn.execute(
927
+ """
928
+ CREATE INDEX IF NOT EXISTS idx_expires_at ON ml_predictions(expires_at)
929
+ """
930
+ )
931
+
932
+ conn.commit()
933
+
934
+ def _generate_cache_key(self, message: str, files_changed: list[str]) -> tuple[str, str, str]:
935
+ """Generate cache key components.
936
+
937
+ Args:
938
+ message: Commit message
939
+ files_changed: List of changed files
940
+
941
+ Returns:
942
+ Tuple of (cache_key, message_hash, files_hash)
943
+ """
944
+ import hashlib
945
+
946
+ message_hash = hashlib.md5(message.encode("utf-8")).hexdigest()
947
+ files_hash = hashlib.md5("|".join(sorted(files_changed)).encode("utf-8")).hexdigest()
948
+ cache_key = f"{message_hash}:{files_hash}"
949
+
950
+ return cache_key, message_hash, files_hash
951
+
952
+ def get_prediction(self, message: str, files_changed: list[str]) -> Optional[dict[str, Any]]:
953
+ """Get cached prediction if available and not expired.
954
+
955
+ Args:
956
+ message: Commit message
957
+ files_changed: List of changed files
958
+
959
+ Returns:
960
+ Cached prediction dictionary or None if not found/expired
961
+ """
962
+ cache_key, _, _ = self._generate_cache_key(message, files_changed)
963
+
964
+ try:
965
+ with sqlite3.connect(self.cache_path) as conn:
966
+ conn.row_factory = sqlite3.Row
967
+ cursor = conn.execute(
968
+ """
969
+ SELECT category, confidence, method, features, alternatives
970
+ FROM ml_predictions
971
+ WHERE key = ? AND expires_at > datetime('now')
972
+ """,
973
+ (cache_key,),
974
+ )
975
+
976
+ row = cursor.fetchone()
977
+ if row:
978
+ import json
979
+
980
+ return {
981
+ "category": row["category"],
982
+ "confidence": row["confidence"],
983
+ "method": "cached", # Override method to indicate cached result
984
+ "features": json.loads(row["features"]) if row["features"] else {},
985
+ "alternatives": (
986
+ json.loads(row["alternatives"]) if row["alternatives"] else []
987
+ ),
988
+ }
989
+
990
+ except Exception as e:
991
+ logger.warning(f"Cache lookup failed: {e}")
992
+
993
+ return None
994
+
995
+ def store_prediction(
996
+ self, message: str, files_changed: list[str], result: dict[str, Any]
997
+ ) -> None:
998
+ """Store prediction in cache with expiration.
999
+
1000
+ Args:
1001
+ message: Commit message
1002
+ files_changed: List of changed files
1003
+ result: Prediction result to cache
1004
+ """
1005
+ cache_key, message_hash, files_hash = self._generate_cache_key(message, files_changed)
1006
+
1007
+ try:
1008
+ import json
1009
+ from datetime import datetime, timedelta
1010
+
1011
+ expires_at = datetime.now() + timedelta(days=self.expiration_days)
1012
+
1013
+ with sqlite3.connect(self.cache_path) as conn:
1014
+ conn.execute(
1015
+ """
1016
+ INSERT OR REPLACE INTO ml_predictions
1017
+ (key, message_hash, files_hash, category, confidence, method,
1018
+ features, alternatives, expires_at)
1019
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1020
+ """,
1021
+ (
1022
+ cache_key,
1023
+ message_hash,
1024
+ files_hash,
1025
+ result["category"],
1026
+ result["confidence"],
1027
+ result["method"],
1028
+ json.dumps(result.get("features", {})),
1029
+ json.dumps(result.get("alternatives", [])),
1030
+ expires_at,
1031
+ ),
1032
+ )
1033
+ conn.commit()
1034
+
1035
+ except Exception as e:
1036
+ logger.warning(f"Cache storage failed: {e}")
1037
+
1038
+ def cleanup_expired(self) -> int:
1039
+ """Remove expired predictions from cache.
1040
+
1041
+ Returns:
1042
+ Number of expired entries removed
1043
+ """
1044
+ try:
1045
+ with sqlite3.connect(self.cache_path) as conn:
1046
+ cursor = conn.execute(
1047
+ """
1048
+ DELETE FROM ml_predictions WHERE expires_at <= datetime('now')
1049
+ """
1050
+ )
1051
+ conn.commit()
1052
+ return cursor.rowcount
1053
+
1054
+ except Exception as e:
1055
+ logger.warning(f"Cache cleanup failed: {e}")
1056
+ return 0
1057
+
1058
+ def get_statistics(self) -> dict[str, Any]:
1059
+ """Get cache usage statistics.
1060
+
1061
+ Returns:
1062
+ Dictionary with cache statistics
1063
+ """
1064
+ try:
1065
+ with sqlite3.connect(self.cache_path) as conn:
1066
+ cursor = conn.execute(
1067
+ """
1068
+ SELECT
1069
+ COUNT(*) as total_entries,
1070
+ COUNT(CASE WHEN expires_at > datetime('now') THEN 1 END) as active_entries,
1071
+ COUNT(CASE WHEN expires_at <= datetime('now') THEN 1 END) as expired_entries,
1072
+ COUNT(DISTINCT method) as unique_methods
1073
+ FROM ml_predictions
1074
+ """
1075
+ )
1076
+
1077
+ row = cursor.fetchone()
1078
+ if row:
1079
+ return {
1080
+ "total_entries": row[0],
1081
+ "active_entries": row[1],
1082
+ "expired_entries": row[2],
1083
+ "unique_methods": row[3],
1084
+ "cache_file_size_mb": (
1085
+ self.cache_path.stat().st_size / (1024 * 1024)
1086
+ if self.cache_path.exists()
1087
+ else 0
1088
+ ),
1089
+ }
1090
+
1091
+ except Exception as e:
1092
+ logger.warning(f"Cache statistics failed: {e}")
1093
+
1094
+ return {
1095
+ "total_entries": 0,
1096
+ "active_entries": 0,
1097
+ "expired_entries": 0,
1098
+ "unique_methods": 0,
1099
+ "cache_file_size_mb": 0,
1100
+ }