gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4108 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
"""LLM-based commit classification orchestrator.
|
|
2
|
+
|
|
3
|
+
This module provides the main interface for LLM-based commit classification,
|
|
4
|
+
orchestrating the various components for a complete classification solution.
|
|
5
|
+
|
|
6
|
+
WHY: This refactored version separates concerns into focused modules while
|
|
7
|
+
maintaining backward compatibility with the existing interface.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISIONS:
|
|
10
|
+
- Main orchestrator delegates to specialized components
|
|
11
|
+
- Maintains backward compatibility with existing code
|
|
12
|
+
- Supports multiple LLM providers through abstraction
|
|
13
|
+
- Provides enhanced rule-based fallback
|
|
14
|
+
- Comprehensive error handling and graceful degradation
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import re
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Optional
|
|
23
|
+
|
|
24
|
+
from .llm.batch_processor import BatchConfig, BatchProcessor
|
|
25
|
+
from .llm.cache import LLMCache
|
|
26
|
+
from .llm.openai_client import OpenAIClassifier, OpenAIConfig
|
|
27
|
+
from .llm.prompts import PromptVersion
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class LLMConfig:
|
|
34
|
+
"""Configuration for LLM-based commit classification.
|
|
35
|
+
|
|
36
|
+
Maintains backward compatibility with existing configuration structure.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# OpenRouter API configuration
|
|
40
|
+
api_key: Optional[str] = None
|
|
41
|
+
api_base_url: str = "https://openrouter.ai/api/v1"
|
|
42
|
+
model: str = "mistralai/mistral-7b-instruct" # Fast, affordable model
|
|
43
|
+
|
|
44
|
+
# Classification parameters
|
|
45
|
+
confidence_threshold: float = 0.7 # Minimum confidence for LLM predictions
|
|
46
|
+
max_tokens: int = 50 # Keep responses short
|
|
47
|
+
temperature: float = 0.1 # Low temperature for consistent results
|
|
48
|
+
timeout_seconds: float = 30.0 # API timeout
|
|
49
|
+
|
|
50
|
+
# Caching configuration
|
|
51
|
+
cache_duration_days: int = 90 # Long cache duration for cost optimization
|
|
52
|
+
enable_caching: bool = True
|
|
53
|
+
|
|
54
|
+
# Cost optimization
|
|
55
|
+
batch_size: int = 1 # Process one at a time for simplicity
|
|
56
|
+
max_daily_requests: int = 1000 # Rate limiting
|
|
57
|
+
|
|
58
|
+
# Domain-specific terms for organization
|
|
59
|
+
domain_terms: dict[str, list[str]] = None
|
|
60
|
+
|
|
61
|
+
def __post_init__(self):
|
|
62
|
+
"""Initialize default domain terms if not provided."""
|
|
63
|
+
if self.domain_terms is None:
|
|
64
|
+
self.domain_terms = {
|
|
65
|
+
"media": [
|
|
66
|
+
"video",
|
|
67
|
+
"audio",
|
|
68
|
+
"streaming",
|
|
69
|
+
"player",
|
|
70
|
+
"media",
|
|
71
|
+
"content",
|
|
72
|
+
"broadcast",
|
|
73
|
+
"live",
|
|
74
|
+
"recording",
|
|
75
|
+
"episode",
|
|
76
|
+
"program",
|
|
77
|
+
],
|
|
78
|
+
"localization": [
|
|
79
|
+
"translation",
|
|
80
|
+
"i18n",
|
|
81
|
+
"l10n",
|
|
82
|
+
"locale",
|
|
83
|
+
"language",
|
|
84
|
+
"spanish",
|
|
85
|
+
"french",
|
|
86
|
+
"german",
|
|
87
|
+
"italian",
|
|
88
|
+
"portuguese",
|
|
89
|
+
"multilingual",
|
|
90
|
+
],
|
|
91
|
+
"integration": [
|
|
92
|
+
"api",
|
|
93
|
+
"webhook",
|
|
94
|
+
"third-party",
|
|
95
|
+
"external",
|
|
96
|
+
"service",
|
|
97
|
+
"integration",
|
|
98
|
+
"sync",
|
|
99
|
+
"import",
|
|
100
|
+
"export",
|
|
101
|
+
"connector",
|
|
102
|
+
],
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class LLMCommitClassifier:
|
|
107
|
+
"""LLM-based commit classifier with modular architecture.
|
|
108
|
+
|
|
109
|
+
This refactored version delegates to specialized components for better
|
|
110
|
+
maintainability while preserving the original interface.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
# Streamlined category definitions (same as original)
|
|
114
|
+
CATEGORIES = {
|
|
115
|
+
"feature": "New functionality, capabilities, enhancements, additions",
|
|
116
|
+
"bugfix": "Fixes, errors, issues, crashes, bugs, corrections",
|
|
117
|
+
"maintenance": "Configuration, chores, dependencies, cleanup, refactoring, updates",
|
|
118
|
+
"integration": "Third-party services, APIs, webhooks, external systems",
|
|
119
|
+
"content": "Text, copy, documentation, README updates, comments",
|
|
120
|
+
"media": "Video, audio, streaming, players, visual assets, images",
|
|
121
|
+
"localization": "Translations, i18n, l10n, regional adaptations",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def __init__(self, config: LLMConfig, cache_dir: Optional[Path] = None):
|
|
125
|
+
"""Initialize LLM commit classifier with modular components.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
config: LLM configuration
|
|
129
|
+
cache_dir: Directory for caching predictions
|
|
130
|
+
"""
|
|
131
|
+
self.config = config
|
|
132
|
+
self.cache_dir = cache_dir or Path(".gitflow-cache")
|
|
133
|
+
self.cache_dir.mkdir(exist_ok=True)
|
|
134
|
+
|
|
135
|
+
# Initialize components
|
|
136
|
+
self._init_classifier()
|
|
137
|
+
self._init_cache()
|
|
138
|
+
self._init_batch_processor()
|
|
139
|
+
self._init_rule_patterns()
|
|
140
|
+
|
|
141
|
+
# Request tracking for rate limiting (backward compatibility)
|
|
142
|
+
self._daily_requests = 0
|
|
143
|
+
self._last_reset_date = None
|
|
144
|
+
|
|
145
|
+
# Cost tracking (backward compatibility)
|
|
146
|
+
self.total_tokens_used = 0
|
|
147
|
+
self.total_cost = 0.0
|
|
148
|
+
self.api_calls_made = 0
|
|
149
|
+
|
|
150
|
+
logger.info(f"LLMCommitClassifier initialized with model: {self.config.model}")
|
|
151
|
+
|
|
152
|
+
def _init_classifier(self) -> None:
|
|
153
|
+
"""Initialize the LLM classifier component.
|
|
154
|
+
|
|
155
|
+
WHY: Modular initialization allows easy switching between providers.
|
|
156
|
+
"""
|
|
157
|
+
# Convert config to OpenAI config
|
|
158
|
+
openai_config = OpenAIConfig(
|
|
159
|
+
api_key=self.config.api_key,
|
|
160
|
+
api_base_url=self.config.api_base_url,
|
|
161
|
+
model=self.config.model,
|
|
162
|
+
temperature=self.config.temperature,
|
|
163
|
+
max_tokens=self.config.max_tokens,
|
|
164
|
+
timeout_seconds=self.config.timeout_seconds,
|
|
165
|
+
max_daily_requests=self.config.max_daily_requests,
|
|
166
|
+
use_openrouter=True, # Default to OpenRouter
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Initialize classifier
|
|
170
|
+
try:
|
|
171
|
+
self.classifier = OpenAIClassifier(
|
|
172
|
+
config=openai_config,
|
|
173
|
+
cache_dir=self.cache_dir,
|
|
174
|
+
prompt_version=PromptVersion.V3_CONTEXTUAL,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Set domain terms in prompt generator
|
|
178
|
+
self.classifier.prompt_generator.domain_terms = self.config.domain_terms
|
|
179
|
+
|
|
180
|
+
except ImportError as e:
|
|
181
|
+
logger.warning(f"Failed to initialize LLM classifier: {e}")
|
|
182
|
+
self.classifier = None
|
|
183
|
+
|
|
184
|
+
def _init_cache(self) -> None:
|
|
185
|
+
"""Initialize the caching component.
|
|
186
|
+
|
|
187
|
+
WHY: Separate cache initialization for better error handling.
|
|
188
|
+
"""
|
|
189
|
+
self.cache: Optional[LLMCache] = None
|
|
190
|
+
if self.config.enable_caching:
|
|
191
|
+
try:
|
|
192
|
+
cache_path = self.cache_dir / "llm_predictions.db"
|
|
193
|
+
self.cache = LLMCache(
|
|
194
|
+
cache_path=cache_path, expiration_days=self.config.cache_duration_days
|
|
195
|
+
)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.warning(f"Failed to initialize LLM cache: {e}")
|
|
198
|
+
self.cache = None
|
|
199
|
+
|
|
200
|
+
def _init_batch_processor(self) -> None:
|
|
201
|
+
"""Initialize the batch processing component.
|
|
202
|
+
|
|
203
|
+
WHY: Batch processing improves efficiency for large-scale classification.
|
|
204
|
+
"""
|
|
205
|
+
batch_config = BatchConfig(
|
|
206
|
+
batch_size=self.config.batch_size, show_progress=True, continue_on_batch_failure=True
|
|
207
|
+
)
|
|
208
|
+
self.batch_processor = BatchProcessor(batch_config)
|
|
209
|
+
|
|
210
|
+
def _init_rule_patterns(self) -> None:
|
|
211
|
+
"""Initialize rule-based patterns for fallback classification.
|
|
212
|
+
|
|
213
|
+
WHY: Rule-based fallback ensures classification works even
|
|
214
|
+
when LLM is unavailable.
|
|
215
|
+
"""
|
|
216
|
+
self.rule_patterns = {
|
|
217
|
+
"feature": [
|
|
218
|
+
r"^(feat|feature)[\(\:]",
|
|
219
|
+
r"^add[\(\:]",
|
|
220
|
+
r"^implement[\(\:]",
|
|
221
|
+
r"^create[\(\:]",
|
|
222
|
+
r"add.*feature",
|
|
223
|
+
r"implement.*feature",
|
|
224
|
+
r"create.*feature",
|
|
225
|
+
r"new.*feature",
|
|
226
|
+
r"introduce.*feature",
|
|
227
|
+
r"^enhancement[\(\:]",
|
|
228
|
+
],
|
|
229
|
+
"bugfix": [
|
|
230
|
+
r"^(fix|bug|hotfix|patch)[\(\:]",
|
|
231
|
+
r"fix.*bug(?!.*format)",
|
|
232
|
+
r"fix.*issue(?!.*format)",
|
|
233
|
+
r"resolve.*bug",
|
|
234
|
+
r"correct.*bug",
|
|
235
|
+
r"repair.*",
|
|
236
|
+
r"^hotfix[\(\:]",
|
|
237
|
+
r"patch.*bug",
|
|
238
|
+
r"debug.*",
|
|
239
|
+
],
|
|
240
|
+
"maintenance": [
|
|
241
|
+
r"^(chore|refactor|style|deps|build|ci|test)[\(\:]",
|
|
242
|
+
r"^update[\(\:]",
|
|
243
|
+
r"^bump[\(\:]",
|
|
244
|
+
r"^upgrade[\(\:]",
|
|
245
|
+
r"refactor.*",
|
|
246
|
+
r"cleanup",
|
|
247
|
+
r"update.*depend",
|
|
248
|
+
r"bump.*version",
|
|
249
|
+
r"configure.*",
|
|
250
|
+
r"maintenance",
|
|
251
|
+
r"organize.*",
|
|
252
|
+
r"format.*",
|
|
253
|
+
r"style.*",
|
|
254
|
+
r"lint.*",
|
|
255
|
+
r"improve.*performance",
|
|
256
|
+
r"optimize.*",
|
|
257
|
+
],
|
|
258
|
+
"content": [
|
|
259
|
+
r"^(docs|doc|readme)[\(\:]",
|
|
260
|
+
r"update.*readme",
|
|
261
|
+
r"documentation",
|
|
262
|
+
r"^comment[\(\:]",
|
|
263
|
+
r"doc.*update",
|
|
264
|
+
r"add.*comment",
|
|
265
|
+
r"update.*doc",
|
|
266
|
+
r"add.*documentation",
|
|
267
|
+
],
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
def classify_commit(
|
|
271
|
+
self, message: str, files_changed: Optional[list[str]] = None
|
|
272
|
+
) -> dict[str, Any]:
|
|
273
|
+
"""Classify a commit message using LLM or fallback methods.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
message: Cleaned commit message
|
|
277
|
+
files_changed: Optional list of changed files
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Classification result dictionary (backward compatible format)
|
|
281
|
+
"""
|
|
282
|
+
start_time = time.time()
|
|
283
|
+
|
|
284
|
+
# Check for empty message
|
|
285
|
+
if not message or not message.strip():
|
|
286
|
+
return self._create_result("maintenance", 0.3, "empty_message", start_time)
|
|
287
|
+
|
|
288
|
+
# Try cache first
|
|
289
|
+
if self.cache:
|
|
290
|
+
cached_result = self.cache.get(message, files_changed)
|
|
291
|
+
if cached_result:
|
|
292
|
+
cached_result["processing_time_ms"] = (time.time() - start_time) * 1000
|
|
293
|
+
return cached_result
|
|
294
|
+
|
|
295
|
+
# Try LLM classification if available and configured
|
|
296
|
+
if self.classifier and self.config.api_key:
|
|
297
|
+
try:
|
|
298
|
+
# Check rate limits
|
|
299
|
+
if self._check_rate_limits():
|
|
300
|
+
result = self.classifier.classify_commit(message, files_changed)
|
|
301
|
+
|
|
302
|
+
# Check if LLM actually succeeded
|
|
303
|
+
if result.method == "llm":
|
|
304
|
+
# Update statistics for backward compatibility
|
|
305
|
+
self.api_calls_made += 1
|
|
306
|
+
self._daily_requests += 1
|
|
307
|
+
|
|
308
|
+
# Get cost information from classifier
|
|
309
|
+
stats = self.classifier.get_statistics()
|
|
310
|
+
self.total_tokens_used = stats.get("total_tokens_used", 0)
|
|
311
|
+
self.total_cost = stats.get("total_cost", 0.0)
|
|
312
|
+
|
|
313
|
+
# Convert to backward compatible format
|
|
314
|
+
result_dict = result.to_dict()
|
|
315
|
+
|
|
316
|
+
# Cache successful result
|
|
317
|
+
if self.cache:
|
|
318
|
+
self.cache.store(message, files_changed, result_dict)
|
|
319
|
+
|
|
320
|
+
return result_dict
|
|
321
|
+
# If method is not 'llm', fall through to rule-based
|
|
322
|
+
else:
|
|
323
|
+
logger.debug("Rate limit exceeded, using rule-based fallback")
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.debug(f"LLM classification not available: {e}")
|
|
326
|
+
|
|
327
|
+
# Fall back to enhanced rule-based classification
|
|
328
|
+
return self._enhanced_rule_based_classification(message, files_changed or [])
|
|
329
|
+
|
|
330
|
+
def classify_commits_batch(
|
|
331
|
+
self,
|
|
332
|
+
commits: list[dict[str, Any]],
|
|
333
|
+
batch_id: Optional[str] = None,
|
|
334
|
+
include_confidence: bool = True,
|
|
335
|
+
) -> list[dict[str, Any]]:
|
|
336
|
+
"""Classify a batch of commits.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
commits: List of commit dictionaries
|
|
340
|
+
batch_id: Optional batch identifier
|
|
341
|
+
include_confidence: Whether to include confidence scores
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
List of classification results (backward compatible format)
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
def classify_func(commit: dict[str, Any]) -> dict[str, Any]:
|
|
348
|
+
"""Classification function for batch processor."""
|
|
349
|
+
message = commit.get("message", "")
|
|
350
|
+
files_changed = []
|
|
351
|
+
|
|
352
|
+
# Extract files from commit
|
|
353
|
+
if "files_changed" in commit:
|
|
354
|
+
fc = commit["files_changed"]
|
|
355
|
+
if isinstance(fc, list):
|
|
356
|
+
files_changed = fc
|
|
357
|
+
|
|
358
|
+
return self.classify_commit(message, files_changed)
|
|
359
|
+
|
|
360
|
+
# Use batch processor
|
|
361
|
+
results = self.batch_processor.process_commits(
|
|
362
|
+
commits, classify_func, f"Classifying {len(commits)} commits"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Add batch_id if provided
|
|
366
|
+
if batch_id:
|
|
367
|
+
for result in results:
|
|
368
|
+
result["batch_id"] = batch_id
|
|
369
|
+
|
|
370
|
+
logger.info(f"Batch {batch_id}: Classified {len(results)} commits")
|
|
371
|
+
return results
|
|
372
|
+
|
|
373
|
+
def _enhanced_rule_based_classification(
|
|
374
|
+
self, message: str, files_changed: list[str]
|
|
375
|
+
) -> dict[str, Any]:
|
|
376
|
+
"""Enhanced rule-based classification as fallback.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
message: Commit message
|
|
380
|
+
files_changed: List of changed files
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Classification result dictionary
|
|
384
|
+
"""
|
|
385
|
+
message_lower = message.lower()
|
|
386
|
+
|
|
387
|
+
# Check style/formatting first
|
|
388
|
+
if re.search(r"^(style|format)[\(\:]", message_lower):
|
|
389
|
+
return self._create_result(
|
|
390
|
+
"maintenance", 0.8, "rule_enhanced", 0.0, "Style/formatting commit"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# Check other patterns
|
|
394
|
+
for category, patterns in self.rule_patterns.items():
|
|
395
|
+
for pattern in patterns:
|
|
396
|
+
if re.search(pattern, message_lower):
|
|
397
|
+
return self._create_result(
|
|
398
|
+
category, 0.8, "rule_enhanced", 0.0, f"Matched pattern: {pattern}"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# File-based analysis
|
|
402
|
+
if files_changed:
|
|
403
|
+
category = self._analyze_files(files_changed)
|
|
404
|
+
if category:
|
|
405
|
+
return self._create_result(
|
|
406
|
+
category, 0.7, "rule_enhanced", 0.0, "File-based classification"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Semantic analysis
|
|
410
|
+
category = self._semantic_analysis(message_lower)
|
|
411
|
+
if category:
|
|
412
|
+
return self._create_result(
|
|
413
|
+
category, 0.6, "rule_enhanced", 0.0, f"Semantic indicator for {category}"
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Default fallback
|
|
417
|
+
if len(message.split()) >= 5:
|
|
418
|
+
return self._create_result(
|
|
419
|
+
"feature", 0.4, "rule_enhanced", 0.0, "Detailed commit suggests feature"
|
|
420
|
+
)
|
|
421
|
+
elif any(term in message_lower for term in ["urgent", "critical", "!"]):
|
|
422
|
+
return self._create_result(
|
|
423
|
+
"bugfix", 0.5, "rule_enhanced", 0.0, "Urgent language suggests bug fix"
|
|
424
|
+
)
|
|
425
|
+
else:
|
|
426
|
+
return self._create_result(
|
|
427
|
+
"maintenance", 0.3, "rule_enhanced", 0.0, "General maintenance work"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
def _analyze_files(self, files_changed: list[str]) -> Optional[str]:
|
|
431
|
+
"""Analyze files to determine category.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
files_changed: List of changed files
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
Category or None
|
|
438
|
+
"""
|
|
439
|
+
file_patterns = []
|
|
440
|
+
|
|
441
|
+
for file_path in files_changed:
|
|
442
|
+
file_lower = file_path.lower()
|
|
443
|
+
ext = Path(file_path).suffix.lower()
|
|
444
|
+
|
|
445
|
+
if any(term in file_lower for term in ["readme", "doc", "changelog", ".md"]):
|
|
446
|
+
file_patterns.append("documentation")
|
|
447
|
+
elif any(term in file_lower for term in ["test", "spec", "__test__"]):
|
|
448
|
+
file_patterns.append("test")
|
|
449
|
+
elif any(term in file_lower for term in ["config", "package.json", ".yml"]):
|
|
450
|
+
file_patterns.append("configuration")
|
|
451
|
+
elif ext in [".jpg", ".png", ".gif", ".mp4", ".mp3", ".svg"]:
|
|
452
|
+
file_patterns.append("media")
|
|
453
|
+
|
|
454
|
+
# Determine category from patterns
|
|
455
|
+
if "documentation" in file_patterns:
|
|
456
|
+
return "content"
|
|
457
|
+
elif "test" in file_patterns or "configuration" in file_patterns:
|
|
458
|
+
return "maintenance"
|
|
459
|
+
elif "media" in file_patterns:
|
|
460
|
+
return "media"
|
|
461
|
+
|
|
462
|
+
return None
|
|
463
|
+
|
|
464
|
+
def _semantic_analysis(self, message_lower: str) -> Optional[str]:
|
|
465
|
+
"""Perform semantic analysis on message.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
message_lower: Lowercase commit message
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Category or None
|
|
472
|
+
"""
|
|
473
|
+
semantic_indicators = {
|
|
474
|
+
"feature": ["implement new", "create new", "introduce new", "develop", "build new"],
|
|
475
|
+
"bugfix": [
|
|
476
|
+
"resolve error",
|
|
477
|
+
"correct issue",
|
|
478
|
+
"repair bug",
|
|
479
|
+
"solve problem",
|
|
480
|
+
"address bug",
|
|
481
|
+
],
|
|
482
|
+
"maintenance": [
|
|
483
|
+
"update config",
|
|
484
|
+
"upgrade",
|
|
485
|
+
"modify existing",
|
|
486
|
+
"change setting",
|
|
487
|
+
"improve performance",
|
|
488
|
+
],
|
|
489
|
+
"content": ["document", "explain", "describe", "clarify", "write documentation"],
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
for category, indicators in semantic_indicators.items():
|
|
493
|
+
if any(indicator in message_lower for indicator in indicators):
|
|
494
|
+
return category
|
|
495
|
+
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
def _check_rate_limits(self) -> bool:
|
|
499
|
+
"""Check if we're within daily rate limits.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
True if request is allowed
|
|
503
|
+
"""
|
|
504
|
+
from datetime import datetime
|
|
505
|
+
|
|
506
|
+
current_date = datetime.now().date()
|
|
507
|
+
|
|
508
|
+
# Reset counter if new day
|
|
509
|
+
if current_date != self._last_reset_date:
|
|
510
|
+
self._daily_requests = 0
|
|
511
|
+
self._last_reset_date = current_date
|
|
512
|
+
|
|
513
|
+
return self._daily_requests < self.config.max_daily_requests
|
|
514
|
+
|
|
515
|
+
def _create_result(
|
|
516
|
+
self,
|
|
517
|
+
category: str,
|
|
518
|
+
confidence: float,
|
|
519
|
+
method: str,
|
|
520
|
+
start_time: float,
|
|
521
|
+
reasoning: Optional[str] = None,
|
|
522
|
+
) -> dict[str, Any]:
|
|
523
|
+
"""Create a standardized result dictionary.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
category: Classification category
|
|
527
|
+
confidence: Confidence score
|
|
528
|
+
method: Classification method
|
|
529
|
+
start_time: Processing start time
|
|
530
|
+
reasoning: Optional reasoning text
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
Result dictionary (backward compatible format)
|
|
534
|
+
"""
|
|
535
|
+
return {
|
|
536
|
+
"category": category,
|
|
537
|
+
"confidence": confidence,
|
|
538
|
+
"method": method,
|
|
539
|
+
"reasoning": reasoning or f"Classified using {method}",
|
|
540
|
+
"model": self.config.model if method == "llm" else "rule-based",
|
|
541
|
+
"alternatives": [],
|
|
542
|
+
"processing_time_ms": (time.time() - start_time) * 1000,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
546
|
+
"""Get classifier usage statistics.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
Dictionary with usage statistics (backward compatible)
|
|
550
|
+
"""
|
|
551
|
+
stats = {
|
|
552
|
+
"daily_requests": self._daily_requests,
|
|
553
|
+
"max_daily_requests": self.config.max_daily_requests,
|
|
554
|
+
"model": self.config.model,
|
|
555
|
+
"cache_enabled": self.config.enable_caching,
|
|
556
|
+
"api_configured": bool(self.config.api_key),
|
|
557
|
+
"total_tokens_used": self.total_tokens_used,
|
|
558
|
+
"total_cost": self.total_cost,
|
|
559
|
+
"api_calls_made": self.api_calls_made,
|
|
560
|
+
"average_tokens_per_call": (
|
|
561
|
+
self.total_tokens_used / self.api_calls_made if self.api_calls_made > 0 else 0
|
|
562
|
+
),
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
# Add cache statistics
|
|
566
|
+
if self.cache:
|
|
567
|
+
stats["cache_statistics"] = self.cache.get_statistics()
|
|
568
|
+
|
|
569
|
+
# Add batch processor statistics
|
|
570
|
+
if self.batch_processor:
|
|
571
|
+
stats["batch_statistics"] = self.batch_processor.get_statistics()
|
|
572
|
+
|
|
573
|
+
# Add classifier statistics if available
|
|
574
|
+
if self.classifier:
|
|
575
|
+
stats["classifier_statistics"] = self.classifier.get_statistics()
|
|
576
|
+
|
|
577
|
+
return stats
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
# Legacy class for backward compatibility
|
|
581
|
+
class LLMPredictionCache:
|
|
582
|
+
"""Legacy cache class for backward compatibility.
|
|
583
|
+
|
|
584
|
+
This wraps the new LLMCache to maintain the old interface.
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
def __init__(self, cache_path: Path, expiration_days: int = 90):
|
|
588
|
+
"""Initialize legacy cache wrapper."""
|
|
589
|
+
self.cache = LLMCache(cache_path, expiration_days)
|
|
590
|
+
|
|
591
|
+
def get_prediction(self, message: str, files_changed: list[str]) -> Optional[dict[str, Any]]:
|
|
592
|
+
"""Get cached prediction (legacy interface)."""
|
|
593
|
+
return self.cache.get(message, files_changed)
|
|
594
|
+
|
|
595
|
+
def store_prediction(
|
|
596
|
+
self, message: str, files_changed: list[str], result: dict[str, Any]
|
|
597
|
+
) -> None:
|
|
598
|
+
"""Store prediction (legacy interface)."""
|
|
599
|
+
self.cache.store(message, files_changed, result)
|
|
600
|
+
|
|
601
|
+
def cleanup_expired(self) -> int:
|
|
602
|
+
"""Remove expired predictions (legacy interface)."""
|
|
603
|
+
return self.cache.cleanup_expired()
|
|
604
|
+
|
|
605
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
606
|
+
"""Get cache statistics (legacy interface)."""
|
|
607
|
+
return self.cache.get_statistics()
|