gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4108 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""LLM classifier module components.
|
|
2
|
+
|
|
3
|
+
This module provides modular components for LLM-based commit classification.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import BaseLLMClassifier, ClassificationResult, LLMProviderConfig
|
|
7
|
+
from .batch_processor import BatchConfig, BatchProcessor, BatchResult
|
|
8
|
+
from .cache import LLMCache
|
|
9
|
+
from .cost_tracker import CostRecord, CostTracker, ModelPricing
|
|
10
|
+
from .openai_client import OpenAIClassifier, OpenAIConfig
|
|
11
|
+
from .prompts import PromptGenerator, PromptTemplate, PromptVersion
|
|
12
|
+
from .response_parser import ResponseParser
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
# Base classes
|
|
16
|
+
"BaseLLMClassifier",
|
|
17
|
+
"ClassificationResult",
|
|
18
|
+
"LLMProviderConfig",
|
|
19
|
+
# Prompts
|
|
20
|
+
"PromptGenerator",
|
|
21
|
+
"PromptVersion",
|
|
22
|
+
"PromptTemplate",
|
|
23
|
+
# Providers
|
|
24
|
+
"OpenAIClassifier",
|
|
25
|
+
"OpenAIConfig",
|
|
26
|
+
# Components
|
|
27
|
+
"ResponseParser",
|
|
28
|
+
"CostTracker",
|
|
29
|
+
"ModelPricing",
|
|
30
|
+
"CostRecord",
|
|
31
|
+
"BatchProcessor",
|
|
32
|
+
"BatchConfig",
|
|
33
|
+
"BatchResult",
|
|
34
|
+
"LLMCache",
|
|
35
|
+
]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Base interface for LLM classifiers.
|
|
2
|
+
|
|
3
|
+
This module defines the abstract base class for all LLM-based classifiers,
|
|
4
|
+
establishing a consistent interface for different LLM providers.
|
|
5
|
+
|
|
6
|
+
WHY: Different LLM providers (OpenAI, Anthropic, OpenRouter, etc.) have different
|
|
7
|
+
APIs but should provide the same classification interface. This abstraction allows
|
|
8
|
+
easy switching between providers without changing the rest of the codebase.
|
|
9
|
+
|
|
10
|
+
DESIGN DECISIONS:
|
|
11
|
+
- Use ABC for enforcing interface implementation
|
|
12
|
+
- Define standard result format for all providers
|
|
13
|
+
- Include confidence scores and reasoning in results
|
|
14
|
+
- Support batch processing for efficiency
|
|
15
|
+
- Provide cost tracking interface
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Optional
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ClassificationResult:
|
|
26
|
+
"""Standard result format for LLM classification.
|
|
27
|
+
|
|
28
|
+
WHY: Consistent result format across all LLM providers ensures
|
|
29
|
+
downstream code doesn't need provider-specific handling.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
category: str
|
|
33
|
+
confidence: float
|
|
34
|
+
method: str # 'llm', 'cached', 'rule_fallback', etc.
|
|
35
|
+
reasoning: str
|
|
36
|
+
model: str
|
|
37
|
+
alternatives: list[dict[str, Any]] # Alternative classifications with scores
|
|
38
|
+
processing_time_ms: float
|
|
39
|
+
batch_id: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
|
+
"""Convert to dictionary format for serialization."""
|
|
43
|
+
result = {
|
|
44
|
+
"category": self.category,
|
|
45
|
+
"confidence": self.confidence,
|
|
46
|
+
"method": self.method,
|
|
47
|
+
"reasoning": self.reasoning,
|
|
48
|
+
"model": self.model,
|
|
49
|
+
"alternatives": self.alternatives,
|
|
50
|
+
"processing_time_ms": self.processing_time_ms,
|
|
51
|
+
}
|
|
52
|
+
if self.batch_id:
|
|
53
|
+
result["batch_id"] = self.batch_id
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class LLMProviderConfig:
|
|
59
|
+
"""Base configuration for LLM providers.
|
|
60
|
+
|
|
61
|
+
WHY: Common configuration options that all providers need,
|
|
62
|
+
with ability to extend for provider-specific settings.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
api_key: Optional[str] = None
|
|
66
|
+
model: str = "default"
|
|
67
|
+
temperature: float = 0.1
|
|
68
|
+
max_tokens: int = 50
|
|
69
|
+
timeout_seconds: float = 30.0
|
|
70
|
+
max_retries: int = 3
|
|
71
|
+
retry_delay_seconds: float = 1.0
|
|
72
|
+
|
|
73
|
+
# Rate limiting
|
|
74
|
+
max_daily_requests: int = 1000
|
|
75
|
+
max_requests_per_minute: int = 60
|
|
76
|
+
|
|
77
|
+
# Cost tracking
|
|
78
|
+
enable_cost_tracking: bool = True
|
|
79
|
+
cost_warning_threshold: float = 10.0 # USD
|
|
80
|
+
|
|
81
|
+
def validate(self) -> None:
|
|
82
|
+
"""Validate configuration settings.
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: If configuration is invalid
|
|
86
|
+
"""
|
|
87
|
+
if self.temperature < 0 or self.temperature > 2:
|
|
88
|
+
raise ValueError(f"Temperature must be between 0 and 2, got {self.temperature}")
|
|
89
|
+
if self.max_tokens < 1:
|
|
90
|
+
raise ValueError(f"max_tokens must be positive, got {self.max_tokens}")
|
|
91
|
+
if self.timeout_seconds <= 0:
|
|
92
|
+
raise ValueError(f"timeout_seconds must be positive, got {self.timeout_seconds}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class BaseLLMClassifier(ABC):
|
|
96
|
+
"""Abstract base class for LLM-based classifiers.
|
|
97
|
+
|
|
98
|
+
WHY: Defines the interface that all LLM providers must implement,
|
|
99
|
+
ensuring consistency and allowing provider switching.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, config: LLMProviderConfig, cache_dir: Optional[Path] = None):
|
|
103
|
+
"""Initialize LLM classifier.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
config: Provider-specific configuration
|
|
107
|
+
cache_dir: Directory for caching predictions
|
|
108
|
+
"""
|
|
109
|
+
config.validate()
|
|
110
|
+
self.config = config
|
|
111
|
+
self.cache_dir = cache_dir or Path(".gitflow-cache")
|
|
112
|
+
self.cache_dir.mkdir(exist_ok=True)
|
|
113
|
+
|
|
114
|
+
# Cost tracking
|
|
115
|
+
self.total_tokens_used = 0
|
|
116
|
+
self.total_cost = 0.0
|
|
117
|
+
self.api_calls_made = 0
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def classify_commit(
|
|
121
|
+
self, message: str, files_changed: Optional[list[str]] = None
|
|
122
|
+
) -> ClassificationResult:
|
|
123
|
+
"""Classify a single commit message.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
message: Commit message to classify
|
|
127
|
+
files_changed: Optional list of changed files for context
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Classification result with category and metadata
|
|
131
|
+
"""
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
@abstractmethod
|
|
135
|
+
def classify_commits_batch(
|
|
136
|
+
self, commits: list[dict[str, Any]], batch_id: Optional[str] = None
|
|
137
|
+
) -> list[ClassificationResult]:
|
|
138
|
+
"""Classify a batch of commits.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
commits: List of commit dictionaries
|
|
142
|
+
batch_id: Optional batch identifier for tracking
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of classification results
|
|
146
|
+
"""
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
@abstractmethod
|
|
150
|
+
def get_provider_name(self) -> str:
|
|
151
|
+
"""Get the name of the LLM provider.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Provider name (e.g., 'openai', 'anthropic', 'openrouter')
|
|
155
|
+
"""
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
@abstractmethod
|
|
159
|
+
def estimate_cost(self, text: str) -> float:
|
|
160
|
+
"""Estimate the cost of classifying the given text.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
text: Text to be classified
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Estimated cost in USD
|
|
167
|
+
"""
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
171
|
+
"""Get usage statistics for this classifier.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dictionary with usage statistics
|
|
175
|
+
"""
|
|
176
|
+
return {
|
|
177
|
+
"provider": self.get_provider_name(),
|
|
178
|
+
"model": self.config.model,
|
|
179
|
+
"api_calls_made": self.api_calls_made,
|
|
180
|
+
"total_tokens_used": self.total_tokens_used,
|
|
181
|
+
"total_cost": self.total_cost,
|
|
182
|
+
"average_tokens_per_call": (
|
|
183
|
+
self.total_tokens_used / self.api_calls_made if self.api_calls_made > 0 else 0
|
|
184
|
+
),
|
|
185
|
+
"cost_warning_threshold": self.config.cost_warning_threshold,
|
|
186
|
+
"approaching_cost_limit": self.total_cost > self.config.cost_warning_threshold * 0.8,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
def reset_statistics(self) -> None:
|
|
190
|
+
"""Reset usage statistics."""
|
|
191
|
+
self.total_tokens_used = 0
|
|
192
|
+
self.total_cost = 0.0
|
|
193
|
+
self.api_calls_made = 0
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""Batch processing logic for efficient LLM classification.
|
|
2
|
+
|
|
3
|
+
This module handles batch processing of commits for classification,
|
|
4
|
+
including progress tracking, error handling, and result aggregation.
|
|
5
|
+
|
|
6
|
+
WHY: Processing commits in batches improves efficiency, enables better
|
|
7
|
+
progress tracking, and allows for optimizations like parallel processing.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISIONS:
|
|
10
|
+
- Support configurable batch sizes
|
|
11
|
+
- Provide detailed progress feedback
|
|
12
|
+
- Handle failures gracefully without losing progress
|
|
13
|
+
- Support resume from partial completion
|
|
14
|
+
- Track batch-level metrics
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import logging
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from typing import Any, Callable, Optional
|
|
22
|
+
|
|
23
|
+
from ....core.progress import get_progress_service
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class BatchConfig:
|
|
30
|
+
"""Configuration for batch processing.
|
|
31
|
+
|
|
32
|
+
WHY: Centralizing batch configuration makes it easy to tune
|
|
33
|
+
performance characteristics for different scenarios.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
batch_size: int = 50 # Number of commits per batch
|
|
37
|
+
max_parallel_batches: int = 1 # Currently serial, but structured for future parallel support
|
|
38
|
+
retry_failed_batches: bool = True
|
|
39
|
+
continue_on_batch_failure: bool = True
|
|
40
|
+
show_progress: bool = True
|
|
41
|
+
progress_nested: bool = True # Show nested progress bars
|
|
42
|
+
|
|
43
|
+
def validate(self) -> None:
|
|
44
|
+
"""Validate batch configuration."""
|
|
45
|
+
if self.batch_size < 1:
|
|
46
|
+
raise ValueError(f"Batch size must be positive, got {self.batch_size}")
|
|
47
|
+
if self.max_parallel_batches < 1:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Max parallel batches must be positive, got {self.max_parallel_batches}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class BatchResult:
|
|
55
|
+
"""Result of processing a single batch.
|
|
56
|
+
|
|
57
|
+
WHY: Structured batch results enable better error handling
|
|
58
|
+
and performance analysis.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
batch_id: str
|
|
62
|
+
total_items: int
|
|
63
|
+
successful_items: int
|
|
64
|
+
failed_items: int
|
|
65
|
+
results: list[dict[str, Any]]
|
|
66
|
+
errors: list[dict[str, Any]]
|
|
67
|
+
processing_time_seconds: float
|
|
68
|
+
retry_count: int = 0
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class BatchProcessor:
|
|
72
|
+
"""Processes commits in batches for LLM classification.
|
|
73
|
+
|
|
74
|
+
WHY: Batch processing improves efficiency and provides better
|
|
75
|
+
user feedback for large-scale classification tasks.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(self, config: Optional[BatchConfig] = None):
|
|
79
|
+
"""Initialize batch processor.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
config: Batch processing configuration
|
|
83
|
+
"""
|
|
84
|
+
self.config = config or BatchConfig()
|
|
85
|
+
self.config.validate()
|
|
86
|
+
|
|
87
|
+
# Processing statistics
|
|
88
|
+
self.total_processed = 0
|
|
89
|
+
self.total_successful = 0
|
|
90
|
+
self.total_failed = 0
|
|
91
|
+
self.batch_results: list[BatchResult] = []
|
|
92
|
+
|
|
93
|
+
def process_commits(
|
|
94
|
+
self,
|
|
95
|
+
commits: list[dict[str, Any]],
|
|
96
|
+
classifier_func: Callable[[dict[str, Any]], dict[str, Any]],
|
|
97
|
+
job_description: str = "Processing commits",
|
|
98
|
+
) -> list[dict[str, Any]]:
|
|
99
|
+
"""Process commits in batches using the provided classifier.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
commits: List of commits to process
|
|
103
|
+
classifier_func: Function to classify a single commit
|
|
104
|
+
job_description: Description for progress tracking
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
List of classification results for all commits
|
|
108
|
+
"""
|
|
109
|
+
if not commits:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
# Split into batches
|
|
113
|
+
batches = self._create_batches(commits)
|
|
114
|
+
logger.info(f"Processing {len(commits)} commits in {len(batches)} batches")
|
|
115
|
+
|
|
116
|
+
# Get progress service
|
|
117
|
+
progress = get_progress_service()
|
|
118
|
+
|
|
119
|
+
all_results = []
|
|
120
|
+
|
|
121
|
+
# Process batches
|
|
122
|
+
if self.config.show_progress:
|
|
123
|
+
with progress.progress(
|
|
124
|
+
total=len(batches), description=job_description, unit="batch", leave=True
|
|
125
|
+
) as batch_ctx:
|
|
126
|
+
for i, batch in enumerate(batches, 1):
|
|
127
|
+
batch_id = self._generate_batch_id(batch, i)
|
|
128
|
+
progress.set_description(
|
|
129
|
+
batch_ctx, f"{job_description} (batch {i}/{len(batches)})"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Process single batch
|
|
133
|
+
batch_result = self._process_single_batch(
|
|
134
|
+
batch, batch_id, classifier_func, progress
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Collect results
|
|
138
|
+
all_results.extend(batch_result.results)
|
|
139
|
+
self.batch_results.append(batch_result)
|
|
140
|
+
|
|
141
|
+
# Update progress
|
|
142
|
+
progress.update(batch_ctx, 1)
|
|
143
|
+
|
|
144
|
+
# Update statistics
|
|
145
|
+
self.total_processed += batch_result.total_items
|
|
146
|
+
self.total_successful += batch_result.successful_items
|
|
147
|
+
self.total_failed += batch_result.failed_items
|
|
148
|
+
|
|
149
|
+
# Log batch summary
|
|
150
|
+
if batch_result.failed_items > 0:
|
|
151
|
+
logger.warning(
|
|
152
|
+
f"Batch {batch_id}: {batch_result.failed_items}/{batch_result.total_items} failed"
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
# Process without progress bars
|
|
156
|
+
for i, batch in enumerate(batches, 1):
|
|
157
|
+
batch_id = self._generate_batch_id(batch, i)
|
|
158
|
+
batch_result = self._process_single_batch(batch, batch_id, classifier_func, None)
|
|
159
|
+
all_results.extend(batch_result.results)
|
|
160
|
+
self.batch_results.append(batch_result)
|
|
161
|
+
|
|
162
|
+
self.total_processed += batch_result.total_items
|
|
163
|
+
self.total_successful += batch_result.successful_items
|
|
164
|
+
self.total_failed += batch_result.failed_items
|
|
165
|
+
|
|
166
|
+
# Log final summary
|
|
167
|
+
logger.info(
|
|
168
|
+
f"Batch processing complete: {self.total_successful}/{self.total_processed} successful"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return all_results
|
|
172
|
+
|
|
173
|
+
def _create_batches(self, commits: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
|
|
174
|
+
"""Split commits into batches.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
commits: List of commits to batch
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
List of commit batches
|
|
181
|
+
"""
|
|
182
|
+
batches = []
|
|
183
|
+
for i in range(0, len(commits), self.config.batch_size):
|
|
184
|
+
batch = commits[i : i + self.config.batch_size]
|
|
185
|
+
batches.append(batch)
|
|
186
|
+
return batches
|
|
187
|
+
|
|
188
|
+
def _generate_batch_id(self, batch: list[dict[str, Any]], batch_num: int) -> str:
|
|
189
|
+
"""Generate a unique ID for a batch.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
batch: Batch of commits
|
|
193
|
+
batch_num: Batch number
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Unique batch ID
|
|
197
|
+
"""
|
|
198
|
+
# Create hash from first and last commit in batch
|
|
199
|
+
if batch:
|
|
200
|
+
first_msg = batch[0].get("message", "")
|
|
201
|
+
last_msg = batch[-1].get("message", "")
|
|
202
|
+
content = f"{batch_num}:{first_msg}:{last_msg}"
|
|
203
|
+
return hashlib.md5(content.encode()).hexdigest()[:8]
|
|
204
|
+
return f"batch_{batch_num}"
|
|
205
|
+
|
|
206
|
+
def _process_single_batch(
|
|
207
|
+
self,
|
|
208
|
+
batch: list[dict[str, Any]],
|
|
209
|
+
batch_id: str,
|
|
210
|
+
classifier_func: Callable[[dict[str, Any]], dict[str, Any]],
|
|
211
|
+
progress: Optional[Any],
|
|
212
|
+
) -> BatchResult:
|
|
213
|
+
"""Process a single batch of commits.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
batch: Batch of commits to process
|
|
217
|
+
batch_id: Unique batch identifier
|
|
218
|
+
classifier_func: Classification function
|
|
219
|
+
progress: Progress tracking service
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
BatchResult with processing outcomes
|
|
223
|
+
"""
|
|
224
|
+
start_time = time.time()
|
|
225
|
+
results = []
|
|
226
|
+
errors = []
|
|
227
|
+
|
|
228
|
+
# Show nested progress for individual commits if configured
|
|
229
|
+
if self.config.show_progress and self.config.progress_nested and progress:
|
|
230
|
+
with progress.progress(
|
|
231
|
+
total=len(batch),
|
|
232
|
+
description=f"Batch {batch_id[:8]}",
|
|
233
|
+
unit="commit",
|
|
234
|
+
nested=True,
|
|
235
|
+
leave=False,
|
|
236
|
+
) as commit_ctx:
|
|
237
|
+
for j, commit in enumerate(batch, 1):
|
|
238
|
+
# Update progress description
|
|
239
|
+
message_preview = commit.get("message", "")[:30]
|
|
240
|
+
progress.set_description(
|
|
241
|
+
commit_ctx, f"Batch {batch_id[:8]} ({j}/{len(batch)}): {message_preview}..."
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Process commit
|
|
245
|
+
result, error = self._process_single_commit(commit, batch_id, classifier_func)
|
|
246
|
+
|
|
247
|
+
if result:
|
|
248
|
+
results.append(result)
|
|
249
|
+
if error:
|
|
250
|
+
errors.append(error)
|
|
251
|
+
|
|
252
|
+
# Update progress
|
|
253
|
+
progress.update(commit_ctx, 1)
|
|
254
|
+
else:
|
|
255
|
+
# Process without nested progress
|
|
256
|
+
for commit in batch:
|
|
257
|
+
result, error = self._process_single_commit(commit, batch_id, classifier_func)
|
|
258
|
+
|
|
259
|
+
if result:
|
|
260
|
+
results.append(result)
|
|
261
|
+
if error:
|
|
262
|
+
errors.append(error)
|
|
263
|
+
|
|
264
|
+
# Create batch result
|
|
265
|
+
processing_time = time.time() - start_time
|
|
266
|
+
|
|
267
|
+
return BatchResult(
|
|
268
|
+
batch_id=batch_id,
|
|
269
|
+
total_items=len(batch),
|
|
270
|
+
successful_items=len(results),
|
|
271
|
+
failed_items=len(errors),
|
|
272
|
+
results=results,
|
|
273
|
+
errors=errors,
|
|
274
|
+
processing_time_seconds=processing_time,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
def _process_single_commit(
|
|
278
|
+
self,
|
|
279
|
+
commit: dict[str, Any],
|
|
280
|
+
batch_id: str,
|
|
281
|
+
classifier_func: Callable[[dict[str, Any]], dict[str, Any]],
|
|
282
|
+
) -> tuple[Optional[dict[str, Any]], Optional[dict[str, Any]]]:
|
|
283
|
+
"""Process a single commit within a batch.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
commit: Commit to process
|
|
287
|
+
batch_id: Batch identifier
|
|
288
|
+
classifier_func: Classification function
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Tuple of (result, error) where one will be None
|
|
292
|
+
"""
|
|
293
|
+
try:
|
|
294
|
+
# Call classifier function
|
|
295
|
+
result = classifier_func(commit)
|
|
296
|
+
|
|
297
|
+
# Add batch ID to result
|
|
298
|
+
result["batch_id"] = batch_id
|
|
299
|
+
|
|
300
|
+
# Add original commit data if not present
|
|
301
|
+
if "commit_hash" not in result and "hash" in commit:
|
|
302
|
+
result["commit_hash"] = commit["hash"]
|
|
303
|
+
if "author" not in result and "author" in commit:
|
|
304
|
+
result["author"] = commit["author"]
|
|
305
|
+
|
|
306
|
+
return result, None
|
|
307
|
+
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.debug(f"Failed to classify commit: {e}")
|
|
310
|
+
|
|
311
|
+
# Create error record
|
|
312
|
+
error = {
|
|
313
|
+
"batch_id": batch_id,
|
|
314
|
+
"commit_hash": commit.get("hash", "unknown"),
|
|
315
|
+
"message": commit.get("message", "")[:100],
|
|
316
|
+
"error": str(e),
|
|
317
|
+
"error_type": type(e).__name__,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# Return fallback result if configured to continue
|
|
321
|
+
if self.config.continue_on_batch_failure:
|
|
322
|
+
fallback_result = {
|
|
323
|
+
"category": "maintenance",
|
|
324
|
+
"confidence": 0.1,
|
|
325
|
+
"method": "error_fallback",
|
|
326
|
+
"reasoning": f"Classification failed: {str(e)}",
|
|
327
|
+
"batch_id": batch_id,
|
|
328
|
+
"commit_hash": commit.get("hash", "unknown"),
|
|
329
|
+
"error": True,
|
|
330
|
+
}
|
|
331
|
+
return fallback_result, error
|
|
332
|
+
|
|
333
|
+
return None, error
|
|
334
|
+
|
|
335
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
336
|
+
"""Get batch processing statistics.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Dictionary with processing statistics
|
|
340
|
+
"""
|
|
341
|
+
if not self.batch_results:
|
|
342
|
+
return {
|
|
343
|
+
"total_batches": 0,
|
|
344
|
+
"total_processed": 0,
|
|
345
|
+
"total_successful": 0,
|
|
346
|
+
"total_failed": 0,
|
|
347
|
+
"success_rate": 0.0,
|
|
348
|
+
"average_batch_time": 0.0,
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
total_time = sum(br.processing_time_seconds for br in self.batch_results)
|
|
352
|
+
|
|
353
|
+
return {
|
|
354
|
+
"total_batches": len(self.batch_results),
|
|
355
|
+
"total_processed": self.total_processed,
|
|
356
|
+
"total_successful": self.total_successful,
|
|
357
|
+
"total_failed": self.total_failed,
|
|
358
|
+
"success_rate": (
|
|
359
|
+
self.total_successful / self.total_processed if self.total_processed > 0 else 0.0
|
|
360
|
+
),
|
|
361
|
+
"average_batch_time": total_time / len(self.batch_results),
|
|
362
|
+
"total_processing_time": total_time,
|
|
363
|
+
"batch_size": self.config.batch_size,
|
|
364
|
+
"batches_with_errors": sum(1 for br in self.batch_results if br.failed_items > 0),
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
def get_failed_commits(self) -> list[dict[str, Any]]:
|
|
368
|
+
"""Get list of all failed commits.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
List of error records for failed commits
|
|
372
|
+
"""
|
|
373
|
+
failed = []
|
|
374
|
+
for batch_result in self.batch_results:
|
|
375
|
+
failed.extend(batch_result.errors)
|
|
376
|
+
return failed
|
|
377
|
+
|
|
378
|
+
def reset_statistics(self) -> None:
|
|
379
|
+
"""Reset all processing statistics."""
|
|
380
|
+
self.total_processed = 0
|
|
381
|
+
self.total_successful = 0
|
|
382
|
+
self.total_failed = 0
|
|
383
|
+
self.batch_results = []
|