gitflow-analytics 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. gitflow_analytics/__init__.py +11 -9
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/cli.py +691 -243
  4. gitflow_analytics/cli_rich.py +353 -0
  5. gitflow_analytics/config.py +389 -96
  6. gitflow_analytics/core/analyzer.py +175 -78
  7. gitflow_analytics/core/branch_mapper.py +132 -132
  8. gitflow_analytics/core/cache.py +242 -173
  9. gitflow_analytics/core/identity.py +214 -178
  10. gitflow_analytics/extractors/base.py +13 -11
  11. gitflow_analytics/extractors/story_points.py +70 -59
  12. gitflow_analytics/extractors/tickets.py +111 -88
  13. gitflow_analytics/integrations/github_integration.py +91 -77
  14. gitflow_analytics/integrations/jira_integration.py +284 -0
  15. gitflow_analytics/integrations/orchestrator.py +99 -72
  16. gitflow_analytics/metrics/dora.py +183 -179
  17. gitflow_analytics/models/database.py +191 -54
  18. gitflow_analytics/qualitative/__init__.py +30 -0
  19. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  20. gitflow_analytics/qualitative/classifiers/change_type.py +468 -0
  21. gitflow_analytics/qualitative/classifiers/domain_classifier.py +399 -0
  22. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +436 -0
  23. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +412 -0
  24. gitflow_analytics/qualitative/core/__init__.py +13 -0
  25. gitflow_analytics/qualitative/core/llm_fallback.py +653 -0
  26. gitflow_analytics/qualitative/core/nlp_engine.py +373 -0
  27. gitflow_analytics/qualitative/core/pattern_cache.py +457 -0
  28. gitflow_analytics/qualitative/core/processor.py +540 -0
  29. gitflow_analytics/qualitative/models/__init__.py +25 -0
  30. gitflow_analytics/qualitative/models/schemas.py +272 -0
  31. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  32. gitflow_analytics/qualitative/utils/batch_processor.py +326 -0
  33. gitflow_analytics/qualitative/utils/cost_tracker.py +343 -0
  34. gitflow_analytics/qualitative/utils/metrics.py +347 -0
  35. gitflow_analytics/qualitative/utils/text_processing.py +243 -0
  36. gitflow_analytics/reports/analytics_writer.py +25 -8
  37. gitflow_analytics/reports/csv_writer.py +60 -32
  38. gitflow_analytics/reports/narrative_writer.py +21 -15
  39. gitflow_analytics/tui/__init__.py +5 -0
  40. gitflow_analytics/tui/app.py +721 -0
  41. gitflow_analytics/tui/screens/__init__.py +8 -0
  42. gitflow_analytics/tui/screens/analysis_progress_screen.py +487 -0
  43. gitflow_analytics/tui/screens/configuration_screen.py +547 -0
  44. gitflow_analytics/tui/screens/loading_screen.py +358 -0
  45. gitflow_analytics/tui/screens/main_screen.py +304 -0
  46. gitflow_analytics/tui/screens/results_screen.py +698 -0
  47. gitflow_analytics/tui/widgets/__init__.py +7 -0
  48. gitflow_analytics/tui/widgets/data_table.py +257 -0
  49. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  50. gitflow_analytics/tui/widgets/progress_widget.py +192 -0
  51. gitflow_analytics-1.0.3.dist-info/METADATA +490 -0
  52. gitflow_analytics-1.0.3.dist-info/RECORD +62 -0
  53. gitflow_analytics-1.0.0.dist-info/METADATA +0 -201
  54. gitflow_analytics-1.0.0.dist-info/RECORD +0 -30
  55. {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/WHEEL +0 -0
  56. {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/entry_points.txt +0 -0
  57. {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/licenses/LICENSE +0 -0
  58. {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,272 @@
1
+ """Data models and configuration schemas for qualitative analysis."""
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from typing import Any, Dict, List, Optional
7
+
8
+
9
+ @dataclass
10
+ class QualitativeCommitData:
11
+ """Enhanced commit data with qualitative analysis results.
12
+
13
+ This class extends basic commit information with semantic analysis results
14
+ including change type, business domain, risk assessment, and processing metadata.
15
+ """
16
+
17
+ # Existing commit data from GitFlow Analytics
18
+ hash: str
19
+ message: str
20
+ author_name: str
21
+ author_email: str
22
+ timestamp: datetime
23
+ files_changed: List[str]
24
+ insertions: int
25
+ deletions: int
26
+
27
+ # New qualitative analysis fields
28
+ change_type: str # feature|bugfix|refactor|docs|test|chore|security|hotfix|config
29
+ change_type_confidence: float # 0.0-1.0
30
+ business_domain: str # frontend|backend|database|infrastructure|mobile|devops|unknown
31
+ domain_confidence: float # 0.0-1.0
32
+ risk_level: str # low|medium|high|critical
33
+ risk_factors: List[str] # List of identified risk factors
34
+ intent_signals: Dict[str, Any] # Intent analysis results
35
+ collaboration_patterns: Dict[str, Any] # Team interaction patterns
36
+ technical_context: Dict[str, Any] # Technical context information
37
+
38
+ # Processing metadata
39
+ processing_method: str # 'nlp' or 'llm'
40
+ processing_time_ms: float
41
+ confidence_score: float # Overall confidence in analysis
42
+
43
+ def to_dict(self) -> Dict[str, Any]:
44
+ """Convert to dictionary for JSON serialization."""
45
+ return {
46
+ 'hash': self.hash,
47
+ 'message': self.message,
48
+ 'author_name': self.author_name,
49
+ 'author_email': self.author_email,
50
+ 'timestamp': self.timestamp.isoformat(),
51
+ 'files_changed': self.files_changed,
52
+ 'insertions': self.insertions,
53
+ 'deletions': self.deletions,
54
+ 'change_type': self.change_type,
55
+ 'change_type_confidence': self.change_type_confidence,
56
+ 'business_domain': self.business_domain,
57
+ 'domain_confidence': self.domain_confidence,
58
+ 'risk_level': self.risk_level,
59
+ 'risk_factors': self.risk_factors,
60
+ 'intent_signals': self.intent_signals,
61
+ 'collaboration_patterns': self.collaboration_patterns,
62
+ 'technical_context': self.technical_context,
63
+ 'processing_method': self.processing_method,
64
+ 'processing_time_ms': self.processing_time_ms,
65
+ 'confidence_score': self.confidence_score,
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class ChangeTypeConfig:
71
+ """Configuration for change type classification."""
72
+
73
+ min_confidence: float = 0.7
74
+ semantic_weight: float = 0.6 # Weight for semantic features
75
+ file_pattern_weight: float = 0.4 # Weight for file pattern signals
76
+ enable_custom_patterns: bool = True
77
+ custom_patterns: Dict[str, Dict[str, List[str]]] = field(default_factory=dict)
78
+
79
+
80
+ @dataclass
81
+ class IntentConfig:
82
+ """Configuration for intent analysis."""
83
+
84
+ urgency_keywords: Dict[str, List[str]] = field(default_factory=lambda: {
85
+ 'critical': ['critical', 'urgent', 'hotfix', 'emergency', 'immediate'],
86
+ 'important': ['important', 'priority', 'asap', 'needed'],
87
+ 'routine': ['routine', 'regular', 'normal', 'standard']
88
+ })
89
+ confidence_threshold: float = 0.6
90
+ sentiment_analysis: bool = True
91
+
92
+
93
+ @dataclass
94
+ class DomainConfig:
95
+ """Configuration for domain classification."""
96
+
97
+ file_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
98
+ 'frontend': ['*.js', '*.jsx', '*.ts', '*.tsx', '*.vue', '*.html', '*.css', '*.scss'],
99
+ 'backend': ['*.py', '*.java', '*.go', '*.rb', '*.php', '*.cs', '*.cpp'],
100
+ 'database': ['*.sql', 'migrations/*', 'schema/*', '**/models/**'],
101
+ 'infrastructure': ['Dockerfile', '*.yaml', '*.yml', 'terraform/*', '*.tf'],
102
+ 'mobile': ['*.swift', '*.kt', '*.java', 'android/*', 'ios/*'],
103
+ 'devops': ['*.yml', '*.yaml', 'ci/*', '.github/*', 'docker/*']
104
+ })
105
+ keyword_patterns: Dict[str, List[str]] = field(default_factory=lambda: {
106
+ 'frontend': ['ui', 'component', 'styling', 'interface', 'layout'],
107
+ 'backend': ['api', 'endpoint', 'service', 'server', 'logic'],
108
+ 'database': ['query', 'schema', 'migration', 'data', 'model'],
109
+ 'infrastructure': ['deploy', 'config', 'environment', 'setup'],
110
+ 'mobile': ['android', 'ios', 'mobile', 'app'],
111
+ 'devops': ['build', 'pipeline', 'deploy', 'ci', 'docker']
112
+ })
113
+ min_confidence: float = 0.6
114
+
115
+
116
+ @dataclass
117
+ class RiskConfig:
118
+ """Configuration for risk analysis."""
119
+
120
+ high_risk_patterns: List[str] = field(default_factory=lambda: [
121
+ # Security-related patterns
122
+ 'password', 'secret', 'key', 'token', 'auth', 'security',
123
+ # Critical system patterns
124
+ 'production', 'prod', 'critical', 'emergency',
125
+ # Infrastructure patterns
126
+ 'database', 'migration', 'schema', 'deploy',
127
+ # Large change patterns
128
+ 'refactor', 'rewrite', 'restructure'
129
+ ])
130
+ medium_risk_patterns: List[str] = field(default_factory=lambda: [
131
+ 'config', 'configuration', 'settings', 'environment',
132
+ 'api', 'endpoint', 'service', 'integration'
133
+ ])
134
+ file_risk_patterns: Dict[str, str] = field(default_factory=lambda: {
135
+ # High risk file patterns
136
+ '**/*prod*': 'high',
137
+ '**/migrations/**': 'high',
138
+ '**/schema/**': 'high',
139
+ 'Dockerfile': 'medium',
140
+ '*.yml': 'medium',
141
+ '*.yaml': 'medium',
142
+ '**/*config*': 'medium'
143
+ })
144
+ size_thresholds: Dict[str, int] = field(default_factory=lambda: {
145
+ 'large_commit_files': 20, # Files changed
146
+ 'large_commit_lines': 500, # Lines changed
147
+ 'massive_commit_lines': 2000 # Very large changes
148
+ })
149
+
150
+
151
+ @dataclass
152
+ class NLPConfig:
153
+ """Configuration for NLP processing engine."""
154
+
155
+ spacy_model: str = "en_core_web_sm"
156
+ spacy_batch_size: int = 1000
157
+ fast_mode: bool = True # Disable parser/NER for speed
158
+
159
+ # Component configurations
160
+ change_type_config: ChangeTypeConfig = field(default_factory=ChangeTypeConfig)
161
+ intent_config: IntentConfig = field(default_factory=IntentConfig)
162
+ domain_config: DomainConfig = field(default_factory=DomainConfig)
163
+ risk_config: RiskConfig = field(default_factory=RiskConfig)
164
+
165
+ # Performance settings
166
+ enable_parallel_processing: bool = True
167
+ max_workers: int = 4
168
+
169
+
170
+ @dataclass
171
+ class LLMConfig:
172
+ """Configuration for LLM fallback processing via OpenRouter."""
173
+
174
+ # OpenRouter API settings
175
+ openrouter_api_key: str = "${OPENROUTER_API_KEY}"
176
+ base_url: str = "https://openrouter.ai/api/v1"
177
+
178
+ # Model selection strategy
179
+ primary_model: str = "anthropic/claude-3-haiku" # Fast, cheap classification
180
+ fallback_model: str = "meta-llama/llama-3.1-8b-instruct:free" # Free fallback
181
+ complex_model: str = "anthropic/claude-3-sonnet" # For complex cases
182
+
183
+ # Model routing thresholds
184
+ complexity_threshold: float = 0.5 # Route complex cases to better model
185
+ cost_threshold_per_1k: float = 0.01 # Max cost per 1k commits
186
+
187
+ # Processing settings
188
+ max_tokens: int = 1000
189
+ temperature: float = 0.1
190
+
191
+ # Batching settings
192
+ max_group_size: int = 10 # Process up to 10 commits per batch
193
+ similarity_threshold: float = 0.8 # Group similar commits together
194
+
195
+ # Rate limiting
196
+ requests_per_minute: int = 200 # Higher limit with OpenRouter
197
+ max_retries: int = 3
198
+
199
+ # Cost control
200
+ max_daily_cost: float = 5.0 # Max daily spend in USD
201
+ enable_cost_tracking: bool = True
202
+
203
+
204
+ @dataclass
205
+ class CacheConfig:
206
+ """Configuration for qualitative analysis caching."""
207
+
208
+ cache_dir: str = ".qualitative_cache"
209
+ semantic_cache_size: int = 10000 # Max cached patterns
210
+ pattern_cache_ttl_hours: int = 168 # 1 week
211
+
212
+ # Learning settings
213
+ enable_pattern_learning: bool = True
214
+ learning_threshold: int = 10 # Min examples to learn pattern
215
+ confidence_boost_factor: float = 0.1 # Boost for learned patterns
216
+
217
+ # Cache optimization
218
+ enable_compression: bool = True
219
+ max_cache_size_mb: int = 100
220
+
221
+
222
+ @dataclass
223
+ class QualitativeConfig:
224
+ """Main configuration for qualitative analysis system.
225
+
226
+ This configuration orchestrates the entire qualitative analysis pipeline,
227
+ balancing performance, accuracy, and cost through intelligent NLP and
228
+ strategic LLM usage.
229
+ """
230
+
231
+ # Processing settings
232
+ enabled: bool = True
233
+ batch_size: int = 1000 # Commits processed per batch
234
+ max_llm_fallback_pct: float = 0.15 # Max 15% of commits use LLM
235
+ confidence_threshold: float = 0.7 # Min confidence for NLP results
236
+
237
+ # Component configurations
238
+ nlp_config: NLPConfig = field(default_factory=NLPConfig)
239
+ llm_config: LLMConfig = field(default_factory=LLMConfig)
240
+ cache_config: CacheConfig = field(default_factory=CacheConfig)
241
+
242
+ # Performance monitoring
243
+ enable_performance_tracking: bool = True
244
+ target_processing_time_ms: float = 2.0 # Target per-commit processing time
245
+
246
+ # Quality settings
247
+ min_overall_confidence: float = 0.6 # Min confidence for any result
248
+ enable_quality_feedback: bool = True # Learn from corrections
249
+
250
+ def validate(self) -> List[str]:
251
+ """Validate configuration and return any warnings.
252
+
253
+ Returns:
254
+ List of validation warning messages.
255
+ """
256
+ warnings = []
257
+
258
+ if self.max_llm_fallback_pct > 0.3:
259
+ warnings.append("LLM fallback percentage > 30% may result in high costs")
260
+
261
+ if self.confidence_threshold > 0.9:
262
+ warnings.append("Very high confidence threshold may route too many commits to LLM")
263
+
264
+ if self.batch_size > 5000:
265
+ warnings.append("Large batch size may cause memory issues")
266
+
267
+ # Validate LLM config if API key is set
268
+ if self.llm_config.openrouter_api_key and self.llm_config.openrouter_api_key != "${OPENROUTER_API_KEY}":
269
+ if self.llm_config.max_daily_cost < 1.0:
270
+ warnings.append("Very low daily cost limit may restrict LLM usage")
271
+
272
+ return warnings
@@ -0,0 +1,13 @@
1
+ """Utility functions for qualitative analysis."""
2
+
3
+ from .text_processing import TextProcessor
4
+ from .batch_processor import BatchProcessor
5
+ from .metrics import PerformanceMetrics
6
+ from .cost_tracker import CostTracker
7
+
8
+ __all__ = [
9
+ "TextProcessor",
10
+ "BatchProcessor",
11
+ "PerformanceMetrics",
12
+ "CostTracker",
13
+ ]
@@ -0,0 +1,326 @@
1
+ """Batch processing utilities for efficient commit analysis."""
2
+
3
+ import logging
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar
6
+ from threading import Lock
7
+ import time
8
+
9
+ T = TypeVar('T')
10
+ R = TypeVar('R')
11
+
12
+
13
+ class BatchProcessor:
14
+ """Efficient batch processing for commit analysis.
15
+
16
+ This class provides utilities for processing large numbers of commits
17
+ in batches with parallel execution, progress tracking, and error handling.
18
+ """
19
+
20
+ def __init__(self, batch_size: int = 1000, max_workers: int = 4):
21
+ """Initialize batch processor.
22
+
23
+ Args:
24
+ batch_size: Number of items to process per batch
25
+ max_workers: Maximum number of worker threads
26
+ """
27
+ self.batch_size = batch_size
28
+ self.max_workers = max_workers
29
+ self.logger = logging.getLogger(__name__)
30
+ self._stats_lock = Lock()
31
+ self._processing_stats = {
32
+ 'total_processed': 0,
33
+ 'total_errors': 0,
34
+ 'batch_times': [],
35
+ 'start_time': None
36
+ }
37
+
38
+ def create_batches(self, items: List[T], batch_size: Optional[int] = None) -> Iterator[List[T]]:
39
+ """Split items into batches for processing.
40
+
41
+ Args:
42
+ items: List of items to batch
43
+ batch_size: Override default batch size
44
+
45
+ Yields:
46
+ Batches of items
47
+ """
48
+ batch_size = batch_size or self.batch_size
49
+
50
+ for i in range(0, len(items), batch_size):
51
+ yield items[i:i + batch_size]
52
+
53
+ def process_batches(self, items: List[T], processor_func: Callable[[List[T]], List[R]],
54
+ parallel: bool = True) -> List[R]:
55
+ """Process items in batches with optional parallelization.
56
+
57
+ Args:
58
+ items: Items to process
59
+ processor_func: Function that processes a batch and returns results
60
+ parallel: Whether to use parallel processing
61
+
62
+ Returns:
63
+ List of all processing results
64
+ """
65
+ if not items:
66
+ return []
67
+
68
+ self._reset_stats()
69
+ self._processing_stats['start_time'] = time.time()
70
+
71
+ batches = list(self.create_batches(items))
72
+ self.logger.info(f"Processing {len(items)} items in {len(batches)} batches")
73
+
74
+ all_results = []
75
+
76
+ if parallel and len(batches) > 1:
77
+ all_results = self._process_parallel(batches, processor_func)
78
+ else:
79
+ all_results = self._process_sequential(batches, processor_func)
80
+
81
+ self._log_final_stats(len(items))
82
+ return all_results
83
+
84
+ def process_with_callback(self, items: List[T], processor_func: Callable[[List[T]], List[R]],
85
+ progress_callback: Optional[Callable[[int, int], None]] = None) -> List[R]:
86
+ """Process batches with progress callback.
87
+
88
+ Args:
89
+ items: Items to process
90
+ processor_func: Function that processes a batch
91
+ progress_callback: Callback for progress updates (processed, total)
92
+
93
+ Returns:
94
+ List of all processing results
95
+ """
96
+ if not items:
97
+ return []
98
+
99
+ self._reset_stats()
100
+ batches = list(self.create_batches(items))
101
+ all_results = []
102
+ processed_count = 0
103
+
104
+ for i, batch in enumerate(batches):
105
+ batch_start = time.time()
106
+
107
+ try:
108
+ batch_results = processor_func(batch)
109
+ all_results.extend(batch_results)
110
+ processed_count += len(batch)
111
+
112
+ with self._stats_lock:
113
+ self._processing_stats['total_processed'] += len(batch)
114
+ self._processing_stats['batch_times'].append(time.time() - batch_start)
115
+
116
+ except Exception as e:
117
+ self.logger.error(f"Error processing batch {i}: {e}")
118
+ with self._stats_lock:
119
+ self._processing_stats['total_errors'] += len(batch)
120
+
121
+ # Call progress callback if provided
122
+ if progress_callback:
123
+ progress_callback(processed_count, len(items))
124
+
125
+ return all_results
126
+
127
+ def _process_parallel(self, batches: List[List[T]],
128
+ processor_func: Callable[[List[T]], List[R]]) -> List[R]:
129
+ """Process batches in parallel using ThreadPoolExecutor.
130
+
131
+ Args:
132
+ batches: List of batches to process
133
+ processor_func: Function to process each batch
134
+
135
+ Returns:
136
+ Combined results from all batches
137
+ """
138
+ all_results = []
139
+
140
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
141
+ # Submit all batches
142
+ future_to_batch = {
143
+ executor.submit(self._process_batch_with_timing, batch, processor_func): i
144
+ for i, batch in enumerate(batches)
145
+ }
146
+
147
+ # Collect results as they complete
148
+ for future in as_completed(future_to_batch):
149
+ batch_idx = future_to_batch[future]
150
+
151
+ try:
152
+ batch_results, batch_time = future.result()
153
+ all_results.extend(batch_results)
154
+
155
+ with self._stats_lock:
156
+ self._processing_stats['total_processed'] += len(batches[batch_idx])
157
+ self._processing_stats['batch_times'].append(batch_time)
158
+
159
+ except Exception as e:
160
+ self.logger.error(f"Error processing batch {batch_idx}: {e}")
161
+ with self._stats_lock:
162
+ self._processing_stats['total_errors'] += len(batches[batch_idx])
163
+
164
+ return all_results
165
+
166
+ def _process_sequential(self, batches: List[List[T]],
167
+ processor_func: Callable[[List[T]], List[R]]) -> List[R]:
168
+ """Process batches sequentially.
169
+
170
+ Args:
171
+ batches: List of batches to process
172
+ processor_func: Function to process each batch
173
+
174
+ Returns:
175
+ Combined results from all batches
176
+ """
177
+ all_results = []
178
+
179
+ for i, batch in enumerate(batches):
180
+ try:
181
+ batch_results, batch_time = self._process_batch_with_timing(batch, processor_func)
182
+ all_results.extend(batch_results)
183
+
184
+ self._processing_stats['total_processed'] += len(batch)
185
+ self._processing_stats['batch_times'].append(batch_time)
186
+
187
+ except Exception as e:
188
+ self.logger.error(f"Error processing batch {i}: {e}")
189
+ self._processing_stats['total_errors'] += len(batch)
190
+
191
+ return all_results
192
+
193
+ def _process_batch_with_timing(self, batch: List[T],
194
+ processor_func: Callable[[List[T]], List[R]]) -> tuple[List[R], float]:
195
+ """Process a single batch with timing.
196
+
197
+ Args:
198
+ batch: Batch to process
199
+ processor_func: Processing function
200
+
201
+ Returns:
202
+ Tuple of (results, processing_time_seconds)
203
+ """
204
+ start_time = time.time()
205
+ results = processor_func(batch)
206
+ processing_time = time.time() - start_time
207
+
208
+ return results, processing_time
209
+
210
+ def _reset_stats(self) -> None:
211
+ """Reset processing statistics."""
212
+ with self._stats_lock:
213
+ self._processing_stats = {
214
+ 'total_processed': 0,
215
+ 'total_errors': 0,
216
+ 'batch_times': [],
217
+ 'start_time': time.time()
218
+ }
219
+
220
+ def _log_final_stats(self, total_items: int) -> None:
221
+ """Log final processing statistics.
222
+
223
+ Args:
224
+ total_items: Total number of items processed
225
+ """
226
+ with self._stats_lock:
227
+ stats = self._processing_stats.copy()
228
+
229
+ if not stats['batch_times']:
230
+ return
231
+
232
+ total_time = time.time() - stats['start_time']
233
+ avg_batch_time = sum(stats['batch_times']) / len(stats['batch_times'])
234
+ items_per_second = stats['total_processed'] / total_time if total_time > 0 else 0
235
+
236
+ self.logger.info(
237
+ f"Batch processing complete: {stats['total_processed']}/{total_items} items processed "
238
+ f"in {total_time:.2f}s ({items_per_second:.1f} items/s), "
239
+ f"{stats['total_errors']} errors, avg batch time: {avg_batch_time:.2f}s"
240
+ )
241
+
242
+ def get_processing_stats(self) -> Dict[str, Any]:
243
+ """Get current processing statistics.
244
+
245
+ Returns:
246
+ Dictionary with processing statistics
247
+ """
248
+ with self._stats_lock:
249
+ stats = self._processing_stats.copy()
250
+
251
+ if stats['start_time'] and stats['batch_times']:
252
+ elapsed_time = time.time() - stats['start_time']
253
+ avg_batch_time = sum(stats['batch_times']) / len(stats['batch_times'])
254
+ items_per_second = stats['total_processed'] / elapsed_time if elapsed_time > 0 else 0
255
+
256
+ return {
257
+ 'total_processed': stats['total_processed'],
258
+ 'total_errors': stats['total_errors'],
259
+ 'elapsed_time_seconds': elapsed_time,
260
+ 'avg_batch_time_seconds': avg_batch_time,
261
+ 'items_per_second': items_per_second,
262
+ 'batches_completed': len(stats['batch_times']),
263
+ 'error_rate': stats['total_errors'] / (stats['total_processed'] + stats['total_errors']) if (stats['total_processed'] + stats['total_errors']) > 0 else 0.0
264
+ }
265
+ else:
266
+ return {
267
+ 'total_processed': 0,
268
+ 'total_errors': 0,
269
+ 'elapsed_time_seconds': 0,
270
+ 'avg_batch_time_seconds': 0,
271
+ 'items_per_second': 0,
272
+ 'batches_completed': 0,
273
+ 'error_rate': 0.0
274
+ }
275
+
276
+
277
+ class ProgressTracker:
278
+ """Simple progress tracking for long-running operations."""
279
+
280
+ def __init__(self, total: int, description: str = "Processing"):
281
+ """Initialize progress tracker.
282
+
283
+ Args:
284
+ total: Total number of items to process
285
+ description: Description of the operation
286
+ """
287
+ self.total = total
288
+ self.description = description
289
+ self.processed = 0
290
+ self.start_time = time.time()
291
+ self.last_report = 0
292
+ self.logger = logging.getLogger(__name__)
293
+
294
+ def update(self, count: int = 1) -> None:
295
+ """Update progress count.
296
+
297
+ Args:
298
+ count: Number of items processed since last update
299
+ """
300
+ self.processed += count
301
+
302
+ # Report progress every 10% or every 1000 items, whichever is less frequent
303
+ report_interval = max(self.total // 10, 1000)
304
+
305
+ if self.processed - self.last_report >= report_interval or self.processed >= self.total:
306
+ self._report_progress()
307
+ self.last_report = self.processed
308
+
309
+ def _report_progress(self) -> None:
310
+ """Report current progress."""
311
+ elapsed_time = time.time() - self.start_time
312
+ percentage = (self.processed / self.total) * 100 if self.total > 0 else 0
313
+ rate = self.processed / elapsed_time if elapsed_time > 0 else 0
314
+
315
+ # Estimate time remaining
316
+ if rate > 0 and self.processed < self.total:
317
+ remaining_items = self.total - self.processed
318
+ eta_seconds = remaining_items / rate
319
+ eta_str = f", ETA: {eta_seconds:.0f}s"
320
+ else:
321
+ eta_str = ""
322
+
323
+ self.logger.info(
324
+ f"{self.description}: {self.processed}/{self.total} ({percentage:.1f}%) "
325
+ f"at {rate:.1f} items/s{eta_str}"
326
+ )