gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,306 @@
1
+ """Data models and configuration schemas for qualitative analysis."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+
8
+ @dataclass
9
+ class QualitativeCommitData:
10
+ """Enhanced commit data with qualitative analysis results.
11
+
12
+ This class extends basic commit information with semantic analysis results
13
+ including change type, business domain, risk assessment, and processing metadata.
14
+ """
15
+
16
+ # Existing commit data from GitFlow Analytics
17
+ hash: str
18
+ message: str
19
+ author_name: str
20
+ author_email: str
21
+ timestamp: datetime
22
+ files_changed: list[str]
23
+ insertions: int
24
+ deletions: int
25
+
26
+ # New qualitative analysis fields
27
+ change_type: str # feature|bugfix|refactor|docs|test|chore|security|hotfix|config
28
+ change_type_confidence: float # 0.0-1.0
29
+ business_domain: str # frontend|backend|database|infrastructure|mobile|devops|unknown
30
+ domain_confidence: float # 0.0-1.0
31
+ risk_level: str # low|medium|high|critical
32
+ risk_factors: list[str] # List of identified risk factors
33
+ intent_signals: dict[str, Any] # Intent analysis results
34
+ collaboration_patterns: dict[str, Any] # Team interaction patterns
35
+ technical_context: dict[str, Any] # Technical context information
36
+
37
+ # Processing metadata
38
+ processing_method: str # 'nlp' or 'llm'
39
+ processing_time_ms: float
40
+ confidence_score: float # Overall confidence in analysis
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary for JSON serialization."""
44
+ return {
45
+ "hash": self.hash,
46
+ "message": self.message,
47
+ "author_name": self.author_name,
48
+ "author_email": self.author_email,
49
+ "timestamp": self.timestamp.isoformat(),
50
+ "files_changed": self.files_changed,
51
+ "insertions": self.insertions,
52
+ "deletions": self.deletions,
53
+ "change_type": self.change_type,
54
+ "change_type_confidence": self.change_type_confidence,
55
+ "business_domain": self.business_domain,
56
+ "domain_confidence": self.domain_confidence,
57
+ "risk_level": self.risk_level,
58
+ "risk_factors": self.risk_factors,
59
+ "intent_signals": self.intent_signals,
60
+ "collaboration_patterns": self.collaboration_patterns,
61
+ "technical_context": self.technical_context,
62
+ "processing_method": self.processing_method,
63
+ "processing_time_ms": self.processing_time_ms,
64
+ "confidence_score": self.confidence_score,
65
+ }
66
+
67
+
68
+ @dataclass
69
+ class ChangeTypeConfig:
70
+ """Configuration for change type classification."""
71
+
72
+ min_confidence: float = 0.7
73
+ semantic_weight: float = 0.6 # Weight for semantic features
74
+ file_pattern_weight: float = 0.4 # Weight for file pattern signals
75
+ enable_custom_patterns: bool = True
76
+ custom_patterns: dict[str, dict[str, list[str]]] = field(default_factory=dict)
77
+
78
+
79
+ @dataclass
80
+ class IntentConfig:
81
+ """Configuration for intent analysis."""
82
+
83
+ urgency_keywords: dict[str, list[str]] = field(
84
+ default_factory=lambda: {
85
+ "critical": ["critical", "urgent", "hotfix", "emergency", "immediate"],
86
+ "important": ["important", "priority", "asap", "needed"],
87
+ "routine": ["routine", "regular", "normal", "standard"],
88
+ }
89
+ )
90
+ confidence_threshold: float = 0.6
91
+ sentiment_analysis: bool = True
92
+
93
+
94
+ @dataclass
95
+ class DomainConfig:
96
+ """Configuration for domain classification."""
97
+
98
+ file_patterns: dict[str, list[str]] = field(
99
+ default_factory=lambda: {
100
+ "frontend": ["*.js", "*.jsx", "*.ts", "*.tsx", "*.vue", "*.html", "*.css", "*.scss"],
101
+ "backend": ["*.py", "*.java", "*.go", "*.rb", "*.php", "*.cs", "*.cpp"],
102
+ "database": ["*.sql", "migrations/*", "schema/*", "**/models/**"],
103
+ "infrastructure": ["Dockerfile", "*.yaml", "*.yml", "terraform/*", "*.tf"],
104
+ "mobile": ["*.swift", "*.kt", "*.java", "android/*", "ios/*"],
105
+ "devops": ["*.yml", "*.yaml", "ci/*", ".github/*", "docker/*"],
106
+ }
107
+ )
108
+ keyword_patterns: dict[str, list[str]] = field(
109
+ default_factory=lambda: {
110
+ "frontend": ["ui", "component", "styling", "interface", "layout"],
111
+ "backend": ["api", "endpoint", "service", "server", "logic"],
112
+ "database": ["query", "schema", "migration", "data", "model"],
113
+ "infrastructure": ["deploy", "config", "environment", "setup"],
114
+ "mobile": ["android", "ios", "mobile", "app"],
115
+ "devops": ["build", "pipeline", "deploy", "ci", "docker"],
116
+ }
117
+ )
118
+ min_confidence: float = 0.6
119
+
120
+
121
+ @dataclass
122
+ class RiskConfig:
123
+ """Configuration for risk analysis."""
124
+
125
+ high_risk_patterns: list[str] = field(
126
+ default_factory=lambda: [
127
+ # Security-related patterns
128
+ "password",
129
+ "secret",
130
+ "key",
131
+ "token",
132
+ "auth",
133
+ "security",
134
+ # Critical system patterns
135
+ "production",
136
+ "prod",
137
+ "critical",
138
+ "emergency",
139
+ # Infrastructure patterns
140
+ "database",
141
+ "migration",
142
+ "schema",
143
+ "deploy",
144
+ # Large change patterns
145
+ "refactor",
146
+ "rewrite",
147
+ "restructure",
148
+ ]
149
+ )
150
+ medium_risk_patterns: list[str] = field(
151
+ default_factory=lambda: [
152
+ "config",
153
+ "configuration",
154
+ "settings",
155
+ "environment",
156
+ "api",
157
+ "endpoint",
158
+ "service",
159
+ "integration",
160
+ ]
161
+ )
162
+ file_risk_patterns: dict[str, str] = field(
163
+ default_factory=lambda: {
164
+ # High risk file patterns
165
+ "**/*prod*": "high",
166
+ "**/migrations/**": "high",
167
+ "**/schema/**": "high",
168
+ "Dockerfile": "medium",
169
+ "*.yml": "medium",
170
+ "*.yaml": "medium",
171
+ "**/*config*": "medium",
172
+ }
173
+ )
174
+ size_thresholds: dict[str, int] = field(
175
+ default_factory=lambda: {
176
+ "large_commit_files": 20, # Files changed
177
+ "large_commit_lines": 500, # Lines changed
178
+ "massive_commit_lines": 2000, # Very large changes
179
+ }
180
+ )
181
+
182
+
183
+ @dataclass
184
+ class NLPConfig:
185
+ """Configuration for NLP processing engine."""
186
+
187
+ spacy_model: str = "en_core_web_sm"
188
+ spacy_batch_size: int = 1000
189
+ fast_mode: bool = True # Disable parser/NER for speed
190
+
191
+ # Component configurations
192
+ change_type_config: ChangeTypeConfig = field(default_factory=ChangeTypeConfig)
193
+ intent_config: IntentConfig = field(default_factory=IntentConfig)
194
+ domain_config: DomainConfig = field(default_factory=DomainConfig)
195
+ risk_config: RiskConfig = field(default_factory=RiskConfig)
196
+
197
+ # Performance settings
198
+ enable_parallel_processing: bool = True
199
+ max_workers: int = 4
200
+
201
+
202
+ @dataclass
203
+ class LLMConfig:
204
+ """Configuration for LLM fallback processing via OpenRouter."""
205
+
206
+ # OpenRouter API settings
207
+ openrouter_api_key: str = "${OPENROUTER_API_KEY}"
208
+ base_url: str = "https://openrouter.ai/api/v1"
209
+
210
+ # Model selection strategy
211
+ primary_model: str = "anthropic/claude-3-haiku" # Fast, cheap classification
212
+ fallback_model: str = "meta-llama/llama-3.1-8b-instruct:free" # Free fallback
213
+ complex_model: str = "anthropic/claude-3-sonnet" # For complex cases
214
+
215
+ # Model routing thresholds
216
+ complexity_threshold: float = 0.5 # Route complex cases to better model
217
+ cost_threshold_per_1k: float = 0.01 # Max cost per 1k commits
218
+
219
+ # Processing settings
220
+ max_tokens: int = 1000
221
+ temperature: float = 0.1
222
+
223
+ # Batching settings
224
+ max_group_size: int = 10 # Process up to 10 commits per batch
225
+ similarity_threshold: float = 0.8 # Group similar commits together
226
+
227
+ # Rate limiting
228
+ requests_per_minute: int = 200 # Higher limit with OpenRouter
229
+ max_retries: int = 3
230
+
231
+ # Cost control
232
+ max_daily_cost: float = 5.0 # Max daily spend in USD
233
+ enable_cost_tracking: bool = True
234
+
235
+
236
+ @dataclass
237
+ class CacheConfig:
238
+ """Configuration for qualitative analysis caching."""
239
+
240
+ cache_dir: str = ".qualitative_cache"
241
+ semantic_cache_size: int = 10000 # Max cached patterns
242
+ pattern_cache_ttl_hours: int = 168 # 1 week
243
+
244
+ # Learning settings
245
+ enable_pattern_learning: bool = True
246
+ learning_threshold: int = 10 # Min examples to learn pattern
247
+ confidence_boost_factor: float = 0.1 # Boost for learned patterns
248
+
249
+ # Cache optimization
250
+ enable_compression: bool = True
251
+ max_cache_size_mb: int = 100
252
+
253
+
254
+ @dataclass
255
+ class QualitativeConfig:
256
+ """Main configuration for qualitative analysis system.
257
+
258
+ This configuration orchestrates the entire qualitative analysis pipeline,
259
+ balancing performance, accuracy, and cost through intelligent NLP and
260
+ strategic LLM usage.
261
+ """
262
+
263
+ # Processing settings
264
+ enabled: bool = True
265
+ batch_size: int = 1000 # Commits processed per batch
266
+ max_llm_fallback_pct: float = 0.15 # Max 15% of commits use LLM
267
+ confidence_threshold: float = 0.7 # Min confidence for NLP results
268
+
269
+ # Component configurations
270
+ nlp_config: NLPConfig = field(default_factory=NLPConfig)
271
+ llm_config: LLMConfig = field(default_factory=LLMConfig)
272
+ cache_config: CacheConfig = field(default_factory=CacheConfig)
273
+
274
+ # Performance monitoring
275
+ enable_performance_tracking: bool = True
276
+ target_processing_time_ms: float = 2.0 # Target per-commit processing time
277
+
278
+ # Quality settings
279
+ min_overall_confidence: float = 0.6 # Min confidence for any result
280
+ enable_quality_feedback: bool = True # Learn from corrections
281
+
282
+ def validate(self) -> list[str]:
283
+ """Validate configuration and return any warnings.
284
+
285
+ Returns:
286
+ List of validation warning messages.
287
+ """
288
+ warnings = []
289
+
290
+ if self.max_llm_fallback_pct > 0.3:
291
+ warnings.append("LLM fallback percentage > 30% may result in high costs")
292
+
293
+ if self.confidence_threshold > 0.9:
294
+ warnings.append("Very high confidence threshold may route too many commits to LLM")
295
+
296
+ if self.batch_size > 5000:
297
+ warnings.append("Large batch size may cause memory issues")
298
+
299
+ # Validate LLM config if API key is set
300
+ if (
301
+ self.llm_config.openrouter_api_key
302
+ and self.llm_config.openrouter_api_key != "${OPENROUTER_API_KEY}"
303
+ ) and self.llm_config.max_daily_cost < 1.0:
304
+ warnings.append("Very low daily cost limit may restrict LLM usage")
305
+
306
+ return warnings
@@ -0,0 +1,13 @@
1
+ """Utility functions for qualitative analysis."""
2
+
3
+ from .batch_processor import BatchProcessor
4
+ from .cost_tracker import CostTracker
5
+ from .metrics import PerformanceMetrics
6
+ from .text_processing import TextProcessor
7
+
8
+ __all__ = [
9
+ "TextProcessor",
10
+ "BatchProcessor",
11
+ "PerformanceMetrics",
12
+ "CostTracker",
13
+ ]
@@ -0,0 +1,339 @@
1
+ """Batch processing utilities for efficient commit analysis."""
2
+
3
+ import logging
4
+ import time
5
+ from collections.abc import Iterator
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from threading import Lock
8
+ from typing import Any, Callable, Optional, TypeVar
9
+
10
+ T = TypeVar("T")
11
+ R = TypeVar("R")
12
+
13
+
14
+ class BatchProcessor:
15
+ """Efficient batch processing for commit analysis.
16
+
17
+ This class provides utilities for processing large numbers of commits
18
+ in batches with parallel execution, progress tracking, and error handling.
19
+ """
20
+
21
+ def __init__(self, batch_size: int = 1000, max_workers: int = 4):
22
+ """Initialize batch processor.
23
+
24
+ Args:
25
+ batch_size: Number of items to process per batch
26
+ max_workers: Maximum number of worker threads
27
+ """
28
+ self.batch_size = batch_size
29
+ self.max_workers = max_workers
30
+ self.logger = logging.getLogger(__name__)
31
+ self._stats_lock = Lock()
32
+ self._processing_stats = {
33
+ "total_processed": 0,
34
+ "total_errors": 0,
35
+ "batch_times": [],
36
+ "start_time": None,
37
+ }
38
+
39
+ def create_batches(self, items: list[T], batch_size: Optional[int] = None) -> Iterator[list[T]]:
40
+ """Split items into batches for processing.
41
+
42
+ Args:
43
+ items: List of items to batch
44
+ batch_size: Override default batch size
45
+
46
+ Yields:
47
+ Batches of items
48
+ """
49
+ batch_size = batch_size or self.batch_size
50
+
51
+ for i in range(0, len(items), batch_size):
52
+ yield items[i : i + batch_size]
53
+
54
+ def process_batches(
55
+ self, items: list[T], processor_func: Callable[[list[T]], list[R]], parallel: bool = True
56
+ ) -> list[R]:
57
+ """Process items in batches with optional parallelization.
58
+
59
+ Args:
60
+ items: Items to process
61
+ processor_func: Function that processes a batch and returns results
62
+ parallel: Whether to use parallel processing
63
+
64
+ Returns:
65
+ List of all processing results
66
+ """
67
+ if not items:
68
+ return []
69
+
70
+ self._reset_stats()
71
+ self._processing_stats["start_time"] = time.time()
72
+
73
+ batches = list(self.create_batches(items))
74
+ self.logger.info(f"Processing {len(items)} items in {len(batches)} batches")
75
+
76
+ all_results = []
77
+
78
+ if parallel and len(batches) > 1:
79
+ all_results = self._process_parallel(batches, processor_func)
80
+ else:
81
+ all_results = self._process_sequential(batches, processor_func)
82
+
83
+ self._log_final_stats(len(items))
84
+ return all_results
85
+
86
+ def process_with_callback(
87
+ self,
88
+ items: list[T],
89
+ processor_func: Callable[[list[T]], list[R]],
90
+ progress_callback: Optional[Callable[[int, int], None]] = None,
91
+ ) -> list[R]:
92
+ """Process batches with progress callback.
93
+
94
+ Args:
95
+ items: Items to process
96
+ processor_func: Function that processes a batch
97
+ progress_callback: Callback for progress updates (processed, total)
98
+
99
+ Returns:
100
+ List of all processing results
101
+ """
102
+ if not items:
103
+ return []
104
+
105
+ self._reset_stats()
106
+ batches = list(self.create_batches(items))
107
+ all_results = []
108
+ processed_count = 0
109
+
110
+ for i, batch in enumerate(batches):
111
+ batch_start = time.time()
112
+
113
+ try:
114
+ batch_results = processor_func(batch)
115
+ all_results.extend(batch_results)
116
+ processed_count += len(batch)
117
+
118
+ with self._stats_lock:
119
+ self._processing_stats["total_processed"] += len(batch)
120
+ self._processing_stats["batch_times"].append(time.time() - batch_start)
121
+
122
+ except Exception as e:
123
+ self.logger.error(f"Error processing batch {i}: {e}")
124
+ with self._stats_lock:
125
+ self._processing_stats["total_errors"] += len(batch)
126
+
127
+ # Call progress callback if provided
128
+ if progress_callback:
129
+ progress_callback(processed_count, len(items))
130
+
131
+ return all_results
132
+
133
+ def _process_parallel(
134
+ self, batches: list[list[T]], processor_func: Callable[[list[T]], list[R]]
135
+ ) -> list[R]:
136
+ """Process batches in parallel using ThreadPoolExecutor.
137
+
138
+ Args:
139
+ batches: List of batches to process
140
+ processor_func: Function to process each batch
141
+
142
+ Returns:
143
+ Combined results from all batches
144
+ """
145
+ all_results = []
146
+
147
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
148
+ # Submit all batches
149
+ future_to_batch = {
150
+ executor.submit(self._process_batch_with_timing, batch, processor_func): i
151
+ for i, batch in enumerate(batches)
152
+ }
153
+
154
+ # Collect results as they complete
155
+ for future in as_completed(future_to_batch):
156
+ batch_idx = future_to_batch[future]
157
+
158
+ try:
159
+ batch_results, batch_time = future.result()
160
+ all_results.extend(batch_results)
161
+
162
+ with self._stats_lock:
163
+ self._processing_stats["total_processed"] += len(batches[batch_idx])
164
+ self._processing_stats["batch_times"].append(batch_time)
165
+
166
+ except Exception as e:
167
+ self.logger.error(f"Error processing batch {batch_idx}: {e}")
168
+ with self._stats_lock:
169
+ self._processing_stats["total_errors"] += len(batches[batch_idx])
170
+
171
+ return all_results
172
+
173
+ def _process_sequential(
174
+ self, batches: list[list[T]], processor_func: Callable[[list[T]], list[R]]
175
+ ) -> list[R]:
176
+ """Process batches sequentially.
177
+
178
+ Args:
179
+ batches: List of batches to process
180
+ processor_func: Function to process each batch
181
+
182
+ Returns:
183
+ Combined results from all batches
184
+ """
185
+ all_results = []
186
+
187
+ for i, batch in enumerate(batches):
188
+ try:
189
+ batch_results, batch_time = self._process_batch_with_timing(batch, processor_func)
190
+ all_results.extend(batch_results)
191
+
192
+ self._processing_stats["total_processed"] += len(batch)
193
+ self._processing_stats["batch_times"].append(batch_time)
194
+
195
+ except Exception as e:
196
+ self.logger.error(f"Error processing batch {i}: {e}")
197
+ self._processing_stats["total_errors"] += len(batch)
198
+
199
+ return all_results
200
+
201
+ def _process_batch_with_timing(
202
+ self, batch: list[T], processor_func: Callable[[list[T]], list[R]]
203
+ ) -> tuple[list[R], float]:
204
+ """Process a single batch with timing.
205
+
206
+ Args:
207
+ batch: Batch to process
208
+ processor_func: Processing function
209
+
210
+ Returns:
211
+ Tuple of (results, processing_time_seconds)
212
+ """
213
+ start_time = time.time()
214
+ results = processor_func(batch)
215
+ processing_time = time.time() - start_time
216
+
217
+ return results, processing_time
218
+
219
+ def _reset_stats(self) -> None:
220
+ """Reset processing statistics."""
221
+ with self._stats_lock:
222
+ self._processing_stats = {
223
+ "total_processed": 0,
224
+ "total_errors": 0,
225
+ "batch_times": [],
226
+ "start_time": time.time(),
227
+ }
228
+
229
+ def _log_final_stats(self, total_items: int) -> None:
230
+ """Log final processing statistics.
231
+
232
+ Args:
233
+ total_items: Total number of items processed
234
+ """
235
+ with self._stats_lock:
236
+ stats = self._processing_stats.copy()
237
+
238
+ if not stats["batch_times"]:
239
+ return
240
+
241
+ total_time = time.time() - stats["start_time"]
242
+ avg_batch_time = sum(stats["batch_times"]) / len(stats["batch_times"])
243
+ items_per_second = stats["total_processed"] / total_time if total_time > 0 else 0
244
+
245
+ self.logger.info(
246
+ f"Batch processing complete: {stats['total_processed']}/{total_items} items processed "
247
+ f"in {total_time:.2f}s ({items_per_second:.1f} items/s), "
248
+ f"{stats['total_errors']} errors, avg batch time: {avg_batch_time:.2f}s"
249
+ )
250
+
251
+ def get_processing_stats(self) -> dict[str, Any]:
252
+ """Get current processing statistics.
253
+
254
+ Returns:
255
+ Dictionary with processing statistics
256
+ """
257
+ with self._stats_lock:
258
+ stats = self._processing_stats.copy()
259
+
260
+ if stats["start_time"] and stats["batch_times"]:
261
+ elapsed_time = time.time() - stats["start_time"]
262
+ avg_batch_time = sum(stats["batch_times"]) / len(stats["batch_times"])
263
+ items_per_second = stats["total_processed"] / elapsed_time if elapsed_time > 0 else 0
264
+
265
+ return {
266
+ "total_processed": stats["total_processed"],
267
+ "total_errors": stats["total_errors"],
268
+ "elapsed_time_seconds": elapsed_time,
269
+ "avg_batch_time_seconds": avg_batch_time,
270
+ "items_per_second": items_per_second,
271
+ "batches_completed": len(stats["batch_times"]),
272
+ "error_rate": (
273
+ stats["total_errors"] / (stats["total_processed"] + stats["total_errors"])
274
+ if (stats["total_processed"] + stats["total_errors"]) > 0
275
+ else 0.0
276
+ ),
277
+ }
278
+ else:
279
+ return {
280
+ "total_processed": 0,
281
+ "total_errors": 0,
282
+ "elapsed_time_seconds": 0,
283
+ "avg_batch_time_seconds": 0,
284
+ "items_per_second": 0,
285
+ "batches_completed": 0,
286
+ "error_rate": 0.0,
287
+ }
288
+
289
+
290
+ class ProgressTracker:
291
+ """Simple progress tracking for long-running operations."""
292
+
293
+ def __init__(self, total: int, description: str = "Processing"):
294
+ """Initialize progress tracker.
295
+
296
+ Args:
297
+ total: Total number of items to process
298
+ description: Description of the operation
299
+ """
300
+ self.total = total
301
+ self.description = description
302
+ self.processed = 0
303
+ self.start_time = time.time()
304
+ self.last_report = 0
305
+ self.logger = logging.getLogger(__name__)
306
+
307
+ def update(self, count: int = 1) -> None:
308
+ """Update progress count.
309
+
310
+ Args:
311
+ count: Number of items processed since last update
312
+ """
313
+ self.processed += count
314
+
315
+ # Report progress every 10% or every 1000 items, whichever is less frequent
316
+ report_interval = max(self.total // 10, 1000)
317
+
318
+ if self.processed - self.last_report >= report_interval or self.processed >= self.total:
319
+ self._report_progress()
320
+ self.last_report = self.processed
321
+
322
+ def _report_progress(self) -> None:
323
+ """Report current progress."""
324
+ elapsed_time = time.time() - self.start_time
325
+ percentage = (self.processed / self.total) * 100 if self.total > 0 else 0
326
+ rate = self.processed / elapsed_time if elapsed_time > 0 else 0
327
+
328
+ # Estimate time remaining
329
+ if rate > 0 and self.processed < self.total:
330
+ remaining_items = self.total - self.processed
331
+ eta_seconds = remaining_items / rate
332
+ eta_str = f", ETA: {eta_seconds:.0f}s"
333
+ else:
334
+ eta_str = ""
335
+
336
+ self.logger.info(
337
+ f"{self.description}: {self.processed}/{self.total} ({percentage:.1f}%) "
338
+ f"at {rate:.1f} items/s{eta_str}"
339
+ )