gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
1
+ """Main orchestrator for qualitative analysis of Git commits."""
2
+
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ from ...core.schema_version import create_schema_manager
9
+ from ...models.database import Database
10
+ from ..models.schemas import QualitativeCommitData, QualitativeConfig
11
+ from ..utils.batch_processor import BatchProcessor, ProgressTracker
12
+ from ..utils.metrics import PerformanceMetrics
13
+ from .llm_fallback import LLMFallback
14
+ from .nlp_engine import NLPEngine
15
+ from .pattern_cache import PatternCache
16
+
17
+
18
+ class QualitativeProcessor:
19
+ """Main orchestrator for qualitative analysis of Git commits.
20
+
21
+ This processor coordinates the entire qualitative analysis pipeline:
22
+ 1. Pattern cache lookup for known commit patterns
23
+ 2. Fast NLP processing for most commits
24
+ 3. Strategic LLM fallback for uncertain cases
25
+ 4. Pattern learning and cache updates
26
+ 5. Performance monitoring and optimization
27
+
28
+ The system is designed to process 10,000+ commits in under 60 seconds
29
+ while maintaining high accuracy and keeping LLM costs low.
30
+ """
31
+
32
+ def __init__(
33
+ self, config: QualitativeConfig, database: Database, cache_dir: Optional[Path] = None
34
+ ):
35
+ """Initialize qualitative processor.
36
+
37
+ Args:
38
+ config: Configuration for qualitative analysis
39
+ database: Database instance for caching and storage
40
+ cache_dir: Cache directory for schema versioning
41
+ """
42
+ self.config = config
43
+ self.database = database
44
+ self.logger = logging.getLogger(__name__)
45
+
46
+ # Initialize schema version manager
47
+ if cache_dir is None:
48
+ cache_dir = Path(config.cache_config.cache_dir)
49
+ self.schema_manager = create_schema_manager(cache_dir)
50
+
51
+ # Initialize core components
52
+ self.nlp_engine = NLPEngine(config.nlp_config)
53
+ self.pattern_cache = PatternCache(config.cache_config, database)
54
+
55
+ # Initialize LLM fallback if enabled
56
+ self.llm_fallback = None
57
+ if config.llm_config.openrouter_api_key:
58
+ try:
59
+ self.llm_fallback = LLMFallback(config.llm_config, cache_dir)
60
+ self.logger.info("LLM fallback system initialized")
61
+ except Exception as e:
62
+ self.logger.warning(f"LLM fallback initialization failed: {e}")
63
+ else:
64
+ self.logger.info("LLM fallback disabled (no API key configured)")
65
+
66
+ # Initialize utilities
67
+ self.batch_processor = BatchProcessor(
68
+ batch_size=config.batch_size, max_workers=config.nlp_config.max_workers
69
+ )
70
+ self.metrics = PerformanceMetrics()
71
+
72
+ # Processing statistics
73
+ self.processing_stats = {
74
+ "total_processed": 0,
75
+ "cache_hits": 0,
76
+ "nlp_processed": 0,
77
+ "llm_processed": 0,
78
+ "processing_start_time": None,
79
+ "last_optimization": None,
80
+ }
81
+
82
+ self.logger.info("Qualitative processor initialized")
83
+
84
+ def _filter_commits_for_processing(
85
+ self, commits: list[dict[str, Any]], force_reprocess: bool = False
86
+ ) -> list[dict[str, Any]]:
87
+ """Filter commits to only those that need processing based on schema versioning."""
88
+ if force_reprocess:
89
+ return commits
90
+
91
+ # Convert config to dict for schema comparison
92
+ config_dict = {
93
+ "nlp_config": self.config.nlp_config.__dict__,
94
+ "llm_config": self.config.llm_config.__dict__,
95
+ "cache_config": self.config.cache_config.__dict__,
96
+ "confidence_threshold": self.config.confidence_threshold,
97
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
98
+ }
99
+
100
+ # Check if schema has changed
101
+ schema_changed = self.schema_manager.has_schema_changed("qualitative", config_dict)
102
+
103
+ if schema_changed:
104
+ self.logger.info("Qualitative analysis schema has changed, reprocessing all commits")
105
+ return commits
106
+
107
+ # Filter by date - only process commits after last processed date
108
+ last_processed = self.schema_manager.get_last_processed_date("qualitative")
109
+ if not last_processed:
110
+ self.logger.info("No previous processing date found, processing all commits")
111
+ return commits
112
+
113
+ # Filter commits by date
114
+ commits_to_process = []
115
+ for commit in commits:
116
+ commit_date = commit.get("timestamp")
117
+ if commit_date and commit_date > last_processed:
118
+ commits_to_process.append(commit)
119
+
120
+ return commits_to_process
121
+
122
+ def _get_existing_results(self, commits: list[dict[str, Any]]) -> list[QualitativeCommitData]:
123
+ """Get existing qualitative results for commits from the database."""
124
+ results = []
125
+
126
+ # Try to load existing results from database
127
+ # This is a simplified version - in practice you'd query the qualitative_commits table
128
+ for commit in commits:
129
+ # Create minimal result indicating no processing needed
130
+ result = QualitativeCommitData(
131
+ hash=commit.get("hash", ""),
132
+ message=commit.get("message", ""),
133
+ author_name=commit.get("author_name", ""),
134
+ author_email=commit.get("author_email", ""),
135
+ timestamp=commit.get("timestamp"),
136
+ files_changed=commit.get("files_changed", []),
137
+ insertions=commit.get("insertions", 0),
138
+ deletions=commit.get("deletions", 0),
139
+ change_type="unknown",
140
+ change_type_confidence=0.0,
141
+ business_domain="unknown",
142
+ domain_confidence=0.0,
143
+ risk_level="low",
144
+ risk_factors=[],
145
+ intent_signals={},
146
+ collaboration_patterns={},
147
+ technical_context={},
148
+ processing_method="cached",
149
+ processing_time_ms=0.0,
150
+ confidence_score=0.0,
151
+ )
152
+ results.append(result)
153
+
154
+ return results
155
+
156
+ def _update_schema_tracking(self, commits: list[dict[str, Any]]):
157
+ """Update schema version tracking after processing commits."""
158
+ if not commits:
159
+ return
160
+
161
+ # Convert config to dict for schema tracking
162
+ config_dict = {
163
+ "nlp_config": self.config.nlp_config.__dict__,
164
+ "llm_config": self.config.llm_config.__dict__,
165
+ "cache_config": self.config.cache_config.__dict__,
166
+ "confidence_threshold": self.config.confidence_threshold,
167
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
168
+ }
169
+
170
+ # Find the latest commit date
171
+ latest_date = max(commit.get("timestamp") for commit in commits if commit.get("timestamp"))
172
+
173
+ # Update schema version with latest processed date
174
+ self.schema_manager.update_schema_version("qualitative", config_dict, latest_date)
175
+ self.schema_manager.mark_date_processed("qualitative", latest_date, config_dict)
176
+
177
+ def process_commits(
178
+ self,
179
+ commits: list[dict[str, Any]],
180
+ show_progress: bool = True,
181
+ force_reprocess: bool = False,
182
+ ) -> list[QualitativeCommitData]:
183
+ """Process commits with qualitative analysis using incremental processing.
184
+
185
+ Args:
186
+ commits: List of commit dictionaries from GitFlow Analytics
187
+ show_progress: Whether to show progress information
188
+ force_reprocess: Force reprocessing even if schema hasn't changed
189
+
190
+ Returns:
191
+ List of QualitativeCommitData with analysis results
192
+ """
193
+ if not commits:
194
+ return []
195
+
196
+ if not self.config.enabled:
197
+ self.logger.info("Qualitative analysis disabled in configuration")
198
+ return self._create_disabled_results(commits)
199
+
200
+ # Filter commits for incremental processing
201
+ commits_to_process = self._filter_commits_for_processing(commits, force_reprocess)
202
+
203
+ if not commits_to_process:
204
+ self.logger.info("No commits require processing (all up-to-date)")
205
+ # Return existing results for all commits
206
+ return self._get_existing_results(commits)
207
+
208
+ self.processing_stats["processing_start_time"] = time.time()
209
+ self.logger.info(
210
+ f"Starting qualitative analysis of {len(commits_to_process)} commits "
211
+ f"({len(commits) - len(commits_to_process)} already processed)"
212
+ )
213
+
214
+ # Setup progress tracking
215
+ progress_tracker = (
216
+ ProgressTracker(total=len(commits), description="Qualitative Analysis")
217
+ if show_progress
218
+ else None
219
+ )
220
+
221
+ # Step 1: Check cache for known patterns
222
+ cached_results, uncached_commits = self._check_cache(commits, progress_tracker)
223
+ self.logger.info(
224
+ f"Cache provided {len(cached_results)} results, processing {len(uncached_commits)} commits"
225
+ )
226
+
227
+ # Step 2: Process uncached commits with NLP
228
+ nlp_results = []
229
+ if uncached_commits:
230
+ nlp_results = self._process_with_nlp(uncached_commits, progress_tracker)
231
+
232
+ # Step 3: Identify uncertain cases for LLM processing
233
+ confident_results, uncertain_commits = self._separate_by_confidence(nlp_results)
234
+ self.logger.info(
235
+ f"NLP confident: {len(confident_results)}, uncertain: {len(uncertain_commits)}"
236
+ )
237
+
238
+ # Step 4: Process uncertain cases with LLM if available
239
+ llm_results = []
240
+ if uncertain_commits and self.llm_fallback:
241
+ llm_results = self._process_with_llm(uncertain_commits, progress_tracker)
242
+ else:
243
+ # If no LLM available, keep uncertain results with lower confidence
244
+ llm_results = [
245
+ self._convert_to_uncertain_result(commit) for commit in uncertain_commits
246
+ ]
247
+
248
+ # Step 5: Update cache with new high-confidence patterns
249
+ all_new_results = confident_results + llm_results
250
+ if self.config.cache_config.enable_pattern_learning:
251
+ self.pattern_cache.learn_from_results(all_new_results)
252
+
253
+ # Step 6: Combine all results
254
+ all_results = cached_results + confident_results + llm_results
255
+
256
+ # Update processing statistics
257
+ self._update_processing_stats(
258
+ len(commits), len(cached_results), len(confident_results), len(llm_results)
259
+ )
260
+
261
+ # Periodic cache optimization
262
+ if self._should_optimize_cache():
263
+ self._optimize_system()
264
+
265
+ # Update schema tracking after successful processing
266
+ self._update_schema_tracking(commits_to_process)
267
+
268
+ self.logger.info(
269
+ f"Qualitative analysis completed in {time.time() - self.processing_stats['processing_start_time']:.2f}s"
270
+ )
271
+ return all_results
272
+
273
+ def _check_cache(
274
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
275
+ ) -> tuple[list[QualitativeCommitData], list[dict[str, Any]]]:
276
+ """Check pattern cache for known commit patterns.
277
+
278
+ Args:
279
+ commits: List of commit dictionaries
280
+ progress_tracker: Optional progress tracker
281
+
282
+ Returns:
283
+ Tuple of (cached_results, uncached_commits)
284
+ """
285
+ cached_results = []
286
+ uncached_commits = []
287
+
288
+ for commit in commits:
289
+ cached_result = self.pattern_cache.lookup_pattern(
290
+ commit.get("message", ""), commit.get("files_changed", [])
291
+ )
292
+
293
+ if cached_result:
294
+ # Convert cached result to QualitativeCommitData
295
+ result = self._create_result_from_cache(commit, cached_result)
296
+ cached_results.append(result)
297
+ self.processing_stats["cache_hits"] += 1
298
+ else:
299
+ uncached_commits.append(commit)
300
+
301
+ if progress_tracker:
302
+ progress_tracker.update(1)
303
+
304
+ return cached_results, uncached_commits
305
+
306
+ def _process_with_nlp(
307
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
308
+ ) -> list[QualitativeCommitData]:
309
+ """Process commits using NLP engine.
310
+
311
+ Args:
312
+ commits: List of commit dictionaries
313
+ progress_tracker: Optional progress tracker
314
+
315
+ Returns:
316
+ List of QualitativeCommitData from NLP processing
317
+ """
318
+ if not commits:
319
+ return []
320
+
321
+ def process_batch_with_progress(batch: list[dict[str, Any]]) -> list[QualitativeCommitData]:
322
+ results = self.nlp_engine.process_batch(batch)
323
+ if progress_tracker:
324
+ progress_tracker.update(len(batch))
325
+ return results
326
+
327
+ # Use batch processing for efficiency
328
+ if self.config.nlp_config.enable_parallel_processing and len(commits) > 1000:
329
+ all_results = self.batch_processor.process_batches(
330
+ commits, process_batch_with_progress, parallel=True
331
+ )
332
+ else:
333
+ all_results = self.batch_processor.process_batches(
334
+ commits, process_batch_with_progress, parallel=False
335
+ )
336
+
337
+ self.processing_stats["nlp_processed"] += len(commits)
338
+ return all_results
339
+
340
+ def _separate_by_confidence(
341
+ self, results: list[QualitativeCommitData]
342
+ ) -> tuple[list[QualitativeCommitData], list[dict[str, Any]]]:
343
+ """Separate results by confidence threshold.
344
+
345
+ Args:
346
+ results: List of NLP analysis results
347
+
348
+ Returns:
349
+ Tuple of (confident_results, uncertain_commit_dicts)
350
+ """
351
+ confident_results = []
352
+ uncertain_commits = []
353
+
354
+ for result in results:
355
+ if result.confidence_score >= self.config.confidence_threshold:
356
+ confident_results.append(result)
357
+ else:
358
+ # Convert back to commit dict for LLM processing
359
+ commit_dict = {
360
+ "hash": result.hash,
361
+ "message": result.message,
362
+ "author_name": result.author_name,
363
+ "author_email": result.author_email,
364
+ "timestamp": result.timestamp,
365
+ "files_changed": result.files_changed,
366
+ "insertions": result.insertions,
367
+ "deletions": result.deletions,
368
+ }
369
+ uncertain_commits.append(commit_dict)
370
+
371
+ return confident_results, uncertain_commits
372
+
373
+ def _process_with_llm(
374
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
375
+ ) -> list[QualitativeCommitData]:
376
+ """Process uncertain commits with LLM fallback.
377
+
378
+ Args:
379
+ commits: List of uncertain commit dictionaries
380
+ progress_tracker: Optional progress tracker
381
+
382
+ Returns:
383
+ List of QualitativeCommitData from LLM processing
384
+ """
385
+ if not commits or not self.llm_fallback:
386
+ return []
387
+
388
+ # Check LLM usage limits
389
+ max_llm_commits = int(len(commits) * self.config.max_llm_fallback_pct)
390
+ if len(commits) > max_llm_commits:
391
+ self.logger.warning(
392
+ f"LLM limit reached: processing {max_llm_commits} of {len(commits)} uncertain commits"
393
+ )
394
+ commits = commits[:max_llm_commits]
395
+
396
+ # Group similar commits for batch processing
397
+ grouped_commits = self.llm_fallback.group_similar_commits(commits)
398
+ self.logger.debug(
399
+ f"Grouped {len(commits)} commits into {len(grouped_commits)} groups for LLM processing"
400
+ )
401
+
402
+ all_results = []
403
+
404
+ for group in grouped_commits:
405
+ try:
406
+ group_results = self.llm_fallback.process_group(group)
407
+ all_results.extend(group_results)
408
+
409
+ if progress_tracker:
410
+ progress_tracker.update(len(group))
411
+
412
+ except Exception as e:
413
+ self.logger.error(f"LLM processing failed for group of {len(group)} commits: {e}")
414
+ # Create fallback results for this group
415
+ fallback_results = [self._convert_to_uncertain_result(commit) for commit in group]
416
+ all_results.extend(fallback_results)
417
+
418
+ if progress_tracker:
419
+ progress_tracker.update(len(group))
420
+
421
+ self.processing_stats["llm_processed"] += len(commits)
422
+ return all_results
423
+
424
+ def _create_result_from_cache(
425
+ self, commit: dict[str, Any], cached_data: dict[str, Any]
426
+ ) -> QualitativeCommitData:
427
+ """Create QualitativeCommitData from cached pattern.
428
+
429
+ Args:
430
+ commit: Original commit dictionary
431
+ cached_data: Cached classification data
432
+
433
+ Returns:
434
+ QualitativeCommitData object
435
+ """
436
+ return QualitativeCommitData(
437
+ # Copy commit fields
438
+ hash=commit.get("hash", ""),
439
+ message=commit.get("message", ""),
440
+ author_name=commit.get("author_name", ""),
441
+ author_email=commit.get("author_email", ""),
442
+ timestamp=commit.get("timestamp", time.time()),
443
+ files_changed=commit.get("files_changed", []),
444
+ insertions=commit.get("insertions", 0),
445
+ deletions=commit.get("deletions", 0),
446
+ # Use cached classification data
447
+ change_type=cached_data.get("change_type", "unknown"),
448
+ change_type_confidence=cached_data.get("change_type_confidence", 0.5),
449
+ business_domain=cached_data.get("business_domain", "unknown"),
450
+ domain_confidence=cached_data.get("domain_confidence", 0.5),
451
+ risk_level=cached_data.get("risk_level", "medium"),
452
+ risk_factors=cached_data.get("risk_factors", []),
453
+ intent_signals=cached_data.get("intent_signals", {}),
454
+ collaboration_patterns=cached_data.get("collaboration_patterns", {}),
455
+ technical_context={"processing_method": "cached"},
456
+ # Processing metadata
457
+ processing_method="cache",
458
+ processing_time_ms=0.5, # Very fast for cached results
459
+ confidence_score=cached_data.get("confidence_score", 0.5),
460
+ )
461
+
462
+ def _convert_to_uncertain_result(self, commit: dict[str, Any]) -> QualitativeCommitData:
463
+ """Convert commit to uncertain result when LLM is unavailable.
464
+
465
+ Args:
466
+ commit: Commit dictionary
467
+
468
+ Returns:
469
+ QualitativeCommitData with uncertain classifications
470
+ """
471
+ return QualitativeCommitData(
472
+ # Copy commit fields
473
+ hash=commit.get("hash", ""),
474
+ message=commit.get("message", ""),
475
+ author_name=commit.get("author_name", ""),
476
+ author_email=commit.get("author_email", ""),
477
+ timestamp=commit.get("timestamp", time.time()),
478
+ files_changed=commit.get("files_changed", []),
479
+ insertions=commit.get("insertions", 0),
480
+ deletions=commit.get("deletions", 0),
481
+ # Uncertain classifications
482
+ change_type="unknown",
483
+ change_type_confidence=0.3,
484
+ business_domain="unknown",
485
+ domain_confidence=0.3,
486
+ risk_level="medium",
487
+ risk_factors=["low_confidence_classification"],
488
+ intent_signals={"confidence": 0.3},
489
+ collaboration_patterns={},
490
+ technical_context={"processing_method": "uncertain_fallback"},
491
+ # Processing metadata
492
+ processing_method="nlp",
493
+ processing_time_ms=1.0,
494
+ confidence_score=0.3,
495
+ )
496
+
497
+ def _create_disabled_results(
498
+ self, commits: list[dict[str, Any]]
499
+ ) -> list[QualitativeCommitData]:
500
+ """Create disabled results when qualitative analysis is turned off.
501
+
502
+ Args:
503
+ commits: List of commit dictionaries
504
+
505
+ Returns:
506
+ List of QualitativeCommitData with disabled status
507
+ """
508
+ results = []
509
+
510
+ for commit in commits:
511
+ result = QualitativeCommitData(
512
+ # Copy commit fields
513
+ hash=commit.get("hash", ""),
514
+ message=commit.get("message", ""),
515
+ author_name=commit.get("author_name", ""),
516
+ author_email=commit.get("author_email", ""),
517
+ timestamp=commit.get("timestamp", time.time()),
518
+ files_changed=commit.get("files_changed", []),
519
+ insertions=commit.get("insertions", 0),
520
+ deletions=commit.get("deletions", 0),
521
+ # Disabled classifications
522
+ change_type="disabled",
523
+ change_type_confidence=0.0,
524
+ business_domain="disabled",
525
+ domain_confidence=0.0,
526
+ risk_level="unknown",
527
+ risk_factors=["qualitative_analysis_disabled"],
528
+ intent_signals={"disabled": True},
529
+ collaboration_patterns={},
530
+ technical_context={"processing_method": "disabled"},
531
+ # Processing metadata
532
+ processing_method="disabled",
533
+ processing_time_ms=0.0,
534
+ confidence_score=0.0,
535
+ )
536
+ results.append(result)
537
+
538
+ return results
539
+
540
+ def _update_processing_stats(
541
+ self, total_commits: int, cached: int, nlp_processed: int, llm_processed: int
542
+ ) -> None:
543
+ """Update processing statistics.
544
+
545
+ Args:
546
+ total_commits: Total number of commits processed
547
+ cached: Number of cache hits
548
+ nlp_processed: Number processed by NLP
549
+ llm_processed: Number processed by LLM
550
+ """
551
+ self.processing_stats["total_processed"] += total_commits
552
+ self.processing_stats["cache_hits"] += cached
553
+ self.processing_stats["nlp_processed"] += nlp_processed
554
+ self.processing_stats["llm_processed"] += llm_processed
555
+
556
+ # Log processing breakdown
557
+ cache_pct = (cached / total_commits) * 100 if total_commits > 0 else 0
558
+ nlp_pct = (nlp_processed / total_commits) * 100 if total_commits > 0 else 0
559
+ llm_pct = (llm_processed / total_commits) * 100 if total_commits > 0 else 0
560
+
561
+ self.logger.info(
562
+ f"Processing breakdown: {cache_pct:.1f}% cached, "
563
+ f"{nlp_pct:.1f}% NLP, {llm_pct:.1f}% LLM"
564
+ )
565
+
566
+ def _should_optimize_cache(self) -> bool:
567
+ """Check if cache optimization should be performed.
568
+
569
+ Returns:
570
+ True if optimization should be performed
571
+ """
572
+ # Optimize every 10,000 commits or every hour
573
+ return bool(
574
+ self.processing_stats["total_processed"] % 10000 == 0
575
+ or self.processing_stats["last_optimization"] is None
576
+ or time.time() - self.processing_stats["last_optimization"] > 3600
577
+ )
578
+
579
+ def _optimize_system(self) -> None:
580
+ """Perform system optimization."""
581
+ self.logger.info("Performing system optimization...")
582
+
583
+ # Optimize pattern cache
584
+ self.pattern_cache.optimize_cache()
585
+
586
+ # Update last optimization time
587
+ self.processing_stats["last_optimization"] = time.time()
588
+
589
+ self.logger.info("System optimization completed")
590
+
591
+ def get_processing_statistics(self) -> dict[str, Any]:
592
+ """Get comprehensive processing statistics.
593
+
594
+ Returns:
595
+ Dictionary with processing statistics
596
+ """
597
+ # Get component statistics
598
+ cache_stats = self.pattern_cache.get_cache_statistics()
599
+ nlp_stats = self.nlp_engine.get_performance_stats()
600
+
601
+ # Calculate processing rates
602
+ total_time = time.time() - (self.processing_stats["processing_start_time"] or time.time())
603
+ commits_per_second = (
604
+ self.processing_stats["total_processed"] / total_time if total_time > 0 else 0
605
+ )
606
+
607
+ # Calculate method percentages
608
+ total = self.processing_stats["total_processed"]
609
+ method_percentages = {
610
+ "cache": (self.processing_stats["cache_hits"] / total * 100) if total > 0 else 0,
611
+ "nlp": (self.processing_stats["nlp_processed"] / total * 100) if total > 0 else 0,
612
+ "llm": (self.processing_stats["llm_processed"] / total * 100) if total > 0 else 0,
613
+ }
614
+
615
+ stats = {
616
+ "processing_summary": {
617
+ "total_commits_processed": self.processing_stats["total_processed"],
618
+ "commits_per_second": commits_per_second,
619
+ "total_processing_time_seconds": total_time,
620
+ "method_breakdown": method_percentages,
621
+ },
622
+ "cache_statistics": cache_stats,
623
+ "nlp_statistics": nlp_stats,
624
+ "configuration": {
625
+ "enabled": self.config.enabled,
626
+ "confidence_threshold": self.config.confidence_threshold,
627
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
628
+ "batch_size": self.config.batch_size,
629
+ },
630
+ }
631
+
632
+ # Add LLM statistics if available
633
+ if self.llm_fallback:
634
+ stats["llm_statistics"] = {
635
+ "cost_tracking": self.llm_fallback.cost_tracker.get_usage_stats(),
636
+ "model_usage": "available",
637
+ }
638
+ else:
639
+ stats["llm_statistics"] = {"model_usage": "disabled"}
640
+
641
+ return stats
642
+
643
+ def validate_setup(self) -> tuple[bool, list[str]]:
644
+ """Validate processor setup and dependencies.
645
+
646
+ Returns:
647
+ Tuple of (is_valid, list_of_issues)
648
+ """
649
+ issues = []
650
+
651
+ # Validate NLP engine
652
+ nlp_valid, nlp_issues = self.nlp_engine.validate_setup()
653
+ if not nlp_valid:
654
+ issues.extend([f"NLP: {issue}" for issue in nlp_issues])
655
+
656
+ # Validate LLM fallback if configured
657
+ if self.config.llm_config.openrouter_api_key and self.llm_fallback is None:
658
+ issues.append("LLM: API key configured but fallback system failed to initialize")
659
+
660
+ # Validate configuration
661
+ config_warnings = self.config.validate()
662
+ issues.extend([f"Config: {warning}" for warning in config_warnings])
663
+
664
+ # Test database connection
665
+ try:
666
+ with self.database.get_session() as session:
667
+ from sqlalchemy import text
668
+
669
+ session.execute(text("SELECT 1"))
670
+ except Exception as e:
671
+ issues.append(f"Database: Connection failed - {e}")
672
+
673
+ return len(issues) == 0, issues