gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -2,260 +2,389 @@
2
2
 
3
3
  import logging
4
4
  import time
5
- from typing import Dict, List, Any, Tuple, Optional
6
- from concurrent.futures import ThreadPoolExecutor, as_completed
7
5
  from pathlib import Path
6
+ from typing import Any, Optional
8
7
 
8
+ from ...core.schema_version import create_schema_manager
9
9
  from ...models.database import Database
10
- from ..models.schemas import QualitativeConfig, QualitativeCommitData
11
- from .nlp_engine import NLPEngine
12
- from .llm_fallback import LLMFallback
13
- from .pattern_cache import PatternCache
10
+ from ..models.schemas import QualitativeCommitData, QualitativeConfig
14
11
  from ..utils.batch_processor import BatchProcessor, ProgressTracker
15
12
  from ..utils.metrics import PerformanceMetrics
13
+ from .llm_fallback import LLMFallback
14
+ from .nlp_engine import NLPEngine
15
+ from .pattern_cache import PatternCache
16
16
 
17
17
 
18
18
  class QualitativeProcessor:
19
19
  """Main orchestrator for qualitative analysis of Git commits.
20
-
20
+
21
21
  This processor coordinates the entire qualitative analysis pipeline:
22
22
  1. Pattern cache lookup for known commit patterns
23
23
  2. Fast NLP processing for most commits
24
24
  3. Strategic LLM fallback for uncertain cases
25
25
  4. Pattern learning and cache updates
26
26
  5. Performance monitoring and optimization
27
-
27
+
28
28
  The system is designed to process 10,000+ commits in under 60 seconds
29
29
  while maintaining high accuracy and keeping LLM costs low.
30
30
  """
31
-
32
- def __init__(self, config: QualitativeConfig, database: Database):
31
+
32
+ def __init__(
33
+ self, config: QualitativeConfig, database: Database, cache_dir: Optional[Path] = None
34
+ ):
33
35
  """Initialize qualitative processor.
34
-
36
+
35
37
  Args:
36
38
  config: Configuration for qualitative analysis
37
39
  database: Database instance for caching and storage
40
+ cache_dir: Cache directory for schema versioning
38
41
  """
39
42
  self.config = config
40
43
  self.database = database
41
44
  self.logger = logging.getLogger(__name__)
42
-
45
+
46
+ # Initialize schema version manager
47
+ if cache_dir is None:
48
+ cache_dir = Path(config.cache_config.cache_dir)
49
+ self.schema_manager = create_schema_manager(cache_dir)
50
+
43
51
  # Initialize core components
44
52
  self.nlp_engine = NLPEngine(config.nlp_config)
45
53
  self.pattern_cache = PatternCache(config.cache_config, database)
46
-
54
+
47
55
  # Initialize LLM fallback if enabled
48
56
  self.llm_fallback = None
49
57
  if config.llm_config.openrouter_api_key:
50
58
  try:
51
- self.llm_fallback = LLMFallback(config.llm_config)
59
+ self.llm_fallback = LLMFallback(config.llm_config, cache_dir)
52
60
  self.logger.info("LLM fallback system initialized")
53
61
  except Exception as e:
54
62
  self.logger.warning(f"LLM fallback initialization failed: {e}")
55
63
  else:
56
64
  self.logger.info("LLM fallback disabled (no API key configured)")
57
-
65
+
58
66
  # Initialize utilities
59
67
  self.batch_processor = BatchProcessor(
60
- batch_size=config.batch_size,
61
- max_workers=config.nlp_config.max_workers
68
+ batch_size=config.batch_size, max_workers=config.nlp_config.max_workers
62
69
  )
63
70
  self.metrics = PerformanceMetrics()
64
-
71
+
65
72
  # Processing statistics
66
73
  self.processing_stats = {
67
- 'total_processed': 0,
68
- 'cache_hits': 0,
69
- 'nlp_processed': 0,
70
- 'llm_processed': 0,
71
- 'processing_start_time': None,
72
- 'last_optimization': None
74
+ "total_processed": 0,
75
+ "cache_hits": 0,
76
+ "nlp_processed": 0,
77
+ "llm_processed": 0,
78
+ "processing_start_time": None,
79
+ "last_optimization": None,
73
80
  }
74
-
81
+
75
82
  self.logger.info("Qualitative processor initialized")
76
-
77
- def process_commits(self, commits: List[Dict[str, Any]],
78
- show_progress: bool = True) -> List[QualitativeCommitData]:
79
- """Process commits with qualitative analysis.
80
-
83
+
84
+ def _filter_commits_for_processing(
85
+ self, commits: list[dict[str, Any]], force_reprocess: bool = False
86
+ ) -> list[dict[str, Any]]:
87
+ """Filter commits to only those that need processing based on schema versioning."""
88
+ if force_reprocess:
89
+ return commits
90
+
91
+ # Convert config to dict for schema comparison
92
+ config_dict = {
93
+ "nlp_config": self.config.nlp_config.__dict__,
94
+ "llm_config": self.config.llm_config.__dict__,
95
+ "cache_config": self.config.cache_config.__dict__,
96
+ "confidence_threshold": self.config.confidence_threshold,
97
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
98
+ }
99
+
100
+ # Check if schema has changed
101
+ schema_changed = self.schema_manager.has_schema_changed("qualitative", config_dict)
102
+
103
+ if schema_changed:
104
+ self.logger.info("Qualitative analysis schema has changed, reprocessing all commits")
105
+ return commits
106
+
107
+ # Filter by date - only process commits after last processed date
108
+ last_processed = self.schema_manager.get_last_processed_date("qualitative")
109
+ if not last_processed:
110
+ self.logger.info("No previous processing date found, processing all commits")
111
+ return commits
112
+
113
+ # Filter commits by date
114
+ commits_to_process = []
115
+ for commit in commits:
116
+ commit_date = commit.get("timestamp")
117
+ if commit_date and commit_date > last_processed:
118
+ commits_to_process.append(commit)
119
+
120
+ return commits_to_process
121
+
122
+ def _get_existing_results(self, commits: list[dict[str, Any]]) -> list[QualitativeCommitData]:
123
+ """Get existing qualitative results for commits from the database."""
124
+ results = []
125
+
126
+ # Try to load existing results from database
127
+ # This is a simplified version - in practice you'd query the qualitative_commits table
128
+ for commit in commits:
129
+ # Create minimal result indicating no processing needed
130
+ result = QualitativeCommitData(
131
+ hash=commit.get("hash", ""),
132
+ message=commit.get("message", ""),
133
+ author_name=commit.get("author_name", ""),
134
+ author_email=commit.get("author_email", ""),
135
+ timestamp=commit.get("timestamp"),
136
+ files_changed=commit.get("files_changed", []),
137
+ insertions=commit.get("insertions", 0),
138
+ deletions=commit.get("deletions", 0),
139
+ change_type="unknown",
140
+ change_type_confidence=0.0,
141
+ business_domain="unknown",
142
+ domain_confidence=0.0,
143
+ risk_level="low",
144
+ risk_factors=[],
145
+ intent_signals={},
146
+ collaboration_patterns={},
147
+ technical_context={},
148
+ processing_method="cached",
149
+ processing_time_ms=0.0,
150
+ confidence_score=0.0,
151
+ )
152
+ results.append(result)
153
+
154
+ return results
155
+
156
+ def _update_schema_tracking(self, commits: list[dict[str, Any]]):
157
+ """Update schema version tracking after processing commits."""
158
+ if not commits:
159
+ return
160
+
161
+ # Convert config to dict for schema tracking
162
+ config_dict = {
163
+ "nlp_config": self.config.nlp_config.__dict__,
164
+ "llm_config": self.config.llm_config.__dict__,
165
+ "cache_config": self.config.cache_config.__dict__,
166
+ "confidence_threshold": self.config.confidence_threshold,
167
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
168
+ }
169
+
170
+ # Find the latest commit date
171
+ latest_date = max(commit.get("timestamp") for commit in commits if commit.get("timestamp"))
172
+
173
+ # Update schema version with latest processed date
174
+ self.schema_manager.update_schema_version("qualitative", config_dict, latest_date)
175
+ self.schema_manager.mark_date_processed("qualitative", latest_date, config_dict)
176
+
177
+ def process_commits(
178
+ self,
179
+ commits: list[dict[str, Any]],
180
+ show_progress: bool = True,
181
+ force_reprocess: bool = False,
182
+ ) -> list[QualitativeCommitData]:
183
+ """Process commits with qualitative analysis using incremental processing.
184
+
81
185
  Args:
82
186
  commits: List of commit dictionaries from GitFlow Analytics
83
187
  show_progress: Whether to show progress information
84
-
188
+ force_reprocess: Force reprocessing even if schema hasn't changed
189
+
85
190
  Returns:
86
191
  List of QualitativeCommitData with analysis results
87
192
  """
88
193
  if not commits:
89
194
  return []
90
-
195
+
91
196
  if not self.config.enabled:
92
197
  self.logger.info("Qualitative analysis disabled in configuration")
93
198
  return self._create_disabled_results(commits)
94
-
95
- self.processing_stats['processing_start_time'] = time.time()
96
- self.logger.info(f"Starting qualitative analysis of {len(commits)} commits")
97
-
199
+
200
+ # Filter commits for incremental processing
201
+ commits_to_process = self._filter_commits_for_processing(commits, force_reprocess)
202
+
203
+ if not commits_to_process:
204
+ self.logger.info("No commits require processing (all up-to-date)")
205
+ # Return existing results for all commits
206
+ return self._get_existing_results(commits)
207
+
208
+ self.processing_stats["processing_start_time"] = time.time()
209
+ self.logger.info(
210
+ f"Starting qualitative analysis of {len(commits_to_process)} commits "
211
+ f"({len(commits) - len(commits_to_process)} already processed)"
212
+ )
213
+
98
214
  # Setup progress tracking
99
- progress_tracker = ProgressTracker(
100
- total=len(commits),
101
- description="Qualitative Analysis"
102
- ) if show_progress else None
103
-
215
+ progress_tracker = (
216
+ ProgressTracker(total=len(commits), description="Qualitative Analysis")
217
+ if show_progress
218
+ else None
219
+ )
220
+
104
221
  # Step 1: Check cache for known patterns
105
222
  cached_results, uncached_commits = self._check_cache(commits, progress_tracker)
106
- self.logger.info(f"Cache provided {len(cached_results)} results, processing {len(uncached_commits)} commits")
107
-
223
+ self.logger.info(
224
+ f"Cache provided {len(cached_results)} results, processing {len(uncached_commits)} commits"
225
+ )
226
+
108
227
  # Step 2: Process uncached commits with NLP
109
228
  nlp_results = []
110
229
  if uncached_commits:
111
230
  nlp_results = self._process_with_nlp(uncached_commits, progress_tracker)
112
-
231
+
113
232
  # Step 3: Identify uncertain cases for LLM processing
114
233
  confident_results, uncertain_commits = self._separate_by_confidence(nlp_results)
115
- self.logger.info(f"NLP confident: {len(confident_results)}, uncertain: {len(uncertain_commits)}")
116
-
234
+ self.logger.info(
235
+ f"NLP confident: {len(confident_results)}, uncertain: {len(uncertain_commits)}"
236
+ )
237
+
117
238
  # Step 4: Process uncertain cases with LLM if available
118
239
  llm_results = []
119
240
  if uncertain_commits and self.llm_fallback:
120
241
  llm_results = self._process_with_llm(uncertain_commits, progress_tracker)
121
242
  else:
122
243
  # If no LLM available, keep uncertain results with lower confidence
123
- llm_results = [self._convert_to_uncertain_result(commit) for commit in uncertain_commits]
124
-
244
+ llm_results = [
245
+ self._convert_to_uncertain_result(commit) for commit in uncertain_commits
246
+ ]
247
+
125
248
  # Step 5: Update cache with new high-confidence patterns
126
249
  all_new_results = confident_results + llm_results
127
250
  if self.config.cache_config.enable_pattern_learning:
128
251
  self.pattern_cache.learn_from_results(all_new_results)
129
-
252
+
130
253
  # Step 6: Combine all results
131
254
  all_results = cached_results + confident_results + llm_results
132
-
255
+
133
256
  # Update processing statistics
134
- self._update_processing_stats(len(commits), len(cached_results),
135
- len(confident_results), len(llm_results))
136
-
257
+ self._update_processing_stats(
258
+ len(commits), len(cached_results), len(confident_results), len(llm_results)
259
+ )
260
+
137
261
  # Periodic cache optimization
138
262
  if self._should_optimize_cache():
139
263
  self._optimize_system()
140
-
141
- self.logger.info(f"Qualitative analysis completed in {time.time() - self.processing_stats['processing_start_time']:.2f}s")
264
+
265
+ # Update schema tracking after successful processing
266
+ self._update_schema_tracking(commits_to_process)
267
+
268
+ self.logger.info(
269
+ f"Qualitative analysis completed in {time.time() - self.processing_stats['processing_start_time']:.2f}s"
270
+ )
142
271
  return all_results
143
-
144
- def _check_cache(self, commits: List[Dict[str, Any]],
145
- progress_tracker: Optional[ProgressTracker]) -> Tuple[List[QualitativeCommitData], List[Dict[str, Any]]]:
272
+
273
+ def _check_cache(
274
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
275
+ ) -> tuple[list[QualitativeCommitData], list[dict[str, Any]]]:
146
276
  """Check pattern cache for known commit patterns.
147
-
277
+
148
278
  Args:
149
279
  commits: List of commit dictionaries
150
280
  progress_tracker: Optional progress tracker
151
-
281
+
152
282
  Returns:
153
283
  Tuple of (cached_results, uncached_commits)
154
284
  """
155
285
  cached_results = []
156
286
  uncached_commits = []
157
-
287
+
158
288
  for commit in commits:
159
289
  cached_result = self.pattern_cache.lookup_pattern(
160
- commit.get('message', ''),
161
- commit.get('files_changed', [])
290
+ commit.get("message", ""), commit.get("files_changed", [])
162
291
  )
163
-
292
+
164
293
  if cached_result:
165
294
  # Convert cached result to QualitativeCommitData
166
295
  result = self._create_result_from_cache(commit, cached_result)
167
296
  cached_results.append(result)
168
- self.processing_stats['cache_hits'] += 1
297
+ self.processing_stats["cache_hits"] += 1
169
298
  else:
170
299
  uncached_commits.append(commit)
171
-
300
+
172
301
  if progress_tracker:
173
302
  progress_tracker.update(1)
174
-
303
+
175
304
  return cached_results, uncached_commits
176
-
177
- def _process_with_nlp(self, commits: List[Dict[str, Any]],
178
- progress_tracker: Optional[ProgressTracker]) -> List[QualitativeCommitData]:
305
+
306
+ def _process_with_nlp(
307
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
308
+ ) -> list[QualitativeCommitData]:
179
309
  """Process commits using NLP engine.
180
-
310
+
181
311
  Args:
182
312
  commits: List of commit dictionaries
183
313
  progress_tracker: Optional progress tracker
184
-
314
+
185
315
  Returns:
186
316
  List of QualitativeCommitData from NLP processing
187
317
  """
188
318
  if not commits:
189
319
  return []
190
-
191
- def process_batch_with_progress(batch: List[Dict[str, Any]]) -> List[QualitativeCommitData]:
320
+
321
+ def process_batch_with_progress(batch: list[dict[str, Any]]) -> list[QualitativeCommitData]:
192
322
  results = self.nlp_engine.process_batch(batch)
193
323
  if progress_tracker:
194
324
  progress_tracker.update(len(batch))
195
325
  return results
196
-
326
+
197
327
  # Use batch processing for efficiency
198
328
  if self.config.nlp_config.enable_parallel_processing and len(commits) > 1000:
199
329
  all_results = self.batch_processor.process_batches(
200
- commits,
201
- process_batch_with_progress,
202
- parallel=True
330
+ commits, process_batch_with_progress, parallel=True
203
331
  )
204
332
  else:
205
333
  all_results = self.batch_processor.process_batches(
206
- commits,
207
- process_batch_with_progress,
208
- parallel=False
334
+ commits, process_batch_with_progress, parallel=False
209
335
  )
210
-
211
- self.processing_stats['nlp_processed'] += len(commits)
336
+
337
+ self.processing_stats["nlp_processed"] += len(commits)
212
338
  return all_results
213
-
214
- def _separate_by_confidence(self, results: List[QualitativeCommitData]) -> Tuple[List[QualitativeCommitData], List[Dict[str, Any]]]:
339
+
340
+ def _separate_by_confidence(
341
+ self, results: list[QualitativeCommitData]
342
+ ) -> tuple[list[QualitativeCommitData], list[dict[str, Any]]]:
215
343
  """Separate results by confidence threshold.
216
-
217
- Args:
344
+
345
+ Args:
218
346
  results: List of NLP analysis results
219
-
347
+
220
348
  Returns:
221
349
  Tuple of (confident_results, uncertain_commit_dicts)
222
350
  """
223
351
  confident_results = []
224
352
  uncertain_commits = []
225
-
353
+
226
354
  for result in results:
227
355
  if result.confidence_score >= self.config.confidence_threshold:
228
356
  confident_results.append(result)
229
357
  else:
230
358
  # Convert back to commit dict for LLM processing
231
359
  commit_dict = {
232
- 'hash': result.hash,
233
- 'message': result.message,
234
- 'author_name': result.author_name,
235
- 'author_email': result.author_email,
236
- 'timestamp': result.timestamp,
237
- 'files_changed': result.files_changed,
238
- 'insertions': result.insertions,
239
- 'deletions': result.deletions
360
+ "hash": result.hash,
361
+ "message": result.message,
362
+ "author_name": result.author_name,
363
+ "author_email": result.author_email,
364
+ "timestamp": result.timestamp,
365
+ "files_changed": result.files_changed,
366
+ "insertions": result.insertions,
367
+ "deletions": result.deletions,
240
368
  }
241
369
  uncertain_commits.append(commit_dict)
242
-
370
+
243
371
  return confident_results, uncertain_commits
244
-
245
- def _process_with_llm(self, commits: List[Dict[str, Any]],
246
- progress_tracker: Optional[ProgressTracker]) -> List[QualitativeCommitData]:
372
+
373
+ def _process_with_llm(
374
+ self, commits: list[dict[str, Any]], progress_tracker: Optional[ProgressTracker]
375
+ ) -> list[QualitativeCommitData]:
247
376
  """Process uncertain commits with LLM fallback.
248
-
377
+
249
378
  Args:
250
379
  commits: List of uncertain commit dictionaries
251
380
  progress_tracker: Optional progress tracker
252
-
381
+
253
382
  Returns:
254
383
  List of QualitativeCommitData from LLM processing
255
384
  """
256
385
  if not commits or not self.llm_fallback:
257
386
  return []
258
-
387
+
259
388
  # Check LLM usage limits
260
389
  max_llm_commits = int(len(commits) * self.config.max_llm_fallback_pct)
261
390
  if len(commits) > max_llm_commits:
@@ -263,278 +392,282 @@ class QualitativeProcessor:
263
392
  f"LLM limit reached: processing {max_llm_commits} of {len(commits)} uncertain commits"
264
393
  )
265
394
  commits = commits[:max_llm_commits]
266
-
395
+
267
396
  # Group similar commits for batch processing
268
397
  grouped_commits = self.llm_fallback.group_similar_commits(commits)
269
- self.logger.debug(f"Grouped {len(commits)} commits into {len(grouped_commits)} groups for LLM processing")
270
-
398
+ self.logger.debug(
399
+ f"Grouped {len(commits)} commits into {len(grouped_commits)} groups for LLM processing"
400
+ )
401
+
271
402
  all_results = []
272
-
403
+
273
404
  for group in grouped_commits:
274
405
  try:
275
406
  group_results = self.llm_fallback.process_group(group)
276
407
  all_results.extend(group_results)
277
-
408
+
278
409
  if progress_tracker:
279
410
  progress_tracker.update(len(group))
280
-
411
+
281
412
  except Exception as e:
282
413
  self.logger.error(f"LLM processing failed for group of {len(group)} commits: {e}")
283
414
  # Create fallback results for this group
284
415
  fallback_results = [self._convert_to_uncertain_result(commit) for commit in group]
285
416
  all_results.extend(fallback_results)
286
-
417
+
287
418
  if progress_tracker:
288
419
  progress_tracker.update(len(group))
289
-
290
- self.processing_stats['llm_processed'] += len(commits)
420
+
421
+ self.processing_stats["llm_processed"] += len(commits)
291
422
  return all_results
292
-
293
- def _create_result_from_cache(self, commit: Dict[str, Any],
294
- cached_data: Dict[str, Any]) -> QualitativeCommitData:
423
+
424
+ def _create_result_from_cache(
425
+ self, commit: dict[str, Any], cached_data: dict[str, Any]
426
+ ) -> QualitativeCommitData:
295
427
  """Create QualitativeCommitData from cached pattern.
296
-
428
+
297
429
  Args:
298
430
  commit: Original commit dictionary
299
431
  cached_data: Cached classification data
300
-
432
+
301
433
  Returns:
302
434
  QualitativeCommitData object
303
435
  """
304
436
  return QualitativeCommitData(
305
437
  # Copy commit fields
306
- hash=commit.get('hash', ''),
307
- message=commit.get('message', ''),
308
- author_name=commit.get('author_name', ''),
309
- author_email=commit.get('author_email', ''),
310
- timestamp=commit.get('timestamp', time.time()),
311
- files_changed=commit.get('files_changed', []),
312
- insertions=commit.get('insertions', 0),
313
- deletions=commit.get('deletions', 0),
314
-
438
+ hash=commit.get("hash", ""),
439
+ message=commit.get("message", ""),
440
+ author_name=commit.get("author_name", ""),
441
+ author_email=commit.get("author_email", ""),
442
+ timestamp=commit.get("timestamp", time.time()),
443
+ files_changed=commit.get("files_changed", []),
444
+ insertions=commit.get("insertions", 0),
445
+ deletions=commit.get("deletions", 0),
315
446
  # Use cached classification data
316
- change_type=cached_data.get('change_type', 'unknown'),
317
- change_type_confidence=cached_data.get('change_type_confidence', 0.5),
318
- business_domain=cached_data.get('business_domain', 'unknown'),
319
- domain_confidence=cached_data.get('domain_confidence', 0.5),
320
- risk_level=cached_data.get('risk_level', 'medium'),
321
- risk_factors=cached_data.get('risk_factors', []),
322
- intent_signals=cached_data.get('intent_signals', {}),
323
- collaboration_patterns=cached_data.get('collaboration_patterns', {}),
324
- technical_context={'processing_method': 'cached'},
325
-
447
+ change_type=cached_data.get("change_type", "unknown"),
448
+ change_type_confidence=cached_data.get("change_type_confidence", 0.5),
449
+ business_domain=cached_data.get("business_domain", "unknown"),
450
+ domain_confidence=cached_data.get("domain_confidence", 0.5),
451
+ risk_level=cached_data.get("risk_level", "medium"),
452
+ risk_factors=cached_data.get("risk_factors", []),
453
+ intent_signals=cached_data.get("intent_signals", {}),
454
+ collaboration_patterns=cached_data.get("collaboration_patterns", {}),
455
+ technical_context={"processing_method": "cached"},
326
456
  # Processing metadata
327
- processing_method='cache',
457
+ processing_method="cache",
328
458
  processing_time_ms=0.5, # Very fast for cached results
329
- confidence_score=cached_data.get('confidence_score', 0.5)
459
+ confidence_score=cached_data.get("confidence_score", 0.5),
330
460
  )
331
-
332
- def _convert_to_uncertain_result(self, commit: Dict[str, Any]) -> QualitativeCommitData:
461
+
462
+ def _convert_to_uncertain_result(self, commit: dict[str, Any]) -> QualitativeCommitData:
333
463
  """Convert commit to uncertain result when LLM is unavailable.
334
-
464
+
335
465
  Args:
336
466
  commit: Commit dictionary
337
-
467
+
338
468
  Returns:
339
469
  QualitativeCommitData with uncertain classifications
340
470
  """
341
471
  return QualitativeCommitData(
342
472
  # Copy commit fields
343
- hash=commit.get('hash', ''),
344
- message=commit.get('message', ''),
345
- author_name=commit.get('author_name', ''),
346
- author_email=commit.get('author_email', ''),
347
- timestamp=commit.get('timestamp', time.time()),
348
- files_changed=commit.get('files_changed', []),
349
- insertions=commit.get('insertions', 0),
350
- deletions=commit.get('deletions', 0),
351
-
473
+ hash=commit.get("hash", ""),
474
+ message=commit.get("message", ""),
475
+ author_name=commit.get("author_name", ""),
476
+ author_email=commit.get("author_email", ""),
477
+ timestamp=commit.get("timestamp", time.time()),
478
+ files_changed=commit.get("files_changed", []),
479
+ insertions=commit.get("insertions", 0),
480
+ deletions=commit.get("deletions", 0),
352
481
  # Uncertain classifications
353
- change_type='unknown',
482
+ change_type="unknown",
354
483
  change_type_confidence=0.3,
355
- business_domain='unknown',
484
+ business_domain="unknown",
356
485
  domain_confidence=0.3,
357
- risk_level='medium',
358
- risk_factors=['low_confidence_classification'],
359
- intent_signals={'confidence': 0.3},
486
+ risk_level="medium",
487
+ risk_factors=["low_confidence_classification"],
488
+ intent_signals={"confidence": 0.3},
360
489
  collaboration_patterns={},
361
- technical_context={'processing_method': 'uncertain_fallback'},
362
-
490
+ technical_context={"processing_method": "uncertain_fallback"},
363
491
  # Processing metadata
364
- processing_method='nlp',
492
+ processing_method="nlp",
365
493
  processing_time_ms=1.0,
366
- confidence_score=0.3
494
+ confidence_score=0.3,
367
495
  )
368
-
369
- def _create_disabled_results(self, commits: List[Dict[str, Any]]) -> List[QualitativeCommitData]:
496
+
497
+ def _create_disabled_results(
498
+ self, commits: list[dict[str, Any]]
499
+ ) -> list[QualitativeCommitData]:
370
500
  """Create disabled results when qualitative analysis is turned off.
371
-
501
+
372
502
  Args:
373
503
  commits: List of commit dictionaries
374
-
504
+
375
505
  Returns:
376
506
  List of QualitativeCommitData with disabled status
377
507
  """
378
508
  results = []
379
-
509
+
380
510
  for commit in commits:
381
511
  result = QualitativeCommitData(
382
512
  # Copy commit fields
383
- hash=commit.get('hash', ''),
384
- message=commit.get('message', ''),
385
- author_name=commit.get('author_name', ''),
386
- author_email=commit.get('author_email', ''),
387
- timestamp=commit.get('timestamp', time.time()),
388
- files_changed=commit.get('files_changed', []),
389
- insertions=commit.get('insertions', 0),
390
- deletions=commit.get('deletions', 0),
391
-
513
+ hash=commit.get("hash", ""),
514
+ message=commit.get("message", ""),
515
+ author_name=commit.get("author_name", ""),
516
+ author_email=commit.get("author_email", ""),
517
+ timestamp=commit.get("timestamp", time.time()),
518
+ files_changed=commit.get("files_changed", []),
519
+ insertions=commit.get("insertions", 0),
520
+ deletions=commit.get("deletions", 0),
392
521
  # Disabled classifications
393
- change_type='disabled',
522
+ change_type="disabled",
394
523
  change_type_confidence=0.0,
395
- business_domain='disabled',
524
+ business_domain="disabled",
396
525
  domain_confidence=0.0,
397
- risk_level='unknown',
398
- risk_factors=['qualitative_analysis_disabled'],
399
- intent_signals={'disabled': True},
526
+ risk_level="unknown",
527
+ risk_factors=["qualitative_analysis_disabled"],
528
+ intent_signals={"disabled": True},
400
529
  collaboration_patterns={},
401
- technical_context={'processing_method': 'disabled'},
402
-
530
+ technical_context={"processing_method": "disabled"},
403
531
  # Processing metadata
404
- processing_method='disabled',
532
+ processing_method="disabled",
405
533
  processing_time_ms=0.0,
406
- confidence_score=0.0
534
+ confidence_score=0.0,
407
535
  )
408
536
  results.append(result)
409
-
537
+
410
538
  return results
411
-
412
- def _update_processing_stats(self, total_commits: int, cached: int,
413
- nlp_processed: int, llm_processed: int) -> None:
539
+
540
+ def _update_processing_stats(
541
+ self, total_commits: int, cached: int, nlp_processed: int, llm_processed: int
542
+ ) -> None:
414
543
  """Update processing statistics.
415
-
544
+
416
545
  Args:
417
546
  total_commits: Total number of commits processed
418
547
  cached: Number of cache hits
419
548
  nlp_processed: Number processed by NLP
420
549
  llm_processed: Number processed by LLM
421
550
  """
422
- self.processing_stats['total_processed'] += total_commits
423
- self.processing_stats['cache_hits'] += cached
424
- self.processing_stats['nlp_processed'] += nlp_processed
425
- self.processing_stats['llm_processed'] += llm_processed
426
-
551
+ self.processing_stats["total_processed"] += total_commits
552
+ self.processing_stats["cache_hits"] += cached
553
+ self.processing_stats["nlp_processed"] += nlp_processed
554
+ self.processing_stats["llm_processed"] += llm_processed
555
+
427
556
  # Log processing breakdown
428
557
  cache_pct = (cached / total_commits) * 100 if total_commits > 0 else 0
429
558
  nlp_pct = (nlp_processed / total_commits) * 100 if total_commits > 0 else 0
430
559
  llm_pct = (llm_processed / total_commits) * 100 if total_commits > 0 else 0
431
-
560
+
432
561
  self.logger.info(
433
562
  f"Processing breakdown: {cache_pct:.1f}% cached, "
434
563
  f"{nlp_pct:.1f}% NLP, {llm_pct:.1f}% LLM"
435
564
  )
436
-
565
+
437
566
  def _should_optimize_cache(self) -> bool:
438
567
  """Check if cache optimization should be performed.
439
-
568
+
440
569
  Returns:
441
570
  True if optimization should be performed
442
571
  """
443
572
  # Optimize every 10,000 commits or every hour
444
- if (self.processing_stats['total_processed'] % 10000 == 0 or
445
- (self.processing_stats['last_optimization'] is None) or
446
- (time.time() - self.processing_stats['last_optimization'] > 3600)):
447
- return True
448
- return False
449
-
573
+ return bool(
574
+ self.processing_stats["total_processed"] % 10000 == 0
575
+ or self.processing_stats["last_optimization"] is None
576
+ or time.time() - self.processing_stats["last_optimization"] > 3600
577
+ )
578
+
450
579
  def _optimize_system(self) -> None:
451
580
  """Perform system optimization."""
452
581
  self.logger.info("Performing system optimization...")
453
-
582
+
454
583
  # Optimize pattern cache
455
584
  self.pattern_cache.optimize_cache()
456
-
585
+
457
586
  # Update last optimization time
458
- self.processing_stats['last_optimization'] = time.time()
459
-
587
+ self.processing_stats["last_optimization"] = time.time()
588
+
460
589
  self.logger.info("System optimization completed")
461
-
462
- def get_processing_statistics(self) -> Dict[str, Any]:
590
+
591
+ def get_processing_statistics(self) -> dict[str, Any]:
463
592
  """Get comprehensive processing statistics.
464
-
593
+
465
594
  Returns:
466
595
  Dictionary with processing statistics
467
596
  """
468
597
  # Get component statistics
469
598
  cache_stats = self.pattern_cache.get_cache_statistics()
470
599
  nlp_stats = self.nlp_engine.get_performance_stats()
471
-
600
+
472
601
  # Calculate processing rates
473
- total_time = time.time() - (self.processing_stats['processing_start_time'] or time.time())
474
- commits_per_second = self.processing_stats['total_processed'] / total_time if total_time > 0 else 0
475
-
602
+ total_time = time.time() - (self.processing_stats["processing_start_time"] or time.time())
603
+ commits_per_second = (
604
+ self.processing_stats["total_processed"] / total_time if total_time > 0 else 0
605
+ )
606
+
476
607
  # Calculate method percentages
477
- total = self.processing_stats['total_processed']
608
+ total = self.processing_stats["total_processed"]
478
609
  method_percentages = {
479
- 'cache': (self.processing_stats['cache_hits'] / total * 100) if total > 0 else 0,
480
- 'nlp': (self.processing_stats['nlp_processed'] / total * 100) if total > 0 else 0,
481
- 'llm': (self.processing_stats['llm_processed'] / total * 100) if total > 0 else 0
610
+ "cache": (self.processing_stats["cache_hits"] / total * 100) if total > 0 else 0,
611
+ "nlp": (self.processing_stats["nlp_processed"] / total * 100) if total > 0 else 0,
612
+ "llm": (self.processing_stats["llm_processed"] / total * 100) if total > 0 else 0,
482
613
  }
483
-
614
+
484
615
  stats = {
485
- 'processing_summary': {
486
- 'total_commits_processed': self.processing_stats['total_processed'],
487
- 'commits_per_second': commits_per_second,
488
- 'total_processing_time_seconds': total_time,
489
- 'method_breakdown': method_percentages
616
+ "processing_summary": {
617
+ "total_commits_processed": self.processing_stats["total_processed"],
618
+ "commits_per_second": commits_per_second,
619
+ "total_processing_time_seconds": total_time,
620
+ "method_breakdown": method_percentages,
621
+ },
622
+ "cache_statistics": cache_stats,
623
+ "nlp_statistics": nlp_stats,
624
+ "configuration": {
625
+ "enabled": self.config.enabled,
626
+ "confidence_threshold": self.config.confidence_threshold,
627
+ "max_llm_fallback_pct": self.config.max_llm_fallback_pct,
628
+ "batch_size": self.config.batch_size,
490
629
  },
491
- 'cache_statistics': cache_stats,
492
- 'nlp_statistics': nlp_stats,
493
- 'configuration': {
494
- 'enabled': self.config.enabled,
495
- 'confidence_threshold': self.config.confidence_threshold,
496
- 'max_llm_fallback_pct': self.config.max_llm_fallback_pct,
497
- 'batch_size': self.config.batch_size
498
- }
499
630
  }
500
-
631
+
501
632
  # Add LLM statistics if available
502
633
  if self.llm_fallback:
503
- stats['llm_statistics'] = {
504
- 'cost_tracking': self.llm_fallback.cost_tracker.get_usage_stats(),
505
- 'model_usage': 'available'
634
+ stats["llm_statistics"] = {
635
+ "cost_tracking": self.llm_fallback.cost_tracker.get_usage_stats(),
636
+ "model_usage": "available",
506
637
  }
507
638
  else:
508
- stats['llm_statistics'] = {'model_usage': 'disabled'}
509
-
639
+ stats["llm_statistics"] = {"model_usage": "disabled"}
640
+
510
641
  return stats
511
-
512
- def validate_setup(self) -> Tuple[bool, List[str]]:
642
+
643
+ def validate_setup(self) -> tuple[bool, list[str]]:
513
644
  """Validate processor setup and dependencies.
514
-
645
+
515
646
  Returns:
516
647
  Tuple of (is_valid, list_of_issues)
517
648
  """
518
649
  issues = []
519
-
650
+
520
651
  # Validate NLP engine
521
652
  nlp_valid, nlp_issues = self.nlp_engine.validate_setup()
522
653
  if not nlp_valid:
523
654
  issues.extend([f"NLP: {issue}" for issue in nlp_issues])
524
-
655
+
525
656
  # Validate LLM fallback if configured
526
657
  if self.config.llm_config.openrouter_api_key and self.llm_fallback is None:
527
658
  issues.append("LLM: API key configured but fallback system failed to initialize")
528
-
659
+
529
660
  # Validate configuration
530
661
  config_warnings = self.config.validate()
531
662
  issues.extend([f"Config: {warning}" for warning in config_warnings])
532
-
663
+
533
664
  # Test database connection
534
665
  try:
535
666
  with self.database.get_session() as session:
536
- session.execute("SELECT 1")
667
+ from sqlalchemy import text
668
+
669
+ session.execute(text("SELECT 1"))
537
670
  except Exception as e:
538
671
  issues.append(f"Database: Connection failed - {e}")
539
-
540
- return len(issues) == 0, issues
672
+
673
+ return len(issues) == 0, issues