gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,752 @@
1
+ """Batch LLM classifier for intelligent commit categorization with context.
2
+
3
+ This module implements the second step of the two-step fetch/analyze process,
4
+ providing intelligent batch classification of commits using LLM with ticket context.
5
+ """
6
+
7
+ import logging
8
+ import uuid
9
+ from collections import defaultdict
10
+ from datetime import datetime, timedelta, timezone
11
+ from pathlib import Path
12
+ from typing import Any, Optional
13
+
14
+ from ..core.progress import get_progress_service
15
+ from ..models.database import CachedCommit, DailyCommitBatch, Database, DetailedTicketData
16
+ from ..qualitative.classifiers.llm_commit_classifier import LLMCommitClassifier, LLMConfig
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BatchCommitClassifier:
22
+ """Intelligent batch classifier using LLM with ticket context.
23
+
24
+ WHY: This class implements the second step of the two-step process by:
25
+ - Reading cached commit data organized by day/week
26
+ - Adding ticket context to improve classification accuracy
27
+ - Sending batches of commits to LLM for intelligent classification
28
+ - Falling back to rule-based classification when LLM fails
29
+ - Storing results with confidence tracking
30
+
31
+ DESIGN DECISION: Uses batch processing to reduce API calls and costs
32
+ while providing better context for classification accuracy.
33
+
34
+ PROGRESS REPORTING: Provides granular progress feedback with nested progress bars:
35
+ - Repository level: Shows which repository is being processed (position 0)
36
+ - Weekly level: Shows week being processed within repository (position 1)
37
+ - API batch level: Shows LLM API batches being processed (position 2)
38
+ Each level shows commit counts and progress indicators for user feedback.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ cache_dir: Path,
44
+ llm_config: Optional[dict[str, Any]] = None,
45
+ batch_size: int = 50,
46
+ confidence_threshold: float = 0.7,
47
+ fallback_enabled: bool = True,
48
+ ):
49
+ """Initialize the batch classifier.
50
+
51
+ Args:
52
+ cache_dir: Path to cache directory containing database
53
+ llm_config: Configuration for LLM classifier
54
+ batch_size: Number of commits per batch (max 50 for token limits)
55
+ confidence_threshold: Minimum confidence for LLM classification
56
+ fallback_enabled: Whether to fall back to rule-based classification
57
+ """
58
+ self.cache_dir = cache_dir
59
+ self.database = Database(cache_dir / "gitflow_cache.db")
60
+ self.batch_size = min(batch_size, 50) # Limit for token constraints
61
+ self.confidence_threshold = confidence_threshold
62
+ self.fallback_enabled = fallback_enabled
63
+
64
+ # Initialize LLM classifier
65
+ # Handle different config types
66
+ if isinstance(llm_config, dict):
67
+ # Convert dict config to LLMConfig object
68
+ llm_config_obj = LLMConfig(
69
+ api_key=llm_config.get("api_key", ""),
70
+ model=llm_config.get("model", "mistralai/mistral-7b-instruct"),
71
+ max_tokens=llm_config.get("max_tokens", 50),
72
+ temperature=llm_config.get("temperature", 0.1),
73
+ confidence_threshold=llm_config.get("confidence_threshold", 0.7),
74
+ timeout_seconds=llm_config.get("timeout_seconds", 30),
75
+ cache_duration_days=llm_config.get("cache_duration_days", 7),
76
+ enable_caching=llm_config.get("enable_caching", True),
77
+ max_daily_requests=llm_config.get("max_daily_requests", 1000),
78
+ )
79
+ elif hasattr(llm_config, "api_key"):
80
+ # Use provided config object (e.g., mock config for testing)
81
+ llm_config_obj = llm_config
82
+ else:
83
+ # Use default LLMConfig
84
+ llm_config_obj = LLMConfig()
85
+
86
+ self.llm_classifier = LLMCommitClassifier(config=llm_config_obj, cache_dir=cache_dir)
87
+ logger.info(
88
+ f"LLM Classifier initialized with API key: {'Yes' if llm_config_obj.api_key else 'No'}"
89
+ )
90
+
91
+ # Rule-based fallback patterns for when LLM fails
92
+ self.fallback_patterns = {
93
+ "feature": [
94
+ r"feat(?:ure)?[\(\:]",
95
+ r"add(?:ed|ing)?.*(?:feature|functionality|capability)",
96
+ r"implement(?:ed|ing|s)?",
97
+ r"introduce(?:d|s)?",
98
+ ],
99
+ "bug_fix": [
100
+ r"fix(?:ed|es|ing)?[\(\:]",
101
+ r"bug[\(\:]",
102
+ r"resolve(?:d|s)?",
103
+ r"repair(?:ed|ing|s)?",
104
+ r"correct(?:ed|ing|s)?",
105
+ ],
106
+ "refactor": [
107
+ r"refactor(?:ed|ing|s)?[\(\:]",
108
+ r"restructure(?:d|ing|s)?",
109
+ r"optimize(?:d|ing|s)?",
110
+ r"improve(?:d|ing|s)?",
111
+ r"clean(?:ed|ing)?\s+up",
112
+ ],
113
+ "documentation": [
114
+ r"docs?[\(\:]",
115
+ r"documentation[\(\:]",
116
+ r"readme",
117
+ r"update.*(?:comment|docs?|documentation)",
118
+ ],
119
+ "maintenance": [
120
+ r"chore[\(\:]",
121
+ r"maintenance[\(\:]",
122
+ r"update.*(?:dependencies|deps)",
123
+ r"bump.*version",
124
+ r"cleanup",
125
+ ],
126
+ "test": [
127
+ r"test(?:s|ing)?[\(\:]",
128
+ r"spec[\(\:]",
129
+ r"add.*(?:test|spec)",
130
+ r"fix.*test",
131
+ ],
132
+ "style": [
133
+ r"style[\(\:]",
134
+ r"format(?:ted|ting)?[\(\:]",
135
+ r"lint(?:ed|ing)?",
136
+ r"prettier",
137
+ r"whitespace",
138
+ ],
139
+ "build": [
140
+ r"build[\(\:]",
141
+ r"ci[\(\:]",
142
+ r"deploy(?:ed|ment)?",
143
+ r"docker",
144
+ r"webpack",
145
+ r"package\.json",
146
+ ],
147
+ }
148
+
149
+ def classify_date_range(
150
+ self,
151
+ start_date: datetime,
152
+ end_date: datetime,
153
+ project_keys: Optional[list[str]] = None,
154
+ force_reclassify: bool = False,
155
+ ) -> dict[str, Any]:
156
+ """Classify all commits in a date range using batch processing.
157
+
158
+ Args:
159
+ start_date: Start date for classification
160
+ end_date: End date for classification
161
+ project_keys: Optional list of specific projects to classify
162
+ force_reclassify: Whether to reclassify already processed batches
163
+
164
+ Returns:
165
+ Dictionary containing classification results and statistics
166
+ """
167
+ logger.info(f"Starting batch classification from {start_date.date()} to {end_date.date()}")
168
+
169
+ # Get daily batches to process
170
+ batches_to_process = self._get_batches_to_process(
171
+ start_date, end_date, project_keys, force_reclassify
172
+ )
173
+
174
+ if not batches_to_process:
175
+ logger.info("No batches need classification")
176
+ return {"processed_batches": 0, "total_commits": 0}
177
+
178
+ # Group batches by repository first for better progress reporting
179
+ repo_batches = self._group_batches_by_repository(batches_to_process)
180
+
181
+ total_processed = 0
182
+ total_commits = 0
183
+
184
+ # Use centralized progress service
185
+ progress = get_progress_service()
186
+
187
+ # Add progress bar for repository processing
188
+ with progress.progress(
189
+ total=len(repo_batches),
190
+ description="AI Classification",
191
+ unit="repo",
192
+ nested=False,
193
+ leave=True,
194
+ ) as repo_ctx:
195
+ for repo_num, (repo_info, repo_batch_list) in enumerate(repo_batches.items(), 1):
196
+ project_key, repo_path = repo_info
197
+ repo_name = Path(repo_path).name if repo_path else project_key
198
+
199
+ # Count commits in this repository for detailed progress
200
+ repo_commit_count = sum(batch.commit_count for batch in repo_batch_list)
201
+
202
+ progress.set_description(
203
+ repo_ctx, f"Classifying {repo_name} ({repo_commit_count} commits)"
204
+ )
205
+ logger.info(
206
+ f"Processing repository {repo_num}/{len(repo_batches)}: {repo_name} ({len(repo_batch_list)} batches, {repo_commit_count} commits)"
207
+ )
208
+
209
+ # Process this repository's batches by week for optimal context
210
+ weekly_batches = self._group_batches_by_week(repo_batch_list)
211
+
212
+ repo_processed = 0
213
+ repo_commits_processed = 0
214
+
215
+ # Add nested progress bar for weekly processing within repository
216
+ with progress.progress(
217
+ total=len(weekly_batches),
218
+ description=" Processing weeks",
219
+ unit="week",
220
+ nested=True,
221
+ leave=False,
222
+ ) as week_ctx:
223
+ for week_num, (week_start, week_batches) in enumerate(
224
+ weekly_batches.items(), 1
225
+ ):
226
+ progress.set_description(
227
+ week_ctx,
228
+ f" Week {week_num}/{len(weekly_batches)} ({week_start.strftime('%Y-%m-%d')})",
229
+ )
230
+ logger.info(
231
+ f" Processing week starting {week_start}: {len(week_batches)} daily batches"
232
+ )
233
+
234
+ week_result = self._classify_weekly_batches(week_batches)
235
+ repo_processed += week_result["batches_processed"]
236
+ repo_commits_processed += week_result["commits_processed"]
237
+
238
+ progress.update(week_ctx, 1)
239
+ # Update description to show commits processed
240
+ progress.set_description(
241
+ week_ctx,
242
+ f" Week {week_num}/{len(weekly_batches)} - {week_result['commits_processed']} commits",
243
+ )
244
+
245
+ total_processed += repo_processed
246
+ total_commits += repo_commits_processed
247
+
248
+ progress.update(repo_ctx, 1)
249
+ # Update description to show total progress
250
+ progress.set_description(
251
+ repo_ctx,
252
+ f"AI Classification [{repo_num}/{len(repo_batches)} repos, {total_commits} commits]",
253
+ )
254
+
255
+ logger.info(
256
+ f" Repository {repo_name} completed: {repo_processed} batches, {repo_commits_processed} commits"
257
+ )
258
+
259
+ # Store daily metrics from classification results
260
+ self._store_daily_metrics(start_date, end_date, project_keys)
261
+
262
+ logger.info(
263
+ f"Batch classification completed: {total_processed} batches, {total_commits} commits"
264
+ )
265
+
266
+ return {
267
+ "processed_batches": total_processed,
268
+ "total_commits": total_commits,
269
+ "date_range": {"start": start_date, "end": end_date},
270
+ "project_keys": project_keys or [],
271
+ }
272
+
273
+ def _get_batches_to_process(
274
+ self,
275
+ start_date: datetime,
276
+ end_date: datetime,
277
+ project_keys: Optional[list[str]],
278
+ force_reclassify: bool,
279
+ ) -> list[DailyCommitBatch]:
280
+ """Get daily commit batches that need classification."""
281
+ session = self.database.get_session()
282
+
283
+ try:
284
+ query = session.query(DailyCommitBatch).filter(
285
+ DailyCommitBatch.date >= start_date.date(), DailyCommitBatch.date <= end_date.date()
286
+ )
287
+
288
+ if project_keys:
289
+ query = query.filter(DailyCommitBatch.project_key.in_(project_keys))
290
+
291
+ if not force_reclassify:
292
+ # Only get batches that haven't been classified or failed
293
+ query = query.filter(
294
+ DailyCommitBatch.classification_status.in_(["pending", "failed"])
295
+ )
296
+
297
+ batches = query.order_by(DailyCommitBatch.date).all()
298
+ logger.info(f"Found {len(batches)} batches needing classification")
299
+
300
+ # Debug: Log filtering criteria
301
+ logger.debug(
302
+ f"Query criteria: start_date={start_date.date()}, end_date={end_date.date()}"
303
+ )
304
+ if project_keys:
305
+ logger.debug(f"Project key filter: {project_keys}")
306
+ logger.debug(f"Force reclassify: {force_reclassify}")
307
+
308
+ return batches
309
+
310
+ except Exception as e:
311
+ logger.error(f"Error getting batches to process: {e}")
312
+ return []
313
+ finally:
314
+ session.close()
315
+
316
+ def _group_batches_by_repository(
317
+ self, batches: list[DailyCommitBatch]
318
+ ) -> dict[tuple[str, str], list[DailyCommitBatch]]:
319
+ """Group daily batches by repository for granular progress reporting."""
320
+ repo_batches = defaultdict(list)
321
+
322
+ for batch in batches:
323
+ # Use (project_key, repo_path) as the key for unique repository identification
324
+ repo_key = (batch.project_key, batch.repo_path)
325
+ repo_batches[repo_key].append(batch)
326
+
327
+ # Sort each repository's batches by date
328
+ for batches_list in repo_batches.values():
329
+ batches_list.sort(key=lambda b: b.date)
330
+
331
+ return dict(repo_batches)
332
+
333
+ def _group_batches_by_week(
334
+ self, batches: list[DailyCommitBatch]
335
+ ) -> dict[datetime, list[DailyCommitBatch]]:
336
+ """Group daily batches by week for optimal context window."""
337
+ weekly_batches = defaultdict(list)
338
+
339
+ for batch in batches:
340
+ # Get Monday of the week
341
+ batch_date = datetime.combine(batch.date, datetime.min.time())
342
+ days_since_monday = batch_date.weekday()
343
+ week_start = batch_date - timedelta(days=days_since_monday)
344
+
345
+ weekly_batches[week_start].append(batch)
346
+
347
+ # Sort each week's batches by date
348
+ for week_batches in weekly_batches.values():
349
+ week_batches.sort(key=lambda b: b.date)
350
+
351
+ return dict(weekly_batches)
352
+
353
+ def _classify_weekly_batches(self, weekly_batches: list[DailyCommitBatch]) -> dict[str, Any]:
354
+ """Classify all batches for a single week with shared context."""
355
+ session = self.database.get_session()
356
+ batches_processed = 0
357
+ commits_processed = 0
358
+
359
+ try:
360
+ # Collect all commits for the week
361
+ week_commits = []
362
+ batch_commit_map = {} # Maps commit hash to batch
363
+
364
+ for batch in weekly_batches:
365
+ # Mark batch as processing
366
+ batch.classification_status = "processing"
367
+
368
+ # Get commits for this day
369
+ daily_commits = self._get_commits_for_batch(session, batch)
370
+ week_commits.extend(daily_commits)
371
+
372
+ # Track which batch each commit belongs to
373
+ for commit in daily_commits:
374
+ batch_commit_map[commit["commit_hash"]] = batch
375
+
376
+ if not week_commits:
377
+ logger.warning(
378
+ f"No commits found for weekly batches (expected {sum(batch.commit_count for batch in weekly_batches)} commits)"
379
+ )
380
+ # Mark batches as failed due to missing commits
381
+ for batch in weekly_batches:
382
+ batch.classification_status = "failed"
383
+ batch.classified_at = datetime.utcnow()
384
+ session.commit()
385
+ return {"batches_processed": 0, "commits_processed": 0}
386
+
387
+ # Get ticket context for the week
388
+ week_tickets = self._get_ticket_context_for_commits(session, week_commits)
389
+
390
+ # Process commits in batches (respecting API limits)
391
+ classified_commits = []
392
+ num_batches = (len(week_commits) + self.batch_size - 1) // self.batch_size
393
+
394
+ # Use centralized progress service for batch processing
395
+ progress = get_progress_service()
396
+
397
+ # Add progress bar for batch processing within the week
398
+ with progress.progress(
399
+ total=num_batches,
400
+ description=" Processing batches",
401
+ unit="batch",
402
+ nested=True,
403
+ leave=False,
404
+ ) as batch_ctx:
405
+ for i in range(0, len(week_commits), self.batch_size):
406
+ batch_num = i // self.batch_size + 1
407
+ batch_commits = week_commits[i : i + self.batch_size]
408
+ progress.set_description(
409
+ batch_ctx,
410
+ f" API batch {batch_num}/{num_batches} ({len(batch_commits)} commits)",
411
+ )
412
+ logger.info(f"Classifying batch {batch_num}: {len(batch_commits)} commits")
413
+
414
+ # Classify this batch with LLM
415
+ batch_results = self._classify_commit_batch_with_llm(
416
+ batch_commits, week_tickets
417
+ )
418
+ classified_commits.extend(batch_results)
419
+
420
+ progress.update(batch_ctx, 1)
421
+ # Update description to show total classified commits
422
+ progress.set_description(
423
+ batch_ctx,
424
+ f" API batch {batch_num}/{num_batches} - Total: {len(classified_commits)} commits",
425
+ )
426
+
427
+ # Store classification results
428
+ for commit_result in classified_commits:
429
+ self._store_commit_classification(session, commit_result)
430
+ commits_processed += 1
431
+
432
+ # Mark all daily batches as completed
433
+ for batch in weekly_batches:
434
+ batch.classification_status = "completed"
435
+ batch.classified_at = datetime.utcnow()
436
+ batches_processed += 1
437
+
438
+ session.commit()
439
+
440
+ logger.info(
441
+ f"Week classification completed: {batches_processed} batches, {commits_processed} commits"
442
+ )
443
+
444
+ except Exception as e:
445
+ logger.error(f"Error in weekly batch classification: {e}")
446
+ # Mark batches as failed
447
+ for batch in weekly_batches:
448
+ batch.classification_status = "failed"
449
+ session.rollback()
450
+ finally:
451
+ session.close()
452
+
453
+ return {
454
+ "batches_processed": batches_processed,
455
+ "commits_processed": commits_processed,
456
+ }
457
+
458
+ def _get_commits_for_batch(self, session: Any, batch: DailyCommitBatch) -> list[dict[str, Any]]:
459
+ """Get all commits for a daily batch."""
460
+ try:
461
+ # Get cached commits for this batch
462
+ # CRITICAL FIX: CachedCommit.timestamp is timezone-aware UTC (from analyzer.py line 806)
463
+ # but we were creating timezone-naive boundaries, causing comparison to fail
464
+ # Create timezone-aware UTC boundaries to match CachedCommit.timestamp format
465
+ start_of_day = datetime.combine(batch.date, datetime.min.time(), tzinfo=timezone.utc)
466
+ end_of_day = datetime.combine(batch.date, datetime.max.time(), tzinfo=timezone.utc)
467
+
468
+ logger.debug(
469
+ f"Searching for commits in {batch.repo_path} between {start_of_day} and {end_of_day}"
470
+ )
471
+
472
+ commits = (
473
+ session.query(CachedCommit)
474
+ .filter(
475
+ CachedCommit.repo_path == batch.repo_path,
476
+ CachedCommit.timestamp >= start_of_day,
477
+ CachedCommit.timestamp < end_of_day,
478
+ )
479
+ .all()
480
+ )
481
+
482
+ logger.debug(f"Found {len(commits)} commits for batch on {batch.date}")
483
+
484
+ commit_list = []
485
+ for commit in commits:
486
+ commit_data = {
487
+ "commit_hash": commit.commit_hash,
488
+ "commit_hash_short": commit.commit_hash[:7],
489
+ "message": commit.message,
490
+ "author_name": commit.author_name,
491
+ "author_email": commit.author_email,
492
+ "timestamp": commit.timestamp,
493
+ "branch": commit.branch,
494
+ "project_key": batch.project_key,
495
+ "repo_path": commit.repo_path,
496
+ "files_changed": commit.files_changed or 0,
497
+ "lines_added": commit.insertions or 0,
498
+ "lines_deleted": commit.deletions or 0,
499
+ "story_points": commit.story_points,
500
+ "ticket_references": commit.ticket_references or [],
501
+ }
502
+ commit_list.append(commit_data)
503
+
504
+ return commit_list
505
+
506
+ except Exception as e:
507
+ logger.error(f"Error getting commits for batch {batch.id}: {e}")
508
+ return []
509
+
510
+ def _get_ticket_context_for_commits(
511
+ self, session: Any, commits: list[dict[str, Any]]
512
+ ) -> dict[str, dict[str, Any]]:
513
+ """Get ticket context for a list of commits."""
514
+ # Extract all ticket references from commits
515
+ all_ticket_ids = set()
516
+ for commit in commits:
517
+ ticket_refs = commit.get("ticket_references", [])
518
+ all_ticket_ids.update(ticket_refs)
519
+
520
+ if not all_ticket_ids:
521
+ return {}
522
+
523
+ try:
524
+ # Get detailed ticket information
525
+ tickets = (
526
+ session.query(DetailedTicketData)
527
+ .filter(DetailedTicketData.ticket_id.in_(all_ticket_ids))
528
+ .all()
529
+ )
530
+
531
+ ticket_context = {}
532
+ for ticket in tickets:
533
+ ticket_context[ticket.ticket_id] = {
534
+ "title": ticket.title,
535
+ "description": (
536
+ ticket.summary or ticket.description[:200] if ticket.description else ""
537
+ ),
538
+ "ticket_type": ticket.ticket_type,
539
+ "status": ticket.status,
540
+ "labels": ticket.labels or [],
541
+ "classification_hints": ticket.classification_hints or [],
542
+ "business_domain": ticket.business_domain,
543
+ }
544
+
545
+ logger.info(f"Retrieved context for {len(ticket_context)} tickets")
546
+ return ticket_context
547
+
548
+ except Exception as e:
549
+ logger.error(f"Error getting ticket context: {e}")
550
+ return {}
551
+
552
+ def _classify_commit_batch_with_llm(
553
+ self,
554
+ commits: list[dict[str, Any]],
555
+ ticket_context: dict[str, dict[str, Any]],
556
+ ) -> list[dict[str, Any]]:
557
+ """Classify a batch of commits using LLM with ticket context."""
558
+ batch_id = str(uuid.uuid4())
559
+ logger.info(f"Starting LLM classification for batch {batch_id} with {len(commits)} commits")
560
+
561
+ # Prepare batch for LLM classification
562
+ enhanced_commits = []
563
+ for commit in commits:
564
+ enhanced_commit = commit.copy()
565
+
566
+ # Add ticket context to commit
567
+ ticket_refs = commit.get("ticket_references", [])
568
+ relevant_tickets = []
569
+ for ticket_id in ticket_refs:
570
+ if ticket_id in ticket_context:
571
+ relevant_tickets.append(ticket_context[ticket_id])
572
+
573
+ enhanced_commit["ticket_context"] = relevant_tickets
574
+ enhanced_commits.append(enhanced_commit)
575
+
576
+ try:
577
+ # Use LLM classifier with enhanced context
578
+ llm_results = self.llm_classifier.classify_commits_batch(
579
+ enhanced_commits, batch_id=batch_id, include_confidence=True
580
+ )
581
+
582
+ # Process LLM results and add fallbacks
583
+ processed_results = []
584
+ for _i, (commit, llm_result) in enumerate(zip(commits, llm_results)):
585
+ confidence = llm_result.get("confidence", 0.0)
586
+ predicted_category = llm_result.get("category", "other")
587
+
588
+ # Apply confidence threshold and fallback
589
+ if confidence < self.confidence_threshold and self.fallback_enabled:
590
+ fallback_category = self._fallback_classify_commit(commit)
591
+ processed_results.append(
592
+ {
593
+ "commit_hash": commit["commit_hash"],
594
+ "category": fallback_category,
595
+ "confidence": 0.5, # Medium confidence for rule-based
596
+ "method": "fallback",
597
+ "llm_category": predicted_category,
598
+ "llm_confidence": confidence,
599
+ "batch_id": batch_id,
600
+ }
601
+ )
602
+ else:
603
+ processed_results.append(
604
+ {
605
+ "commit_hash": commit["commit_hash"],
606
+ "category": predicted_category,
607
+ "confidence": confidence,
608
+ "method": "llm",
609
+ "batch_id": batch_id,
610
+ }
611
+ )
612
+
613
+ logger.info(
614
+ f"LLM classification completed for batch {batch_id}: {len(processed_results)} commits"
615
+ )
616
+ return processed_results
617
+
618
+ except Exception as e:
619
+ logger.error(f"LLM classification failed for batch {batch_id}: {e}")
620
+
621
+ # Fall back to rule-based classification for entire batch
622
+ if self.fallback_enabled:
623
+ fallback_results = []
624
+ for commit in commits:
625
+ category = self._fallback_classify_commit(commit)
626
+ fallback_results.append(
627
+ {
628
+ "commit_hash": commit["commit_hash"],
629
+ "category": category,
630
+ "confidence": 0.3, # Low confidence for fallback
631
+ "method": "fallback_only",
632
+ "error": str(e),
633
+ "batch_id": batch_id,
634
+ }
635
+ )
636
+
637
+ logger.info(f"Fallback classification completed for batch {batch_id}")
638
+ return fallback_results
639
+
640
+ return []
641
+
642
+ def _fallback_classify_commit(self, commit: dict[str, Any]) -> str:
643
+ """Classify commit using rule-based patterns."""
644
+ import re
645
+
646
+ message = commit.get("message", "").lower()
647
+
648
+ # Check patterns in order of specificity
649
+ for category, patterns in self.fallback_patterns.items():
650
+ for pattern in patterns:
651
+ if re.search(pattern, message, re.IGNORECASE):
652
+ return category
653
+
654
+ # Default category
655
+ return "other"
656
+
657
+ def _store_commit_classification(
658
+ self, session: Any, classification_result: dict[str, Any]
659
+ ) -> None:
660
+ """Store classification result in cached commit record."""
661
+ try:
662
+ commit_hash = classification_result["commit_hash"]
663
+
664
+ # Find the cached commit record
665
+ cached_commit = (
666
+ session.query(CachedCommit).filter(CachedCommit.commit_hash == commit_hash).first()
667
+ )
668
+
669
+ if cached_commit:
670
+ # Store classification in ticket_references as temporary solution
671
+ # In production, this would go in a separate classification table
672
+ if not hasattr(cached_commit, "classification_data"):
673
+ cached_commit.ticket_references = cached_commit.ticket_references or []
674
+
675
+ # Add classification data to the record
676
+ # Note: This is a simplified approach - in production you'd want a separate table
677
+ {
678
+ "category": classification_result["category"],
679
+ "confidence": classification_result["confidence"],
680
+ "method": classification_result["method"],
681
+ "classified_at": datetime.utcnow().isoformat(),
682
+ "batch_id": classification_result.get("batch_id"),
683
+ }
684
+
685
+ # Store in a JSON field or separate table in production
686
+ logger.debug(
687
+ f"Classified commit {commit_hash[:7]} as {classification_result['category']}"
688
+ )
689
+
690
+ except Exception as e:
691
+ logger.error(
692
+ f"Error storing classification for {classification_result.get('commit_hash', 'unknown')}: {e}"
693
+ )
694
+
695
+ def _store_daily_metrics(
696
+ self,
697
+ start_date: datetime,
698
+ end_date: datetime,
699
+ project_keys: Optional[list[str]],
700
+ ) -> None:
701
+ """Store aggregated daily metrics from classification results."""
702
+ from ..core.metrics_storage import DailyMetricsStorage
703
+
704
+ try:
705
+ DailyMetricsStorage(self.cache_dir / "gitflow_cache.db")
706
+
707
+ # This would typically aggregate from the classification results
708
+ # For now, we'll let the existing system handle this
709
+ logger.info("Daily metrics storage integration placeholder")
710
+
711
+ except Exception as e:
712
+ logger.error(f"Error storing daily metrics: {e}")
713
+
714
+ def get_classification_status(
715
+ self,
716
+ start_date: datetime,
717
+ end_date: datetime,
718
+ project_keys: Optional[list[str]] = None,
719
+ ) -> dict[str, Any]:
720
+ """Get classification status for a date range."""
721
+ session = self.database.get_session()
722
+
723
+ try:
724
+ query = session.query(DailyCommitBatch).filter(
725
+ DailyCommitBatch.date >= start_date.date(), DailyCommitBatch.date <= end_date.date()
726
+ )
727
+
728
+ if project_keys:
729
+ query = query.filter(DailyCommitBatch.project_key.in_(project_keys))
730
+
731
+ batches = query.all()
732
+
733
+ status_counts = defaultdict(int)
734
+ total_commits = 0
735
+
736
+ for batch in batches:
737
+ status_counts[batch.classification_status] += 1
738
+ total_commits += batch.commit_count
739
+
740
+ return {
741
+ "total_batches": len(batches),
742
+ "total_commits": total_commits,
743
+ "status_breakdown": dict(status_counts),
744
+ "completion_rate": status_counts["completed"] / len(batches) if batches else 0.0,
745
+ "date_range": {"start": start_date, "end": end_date},
746
+ }
747
+
748
+ except Exception as e:
749
+ logger.error(f"Error getting classification status: {e}")
750
+ return {}
751
+ finally:
752
+ session.close()