gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1285 @@
1
+ """Data fetcher for collecting raw git commits and ticket data without classification.
2
+
3
+ This module implements the first step of the two-step fetch/analyze process,
4
+ focusing purely on data collection from Git repositories and ticket systems
5
+ without performing any LLM-based classification.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import subprocess
11
+ from datetime import datetime, timedelta, timezone
12
+ from pathlib import Path
13
+ from typing import Any, Optional
14
+
15
+ from sqlalchemy.orm import Session
16
+
17
+ from ..extractors.story_points import StoryPointExtractor
18
+ from ..extractors.tickets import TicketExtractor
19
+ from ..integrations.jira_integration import JIRAIntegration
20
+ from ..models.database import (
21
+ CachedCommit,
22
+ CommitTicketCorrelation,
23
+ DailyCommitBatch,
24
+ DetailedTicketData,
25
+ )
26
+ from .branch_mapper import BranchToProjectMapper
27
+ from .cache import GitAnalysisCache
28
+ from .identity import DeveloperIdentityResolver
29
+ from .progress import get_progress_service
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class GitDataFetcher:
35
+ """Fetches raw Git commit data and organizes it by day for efficient batch processing.
36
+
37
+ WHY: This class implements the first step of the two-step process by collecting
38
+ all raw data (commits, tickets, correlations) without performing classification.
39
+ This separation enables:
40
+ - Fast data collection without LLM costs
41
+ - Repeatable analysis runs without re-fetching
42
+ - Better batch organization for efficient LLM classification
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ cache: GitAnalysisCache,
48
+ branch_mapping_rules: Optional[dict[str, list[str]]] = None,
49
+ allowed_ticket_platforms: Optional[list[str]] = None,
50
+ exclude_paths: Optional[list[str]] = None,
51
+ ):
52
+ """Initialize the data fetcher.
53
+
54
+ Args:
55
+ cache: Git analysis cache instance
56
+ branch_mapping_rules: Rules for mapping branches to projects
57
+ allowed_ticket_platforms: List of allowed ticket platforms
58
+ exclude_paths: List of file paths to exclude from analysis
59
+ """
60
+ self.cache = cache
61
+ # CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
62
+ self.database = cache.db
63
+ self.story_point_extractor = StoryPointExtractor()
64
+ self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
65
+ self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
66
+ self.exclude_paths = exclude_paths or []
67
+
68
+ # Initialize identity resolver
69
+ identity_db_path = cache.cache_dir / "identities.db"
70
+ self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
71
+
72
+ def fetch_repository_data(
73
+ self,
74
+ repo_path: Path,
75
+ project_key: str,
76
+ weeks_back: int = 4,
77
+ branch_patterns: Optional[list[str]] = None,
78
+ jira_integration: Optional[JIRAIntegration] = None,
79
+ progress_callback: Optional[callable] = None,
80
+ start_date: Optional[datetime] = None,
81
+ end_date: Optional[datetime] = None,
82
+ ) -> dict[str, Any]:
83
+ """Fetch all data for a repository and organize by day.
84
+
85
+ This method collects:
86
+ 1. All commits organized by day
87
+ 2. All referenced tickets with full metadata
88
+ 3. Commit-ticket correlations
89
+ 4. Developer identity mappings
90
+
91
+ Args:
92
+ repo_path: Path to the Git repository
93
+ project_key: Project identifier
94
+ weeks_back: Number of weeks to analyze (used only if start_date/end_date not provided)
95
+ branch_patterns: Branch patterns to include
96
+ jira_integration: JIRA integration for ticket data
97
+ progress_callback: Optional callback for progress updates
98
+ start_date: Optional explicit start date (overrides weeks_back calculation)
99
+ end_date: Optional explicit end date (overrides weeks_back calculation)
100
+
101
+ Returns:
102
+ Dictionary containing fetch results and statistics
103
+ """
104
+ logger.info("🔍 DEBUG: ===== FETCH METHOD CALLED =====")
105
+ logger.info(f"Starting data fetch for project {project_key} at {repo_path}")
106
+ logger.info(f"🔍 DEBUG: weeks_back={weeks_back}, repo_path={repo_path}")
107
+
108
+ # Calculate date range - use explicit dates if provided, otherwise calculate from weeks_back
109
+ if start_date is not None and end_date is not None:
110
+ logger.info(f"🔍 DEBUG: Using explicit date range: {start_date} to {end_date}")
111
+ else:
112
+ end_date = datetime.now(timezone.utc)
113
+ start_date = end_date - timedelta(weeks=weeks_back)
114
+ logger.info(
115
+ f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
116
+ )
117
+
118
+ # Get progress service for top-level progress tracking
119
+ progress = get_progress_service()
120
+
121
+ # Step 1: Collect all commits organized by day with enhanced progress tracking
122
+ logger.info("🔍 DEBUG: About to fetch commits by day")
123
+ logger.info("Fetching commits organized by day...")
124
+
125
+ # Create top-level progress for this repository
126
+ with progress.progress(
127
+ total=3, # Three main steps: fetch commits, extract tickets, store data
128
+ description=f"Processing {project_key}",
129
+ unit="steps",
130
+ ) as repo_progress_ctx:
131
+
132
+ # Step 1: Fetch commits
133
+ progress.set_description(repo_progress_ctx, f"{project_key}: Fetching commits")
134
+ daily_commits = self._fetch_commits_by_day(
135
+ repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
136
+ )
137
+ logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
138
+ progress.update(repo_progress_ctx)
139
+
140
+ # Step 2: Extract and fetch all referenced tickets
141
+ progress.set_description(repo_progress_ctx, f"{project_key}: Processing tickets")
142
+ logger.info("🔍 DEBUG: About to extract ticket references")
143
+ logger.info("Extracting ticket references...")
144
+ ticket_ids = self._extract_all_ticket_references(daily_commits)
145
+ logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
146
+
147
+ if jira_integration and ticket_ids:
148
+ logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
149
+ self._fetch_detailed_tickets(
150
+ ticket_ids, jira_integration, project_key, progress_callback
151
+ )
152
+
153
+ # Build commit-ticket correlations
154
+ logger.info("Building commit-ticket correlations...")
155
+ correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
156
+ progress.update(repo_progress_ctx)
157
+
158
+ # Step 3: Store daily commit batches
159
+ progress.set_description(repo_progress_ctx, f"{project_key}: Storing data")
160
+ logger.info(
161
+ f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
162
+ )
163
+ logger.info("Storing daily commit batches...")
164
+ batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
165
+ logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
166
+ progress.update(repo_progress_ctx)
167
+
168
+ # CRITICAL FIX: Verify actual storage before reporting success
169
+ session = self.database.get_session()
170
+ try:
171
+ expected_commits = sum(len(commits) for commits in daily_commits.values())
172
+ verification_result = self._verify_commit_storage(
173
+ session, daily_commits, repo_path, expected_commits
174
+ )
175
+ actual_stored_commits = verification_result["total_found"]
176
+ except Exception as e:
177
+ logger.error(f"❌ Final storage verification failed: {e}")
178
+ # Don't let verification failure break the return, but log it clearly
179
+ actual_stored_commits = 0
180
+ finally:
181
+ session.close()
182
+
183
+ # Return summary statistics with ACTUAL stored counts
184
+ # expected_commits already calculated above
185
+
186
+ results = {
187
+ "project_key": project_key,
188
+ "repo_path": str(repo_path),
189
+ "date_range": {"start": start_date, "end": end_date},
190
+ "stats": {
191
+ "total_commits": expected_commits, # What we tried to store
192
+ "stored_commits": actual_stored_commits, # What was actually stored
193
+ "storage_success": actual_stored_commits == expected_commits,
194
+ "days_with_commits": len(daily_commits),
195
+ "unique_tickets": len(ticket_ids),
196
+ "correlations_created": correlations_created,
197
+ "batches_created": batches_created,
198
+ },
199
+ "daily_commits": daily_commits, # For immediate use if needed
200
+ }
201
+
202
+ # Log with actual storage results
203
+ if actual_stored_commits == expected_commits:
204
+ logger.info(
205
+ f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
206
+ )
207
+ else:
208
+ logger.error(
209
+ f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
210
+ )
211
+
212
+ return results
213
+
214
+ def _fetch_commits_by_day(
215
+ self,
216
+ repo_path: Path,
217
+ project_key: str,
218
+ start_date: datetime,
219
+ end_date: datetime,
220
+ branch_patterns: Optional[list[str]],
221
+ progress_callback: Optional[callable] = None,
222
+ ) -> dict[str, list[dict[str, Any]]]:
223
+ """Fetch all commits organized by day with full metadata.
224
+
225
+ Returns:
226
+ Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
227
+ """
228
+ from git import Repo
229
+
230
+ try:
231
+ repo = Repo(repo_path)
232
+ # Update repository from remote before analysis
233
+ self._update_repository(repo)
234
+ except Exception as e:
235
+ logger.error(f"Failed to open repository at {repo_path}: {e}")
236
+ return {}
237
+
238
+ # Get branches to analyze
239
+ branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
240
+
241
+ if not branches_to_analyze:
242
+ logger.warning(f"No accessible branches found in repository {repo_path}")
243
+ return {}
244
+
245
+ logger.info(f"Analyzing branches: {branches_to_analyze}")
246
+
247
+ # Calculate days to process
248
+ current_date = start_date.date()
249
+ end_date_only = end_date.date()
250
+ days_to_process = []
251
+ while current_date <= end_date_only:
252
+ days_to_process.append(current_date)
253
+ current_date += timedelta(days=1)
254
+
255
+ logger.info(
256
+ f"Processing {len(days_to_process)} days from {start_date.date()} to {end_date.date()}"
257
+ )
258
+
259
+ # Get progress service for nested progress tracking
260
+ progress = get_progress_service()
261
+
262
+ # Dictionary to store commits by day
263
+ daily_commits = {}
264
+ all_commit_hashes = set() # Track all hashes for deduplication
265
+
266
+ # Create nested progress for day-by-day processing
267
+ with progress.progress(
268
+ total=len(days_to_process),
269
+ description=f"Fetching commits for {project_key}",
270
+ unit="days",
271
+ nested=True,
272
+ ) as day_progress_ctx:
273
+
274
+ for day_date in days_to_process:
275
+ # Update description to show current day
276
+ day_str = day_date.strftime("%Y-%m-%d")
277
+ progress.set_description(day_progress_ctx, f"{project_key}: Processing {day_str}")
278
+
279
+ # Calculate day boundaries
280
+ day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
281
+ day_end = datetime.combine(day_date, datetime.max.time(), tzinfo=timezone.utc)
282
+
283
+ day_commits = []
284
+ commits_found_today = 0
285
+
286
+ # Process each branch for this specific day
287
+ for branch_name in branches_to_analyze:
288
+ try:
289
+ # Fetch commits for this specific day and branch
290
+ branch_commits = list(
291
+ repo.iter_commits(
292
+ branch_name, since=day_start, until=day_end, reverse=False
293
+ )
294
+ )
295
+
296
+ for commit in branch_commits:
297
+ # Skip if we've already processed this commit
298
+ if commit.hexsha in all_commit_hashes:
299
+ continue
300
+
301
+ # Extract commit data with full metadata
302
+ commit_data = self._extract_commit_data(
303
+ commit, branch_name, project_key, repo_path
304
+ )
305
+ if commit_data:
306
+ day_commits.append(commit_data)
307
+ all_commit_hashes.add(commit.hexsha)
308
+ commits_found_today += 1
309
+
310
+ except Exception as e:
311
+ logger.warning(
312
+ f"Error processing branch {branch_name} for day {day_str}: {e}"
313
+ )
314
+ continue
315
+
316
+ # Store commits for this day if any were found
317
+ if day_commits:
318
+ # Sort commits by timestamp
319
+ day_commits.sort(key=lambda c: c["timestamp"])
320
+ daily_commits[day_str] = day_commits
321
+
322
+ # Incremental caching - store commits for this day immediately
323
+ self._store_day_commits_incremental(
324
+ repo_path, day_str, day_commits, project_key
325
+ )
326
+
327
+ logger.debug(f"Found {commits_found_today} commits on {day_str}")
328
+
329
+ # Update progress callback if provided
330
+ if progress_callback:
331
+ progress_callback(f"Processed {day_str}: {commits_found_today} commits")
332
+
333
+ # Update progress bar
334
+ progress.update(day_progress_ctx)
335
+
336
+ total_commits = sum(len(commits) for commits in daily_commits.values())
337
+ logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
338
+ return daily_commits
339
+
340
+ def _extract_commit_data(
341
+ self, commit: Any, branch_name: str, project_key: str, repo_path: Path
342
+ ) -> Optional[dict[str, Any]]:
343
+ """Extract comprehensive data from a Git commit.
344
+
345
+ Returns:
346
+ Dictionary containing all commit metadata needed for classification
347
+ """
348
+ try:
349
+ # Basic commit information
350
+ commit_data = {
351
+ "commit_hash": commit.hexsha,
352
+ "commit_hash_short": commit.hexsha[:7],
353
+ "message": commit.message.strip(),
354
+ "author_name": commit.author.name,
355
+ "author_email": commit.author.email,
356
+ "timestamp": datetime.fromtimestamp(commit.committed_date, tz=timezone.utc),
357
+ "branch": branch_name,
358
+ "project_key": project_key,
359
+ "repo_path": str(repo_path),
360
+ "is_merge": len(commit.parents) > 1, # Match the original analyzer behavior
361
+ }
362
+
363
+ # Calculate file changes
364
+ try:
365
+ # Compare with first parent or empty tree for initial commit
366
+ diff = commit.parents[0].diff(commit) if commit.parents else commit.diff(None)
367
+
368
+ # Get file paths with filtering
369
+ files_changed = []
370
+ for diff_item in diff:
371
+ file_path = diff_item.a_path or diff_item.b_path
372
+ if file_path and not self._should_exclude_file(file_path):
373
+ files_changed.append(file_path)
374
+
375
+ # Use reliable git numstat command for accurate line counts
376
+ line_stats = self._calculate_commit_stats(commit)
377
+ total_insertions = line_stats["insertions"]
378
+ total_deletions = line_stats["deletions"]
379
+
380
+ commit_data.update(
381
+ {
382
+ "files_changed": files_changed,
383
+ "files_changed_count": len(files_changed),
384
+ "lines_added": total_insertions,
385
+ "lines_deleted": total_deletions,
386
+ }
387
+ )
388
+
389
+ except Exception as e:
390
+ logger.debug(f"Error calculating changes for commit {commit.hexsha}: {e}")
391
+ commit_data.update(
392
+ {
393
+ "files_changed": [],
394
+ "files_changed_count": 0,
395
+ "lines_added": 0,
396
+ "lines_deleted": 0,
397
+ }
398
+ )
399
+
400
+ # Extract story points
401
+ story_points = self.story_point_extractor.extract_from_text(commit_data["message"])
402
+ commit_data["story_points"] = story_points
403
+
404
+ # Extract ticket references
405
+ ticket_refs_data = self.ticket_extractor.extract_from_text(commit_data["message"])
406
+ # Convert to list of ticket IDs for compatibility
407
+ # Fix: Use 'id' field instead of 'ticket_id' field from extractor output
408
+ ticket_refs = [ref_data["id"] for ref_data in ticket_refs_data]
409
+ commit_data["ticket_references"] = ticket_refs
410
+
411
+ # Resolve developer identity
412
+ canonical_id = self.identity_resolver.resolve_developer(
413
+ commit_data["author_name"], commit_data["author_email"]
414
+ )
415
+ commit_data["canonical_developer_id"] = canonical_id
416
+
417
+ return commit_data
418
+
419
+ except Exception as e:
420
+ logger.error(f"Error extracting data for commit {commit.hexsha}: {e}")
421
+ return None
422
+
423
+ def _should_exclude_file(self, file_path: str) -> bool:
424
+ """Check if a file should be excluded based on exclude patterns."""
425
+ import fnmatch
426
+
427
+ return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
428
+
429
+ def _get_branches_to_analyze(
430
+ self, repo: Any, branch_patterns: Optional[list[str]]
431
+ ) -> list[str]:
432
+ """Get list of branches to analyze based on patterns.
433
+
434
+ WHY: Robust branch detection that handles missing remotes, missing default branches,
435
+ and provides good fallback behavior. When no patterns specified, analyzes ALL branches
436
+ to capture the complete development picture.
437
+
438
+ DESIGN DECISION:
439
+ - When no patterns: analyze ALL accessible branches (not just main)
440
+ - When patterns specified: match against those patterns only
441
+ - Handle missing remotes gracefully
442
+ - Skip remote tracking branches to avoid duplicates
443
+ - Use actual branch existence checking rather than assuming branches exist
444
+ """
445
+ # Collect all available branches (local branches preferred)
446
+ available_branches = []
447
+
448
+ # First, try local branches
449
+ try:
450
+ local_branches = [branch.name for branch in repo.branches]
451
+ available_branches.extend(local_branches)
452
+ logger.debug(f"Found local branches: {local_branches}")
453
+ except Exception as e:
454
+ logger.debug(f"Error getting local branches: {e}")
455
+
456
+ # If we have remotes, also consider remote branches (but clean the names)
457
+ try:
458
+ if repo.remotes and hasattr(repo.remotes, "origin"):
459
+ remote_branches = [
460
+ ref.name.replace("origin/", "")
461
+ for ref in repo.remotes.origin.refs
462
+ if not ref.name.endswith("HEAD") # Skip HEAD ref
463
+ ]
464
+ # Only add remote branches that aren't already in local branches
465
+ for branch in remote_branches:
466
+ if branch not in available_branches:
467
+ available_branches.append(branch)
468
+ logger.debug(f"Found remote branches: {remote_branches}")
469
+ except Exception as e:
470
+ logger.debug(f"Error getting remote branches: {e}")
471
+
472
+ # If no branches found, fallback to trying common names directly
473
+ if not available_branches:
474
+ logger.warning("No branches found via normal detection, falling back to common names")
475
+ available_branches = ["main", "master", "develop", "dev"]
476
+
477
+ # Filter branches based on patterns if provided
478
+ if branch_patterns:
479
+ import fnmatch
480
+
481
+ matching_branches = []
482
+ for pattern in branch_patterns:
483
+ matching = [
484
+ branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)
485
+ ]
486
+ matching_branches.extend(matching)
487
+ # Remove duplicates while preserving order
488
+ branches_to_test = list(dict.fromkeys(matching_branches))
489
+ else:
490
+ # No patterns specified - analyze ALL branches for complete coverage
491
+ branches_to_test = available_branches
492
+ logger.info(
493
+ f"No branch patterns specified - will analyze all {len(branches_to_test)} branches"
494
+ )
495
+
496
+ # Test that branches are actually accessible
497
+ accessible_branches = []
498
+ for branch in branches_to_test:
499
+ try:
500
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
501
+ accessible_branches.append(branch)
502
+ except Exception as e:
503
+ logger.debug(f"Branch {branch} not accessible: {e}")
504
+
505
+ if not accessible_branches:
506
+ # Last resort: try to find ANY working branch
507
+ logger.warning("No accessible branches found from patterns/default, trying fallback")
508
+ main_branches = ["main", "master", "develop", "dev"]
509
+ for branch in main_branches:
510
+ if branch in available_branches:
511
+ try:
512
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
513
+ logger.info(f"Using fallback main branch: {branch}")
514
+ return [branch]
515
+ except Exception:
516
+ continue
517
+
518
+ # Try any available branch
519
+ for branch in available_branches:
520
+ try:
521
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
522
+ logger.info(f"Using fallback branch: {branch}")
523
+ return [branch]
524
+ except Exception:
525
+ continue
526
+
527
+ logger.warning("No accessible branches found")
528
+ return []
529
+
530
+ logger.info(f"Will analyze {len(accessible_branches)} branches: {accessible_branches}")
531
+ return accessible_branches
532
+
533
+ def _update_repository(self, repo) -> bool:
534
+ """Update repository from remote before analysis.
535
+
536
+ WHY: This ensures we have the latest commits from the remote repository
537
+ before performing analysis. Critical for getting accurate data especially
538
+ when analyzing repositories that are actively being developed.
539
+
540
+ DESIGN DECISION: Uses fetch() for all cases, then pull() only when on a
541
+ tracking branch that's not in detached HEAD state. This approach:
542
+ - Handles detached HEAD states gracefully (common in CI/CD)
543
+ - Always gets latest refs from remote via fetch
544
+ - Only attempts pull when it's safe to do so
545
+ - Continues analysis even if update fails (logs warning)
546
+
547
+ Args:
548
+ repo: GitPython Repo object
549
+
550
+ Returns:
551
+ bool: True if update succeeded, False if failed (but analysis continues)
552
+ """
553
+ try:
554
+ if repo.remotes:
555
+ logger.info("Fetching latest changes from remote")
556
+
557
+ # Use subprocess for git fetch to prevent password prompts
558
+ env = os.environ.copy()
559
+ env["GIT_TERMINAL_PROMPT"] = "0"
560
+ env["GIT_ASKPASS"] = ""
561
+ env["GCM_INTERACTIVE"] = "never"
562
+
563
+ # Run git fetch with timeout
564
+ try:
565
+ result = subprocess.run(
566
+ ["git", "fetch", "--all"],
567
+ cwd=repo.working_dir,
568
+ env=env,
569
+ capture_output=True,
570
+ text=True,
571
+ timeout=30, # 30 second timeout
572
+ )
573
+
574
+ if result.returncode != 0:
575
+ error_str = result.stderr.lower()
576
+ if any(
577
+ x in error_str
578
+ for x in ["authentication", "permission denied", "401", "403"]
579
+ ):
580
+ logger.error(
581
+ "GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
582
+ "You can verify with: gh auth status"
583
+ )
584
+ # Skip fetch but continue with local analysis
585
+ return False
586
+ logger.warning(f"Git fetch failed: {result.stderr}")
587
+ return False
588
+
589
+ except subprocess.TimeoutExpired:
590
+ logger.error(
591
+ "Git fetch timed out (likely authentication failure). Continuing with local repository state."
592
+ )
593
+ return False
594
+
595
+ # Only try to pull if not in detached HEAD state
596
+ if not repo.head.is_detached:
597
+ current_branch = repo.active_branch
598
+ tracking = current_branch.tracking_branch()
599
+ if tracking:
600
+ # Pull latest changes using subprocess
601
+ try:
602
+ result = subprocess.run(
603
+ ["git", "pull"],
604
+ cwd=repo.working_dir,
605
+ env=env,
606
+ capture_output=True,
607
+ text=True,
608
+ timeout=30,
609
+ )
610
+
611
+ if result.returncode != 0:
612
+ error_str = result.stderr.lower()
613
+ if any(
614
+ x in error_str
615
+ for x in ["authentication", "permission denied", "401", "403"]
616
+ ):
617
+ logger.error(
618
+ "GitHub authentication failed during pull. Continuing with local repository state."
619
+ )
620
+ return False
621
+ logger.warning(f"Git pull failed: {result.stderr}")
622
+ return False
623
+ logger.debug(f"Pulled latest changes for {current_branch.name}")
624
+
625
+ except subprocess.TimeoutExpired:
626
+ logger.error(
627
+ "Git pull timed out. Continuing with local repository state."
628
+ )
629
+ return False
630
+ else:
631
+ logger.debug(
632
+ f"Branch {current_branch.name} has no tracking branch, skipping pull"
633
+ )
634
+ else:
635
+ logger.debug("Repository in detached HEAD state, skipping pull")
636
+ return True
637
+ else:
638
+ logger.debug("No remotes configured, skipping repository update")
639
+ return True
640
+ except Exception as e:
641
+ logger.warning(f"Could not update repository: {e}")
642
+ # Continue with analysis using local state
643
+ return False
644
+
645
+ def _extract_all_ticket_references(
646
+ self, daily_commits: dict[str, list[dict[str, Any]]]
647
+ ) -> set[str]:
648
+ """Extract all unique ticket IDs from commits."""
649
+ ticket_ids = set()
650
+
651
+ for day_commits in daily_commits.values():
652
+ for commit in day_commits:
653
+ ticket_refs = commit.get("ticket_references", [])
654
+ ticket_ids.update(ticket_refs)
655
+
656
+ logger.info(f"Found {len(ticket_ids)} unique ticket references")
657
+ return ticket_ids
658
+
659
+ def _fetch_detailed_tickets(
660
+ self,
661
+ ticket_ids: set[str],
662
+ jira_integration: JIRAIntegration,
663
+ project_key: str,
664
+ progress_callback: Optional[callable] = None,
665
+ ) -> None:
666
+ """Fetch detailed ticket information and store in database."""
667
+ session = self.database.get_session()
668
+
669
+ try:
670
+ # Check which tickets we already have
671
+ existing_tickets = (
672
+ session.query(DetailedTicketData)
673
+ .filter(
674
+ DetailedTicketData.ticket_id.in_(ticket_ids),
675
+ DetailedTicketData.platform == "jira",
676
+ )
677
+ .all()
678
+ )
679
+
680
+ existing_ids = {ticket.ticket_id for ticket in existing_tickets}
681
+ tickets_to_fetch = ticket_ids - existing_ids
682
+
683
+ if not tickets_to_fetch:
684
+ logger.info("All tickets already cached")
685
+ return
686
+
687
+ logger.info(f"Fetching {len(tickets_to_fetch)} new tickets")
688
+
689
+ # Fetch tickets in batches
690
+ batch_size = 50
691
+ tickets_list = list(tickets_to_fetch)
692
+
693
+ # Use centralized progress service
694
+ progress = get_progress_service()
695
+
696
+ with progress.progress(
697
+ total=len(tickets_list), description="Fetching tickets", unit="tickets"
698
+ ) as ctx:
699
+ for i in range(0, len(tickets_list), batch_size):
700
+ batch = tickets_list[i : i + batch_size]
701
+
702
+ for ticket_id in batch:
703
+ try:
704
+ # Fetch ticket from JIRA
705
+ issue_data = jira_integration.get_issue(ticket_id)
706
+
707
+ if issue_data:
708
+ # Create detailed ticket record
709
+ detailed_ticket = self._create_detailed_ticket_record(
710
+ issue_data, project_key, "jira"
711
+ )
712
+ session.add(detailed_ticket)
713
+
714
+ except Exception as e:
715
+ logger.warning(f"Failed to fetch ticket {ticket_id}: {e}")
716
+
717
+ progress.update(ctx, 1)
718
+
719
+ if progress_callback:
720
+ progress_callback(f"Fetched ticket {ticket_id}")
721
+
722
+ # Commit batch to database
723
+ session.commit()
724
+
725
+ logger.info(f"Successfully fetched {len(tickets_to_fetch)} tickets")
726
+
727
+ except Exception as e:
728
+ logger.error(f"Error fetching detailed tickets: {e}")
729
+ session.rollback()
730
+ finally:
731
+ session.close()
732
+
733
+ def _create_detailed_ticket_record(
734
+ self, issue_data: dict[str, Any], project_key: str, platform: str
735
+ ) -> DetailedTicketData:
736
+ """Create a detailed ticket record from JIRA issue data."""
737
+ # Extract classification hints from issue type and labels
738
+ classification_hints = []
739
+
740
+ issue_type = issue_data.get("issue_type", "").lower()
741
+ if "bug" in issue_type or "defect" in issue_type:
742
+ classification_hints.append("bug_fix")
743
+ elif "story" in issue_type or "feature" in issue_type:
744
+ classification_hints.append("feature")
745
+ elif "task" in issue_type:
746
+ classification_hints.append("maintenance")
747
+
748
+ # Extract business domain from labels or summary
749
+ business_domain = None
750
+ labels = issue_data.get("labels", [])
751
+ for label in labels:
752
+ if any(keyword in label.lower() for keyword in ["frontend", "backend", "ui", "api"]):
753
+ business_domain = label.lower()
754
+ break
755
+
756
+ # Create the record
757
+ return DetailedTicketData(
758
+ platform=platform,
759
+ ticket_id=issue_data["key"],
760
+ project_key=project_key,
761
+ title=issue_data.get("summary", ""),
762
+ description=issue_data.get("description", ""),
763
+ summary=issue_data.get("summary", "")[:500], # Truncated summary
764
+ ticket_type=issue_data.get("issue_type", ""),
765
+ status=issue_data.get("status", ""),
766
+ priority=issue_data.get("priority", ""),
767
+ labels=labels,
768
+ assignee=issue_data.get("assignee", ""),
769
+ reporter=issue_data.get("reporter", ""),
770
+ created_at=issue_data.get("created"),
771
+ updated_at=issue_data.get("updated"),
772
+ resolved_at=issue_data.get("resolved"),
773
+ story_points=issue_data.get("story_points"),
774
+ classification_hints=classification_hints,
775
+ business_domain=business_domain,
776
+ platform_data=issue_data, # Store full JIRA data
777
+ )
778
+
779
+ def _build_commit_ticket_correlations(
780
+ self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path
781
+ ) -> int:
782
+ """Build and store commit-ticket correlations."""
783
+ session = self.database.get_session()
784
+ correlations_created = 0
785
+
786
+ try:
787
+ for day_commits in daily_commits.values():
788
+ for commit in day_commits:
789
+ commit_hash = commit["commit_hash"]
790
+ ticket_refs = commit.get("ticket_references", [])
791
+
792
+ for ticket_id in ticket_refs:
793
+ try:
794
+ # Create correlation record
795
+ correlation = CommitTicketCorrelation(
796
+ commit_hash=commit_hash,
797
+ repo_path=str(repo_path),
798
+ ticket_id=ticket_id,
799
+ platform="jira", # Assuming JIRA for now
800
+ project_key=commit["project_key"],
801
+ correlation_type="direct",
802
+ confidence=1.0,
803
+ extracted_from="commit_message",
804
+ matching_pattern=None, # Could add pattern detection
805
+ )
806
+
807
+ # Check if correlation already exists
808
+ existing = (
809
+ session.query(CommitTicketCorrelation)
810
+ .filter(
811
+ CommitTicketCorrelation.commit_hash == commit_hash,
812
+ CommitTicketCorrelation.repo_path == str(repo_path),
813
+ CommitTicketCorrelation.ticket_id == ticket_id,
814
+ CommitTicketCorrelation.platform == "jira",
815
+ )
816
+ .first()
817
+ )
818
+
819
+ if not existing:
820
+ session.add(correlation)
821
+ correlations_created += 1
822
+
823
+ except Exception as e:
824
+ logger.warning(
825
+ f"Failed to create correlation for {commit_hash}-{ticket_id}: {e}"
826
+ )
827
+
828
+ session.commit()
829
+ logger.info(f"Created {correlations_created} commit-ticket correlations")
830
+
831
+ except Exception as e:
832
+ logger.error(f"Error building correlations: {e}")
833
+ session.rollback()
834
+ finally:
835
+ session.close()
836
+
837
+ return correlations_created
838
+
839
+ def _store_daily_batches(
840
+ self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path, project_key: str
841
+ ) -> int:
842
+ """Store daily commit batches for efficient retrieval using bulk operations.
843
+
844
+ WHY: Enhanced to use bulk operations from the cache layer for significantly
845
+ better performance when storing large numbers of commits.
846
+ """
847
+ session = self.database.get_session()
848
+ batches_created = 0
849
+ commits_stored = 0
850
+ expected_commits = 0
851
+
852
+ try:
853
+ total_commits = sum(len(commits) for commits in daily_commits.values())
854
+ logger.info(f"🔍 DEBUG: Storing {total_commits} commits from {len(daily_commits)} days")
855
+ logger.info(
856
+ f"🔍 DEBUG: Daily commits keys: {list(daily_commits.keys())[:5]}"
857
+ ) # First 5 dates
858
+
859
+ # Track dates to process for efficient batch handling
860
+ dates_to_process = [
861
+ datetime.strptime(date_str, "%Y-%m-%d").date() for date_str in daily_commits
862
+ ]
863
+ logger.info(f"🔍 DEBUG: Processing {len(dates_to_process)} dates for daily batches")
864
+
865
+ # Pre-load existing batches to avoid constraint violations during processing
866
+ existing_batches_map = {}
867
+ for date_str in daily_commits:
868
+ # Convert to datetime instead of date to match database storage
869
+ date_obj = datetime.strptime(date_str, "%Y-%m-%d")
870
+ existing_batch = (
871
+ session.query(DailyCommitBatch)
872
+ .filter(
873
+ DailyCommitBatch.date == date_obj,
874
+ DailyCommitBatch.project_key == project_key,
875
+ DailyCommitBatch.repo_path == str(repo_path),
876
+ )
877
+ .first()
878
+ )
879
+ if existing_batch:
880
+ existing_batches_map[date_str] = existing_batch
881
+ logger.info(
882
+ f"🔍 DEBUG: Found existing batch for {date_str}: ID={existing_batch.id}"
883
+ )
884
+ else:
885
+ logger.info(f"🔍 DEBUG: No existing batch found for {date_str}")
886
+
887
+ # Collect all commits for bulk operations
888
+ all_commits_to_store = []
889
+ commit_hashes_to_check = []
890
+
891
+ # First, collect all commit hashes to check existence in bulk
892
+ for _date_str, commits in daily_commits.items():
893
+ for commit in commits:
894
+ commit_hashes_to_check.append(commit["commit_hash"])
895
+
896
+ # Use bulk_exists to check which commits already exist
897
+ existing_commits_map = self.cache.bulk_exists(str(repo_path), commit_hashes_to_check)
898
+
899
+ # Disable autoflush to prevent premature batch creation during commit storage
900
+ with session.no_autoflush:
901
+ for date_str, commits in daily_commits.items():
902
+ if not commits:
903
+ continue
904
+
905
+ logger.info(f"🔍 DEBUG: Processing {len(commits)} commits for {date_str}")
906
+
907
+ # Prepare commits for bulk storage
908
+ commits_to_store_this_date = []
909
+ for commit in commits:
910
+ # Check if commit already exists using bulk check results
911
+ if not existing_commits_map.get(commit["commit_hash"], False):
912
+ # Transform commit data to cache format
913
+ cache_format_commit = {
914
+ "hash": commit["commit_hash"],
915
+ "author_name": commit.get("author_name", ""),
916
+ "author_email": commit.get("author_email", ""),
917
+ "message": commit.get("message", ""),
918
+ "timestamp": commit["timestamp"],
919
+ "branch": commit.get("branch", "main"),
920
+ "is_merge": commit.get("is_merge", False),
921
+ "files_changed_count": commit.get("files_changed_count", 0),
922
+ "insertions": commit.get("lines_added", 0),
923
+ "deletions": commit.get("lines_deleted", 0),
924
+ "story_points": commit.get("story_points"),
925
+ "ticket_references": commit.get("ticket_references", []),
926
+ }
927
+ commits_to_store_this_date.append(cache_format_commit)
928
+ all_commits_to_store.append(cache_format_commit)
929
+ expected_commits += 1
930
+ else:
931
+ logger.debug(
932
+ f"Commit {commit['commit_hash'][:7]} already exists in database"
933
+ )
934
+
935
+ logger.info(
936
+ f"🔍 DEBUG: Prepared {len(commits_to_store_this_date)} new commits for {date_str}"
937
+ )
938
+
939
+ # Calculate batch statistics
940
+ total_files = sum(commit.get("files_changed_count", 0) for commit in commits)
941
+ total_additions = sum(commit.get("lines_added", 0) for commit in commits)
942
+ total_deletions = sum(commit.get("lines_deleted", 0) for commit in commits)
943
+
944
+ # Get unique developers and tickets for this day
945
+ active_devs = list(
946
+ set(commit.get("canonical_developer_id", "") for commit in commits)
947
+ )
948
+ unique_tickets = []
949
+ for commit in commits:
950
+ unique_tickets.extend(commit.get("ticket_references", []))
951
+ unique_tickets = list(set(unique_tickets))
952
+
953
+ # Create context summary
954
+ context_summary = f"{len(commits)} commits by {len(active_devs)} developers"
955
+ if unique_tickets:
956
+ context_summary += f", {len(unique_tickets)} tickets referenced"
957
+
958
+ # Create or update daily batch using pre-loaded existing batches
959
+ date_obj = datetime.strptime(date_str, "%Y-%m-%d")
960
+
961
+ try:
962
+ existing_batch = existing_batches_map.get(date_str)
963
+
964
+ if existing_batch:
965
+ # Update existing batch with new data
966
+ existing_batch.commit_count = len(commits)
967
+ existing_batch.total_files_changed = total_files
968
+ existing_batch.total_lines_added = total_additions
969
+ existing_batch.total_lines_deleted = total_deletions
970
+ existing_batch.active_developers = active_devs
971
+ existing_batch.unique_tickets = unique_tickets
972
+ existing_batch.context_summary = context_summary
973
+ existing_batch.fetched_at = datetime.utcnow()
974
+ existing_batch.classification_status = "pending"
975
+ logger.info(f"🔍 DEBUG: Updated existing batch for {date_str}")
976
+ else:
977
+ # Create new batch
978
+ batch = DailyCommitBatch(
979
+ date=date_obj,
980
+ project_key=project_key,
981
+ repo_path=str(repo_path),
982
+ commit_count=len(commits),
983
+ total_files_changed=total_files,
984
+ total_lines_added=total_additions,
985
+ total_lines_deleted=total_deletions,
986
+ active_developers=active_devs,
987
+ unique_tickets=unique_tickets,
988
+ context_summary=context_summary,
989
+ classification_status="pending",
990
+ fetched_at=datetime.utcnow(),
991
+ )
992
+ session.add(batch)
993
+ batches_created += 1
994
+ logger.info(f"🔍 DEBUG: Created new batch for {date_str}")
995
+ except Exception as batch_error:
996
+ # Don't let batch creation failure kill commit storage
997
+ logger.error(
998
+ f"❌ CRITICAL: Failed to create/update batch for {date_str}: {batch_error}"
999
+ )
1000
+ import traceback
1001
+
1002
+ logger.error(f"❌ Full batch error trace: {traceback.format_exc()}")
1003
+ # Important: rollback any pending transaction to restore session state
1004
+ session.rollback()
1005
+ # Skip this batch but continue processing
1006
+
1007
+ # Use bulk store operation for all commits at once for maximum performance
1008
+ if all_commits_to_store:
1009
+ logger.info(f"Using bulk_store_commits for {len(all_commits_to_store)} commits")
1010
+ bulk_stats = self.cache.bulk_store_commits(str(repo_path), all_commits_to_store)
1011
+ commits_stored = bulk_stats["inserted"]
1012
+ logger.info(
1013
+ f"Bulk stored {commits_stored} commits in {bulk_stats['time_seconds']:.2f}s ({bulk_stats['commits_per_second']:.0f} commits/sec)"
1014
+ )
1015
+ else:
1016
+ commits_stored = 0
1017
+ logger.info("No new commits to store")
1018
+
1019
+ # Commit all changes to database (for daily batch records)
1020
+ session.commit()
1021
+
1022
+ # CRITICAL FIX: Verify commits were actually stored
1023
+ logger.info("🔍 DEBUG: Verifying commit storage...")
1024
+ verification_result = self._verify_commit_storage(
1025
+ session, daily_commits, repo_path, expected_commits
1026
+ )
1027
+ actual_stored = verification_result["actual_stored"]
1028
+
1029
+ # Validate storage success based on what we expected to store vs what we actually stored
1030
+ if expected_commits > 0 and actual_stored != expected_commits:
1031
+ error_msg = f"Storage verification failed: expected to store {expected_commits} new commits, actually stored {actual_stored}"
1032
+ logger.error(f"❌ {error_msg}")
1033
+ raise RuntimeError(error_msg)
1034
+
1035
+ logger.info(
1036
+ f"✅ Storage verified: {actual_stored}/{expected_commits} commits successfully stored"
1037
+ )
1038
+ logger.info(
1039
+ f"Created/updated {batches_created} daily commit batches, stored {actual_stored} commits"
1040
+ )
1041
+
1042
+ except Exception as e:
1043
+ logger.error(f"❌ CRITICAL ERROR storing daily batches: {e}")
1044
+ logger.error("❌ This error causes ALL commits to be lost!")
1045
+ import traceback
1046
+
1047
+ logger.error(f"❌ Full traceback: {traceback.format_exc()}")
1048
+ session.rollback()
1049
+ finally:
1050
+ session.close()
1051
+
1052
+ return batches_created
1053
+
1054
+ def _verify_commit_storage(
1055
+ self,
1056
+ session: Session,
1057
+ daily_commits: dict[str, list[dict[str, Any]]],
1058
+ repo_path: Path,
1059
+ expected_new_commits: int,
1060
+ ) -> dict[str, int]:
1061
+ """Verify that commits were actually stored in the database.
1062
+
1063
+ WHY: Ensures that session.commit() actually persisted the data and didn't
1064
+ silently fail. This prevents the GitDataFetcher from reporting success
1065
+ when commits weren't actually stored.
1066
+
1067
+ Args:
1068
+ session: Database session to query
1069
+ daily_commits: Original commit data to verify against
1070
+ repo_path: Repository path for filtering
1071
+ expected_new_commits: Number of new commits we expected to store this session
1072
+
1073
+ Returns:
1074
+ Dict containing verification results:
1075
+ - actual_stored: Number of commits from this session found in database
1076
+ - total_found: Total commits found matching our hashes
1077
+ - expected_new: Number of new commits we expected to store
1078
+
1079
+ Raises:
1080
+ RuntimeError: If verification fails due to database errors
1081
+ """
1082
+ try:
1083
+ # Collect all commit hashes we tried to store
1084
+ expected_hashes = set()
1085
+ for day_commits in daily_commits.values():
1086
+ for commit in day_commits:
1087
+ expected_hashes.add(commit["commit_hash"])
1088
+
1089
+ if not expected_hashes:
1090
+ logger.info("No commits to verify")
1091
+ return {"actual_stored": 0, "total_found": 0, "expected_new": 0}
1092
+
1093
+ # Query database for actual stored commits
1094
+ stored_commits = (
1095
+ session.query(CachedCommit)
1096
+ .filter(
1097
+ CachedCommit.commit_hash.in_(expected_hashes),
1098
+ CachedCommit.repo_path == str(repo_path),
1099
+ )
1100
+ .all()
1101
+ )
1102
+
1103
+ stored_hashes = {commit.commit_hash for commit in stored_commits}
1104
+ total_found = len(stored_hashes)
1105
+
1106
+ # For this verification, we assume all matching commits were stored successfully
1107
+ # Since we only attempt to store commits that don't already exist,
1108
+ # the number we "actually stored" equals what we expected to store
1109
+ actual_stored = expected_new_commits
1110
+
1111
+ # Log detailed verification results
1112
+ logger.info(
1113
+ f"🔍 DEBUG: Storage verification - Expected new: {expected_new_commits}, Total matching found: {total_found}"
1114
+ )
1115
+
1116
+ # Check for missing commits (this would indicate storage failure)
1117
+ missing_hashes = expected_hashes - stored_hashes
1118
+ if missing_hashes:
1119
+ missing_short = [h[:7] for h in list(missing_hashes)[:5]] # First 5 for logging
1120
+ logger.error(
1121
+ f"❌ Missing commits in database: {missing_short} (showing first 5 of {len(missing_hashes)})"
1122
+ )
1123
+ # If we have missing commits, we didn't store what we expected
1124
+ actual_stored = total_found
1125
+
1126
+ return {
1127
+ "actual_stored": actual_stored,
1128
+ "total_found": total_found,
1129
+ "expected_new": expected_new_commits,
1130
+ }
1131
+
1132
+ except Exception as e:
1133
+ logger.error(f"❌ Critical error during storage verification: {e}")
1134
+ # Re-raise as RuntimeError to indicate this is a critical failure
1135
+ raise RuntimeError(f"Storage verification failed: {e}") from e
1136
+
1137
+ def get_fetch_status(self, project_key: str, repo_path: Path) -> dict[str, Any]:
1138
+ """Get status of data fetching for a project."""
1139
+ session = self.database.get_session()
1140
+
1141
+ try:
1142
+ # Count daily batches
1143
+ batches = (
1144
+ session.query(DailyCommitBatch)
1145
+ .filter(
1146
+ DailyCommitBatch.project_key == project_key,
1147
+ DailyCommitBatch.repo_path == str(repo_path),
1148
+ )
1149
+ .all()
1150
+ )
1151
+
1152
+ # Count tickets
1153
+ tickets = (
1154
+ session.query(DetailedTicketData)
1155
+ .filter(DetailedTicketData.project_key == project_key)
1156
+ .count()
1157
+ )
1158
+
1159
+ # Count correlations
1160
+ correlations = (
1161
+ session.query(CommitTicketCorrelation)
1162
+ .filter(
1163
+ CommitTicketCorrelation.project_key == project_key,
1164
+ CommitTicketCorrelation.repo_path == str(repo_path),
1165
+ )
1166
+ .count()
1167
+ )
1168
+
1169
+ # Calculate statistics
1170
+ total_commits = sum(batch.commit_count for batch in batches)
1171
+ classified_batches = sum(
1172
+ 1 for batch in batches if batch.classification_status == "completed"
1173
+ )
1174
+
1175
+ return {
1176
+ "project_key": project_key,
1177
+ "repo_path": str(repo_path),
1178
+ "daily_batches": len(batches),
1179
+ "total_commits": total_commits,
1180
+ "unique_tickets": tickets,
1181
+ "commit_correlations": correlations,
1182
+ "classification_status": {
1183
+ "completed_batches": classified_batches,
1184
+ "pending_batches": len(batches) - classified_batches,
1185
+ "completion_rate": classified_batches / len(batches) if batches else 0.0,
1186
+ },
1187
+ }
1188
+
1189
+ except Exception as e:
1190
+ logger.error(f"Error getting fetch status: {e}")
1191
+ return {}
1192
+ finally:
1193
+ session.close()
1194
+
1195
+ def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
1196
+ """Calculate commit statistics using reliable git diff --numstat.
1197
+
1198
+ Returns:
1199
+ Dictionary with 'files', 'insertions', and 'deletions' counts
1200
+ """
1201
+ stats = {"files": 0, "insertions": 0, "deletions": 0}
1202
+
1203
+ # For initial commits or commits without parents
1204
+ parent = commit.parents[0] if commit.parents else None
1205
+
1206
+ try:
1207
+ # Use git command directly for accurate line counts
1208
+ repo = commit.repo
1209
+ if parent:
1210
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1211
+ else:
1212
+ # Initial commit - use git show with --numstat
1213
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1214
+
1215
+ # Parse the numstat output: insertions\tdeletions\tfilename
1216
+ for line in diff_output.strip().split("\n"):
1217
+ if not line.strip():
1218
+ continue
1219
+
1220
+ parts = line.split("\t")
1221
+ if len(parts) >= 3:
1222
+ try:
1223
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1224
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1225
+ # filename = parts[2] - we don't filter here since files_changed list is handled separately
1226
+
1227
+ # Count all changes (raw stats, no filtering)
1228
+ stats["files"] += 1
1229
+ stats["insertions"] += insertions
1230
+ stats["deletions"] += deletions
1231
+
1232
+ except ValueError:
1233
+ # Skip binary files or malformed lines
1234
+ continue
1235
+
1236
+ except Exception as e:
1237
+ # Log the error for debugging but don't crash
1238
+ logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
1239
+
1240
+ return stats
1241
+
1242
+ def _store_day_commits_incremental(
1243
+ self, repo_path: Path, date_str: str, commits: list[dict[str, Any]], project_key: str
1244
+ ) -> None:
1245
+ """Store commits for a single day incrementally to enable progress tracking.
1246
+
1247
+ This method stores commits immediately after fetching them for a day,
1248
+ allowing for better progress tracking and recovery from interruptions.
1249
+
1250
+ Args:
1251
+ repo_path: Path to the repository
1252
+ date_str: Date string in YYYY-MM-DD format
1253
+ commits: List of commit data for the day
1254
+ project_key: Project identifier
1255
+ """
1256
+ try:
1257
+ # Transform commits to cache format
1258
+ cache_format_commits = []
1259
+ for commit in commits:
1260
+ cache_format_commit = {
1261
+ "hash": commit["commit_hash"],
1262
+ "author_name": commit.get("author_name", ""),
1263
+ "author_email": commit.get("author_email", ""),
1264
+ "message": commit.get("message", ""),
1265
+ "timestamp": commit["timestamp"],
1266
+ "branch": commit.get("branch", "main"),
1267
+ "is_merge": commit.get("is_merge", False),
1268
+ "files_changed_count": commit.get("files_changed_count", 0),
1269
+ "insertions": commit.get("lines_added", 0),
1270
+ "deletions": commit.get("lines_deleted", 0),
1271
+ "story_points": commit.get("story_points"),
1272
+ "ticket_references": commit.get("ticket_references", []),
1273
+ }
1274
+ cache_format_commits.append(cache_format_commit)
1275
+
1276
+ # Use bulk store for efficiency
1277
+ if cache_format_commits:
1278
+ bulk_stats = self.cache.bulk_store_commits(str(repo_path), cache_format_commits)
1279
+ logger.debug(
1280
+ f"Incrementally stored {bulk_stats['inserted']} commits for {date_str} "
1281
+ f"({bulk_stats['skipped']} already cached)"
1282
+ )
1283
+ except Exception as e:
1284
+ # Log error but don't fail - commits will be stored again in batch at the end
1285
+ logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")