gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1193 @@
1
+ """Data fetcher for collecting raw git commits and ticket data without classification.
2
+
3
+ This module implements the first step of the two-step fetch/analyze process,
4
+ focusing purely on data collection from Git repositories and ticket systems
5
+ without performing any LLM-based classification.
6
+ """
7
+
8
+ import contextlib
9
+ import logging
10
+ import os
11
+ import subprocess
12
+ from collections import defaultdict
13
+ from datetime import datetime, timedelta, timezone
14
+ from pathlib import Path
15
+ from typing import Any, Optional
16
+
17
+ from sqlalchemy.orm import Session
18
+
19
+ from ..extractors.story_points import StoryPointExtractor
20
+ from ..extractors.tickets import TicketExtractor
21
+ from ..integrations.jira_integration import JIRAIntegration
22
+ from ..models.database import (
23
+ CachedCommit,
24
+ CommitTicketCorrelation,
25
+ DailyCommitBatch,
26
+ DetailedTicketData,
27
+ )
28
+ from .branch_mapper import BranchToProjectMapper
29
+ from .cache import GitAnalysisCache
30
+ from .identity import DeveloperIdentityResolver
31
+ from .progress import get_progress_service
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class GitDataFetcher:
37
+ """Fetches raw Git commit data and organizes it by day for efficient batch processing.
38
+
39
+ WHY: This class implements the first step of the two-step process by collecting
40
+ all raw data (commits, tickets, correlations) without performing classification.
41
+ This separation enables:
42
+ - Fast data collection without LLM costs
43
+ - Repeatable analysis runs without re-fetching
44
+ - Better batch organization for efficient LLM classification
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ cache: GitAnalysisCache,
50
+ branch_mapping_rules: Optional[dict[str, list[str]]] = None,
51
+ allowed_ticket_platforms: Optional[list[str]] = None,
52
+ exclude_paths: Optional[list[str]] = None,
53
+ ):
54
+ """Initialize the data fetcher.
55
+
56
+ Args:
57
+ cache: Git analysis cache instance
58
+ branch_mapping_rules: Rules for mapping branches to projects
59
+ allowed_ticket_platforms: List of allowed ticket platforms
60
+ exclude_paths: List of file paths to exclude from analysis
61
+ """
62
+ self.cache = cache
63
+ # CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
64
+ self.database = cache.db
65
+ self.story_point_extractor = StoryPointExtractor()
66
+ self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
67
+ self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
68
+ self.exclude_paths = exclude_paths or []
69
+
70
+ # Initialize identity resolver
71
+ identity_db_path = cache.cache_dir / "identities.db"
72
+ self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
73
+
74
+ def fetch_repository_data(
75
+ self,
76
+ repo_path: Path,
77
+ project_key: str,
78
+ weeks_back: int = 4,
79
+ branch_patterns: Optional[list[str]] = None,
80
+ jira_integration: Optional[JIRAIntegration] = None,
81
+ progress_callback: Optional[callable] = None,
82
+ start_date: Optional[datetime] = None,
83
+ end_date: Optional[datetime] = None,
84
+ ) -> dict[str, Any]:
85
+ """Fetch all data for a repository and organize by day.
86
+
87
+ This method collects:
88
+ 1. All commits organized by day
89
+ 2. All referenced tickets with full metadata
90
+ 3. Commit-ticket correlations
91
+ 4. Developer identity mappings
92
+
93
+ Args:
94
+ repo_path: Path to the Git repository
95
+ project_key: Project identifier
96
+ weeks_back: Number of weeks to analyze (used only if start_date/end_date not provided)
97
+ branch_patterns: Branch patterns to include
98
+ jira_integration: JIRA integration for ticket data
99
+ progress_callback: Optional callback for progress updates
100
+ start_date: Optional explicit start date (overrides weeks_back calculation)
101
+ end_date: Optional explicit end date (overrides weeks_back calculation)
102
+
103
+ Returns:
104
+ Dictionary containing fetch results and statistics
105
+ """
106
+ logger.info("🔍 DEBUG: ===== FETCH METHOD CALLED =====")
107
+ logger.info(f"Starting data fetch for project {project_key} at {repo_path}")
108
+ logger.info(f"🔍 DEBUG: weeks_back={weeks_back}, repo_path={repo_path}")
109
+
110
+ # Calculate date range - use explicit dates if provided, otherwise calculate from weeks_back
111
+ if start_date is not None and end_date is not None:
112
+ logger.info(f"🔍 DEBUG: Using explicit date range: {start_date} to {end_date}")
113
+ else:
114
+ end_date = datetime.now(timezone.utc)
115
+ start_date = end_date - timedelta(weeks=weeks_back)
116
+ logger.info(
117
+ f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
118
+ )
119
+
120
+ # Step 1: Collect all commits organized by day
121
+ logger.info("🔍 DEBUG: About to fetch commits by day")
122
+ logger.info("Fetching commits organized by day...")
123
+ daily_commits = self._fetch_commits_by_day(
124
+ repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
125
+ )
126
+ logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
127
+
128
+ # Step 2: Extract and fetch all referenced tickets
129
+ logger.info("🔍 DEBUG: About to extract ticket references")
130
+ logger.info("Extracting ticket references...")
131
+ ticket_ids = self._extract_all_ticket_references(daily_commits)
132
+ logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
133
+
134
+ if jira_integration and ticket_ids:
135
+ logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
136
+ self._fetch_detailed_tickets(
137
+ ticket_ids, jira_integration, project_key, progress_callback
138
+ )
139
+
140
+ # Step 3: Store commit-ticket correlations
141
+ logger.info("Building commit-ticket correlations...")
142
+ correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
143
+
144
+ # Step 4: Store daily commit batches
145
+ logger.info(
146
+ f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
147
+ )
148
+ logger.info("Storing daily commit batches...")
149
+ batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
150
+ logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
151
+
152
+ # CRITICAL FIX: Verify actual storage before reporting success
153
+ session = self.database.get_session()
154
+ try:
155
+ expected_commits = sum(len(commits) for commits in daily_commits.values())
156
+ verification_result = self._verify_commit_storage(
157
+ session, daily_commits, repo_path, expected_commits
158
+ )
159
+ actual_stored_commits = verification_result["total_found"]
160
+ except Exception as e:
161
+ logger.error(f"❌ Final storage verification failed: {e}")
162
+ # Don't let verification failure break the return, but log it clearly
163
+ actual_stored_commits = 0
164
+ finally:
165
+ session.close()
166
+
167
+ # Return summary statistics with ACTUAL stored counts
168
+ # expected_commits already calculated above
169
+
170
+ results = {
171
+ "project_key": project_key,
172
+ "repo_path": str(repo_path),
173
+ "date_range": {"start": start_date, "end": end_date},
174
+ "stats": {
175
+ "total_commits": expected_commits, # What we tried to store
176
+ "stored_commits": actual_stored_commits, # What was actually stored
177
+ "storage_success": actual_stored_commits == expected_commits,
178
+ "days_with_commits": len(daily_commits),
179
+ "unique_tickets": len(ticket_ids),
180
+ "correlations_created": correlations_created,
181
+ "batches_created": batches_created,
182
+ },
183
+ "daily_commits": daily_commits, # For immediate use if needed
184
+ }
185
+
186
+ # Log with actual storage results
187
+ if actual_stored_commits == expected_commits:
188
+ logger.info(
189
+ f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
190
+ )
191
+ else:
192
+ logger.error(
193
+ f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
194
+ )
195
+
196
+ return results
197
+
198
+ def _fetch_commits_by_day(
199
+ self,
200
+ repo_path: Path,
201
+ project_key: str,
202
+ start_date: datetime,
203
+ end_date: datetime,
204
+ branch_patterns: Optional[list[str]],
205
+ progress_callback: Optional[callable] = None,
206
+ ) -> dict[str, list[dict[str, Any]]]:
207
+ """Fetch all commits organized by day with full metadata.
208
+
209
+ Returns:
210
+ Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
211
+ """
212
+ from git import Repo
213
+
214
+ try:
215
+ repo = Repo(repo_path)
216
+ # Update repository from remote before analysis
217
+ self._update_repository(repo)
218
+ except Exception as e:
219
+ logger.error(f"Failed to open repository at {repo_path}: {e}")
220
+ return {}
221
+
222
+ # Collect commits from all relevant branches
223
+ all_commits = []
224
+ branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
225
+
226
+ if not branches_to_analyze:
227
+ logger.warning(f"No accessible branches found in repository {repo_path}")
228
+ return {}
229
+
230
+ logger.info(f"Analyzing branches: {branches_to_analyze}")
231
+
232
+ for branch_name in branches_to_analyze:
233
+ try:
234
+ branch_commits = list(
235
+ repo.iter_commits(branch_name, since=start_date, until=end_date, reverse=False)
236
+ )
237
+
238
+ logger.debug(
239
+ f"Found {len(branch_commits)} commits in branch {branch_name} for date range"
240
+ )
241
+
242
+ for commit in branch_commits:
243
+ # Include merge commits like the original analyzer
244
+ # The original analyzer marks merge commits with is_merge=True but doesn't skip them
245
+
246
+ # Extract commit data with full metadata
247
+ commit_data = self._extract_commit_data(
248
+ commit, branch_name, project_key, repo_path
249
+ )
250
+ if commit_data:
251
+ all_commits.append(commit_data)
252
+
253
+ except Exception as e:
254
+ logger.warning(f"Error processing branch {branch_name}: {e}")
255
+ continue
256
+
257
+ # Deduplicate commits (same commit may appear in multiple branches)
258
+ seen_hashes = set()
259
+ unique_commits = []
260
+ for commit_data in all_commits:
261
+ commit_hash = commit_data["commit_hash"]
262
+ if commit_hash not in seen_hashes:
263
+ seen_hashes.add(commit_hash)
264
+ unique_commits.append(commit_data)
265
+
266
+ # Organize commits by day
267
+ daily_commits = defaultdict(list)
268
+ for commit_data in unique_commits:
269
+ # Convert timestamp to date key
270
+ commit_date = commit_data["timestamp"].date()
271
+ date_key = commit_date.strftime("%Y-%m-%d")
272
+ daily_commits[date_key].append(commit_data)
273
+
274
+ # Sort commits within each day by timestamp
275
+ for date_key in daily_commits:
276
+ daily_commits[date_key].sort(key=lambda c: c["timestamp"])
277
+
278
+ logger.info(f"Collected {len(unique_commits)} commits across {len(daily_commits)} days")
279
+ return dict(daily_commits)
280
+
281
+ def _extract_commit_data(
282
+ self, commit: Any, branch_name: str, project_key: str, repo_path: Path
283
+ ) -> Optional[dict[str, Any]]:
284
+ """Extract comprehensive data from a Git commit.
285
+
286
+ Returns:
287
+ Dictionary containing all commit metadata needed for classification
288
+ """
289
+ try:
290
+ # Basic commit information
291
+ commit_data = {
292
+ "commit_hash": commit.hexsha,
293
+ "commit_hash_short": commit.hexsha[:7],
294
+ "message": commit.message.strip(),
295
+ "author_name": commit.author.name,
296
+ "author_email": commit.author.email,
297
+ "timestamp": datetime.fromtimestamp(commit.committed_date, tz=timezone.utc),
298
+ "branch": branch_name,
299
+ "project_key": project_key,
300
+ "repo_path": str(repo_path),
301
+ "is_merge": len(commit.parents) > 1, # Match the original analyzer behavior
302
+ }
303
+
304
+ # Calculate file changes
305
+ try:
306
+ # Compare with first parent or empty tree for initial commit
307
+ diff = commit.parents[0].diff(commit) if commit.parents else commit.diff(None)
308
+
309
+ # Get file paths with filtering
310
+ files_changed = []
311
+ for diff_item in diff:
312
+ file_path = diff_item.a_path or diff_item.b_path
313
+ if file_path and not self._should_exclude_file(file_path):
314
+ files_changed.append(file_path)
315
+
316
+ # Use reliable git numstat command for accurate line counts
317
+ line_stats = self._calculate_commit_stats(commit)
318
+ total_insertions = line_stats["insertions"]
319
+ total_deletions = line_stats["deletions"]
320
+
321
+ commit_data.update(
322
+ {
323
+ "files_changed": files_changed,
324
+ "files_changed_count": len(files_changed),
325
+ "lines_added": total_insertions,
326
+ "lines_deleted": total_deletions,
327
+ }
328
+ )
329
+
330
+ except Exception as e:
331
+ logger.debug(f"Error calculating changes for commit {commit.hexsha}: {e}")
332
+ commit_data.update(
333
+ {
334
+ "files_changed": [],
335
+ "files_changed_count": 0,
336
+ "lines_added": 0,
337
+ "lines_deleted": 0,
338
+ }
339
+ )
340
+
341
+ # Extract story points
342
+ story_points = self.story_point_extractor.extract_from_text(commit_data["message"])
343
+ commit_data["story_points"] = story_points
344
+
345
+ # Extract ticket references
346
+ ticket_refs_data = self.ticket_extractor.extract_from_text(commit_data["message"])
347
+ # Convert to list of ticket IDs for compatibility
348
+ # Fix: Use 'id' field instead of 'ticket_id' field from extractor output
349
+ ticket_refs = [ref_data["id"] for ref_data in ticket_refs_data]
350
+ commit_data["ticket_references"] = ticket_refs
351
+
352
+ # Resolve developer identity
353
+ canonical_id = self.identity_resolver.resolve_developer(
354
+ commit_data["author_name"], commit_data["author_email"]
355
+ )
356
+ commit_data["canonical_developer_id"] = canonical_id
357
+
358
+ return commit_data
359
+
360
+ except Exception as e:
361
+ logger.error(f"Error extracting data for commit {commit.hexsha}: {e}")
362
+ return None
363
+
364
+ def _should_exclude_file(self, file_path: str) -> bool:
365
+ """Check if a file should be excluded based on exclude patterns."""
366
+ import fnmatch
367
+
368
+ return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
369
+
370
+ def _get_branches_to_analyze(
371
+ self, repo: Any, branch_patterns: Optional[list[str]]
372
+ ) -> list[str]:
373
+ """Get list of branches to analyze based on patterns.
374
+
375
+ WHY: Robust branch detection that handles missing remotes, missing default branches,
376
+ and provides good fallback behavior. Based on the approach used in the existing analyzer.
377
+
378
+ DESIGN DECISION:
379
+ - Try default branches first, fall back to all available branches
380
+ - Handle missing remotes gracefully
381
+ - Skip remote tracking branches to avoid duplicates
382
+ - Use actual branch existence checking rather than assuming branches exist
383
+ """
384
+ if not branch_patterns:
385
+ # Get all available branches (local branches preferred)
386
+ available_branches = []
387
+
388
+ # First, try local branches
389
+ try:
390
+ local_branches = [branch.name for branch in repo.branches]
391
+ available_branches.extend(local_branches)
392
+ logger.debug(f"Found local branches: {local_branches}")
393
+ except Exception as e:
394
+ logger.debug(f"Error getting local branches: {e}")
395
+
396
+ # If we have remotes, also consider remote branches (but clean the names)
397
+ try:
398
+ if repo.remotes and hasattr(repo.remotes, "origin"):
399
+ remote_branches = [
400
+ ref.name.replace("origin/", "")
401
+ for ref in repo.remotes.origin.refs
402
+ if not ref.name.endswith("HEAD") # Skip HEAD ref
403
+ ]
404
+ # Only add remote branches that aren't already in local branches
405
+ for branch in remote_branches:
406
+ if branch not in available_branches:
407
+ available_branches.append(branch)
408
+ logger.debug(f"Found remote branches: {remote_branches}")
409
+ except Exception as e:
410
+ logger.debug(f"Error getting remote branches: {e}")
411
+
412
+ # If no branches found, fallback to trying common names directly
413
+ if not available_branches:
414
+ logger.warning(
415
+ "No branches found via normal detection, falling back to common names"
416
+ )
417
+ available_branches = ["main", "master", "develop", "dev"]
418
+
419
+ # Try default main branches first, in order of preference
420
+ main_branches = ["main", "master", "develop", "dev"]
421
+ for branch in main_branches:
422
+ if branch in available_branches:
423
+ # Test that we can actually access this branch
424
+ try:
425
+ # Just try to get the commit object to verify branch exists and is accessible
426
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
427
+ logger.info(f"Using main branch: {branch}")
428
+ return [branch]
429
+ except Exception as e:
430
+ logger.debug(f"Branch {branch} exists but not accessible: {e}")
431
+ continue
432
+
433
+ # If no main branches work, try the first available branch that actually works
434
+ for branch in available_branches:
435
+ try:
436
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
437
+ logger.info(f"Using fallback branch: {branch}")
438
+ return [branch]
439
+ except Exception as e:
440
+ logger.debug(f"Branch {branch} not accessible: {e}")
441
+ continue
442
+
443
+ # Last resort: return empty list (will be handled gracefully by caller)
444
+ logger.warning("No accessible branches found")
445
+ return []
446
+
447
+ # Use specified patterns - match against all available branches
448
+ import fnmatch
449
+
450
+ available_branches = []
451
+
452
+ # Collect all branches (local and remote)
453
+ with contextlib.suppress(Exception):
454
+ available_branches.extend([branch.name for branch in repo.branches])
455
+
456
+ try:
457
+ if repo.remotes and hasattr(repo.remotes, "origin"):
458
+ remote_branches = [
459
+ ref.name.replace("origin/", "")
460
+ for ref in repo.remotes.origin.refs
461
+ if not ref.name.endswith("HEAD")
462
+ ]
463
+ for branch in remote_branches:
464
+ if branch not in available_branches:
465
+ available_branches.append(branch)
466
+ except Exception:
467
+ pass
468
+
469
+ # Match patterns against available branches
470
+ matching_branches = []
471
+ for pattern in branch_patterns:
472
+ matching = [branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)]
473
+ matching_branches.extend(matching)
474
+
475
+ # Test that matched branches are actually accessible
476
+ accessible_branches = []
477
+ for branch in list(set(matching_branches)): # Remove duplicates
478
+ try:
479
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
480
+ accessible_branches.append(branch)
481
+ except Exception as e:
482
+ logger.debug(f"Matched branch {branch} not accessible: {e}")
483
+
484
+ return accessible_branches
485
+
486
+ def _update_repository(self, repo) -> bool:
487
+ """Update repository from remote before analysis.
488
+
489
+ WHY: This ensures we have the latest commits from the remote repository
490
+ before performing analysis. Critical for getting accurate data especially
491
+ when analyzing repositories that are actively being developed.
492
+
493
+ DESIGN DECISION: Uses fetch() for all cases, then pull() only when on a
494
+ tracking branch that's not in detached HEAD state. This approach:
495
+ - Handles detached HEAD states gracefully (common in CI/CD)
496
+ - Always gets latest refs from remote via fetch
497
+ - Only attempts pull when it's safe to do so
498
+ - Continues analysis even if update fails (logs warning)
499
+
500
+ Args:
501
+ repo: GitPython Repo object
502
+
503
+ Returns:
504
+ bool: True if update succeeded, False if failed (but analysis continues)
505
+ """
506
+ try:
507
+ if repo.remotes:
508
+ logger.info("Fetching latest changes from remote")
509
+
510
+ # Use subprocess for git fetch to prevent password prompts
511
+ env = os.environ.copy()
512
+ env["GIT_TERMINAL_PROMPT"] = "0"
513
+ env["GIT_ASKPASS"] = ""
514
+ env["GCM_INTERACTIVE"] = "never"
515
+
516
+ # Run git fetch with timeout
517
+ try:
518
+ result = subprocess.run(
519
+ ["git", "fetch", "--all", "--config", "credential.helper="],
520
+ cwd=repo.working_dir,
521
+ env=env,
522
+ capture_output=True,
523
+ text=True,
524
+ timeout=30, # 30 second timeout
525
+ )
526
+
527
+ if result.returncode != 0:
528
+ error_str = result.stderr.lower()
529
+ if any(
530
+ x in error_str
531
+ for x in ["authentication", "permission denied", "401", "403"]
532
+ ):
533
+ logger.error(
534
+ "GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
535
+ "You can verify with: gh auth status"
536
+ )
537
+ # Skip fetch but continue with local analysis
538
+ return False
539
+ logger.warning(f"Git fetch failed: {result.stderr}")
540
+ return False
541
+
542
+ except subprocess.TimeoutExpired:
543
+ logger.error(
544
+ "Git fetch timed out (likely authentication failure). Continuing with local repository state."
545
+ )
546
+ return False
547
+
548
+ # Only try to pull if not in detached HEAD state
549
+ if not repo.head.is_detached:
550
+ current_branch = repo.active_branch
551
+ tracking = current_branch.tracking_branch()
552
+ if tracking:
553
+ # Pull latest changes using subprocess
554
+ try:
555
+ result = subprocess.run(
556
+ ["git", "pull", "--config", "credential.helper="],
557
+ cwd=repo.working_dir,
558
+ env=env,
559
+ capture_output=True,
560
+ text=True,
561
+ timeout=30,
562
+ )
563
+
564
+ if result.returncode != 0:
565
+ error_str = result.stderr.lower()
566
+ if any(
567
+ x in error_str
568
+ for x in ["authentication", "permission denied", "401", "403"]
569
+ ):
570
+ logger.error(
571
+ "GitHub authentication failed during pull. Continuing with local repository state."
572
+ )
573
+ return False
574
+ logger.warning(f"Git pull failed: {result.stderr}")
575
+ return False
576
+ logger.debug(f"Pulled latest changes for {current_branch.name}")
577
+
578
+ except subprocess.TimeoutExpired:
579
+ logger.error(
580
+ "Git pull timed out. Continuing with local repository state."
581
+ )
582
+ return False
583
+ else:
584
+ logger.debug(
585
+ f"Branch {current_branch.name} has no tracking branch, skipping pull"
586
+ )
587
+ else:
588
+ logger.debug("Repository in detached HEAD state, skipping pull")
589
+ return True
590
+ else:
591
+ logger.debug("No remotes configured, skipping repository update")
592
+ return True
593
+ except Exception as e:
594
+ logger.warning(f"Could not update repository: {e}")
595
+ # Continue with analysis using local state
596
+ return False
597
+
598
+ def _extract_all_ticket_references(
599
+ self, daily_commits: dict[str, list[dict[str, Any]]]
600
+ ) -> set[str]:
601
+ """Extract all unique ticket IDs from commits."""
602
+ ticket_ids = set()
603
+
604
+ for day_commits in daily_commits.values():
605
+ for commit in day_commits:
606
+ ticket_refs = commit.get("ticket_references", [])
607
+ ticket_ids.update(ticket_refs)
608
+
609
+ logger.info(f"Found {len(ticket_ids)} unique ticket references")
610
+ return ticket_ids
611
+
612
+ def _fetch_detailed_tickets(
613
+ self,
614
+ ticket_ids: set[str],
615
+ jira_integration: JIRAIntegration,
616
+ project_key: str,
617
+ progress_callback: Optional[callable] = None,
618
+ ) -> None:
619
+ """Fetch detailed ticket information and store in database."""
620
+ session = self.database.get_session()
621
+
622
+ try:
623
+ # Check which tickets we already have
624
+ existing_tickets = (
625
+ session.query(DetailedTicketData)
626
+ .filter(
627
+ DetailedTicketData.ticket_id.in_(ticket_ids),
628
+ DetailedTicketData.platform == "jira",
629
+ )
630
+ .all()
631
+ )
632
+
633
+ existing_ids = {ticket.ticket_id for ticket in existing_tickets}
634
+ tickets_to_fetch = ticket_ids - existing_ids
635
+
636
+ if not tickets_to_fetch:
637
+ logger.info("All tickets already cached")
638
+ return
639
+
640
+ logger.info(f"Fetching {len(tickets_to_fetch)} new tickets")
641
+
642
+ # Fetch tickets in batches
643
+ batch_size = 50
644
+ tickets_list = list(tickets_to_fetch)
645
+
646
+ # Use centralized progress service
647
+ progress = get_progress_service()
648
+
649
+ with progress.progress(
650
+ total=len(tickets_list), description="Fetching tickets", unit="tickets"
651
+ ) as ctx:
652
+ for i in range(0, len(tickets_list), batch_size):
653
+ batch = tickets_list[i : i + batch_size]
654
+
655
+ for ticket_id in batch:
656
+ try:
657
+ # Fetch ticket from JIRA
658
+ issue_data = jira_integration.get_issue(ticket_id)
659
+
660
+ if issue_data:
661
+ # Create detailed ticket record
662
+ detailed_ticket = self._create_detailed_ticket_record(
663
+ issue_data, project_key, "jira"
664
+ )
665
+ session.add(detailed_ticket)
666
+
667
+ except Exception as e:
668
+ logger.warning(f"Failed to fetch ticket {ticket_id}: {e}")
669
+
670
+ progress.update(ctx, 1)
671
+
672
+ if progress_callback:
673
+ progress_callback(f"Fetched ticket {ticket_id}")
674
+
675
+ # Commit batch to database
676
+ session.commit()
677
+
678
+ logger.info(f"Successfully fetched {len(tickets_to_fetch)} tickets")
679
+
680
+ except Exception as e:
681
+ logger.error(f"Error fetching detailed tickets: {e}")
682
+ session.rollback()
683
+ finally:
684
+ session.close()
685
+
686
+ def _create_detailed_ticket_record(
687
+ self, issue_data: dict[str, Any], project_key: str, platform: str
688
+ ) -> DetailedTicketData:
689
+ """Create a detailed ticket record from JIRA issue data."""
690
+ # Extract classification hints from issue type and labels
691
+ classification_hints = []
692
+
693
+ issue_type = issue_data.get("issue_type", "").lower()
694
+ if "bug" in issue_type or "defect" in issue_type:
695
+ classification_hints.append("bug_fix")
696
+ elif "story" in issue_type or "feature" in issue_type:
697
+ classification_hints.append("feature")
698
+ elif "task" in issue_type:
699
+ classification_hints.append("maintenance")
700
+
701
+ # Extract business domain from labels or summary
702
+ business_domain = None
703
+ labels = issue_data.get("labels", [])
704
+ for label in labels:
705
+ if any(keyword in label.lower() for keyword in ["frontend", "backend", "ui", "api"]):
706
+ business_domain = label.lower()
707
+ break
708
+
709
+ # Create the record
710
+ return DetailedTicketData(
711
+ platform=platform,
712
+ ticket_id=issue_data["key"],
713
+ project_key=project_key,
714
+ title=issue_data.get("summary", ""),
715
+ description=issue_data.get("description", ""),
716
+ summary=issue_data.get("summary", "")[:500], # Truncated summary
717
+ ticket_type=issue_data.get("issue_type", ""),
718
+ status=issue_data.get("status", ""),
719
+ priority=issue_data.get("priority", ""),
720
+ labels=labels,
721
+ assignee=issue_data.get("assignee", ""),
722
+ reporter=issue_data.get("reporter", ""),
723
+ created_at=issue_data.get("created"),
724
+ updated_at=issue_data.get("updated"),
725
+ resolved_at=issue_data.get("resolved"),
726
+ story_points=issue_data.get("story_points"),
727
+ classification_hints=classification_hints,
728
+ business_domain=business_domain,
729
+ platform_data=issue_data, # Store full JIRA data
730
+ )
731
+
732
+ def _build_commit_ticket_correlations(
733
+ self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path
734
+ ) -> int:
735
+ """Build and store commit-ticket correlations."""
736
+ session = self.database.get_session()
737
+ correlations_created = 0
738
+
739
+ try:
740
+ for day_commits in daily_commits.values():
741
+ for commit in day_commits:
742
+ commit_hash = commit["commit_hash"]
743
+ ticket_refs = commit.get("ticket_references", [])
744
+
745
+ for ticket_id in ticket_refs:
746
+ try:
747
+ # Create correlation record
748
+ correlation = CommitTicketCorrelation(
749
+ commit_hash=commit_hash,
750
+ repo_path=str(repo_path),
751
+ ticket_id=ticket_id,
752
+ platform="jira", # Assuming JIRA for now
753
+ project_key=commit["project_key"],
754
+ correlation_type="direct",
755
+ confidence=1.0,
756
+ extracted_from="commit_message",
757
+ matching_pattern=None, # Could add pattern detection
758
+ )
759
+
760
+ # Check if correlation already exists
761
+ existing = (
762
+ session.query(CommitTicketCorrelation)
763
+ .filter(
764
+ CommitTicketCorrelation.commit_hash == commit_hash,
765
+ CommitTicketCorrelation.repo_path == str(repo_path),
766
+ CommitTicketCorrelation.ticket_id == ticket_id,
767
+ CommitTicketCorrelation.platform == "jira",
768
+ )
769
+ .first()
770
+ )
771
+
772
+ if not existing:
773
+ session.add(correlation)
774
+ correlations_created += 1
775
+
776
+ except Exception as e:
777
+ logger.warning(
778
+ f"Failed to create correlation for {commit_hash}-{ticket_id}: {e}"
779
+ )
780
+
781
+ session.commit()
782
+ logger.info(f"Created {correlations_created} commit-ticket correlations")
783
+
784
+ except Exception as e:
785
+ logger.error(f"Error building correlations: {e}")
786
+ session.rollback()
787
+ finally:
788
+ session.close()
789
+
790
+ return correlations_created
791
+
792
+ def _store_daily_batches(
793
+ self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path, project_key: str
794
+ ) -> int:
795
+ """Store daily commit batches for efficient retrieval using bulk operations.
796
+
797
+ WHY: Enhanced to use bulk operations from the cache layer for significantly
798
+ better performance when storing large numbers of commits.
799
+ """
800
+ session = self.database.get_session()
801
+ batches_created = 0
802
+ commits_stored = 0
803
+ expected_commits = 0
804
+
805
+ try:
806
+ total_commits = sum(len(commits) for commits in daily_commits.values())
807
+ logger.info(f"🔍 DEBUG: Storing {total_commits} commits from {len(daily_commits)} days")
808
+ logger.info(
809
+ f"🔍 DEBUG: Daily commits keys: {list(daily_commits.keys())[:5]}"
810
+ ) # First 5 dates
811
+
812
+ # Track dates to process for efficient batch handling
813
+ dates_to_process = [
814
+ datetime.strptime(date_str, "%Y-%m-%d").date() for date_str in daily_commits
815
+ ]
816
+ logger.info(f"🔍 DEBUG: Processing {len(dates_to_process)} dates for daily batches")
817
+
818
+ # Pre-load existing batches to avoid constraint violations during processing
819
+ existing_batches_map = {}
820
+ for date_str in daily_commits:
821
+ # Convert to datetime instead of date to match database storage
822
+ date_obj = datetime.strptime(date_str, "%Y-%m-%d")
823
+ existing_batch = (
824
+ session.query(DailyCommitBatch)
825
+ .filter(
826
+ DailyCommitBatch.date == date_obj,
827
+ DailyCommitBatch.project_key == project_key,
828
+ DailyCommitBatch.repo_path == str(repo_path),
829
+ )
830
+ .first()
831
+ )
832
+ if existing_batch:
833
+ existing_batches_map[date_str] = existing_batch
834
+ logger.info(
835
+ f"🔍 DEBUG: Found existing batch for {date_str}: ID={existing_batch.id}"
836
+ )
837
+ else:
838
+ logger.info(f"🔍 DEBUG: No existing batch found for {date_str}")
839
+
840
+ # Collect all commits for bulk operations
841
+ all_commits_to_store = []
842
+ commit_hashes_to_check = []
843
+
844
+ # First, collect all commit hashes to check existence in bulk
845
+ for _date_str, commits in daily_commits.items():
846
+ for commit in commits:
847
+ commit_hashes_to_check.append(commit["commit_hash"])
848
+
849
+ # Use bulk_exists to check which commits already exist
850
+ existing_commits_map = self.cache.bulk_exists(str(repo_path), commit_hashes_to_check)
851
+
852
+ # Disable autoflush to prevent premature batch creation during commit storage
853
+ with session.no_autoflush:
854
+ for date_str, commits in daily_commits.items():
855
+ if not commits:
856
+ continue
857
+
858
+ logger.info(f"🔍 DEBUG: Processing {len(commits)} commits for {date_str}")
859
+
860
+ # Prepare commits for bulk storage
861
+ commits_to_store_this_date = []
862
+ for commit in commits:
863
+ # Check if commit already exists using bulk check results
864
+ if not existing_commits_map.get(commit["commit_hash"], False):
865
+ # Transform commit data to cache format
866
+ cache_format_commit = {
867
+ "hash": commit["commit_hash"],
868
+ "author_name": commit.get("author_name", ""),
869
+ "author_email": commit.get("author_email", ""),
870
+ "message": commit.get("message", ""),
871
+ "timestamp": commit["timestamp"],
872
+ "branch": commit.get("branch", "main"),
873
+ "is_merge": commit.get("is_merge", False),
874
+ "files_changed_count": commit.get("files_changed_count", 0),
875
+ "insertions": commit.get("lines_added", 0),
876
+ "deletions": commit.get("lines_deleted", 0),
877
+ "story_points": commit.get("story_points"),
878
+ "ticket_references": commit.get("ticket_references", []),
879
+ }
880
+ commits_to_store_this_date.append(cache_format_commit)
881
+ all_commits_to_store.append(cache_format_commit)
882
+ expected_commits += 1
883
+ else:
884
+ logger.debug(
885
+ f"Commit {commit['commit_hash'][:7]} already exists in database"
886
+ )
887
+
888
+ logger.info(
889
+ f"🔍 DEBUG: Prepared {len(commits_to_store_this_date)} new commits for {date_str}"
890
+ )
891
+
892
+ # Calculate batch statistics
893
+ total_files = sum(commit.get("files_changed_count", 0) for commit in commits)
894
+ total_additions = sum(commit.get("lines_added", 0) for commit in commits)
895
+ total_deletions = sum(commit.get("lines_deleted", 0) for commit in commits)
896
+
897
+ # Get unique developers and tickets for this day
898
+ active_devs = list(
899
+ set(commit.get("canonical_developer_id", "") for commit in commits)
900
+ )
901
+ unique_tickets = []
902
+ for commit in commits:
903
+ unique_tickets.extend(commit.get("ticket_references", []))
904
+ unique_tickets = list(set(unique_tickets))
905
+
906
+ # Create context summary
907
+ context_summary = f"{len(commits)} commits by {len(active_devs)} developers"
908
+ if unique_tickets:
909
+ context_summary += f", {len(unique_tickets)} tickets referenced"
910
+
911
+ # Create or update daily batch using pre-loaded existing batches
912
+ date_obj = datetime.strptime(date_str, "%Y-%m-%d")
913
+
914
+ try:
915
+ existing_batch = existing_batches_map.get(date_str)
916
+
917
+ if existing_batch:
918
+ # Update existing batch with new data
919
+ existing_batch.commit_count = len(commits)
920
+ existing_batch.total_files_changed = total_files
921
+ existing_batch.total_lines_added = total_additions
922
+ existing_batch.total_lines_deleted = total_deletions
923
+ existing_batch.active_developers = active_devs
924
+ existing_batch.unique_tickets = unique_tickets
925
+ existing_batch.context_summary = context_summary
926
+ existing_batch.fetched_at = datetime.utcnow()
927
+ existing_batch.classification_status = "pending"
928
+ logger.info(f"🔍 DEBUG: Updated existing batch for {date_str}")
929
+ else:
930
+ # Create new batch
931
+ batch = DailyCommitBatch(
932
+ date=date_obj,
933
+ project_key=project_key,
934
+ repo_path=str(repo_path),
935
+ commit_count=len(commits),
936
+ total_files_changed=total_files,
937
+ total_lines_added=total_additions,
938
+ total_lines_deleted=total_deletions,
939
+ active_developers=active_devs,
940
+ unique_tickets=unique_tickets,
941
+ context_summary=context_summary,
942
+ classification_status="pending",
943
+ fetched_at=datetime.utcnow(),
944
+ )
945
+ session.add(batch)
946
+ batches_created += 1
947
+ logger.info(f"🔍 DEBUG: Created new batch for {date_str}")
948
+ except Exception as batch_error:
949
+ # Don't let batch creation failure kill commit storage
950
+ logger.error(
951
+ f"❌ CRITICAL: Failed to create/update batch for {date_str}: {batch_error}"
952
+ )
953
+ import traceback
954
+
955
+ logger.error(f"❌ Full batch error trace: {traceback.format_exc()}")
956
+ # Important: rollback any pending transaction to restore session state
957
+ session.rollback()
958
+ # Skip this batch but continue processing
959
+
960
+ # Use bulk store operation for all commits at once for maximum performance
961
+ if all_commits_to_store:
962
+ logger.info(f"Using bulk_store_commits for {len(all_commits_to_store)} commits")
963
+ bulk_stats = self.cache.bulk_store_commits(str(repo_path), all_commits_to_store)
964
+ commits_stored = bulk_stats["inserted"]
965
+ logger.info(
966
+ f"Bulk stored {commits_stored} commits in {bulk_stats['time_seconds']:.2f}s ({bulk_stats['commits_per_second']:.0f} commits/sec)"
967
+ )
968
+ else:
969
+ commits_stored = 0
970
+ logger.info("No new commits to store")
971
+
972
+ # Commit all changes to database (for daily batch records)
973
+ session.commit()
974
+
975
+ # CRITICAL FIX: Verify commits were actually stored
976
+ logger.info("🔍 DEBUG: Verifying commit storage...")
977
+ verification_result = self._verify_commit_storage(
978
+ session, daily_commits, repo_path, expected_commits
979
+ )
980
+ actual_stored = verification_result["actual_stored"]
981
+
982
+ # Validate storage success based on what we expected to store vs what we actually stored
983
+ if expected_commits > 0 and actual_stored != expected_commits:
984
+ error_msg = f"Storage verification failed: expected to store {expected_commits} new commits, actually stored {actual_stored}"
985
+ logger.error(f"❌ {error_msg}")
986
+ raise RuntimeError(error_msg)
987
+
988
+ logger.info(
989
+ f"✅ Storage verified: {actual_stored}/{expected_commits} commits successfully stored"
990
+ )
991
+ logger.info(
992
+ f"Created/updated {batches_created} daily commit batches, stored {actual_stored} commits"
993
+ )
994
+
995
+ except Exception as e:
996
+ logger.error(f"❌ CRITICAL ERROR storing daily batches: {e}")
997
+ logger.error("❌ This error causes ALL commits to be lost!")
998
+ import traceback
999
+
1000
+ logger.error(f"❌ Full traceback: {traceback.format_exc()}")
1001
+ session.rollback()
1002
+ finally:
1003
+ session.close()
1004
+
1005
+ return batches_created
1006
+
1007
+ def _verify_commit_storage(
1008
+ self,
1009
+ session: Session,
1010
+ daily_commits: dict[str, list[dict[str, Any]]],
1011
+ repo_path: Path,
1012
+ expected_new_commits: int,
1013
+ ) -> dict[str, int]:
1014
+ """Verify that commits were actually stored in the database.
1015
+
1016
+ WHY: Ensures that session.commit() actually persisted the data and didn't
1017
+ silently fail. This prevents the GitDataFetcher from reporting success
1018
+ when commits weren't actually stored.
1019
+
1020
+ Args:
1021
+ session: Database session to query
1022
+ daily_commits: Original commit data to verify against
1023
+ repo_path: Repository path for filtering
1024
+ expected_new_commits: Number of new commits we expected to store this session
1025
+
1026
+ Returns:
1027
+ Dict containing verification results:
1028
+ - actual_stored: Number of commits from this session found in database
1029
+ - total_found: Total commits found matching our hashes
1030
+ - expected_new: Number of new commits we expected to store
1031
+
1032
+ Raises:
1033
+ RuntimeError: If verification fails due to database errors
1034
+ """
1035
+ try:
1036
+ # Collect all commit hashes we tried to store
1037
+ expected_hashes = set()
1038
+ for day_commits in daily_commits.values():
1039
+ for commit in day_commits:
1040
+ expected_hashes.add(commit["commit_hash"])
1041
+
1042
+ if not expected_hashes:
1043
+ logger.info("No commits to verify")
1044
+ return {"actual_stored": 0, "total_found": 0, "expected_new": 0}
1045
+
1046
+ # Query database for actual stored commits
1047
+ stored_commits = (
1048
+ session.query(CachedCommit)
1049
+ .filter(
1050
+ CachedCommit.commit_hash.in_(expected_hashes),
1051
+ CachedCommit.repo_path == str(repo_path),
1052
+ )
1053
+ .all()
1054
+ )
1055
+
1056
+ stored_hashes = {commit.commit_hash for commit in stored_commits}
1057
+ total_found = len(stored_hashes)
1058
+
1059
+ # For this verification, we assume all matching commits were stored successfully
1060
+ # Since we only attempt to store commits that don't already exist,
1061
+ # the number we "actually stored" equals what we expected to store
1062
+ actual_stored = expected_new_commits
1063
+
1064
+ # Log detailed verification results
1065
+ logger.info(
1066
+ f"🔍 DEBUG: Storage verification - Expected new: {expected_new_commits}, Total matching found: {total_found}"
1067
+ )
1068
+
1069
+ # Check for missing commits (this would indicate storage failure)
1070
+ missing_hashes = expected_hashes - stored_hashes
1071
+ if missing_hashes:
1072
+ missing_short = [h[:7] for h in list(missing_hashes)[:5]] # First 5 for logging
1073
+ logger.error(
1074
+ f"❌ Missing commits in database: {missing_short} (showing first 5 of {len(missing_hashes)})"
1075
+ )
1076
+ # If we have missing commits, we didn't store what we expected
1077
+ actual_stored = total_found
1078
+
1079
+ return {
1080
+ "actual_stored": actual_stored,
1081
+ "total_found": total_found,
1082
+ "expected_new": expected_new_commits,
1083
+ }
1084
+
1085
+ except Exception as e:
1086
+ logger.error(f"❌ Critical error during storage verification: {e}")
1087
+ # Re-raise as RuntimeError to indicate this is a critical failure
1088
+ raise RuntimeError(f"Storage verification failed: {e}") from e
1089
+
1090
+ def get_fetch_status(self, project_key: str, repo_path: Path) -> dict[str, Any]:
1091
+ """Get status of data fetching for a project."""
1092
+ session = self.database.get_session()
1093
+
1094
+ try:
1095
+ # Count daily batches
1096
+ batches = (
1097
+ session.query(DailyCommitBatch)
1098
+ .filter(
1099
+ DailyCommitBatch.project_key == project_key,
1100
+ DailyCommitBatch.repo_path == str(repo_path),
1101
+ )
1102
+ .all()
1103
+ )
1104
+
1105
+ # Count tickets
1106
+ tickets = (
1107
+ session.query(DetailedTicketData)
1108
+ .filter(DetailedTicketData.project_key == project_key)
1109
+ .count()
1110
+ )
1111
+
1112
+ # Count correlations
1113
+ correlations = (
1114
+ session.query(CommitTicketCorrelation)
1115
+ .filter(
1116
+ CommitTicketCorrelation.project_key == project_key,
1117
+ CommitTicketCorrelation.repo_path == str(repo_path),
1118
+ )
1119
+ .count()
1120
+ )
1121
+
1122
+ # Calculate statistics
1123
+ total_commits = sum(batch.commit_count for batch in batches)
1124
+ classified_batches = sum(
1125
+ 1 for batch in batches if batch.classification_status == "completed"
1126
+ )
1127
+
1128
+ return {
1129
+ "project_key": project_key,
1130
+ "repo_path": str(repo_path),
1131
+ "daily_batches": len(batches),
1132
+ "total_commits": total_commits,
1133
+ "unique_tickets": tickets,
1134
+ "commit_correlations": correlations,
1135
+ "classification_status": {
1136
+ "completed_batches": classified_batches,
1137
+ "pending_batches": len(batches) - classified_batches,
1138
+ "completion_rate": classified_batches / len(batches) if batches else 0.0,
1139
+ },
1140
+ }
1141
+
1142
+ except Exception as e:
1143
+ logger.error(f"Error getting fetch status: {e}")
1144
+ return {}
1145
+ finally:
1146
+ session.close()
1147
+
1148
+ def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
1149
+ """Calculate commit statistics using reliable git diff --numstat.
1150
+
1151
+ Returns:
1152
+ Dictionary with 'files', 'insertions', and 'deletions' counts
1153
+ """
1154
+ stats = {"files": 0, "insertions": 0, "deletions": 0}
1155
+
1156
+ # For initial commits or commits without parents
1157
+ parent = commit.parents[0] if commit.parents else None
1158
+
1159
+ try:
1160
+ # Use git command directly for accurate line counts
1161
+ repo = commit.repo
1162
+ if parent:
1163
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1164
+ else:
1165
+ # Initial commit - use git show with --numstat
1166
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1167
+
1168
+ # Parse the numstat output: insertions\tdeletions\tfilename
1169
+ for line in diff_output.strip().split("\n"):
1170
+ if not line.strip():
1171
+ continue
1172
+
1173
+ parts = line.split("\t")
1174
+ if len(parts) >= 3:
1175
+ try:
1176
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1177
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1178
+ # filename = parts[2] - we don't filter here since files_changed list is handled separately
1179
+
1180
+ # Count all changes (raw stats, no filtering)
1181
+ stats["files"] += 1
1182
+ stats["insertions"] += insertions
1183
+ stats["deletions"] += deletions
1184
+
1185
+ except ValueError:
1186
+ # Skip binary files or malformed lines
1187
+ continue
1188
+
1189
+ except Exception as e:
1190
+ # Log the error for debugging but don't crash
1191
+ logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
1192
+
1193
+ return stats