gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4108 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +904 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +441 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1193 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1193 @@
|
|
|
1
|
+
"""Data fetcher for collecting raw git commits and ticket data without classification.
|
|
2
|
+
|
|
3
|
+
This module implements the first step of the two-step fetch/analyze process,
|
|
4
|
+
focusing purely on data collection from Git repositories and ticket systems
|
|
5
|
+
without performing any LLM-based classification.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import contextlib
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import subprocess
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from datetime import datetime, timedelta, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Optional
|
|
16
|
+
|
|
17
|
+
from sqlalchemy.orm import Session
|
|
18
|
+
|
|
19
|
+
from ..extractors.story_points import StoryPointExtractor
|
|
20
|
+
from ..extractors.tickets import TicketExtractor
|
|
21
|
+
from ..integrations.jira_integration import JIRAIntegration
|
|
22
|
+
from ..models.database import (
|
|
23
|
+
CachedCommit,
|
|
24
|
+
CommitTicketCorrelation,
|
|
25
|
+
DailyCommitBatch,
|
|
26
|
+
DetailedTicketData,
|
|
27
|
+
)
|
|
28
|
+
from .branch_mapper import BranchToProjectMapper
|
|
29
|
+
from .cache import GitAnalysisCache
|
|
30
|
+
from .identity import DeveloperIdentityResolver
|
|
31
|
+
from .progress import get_progress_service
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class GitDataFetcher:
|
|
37
|
+
"""Fetches raw Git commit data and organizes it by day for efficient batch processing.
|
|
38
|
+
|
|
39
|
+
WHY: This class implements the first step of the two-step process by collecting
|
|
40
|
+
all raw data (commits, tickets, correlations) without performing classification.
|
|
41
|
+
This separation enables:
|
|
42
|
+
- Fast data collection without LLM costs
|
|
43
|
+
- Repeatable analysis runs without re-fetching
|
|
44
|
+
- Better batch organization for efficient LLM classification
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
cache: GitAnalysisCache,
|
|
50
|
+
branch_mapping_rules: Optional[dict[str, list[str]]] = None,
|
|
51
|
+
allowed_ticket_platforms: Optional[list[str]] = None,
|
|
52
|
+
exclude_paths: Optional[list[str]] = None,
|
|
53
|
+
):
|
|
54
|
+
"""Initialize the data fetcher.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
cache: Git analysis cache instance
|
|
58
|
+
branch_mapping_rules: Rules for mapping branches to projects
|
|
59
|
+
allowed_ticket_platforms: List of allowed ticket platforms
|
|
60
|
+
exclude_paths: List of file paths to exclude from analysis
|
|
61
|
+
"""
|
|
62
|
+
self.cache = cache
|
|
63
|
+
# CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
|
|
64
|
+
self.database = cache.db
|
|
65
|
+
self.story_point_extractor = StoryPointExtractor()
|
|
66
|
+
self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
|
|
67
|
+
self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
|
|
68
|
+
self.exclude_paths = exclude_paths or []
|
|
69
|
+
|
|
70
|
+
# Initialize identity resolver
|
|
71
|
+
identity_db_path = cache.cache_dir / "identities.db"
|
|
72
|
+
self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
|
|
73
|
+
|
|
74
|
+
def fetch_repository_data(
|
|
75
|
+
self,
|
|
76
|
+
repo_path: Path,
|
|
77
|
+
project_key: str,
|
|
78
|
+
weeks_back: int = 4,
|
|
79
|
+
branch_patterns: Optional[list[str]] = None,
|
|
80
|
+
jira_integration: Optional[JIRAIntegration] = None,
|
|
81
|
+
progress_callback: Optional[callable] = None,
|
|
82
|
+
start_date: Optional[datetime] = None,
|
|
83
|
+
end_date: Optional[datetime] = None,
|
|
84
|
+
) -> dict[str, Any]:
|
|
85
|
+
"""Fetch all data for a repository and organize by day.
|
|
86
|
+
|
|
87
|
+
This method collects:
|
|
88
|
+
1. All commits organized by day
|
|
89
|
+
2. All referenced tickets with full metadata
|
|
90
|
+
3. Commit-ticket correlations
|
|
91
|
+
4. Developer identity mappings
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
repo_path: Path to the Git repository
|
|
95
|
+
project_key: Project identifier
|
|
96
|
+
weeks_back: Number of weeks to analyze (used only if start_date/end_date not provided)
|
|
97
|
+
branch_patterns: Branch patterns to include
|
|
98
|
+
jira_integration: JIRA integration for ticket data
|
|
99
|
+
progress_callback: Optional callback for progress updates
|
|
100
|
+
start_date: Optional explicit start date (overrides weeks_back calculation)
|
|
101
|
+
end_date: Optional explicit end date (overrides weeks_back calculation)
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Dictionary containing fetch results and statistics
|
|
105
|
+
"""
|
|
106
|
+
logger.info("🔍 DEBUG: ===== FETCH METHOD CALLED =====")
|
|
107
|
+
logger.info(f"Starting data fetch for project {project_key} at {repo_path}")
|
|
108
|
+
logger.info(f"🔍 DEBUG: weeks_back={weeks_back}, repo_path={repo_path}")
|
|
109
|
+
|
|
110
|
+
# Calculate date range - use explicit dates if provided, otherwise calculate from weeks_back
|
|
111
|
+
if start_date is not None and end_date is not None:
|
|
112
|
+
logger.info(f"🔍 DEBUG: Using explicit date range: {start_date} to {end_date}")
|
|
113
|
+
else:
|
|
114
|
+
end_date = datetime.now(timezone.utc)
|
|
115
|
+
start_date = end_date - timedelta(weeks=weeks_back)
|
|
116
|
+
logger.info(
|
|
117
|
+
f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Step 1: Collect all commits organized by day
|
|
121
|
+
logger.info("🔍 DEBUG: About to fetch commits by day")
|
|
122
|
+
logger.info("Fetching commits organized by day...")
|
|
123
|
+
daily_commits = self._fetch_commits_by_day(
|
|
124
|
+
repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
|
|
125
|
+
)
|
|
126
|
+
logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
|
|
127
|
+
|
|
128
|
+
# Step 2: Extract and fetch all referenced tickets
|
|
129
|
+
logger.info("🔍 DEBUG: About to extract ticket references")
|
|
130
|
+
logger.info("Extracting ticket references...")
|
|
131
|
+
ticket_ids = self._extract_all_ticket_references(daily_commits)
|
|
132
|
+
logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
|
|
133
|
+
|
|
134
|
+
if jira_integration and ticket_ids:
|
|
135
|
+
logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
|
|
136
|
+
self._fetch_detailed_tickets(
|
|
137
|
+
ticket_ids, jira_integration, project_key, progress_callback
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Step 3: Store commit-ticket correlations
|
|
141
|
+
logger.info("Building commit-ticket correlations...")
|
|
142
|
+
correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
|
|
143
|
+
|
|
144
|
+
# Step 4: Store daily commit batches
|
|
145
|
+
logger.info(
|
|
146
|
+
f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
|
|
147
|
+
)
|
|
148
|
+
logger.info("Storing daily commit batches...")
|
|
149
|
+
batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
|
|
150
|
+
logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
|
|
151
|
+
|
|
152
|
+
# CRITICAL FIX: Verify actual storage before reporting success
|
|
153
|
+
session = self.database.get_session()
|
|
154
|
+
try:
|
|
155
|
+
expected_commits = sum(len(commits) for commits in daily_commits.values())
|
|
156
|
+
verification_result = self._verify_commit_storage(
|
|
157
|
+
session, daily_commits, repo_path, expected_commits
|
|
158
|
+
)
|
|
159
|
+
actual_stored_commits = verification_result["total_found"]
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(f"❌ Final storage verification failed: {e}")
|
|
162
|
+
# Don't let verification failure break the return, but log it clearly
|
|
163
|
+
actual_stored_commits = 0
|
|
164
|
+
finally:
|
|
165
|
+
session.close()
|
|
166
|
+
|
|
167
|
+
# Return summary statistics with ACTUAL stored counts
|
|
168
|
+
# expected_commits already calculated above
|
|
169
|
+
|
|
170
|
+
results = {
|
|
171
|
+
"project_key": project_key,
|
|
172
|
+
"repo_path": str(repo_path),
|
|
173
|
+
"date_range": {"start": start_date, "end": end_date},
|
|
174
|
+
"stats": {
|
|
175
|
+
"total_commits": expected_commits, # What we tried to store
|
|
176
|
+
"stored_commits": actual_stored_commits, # What was actually stored
|
|
177
|
+
"storage_success": actual_stored_commits == expected_commits,
|
|
178
|
+
"days_with_commits": len(daily_commits),
|
|
179
|
+
"unique_tickets": len(ticket_ids),
|
|
180
|
+
"correlations_created": correlations_created,
|
|
181
|
+
"batches_created": batches_created,
|
|
182
|
+
},
|
|
183
|
+
"daily_commits": daily_commits, # For immediate use if needed
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
# Log with actual storage results
|
|
187
|
+
if actual_stored_commits == expected_commits:
|
|
188
|
+
logger.info(
|
|
189
|
+
f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
logger.error(
|
|
193
|
+
f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return results
|
|
197
|
+
|
|
198
|
+
def _fetch_commits_by_day(
|
|
199
|
+
self,
|
|
200
|
+
repo_path: Path,
|
|
201
|
+
project_key: str,
|
|
202
|
+
start_date: datetime,
|
|
203
|
+
end_date: datetime,
|
|
204
|
+
branch_patterns: Optional[list[str]],
|
|
205
|
+
progress_callback: Optional[callable] = None,
|
|
206
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
207
|
+
"""Fetch all commits organized by day with full metadata.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
|
|
211
|
+
"""
|
|
212
|
+
from git import Repo
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
repo = Repo(repo_path)
|
|
216
|
+
# Update repository from remote before analysis
|
|
217
|
+
self._update_repository(repo)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Failed to open repository at {repo_path}: {e}")
|
|
220
|
+
return {}
|
|
221
|
+
|
|
222
|
+
# Collect commits from all relevant branches
|
|
223
|
+
all_commits = []
|
|
224
|
+
branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
|
|
225
|
+
|
|
226
|
+
if not branches_to_analyze:
|
|
227
|
+
logger.warning(f"No accessible branches found in repository {repo_path}")
|
|
228
|
+
return {}
|
|
229
|
+
|
|
230
|
+
logger.info(f"Analyzing branches: {branches_to_analyze}")
|
|
231
|
+
|
|
232
|
+
for branch_name in branches_to_analyze:
|
|
233
|
+
try:
|
|
234
|
+
branch_commits = list(
|
|
235
|
+
repo.iter_commits(branch_name, since=start_date, until=end_date, reverse=False)
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
logger.debug(
|
|
239
|
+
f"Found {len(branch_commits)} commits in branch {branch_name} for date range"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
for commit in branch_commits:
|
|
243
|
+
# Include merge commits like the original analyzer
|
|
244
|
+
# The original analyzer marks merge commits with is_merge=True but doesn't skip them
|
|
245
|
+
|
|
246
|
+
# Extract commit data with full metadata
|
|
247
|
+
commit_data = self._extract_commit_data(
|
|
248
|
+
commit, branch_name, project_key, repo_path
|
|
249
|
+
)
|
|
250
|
+
if commit_data:
|
|
251
|
+
all_commits.append(commit_data)
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.warning(f"Error processing branch {branch_name}: {e}")
|
|
255
|
+
continue
|
|
256
|
+
|
|
257
|
+
# Deduplicate commits (same commit may appear in multiple branches)
|
|
258
|
+
seen_hashes = set()
|
|
259
|
+
unique_commits = []
|
|
260
|
+
for commit_data in all_commits:
|
|
261
|
+
commit_hash = commit_data["commit_hash"]
|
|
262
|
+
if commit_hash not in seen_hashes:
|
|
263
|
+
seen_hashes.add(commit_hash)
|
|
264
|
+
unique_commits.append(commit_data)
|
|
265
|
+
|
|
266
|
+
# Organize commits by day
|
|
267
|
+
daily_commits = defaultdict(list)
|
|
268
|
+
for commit_data in unique_commits:
|
|
269
|
+
# Convert timestamp to date key
|
|
270
|
+
commit_date = commit_data["timestamp"].date()
|
|
271
|
+
date_key = commit_date.strftime("%Y-%m-%d")
|
|
272
|
+
daily_commits[date_key].append(commit_data)
|
|
273
|
+
|
|
274
|
+
# Sort commits within each day by timestamp
|
|
275
|
+
for date_key in daily_commits:
|
|
276
|
+
daily_commits[date_key].sort(key=lambda c: c["timestamp"])
|
|
277
|
+
|
|
278
|
+
logger.info(f"Collected {len(unique_commits)} commits across {len(daily_commits)} days")
|
|
279
|
+
return dict(daily_commits)
|
|
280
|
+
|
|
281
|
+
def _extract_commit_data(
|
|
282
|
+
self, commit: Any, branch_name: str, project_key: str, repo_path: Path
|
|
283
|
+
) -> Optional[dict[str, Any]]:
|
|
284
|
+
"""Extract comprehensive data from a Git commit.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Dictionary containing all commit metadata needed for classification
|
|
288
|
+
"""
|
|
289
|
+
try:
|
|
290
|
+
# Basic commit information
|
|
291
|
+
commit_data = {
|
|
292
|
+
"commit_hash": commit.hexsha,
|
|
293
|
+
"commit_hash_short": commit.hexsha[:7],
|
|
294
|
+
"message": commit.message.strip(),
|
|
295
|
+
"author_name": commit.author.name,
|
|
296
|
+
"author_email": commit.author.email,
|
|
297
|
+
"timestamp": datetime.fromtimestamp(commit.committed_date, tz=timezone.utc),
|
|
298
|
+
"branch": branch_name,
|
|
299
|
+
"project_key": project_key,
|
|
300
|
+
"repo_path": str(repo_path),
|
|
301
|
+
"is_merge": len(commit.parents) > 1, # Match the original analyzer behavior
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
# Calculate file changes
|
|
305
|
+
try:
|
|
306
|
+
# Compare with first parent or empty tree for initial commit
|
|
307
|
+
diff = commit.parents[0].diff(commit) if commit.parents else commit.diff(None)
|
|
308
|
+
|
|
309
|
+
# Get file paths with filtering
|
|
310
|
+
files_changed = []
|
|
311
|
+
for diff_item in diff:
|
|
312
|
+
file_path = diff_item.a_path or diff_item.b_path
|
|
313
|
+
if file_path and not self._should_exclude_file(file_path):
|
|
314
|
+
files_changed.append(file_path)
|
|
315
|
+
|
|
316
|
+
# Use reliable git numstat command for accurate line counts
|
|
317
|
+
line_stats = self._calculate_commit_stats(commit)
|
|
318
|
+
total_insertions = line_stats["insertions"]
|
|
319
|
+
total_deletions = line_stats["deletions"]
|
|
320
|
+
|
|
321
|
+
commit_data.update(
|
|
322
|
+
{
|
|
323
|
+
"files_changed": files_changed,
|
|
324
|
+
"files_changed_count": len(files_changed),
|
|
325
|
+
"lines_added": total_insertions,
|
|
326
|
+
"lines_deleted": total_deletions,
|
|
327
|
+
}
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.debug(f"Error calculating changes for commit {commit.hexsha}: {e}")
|
|
332
|
+
commit_data.update(
|
|
333
|
+
{
|
|
334
|
+
"files_changed": [],
|
|
335
|
+
"files_changed_count": 0,
|
|
336
|
+
"lines_added": 0,
|
|
337
|
+
"lines_deleted": 0,
|
|
338
|
+
}
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Extract story points
|
|
342
|
+
story_points = self.story_point_extractor.extract_from_text(commit_data["message"])
|
|
343
|
+
commit_data["story_points"] = story_points
|
|
344
|
+
|
|
345
|
+
# Extract ticket references
|
|
346
|
+
ticket_refs_data = self.ticket_extractor.extract_from_text(commit_data["message"])
|
|
347
|
+
# Convert to list of ticket IDs for compatibility
|
|
348
|
+
# Fix: Use 'id' field instead of 'ticket_id' field from extractor output
|
|
349
|
+
ticket_refs = [ref_data["id"] for ref_data in ticket_refs_data]
|
|
350
|
+
commit_data["ticket_references"] = ticket_refs
|
|
351
|
+
|
|
352
|
+
# Resolve developer identity
|
|
353
|
+
canonical_id = self.identity_resolver.resolve_developer(
|
|
354
|
+
commit_data["author_name"], commit_data["author_email"]
|
|
355
|
+
)
|
|
356
|
+
commit_data["canonical_developer_id"] = canonical_id
|
|
357
|
+
|
|
358
|
+
return commit_data
|
|
359
|
+
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Error extracting data for commit {commit.hexsha}: {e}")
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
def _should_exclude_file(self, file_path: str) -> bool:
|
|
365
|
+
"""Check if a file should be excluded based on exclude patterns."""
|
|
366
|
+
import fnmatch
|
|
367
|
+
|
|
368
|
+
return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
|
|
369
|
+
|
|
370
|
+
def _get_branches_to_analyze(
|
|
371
|
+
self, repo: Any, branch_patterns: Optional[list[str]]
|
|
372
|
+
) -> list[str]:
|
|
373
|
+
"""Get list of branches to analyze based on patterns.
|
|
374
|
+
|
|
375
|
+
WHY: Robust branch detection that handles missing remotes, missing default branches,
|
|
376
|
+
and provides good fallback behavior. Based on the approach used in the existing analyzer.
|
|
377
|
+
|
|
378
|
+
DESIGN DECISION:
|
|
379
|
+
- Try default branches first, fall back to all available branches
|
|
380
|
+
- Handle missing remotes gracefully
|
|
381
|
+
- Skip remote tracking branches to avoid duplicates
|
|
382
|
+
- Use actual branch existence checking rather than assuming branches exist
|
|
383
|
+
"""
|
|
384
|
+
if not branch_patterns:
|
|
385
|
+
# Get all available branches (local branches preferred)
|
|
386
|
+
available_branches = []
|
|
387
|
+
|
|
388
|
+
# First, try local branches
|
|
389
|
+
try:
|
|
390
|
+
local_branches = [branch.name for branch in repo.branches]
|
|
391
|
+
available_branches.extend(local_branches)
|
|
392
|
+
logger.debug(f"Found local branches: {local_branches}")
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logger.debug(f"Error getting local branches: {e}")
|
|
395
|
+
|
|
396
|
+
# If we have remotes, also consider remote branches (but clean the names)
|
|
397
|
+
try:
|
|
398
|
+
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
399
|
+
remote_branches = [
|
|
400
|
+
ref.name.replace("origin/", "")
|
|
401
|
+
for ref in repo.remotes.origin.refs
|
|
402
|
+
if not ref.name.endswith("HEAD") # Skip HEAD ref
|
|
403
|
+
]
|
|
404
|
+
# Only add remote branches that aren't already in local branches
|
|
405
|
+
for branch in remote_branches:
|
|
406
|
+
if branch not in available_branches:
|
|
407
|
+
available_branches.append(branch)
|
|
408
|
+
logger.debug(f"Found remote branches: {remote_branches}")
|
|
409
|
+
except Exception as e:
|
|
410
|
+
logger.debug(f"Error getting remote branches: {e}")
|
|
411
|
+
|
|
412
|
+
# If no branches found, fallback to trying common names directly
|
|
413
|
+
if not available_branches:
|
|
414
|
+
logger.warning(
|
|
415
|
+
"No branches found via normal detection, falling back to common names"
|
|
416
|
+
)
|
|
417
|
+
available_branches = ["main", "master", "develop", "dev"]
|
|
418
|
+
|
|
419
|
+
# Try default main branches first, in order of preference
|
|
420
|
+
main_branches = ["main", "master", "develop", "dev"]
|
|
421
|
+
for branch in main_branches:
|
|
422
|
+
if branch in available_branches:
|
|
423
|
+
# Test that we can actually access this branch
|
|
424
|
+
try:
|
|
425
|
+
# Just try to get the commit object to verify branch exists and is accessible
|
|
426
|
+
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
427
|
+
logger.info(f"Using main branch: {branch}")
|
|
428
|
+
return [branch]
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.debug(f"Branch {branch} exists but not accessible: {e}")
|
|
431
|
+
continue
|
|
432
|
+
|
|
433
|
+
# If no main branches work, try the first available branch that actually works
|
|
434
|
+
for branch in available_branches:
|
|
435
|
+
try:
|
|
436
|
+
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
437
|
+
logger.info(f"Using fallback branch: {branch}")
|
|
438
|
+
return [branch]
|
|
439
|
+
except Exception as e:
|
|
440
|
+
logger.debug(f"Branch {branch} not accessible: {e}")
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
# Last resort: return empty list (will be handled gracefully by caller)
|
|
444
|
+
logger.warning("No accessible branches found")
|
|
445
|
+
return []
|
|
446
|
+
|
|
447
|
+
# Use specified patterns - match against all available branches
|
|
448
|
+
import fnmatch
|
|
449
|
+
|
|
450
|
+
available_branches = []
|
|
451
|
+
|
|
452
|
+
# Collect all branches (local and remote)
|
|
453
|
+
with contextlib.suppress(Exception):
|
|
454
|
+
available_branches.extend([branch.name for branch in repo.branches])
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
458
|
+
remote_branches = [
|
|
459
|
+
ref.name.replace("origin/", "")
|
|
460
|
+
for ref in repo.remotes.origin.refs
|
|
461
|
+
if not ref.name.endswith("HEAD")
|
|
462
|
+
]
|
|
463
|
+
for branch in remote_branches:
|
|
464
|
+
if branch not in available_branches:
|
|
465
|
+
available_branches.append(branch)
|
|
466
|
+
except Exception:
|
|
467
|
+
pass
|
|
468
|
+
|
|
469
|
+
# Match patterns against available branches
|
|
470
|
+
matching_branches = []
|
|
471
|
+
for pattern in branch_patterns:
|
|
472
|
+
matching = [branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)]
|
|
473
|
+
matching_branches.extend(matching)
|
|
474
|
+
|
|
475
|
+
# Test that matched branches are actually accessible
|
|
476
|
+
accessible_branches = []
|
|
477
|
+
for branch in list(set(matching_branches)): # Remove duplicates
|
|
478
|
+
try:
|
|
479
|
+
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
480
|
+
accessible_branches.append(branch)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger.debug(f"Matched branch {branch} not accessible: {e}")
|
|
483
|
+
|
|
484
|
+
return accessible_branches
|
|
485
|
+
|
|
486
|
+
def _update_repository(self, repo) -> bool:
|
|
487
|
+
"""Update repository from remote before analysis.
|
|
488
|
+
|
|
489
|
+
WHY: This ensures we have the latest commits from the remote repository
|
|
490
|
+
before performing analysis. Critical for getting accurate data especially
|
|
491
|
+
when analyzing repositories that are actively being developed.
|
|
492
|
+
|
|
493
|
+
DESIGN DECISION: Uses fetch() for all cases, then pull() only when on a
|
|
494
|
+
tracking branch that's not in detached HEAD state. This approach:
|
|
495
|
+
- Handles detached HEAD states gracefully (common in CI/CD)
|
|
496
|
+
- Always gets latest refs from remote via fetch
|
|
497
|
+
- Only attempts pull when it's safe to do so
|
|
498
|
+
- Continues analysis even if update fails (logs warning)
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
repo: GitPython Repo object
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
bool: True if update succeeded, False if failed (but analysis continues)
|
|
505
|
+
"""
|
|
506
|
+
try:
|
|
507
|
+
if repo.remotes:
|
|
508
|
+
logger.info("Fetching latest changes from remote")
|
|
509
|
+
|
|
510
|
+
# Use subprocess for git fetch to prevent password prompts
|
|
511
|
+
env = os.environ.copy()
|
|
512
|
+
env["GIT_TERMINAL_PROMPT"] = "0"
|
|
513
|
+
env["GIT_ASKPASS"] = ""
|
|
514
|
+
env["GCM_INTERACTIVE"] = "never"
|
|
515
|
+
|
|
516
|
+
# Run git fetch with timeout
|
|
517
|
+
try:
|
|
518
|
+
result = subprocess.run(
|
|
519
|
+
["git", "fetch", "--all", "--config", "credential.helper="],
|
|
520
|
+
cwd=repo.working_dir,
|
|
521
|
+
env=env,
|
|
522
|
+
capture_output=True,
|
|
523
|
+
text=True,
|
|
524
|
+
timeout=30, # 30 second timeout
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
if result.returncode != 0:
|
|
528
|
+
error_str = result.stderr.lower()
|
|
529
|
+
if any(
|
|
530
|
+
x in error_str
|
|
531
|
+
for x in ["authentication", "permission denied", "401", "403"]
|
|
532
|
+
):
|
|
533
|
+
logger.error(
|
|
534
|
+
"GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
|
|
535
|
+
"You can verify with: gh auth status"
|
|
536
|
+
)
|
|
537
|
+
# Skip fetch but continue with local analysis
|
|
538
|
+
return False
|
|
539
|
+
logger.warning(f"Git fetch failed: {result.stderr}")
|
|
540
|
+
return False
|
|
541
|
+
|
|
542
|
+
except subprocess.TimeoutExpired:
|
|
543
|
+
logger.error(
|
|
544
|
+
"Git fetch timed out (likely authentication failure). Continuing with local repository state."
|
|
545
|
+
)
|
|
546
|
+
return False
|
|
547
|
+
|
|
548
|
+
# Only try to pull if not in detached HEAD state
|
|
549
|
+
if not repo.head.is_detached:
|
|
550
|
+
current_branch = repo.active_branch
|
|
551
|
+
tracking = current_branch.tracking_branch()
|
|
552
|
+
if tracking:
|
|
553
|
+
# Pull latest changes using subprocess
|
|
554
|
+
try:
|
|
555
|
+
result = subprocess.run(
|
|
556
|
+
["git", "pull", "--config", "credential.helper="],
|
|
557
|
+
cwd=repo.working_dir,
|
|
558
|
+
env=env,
|
|
559
|
+
capture_output=True,
|
|
560
|
+
text=True,
|
|
561
|
+
timeout=30,
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
if result.returncode != 0:
|
|
565
|
+
error_str = result.stderr.lower()
|
|
566
|
+
if any(
|
|
567
|
+
x in error_str
|
|
568
|
+
for x in ["authentication", "permission denied", "401", "403"]
|
|
569
|
+
):
|
|
570
|
+
logger.error(
|
|
571
|
+
"GitHub authentication failed during pull. Continuing with local repository state."
|
|
572
|
+
)
|
|
573
|
+
return False
|
|
574
|
+
logger.warning(f"Git pull failed: {result.stderr}")
|
|
575
|
+
return False
|
|
576
|
+
logger.debug(f"Pulled latest changes for {current_branch.name}")
|
|
577
|
+
|
|
578
|
+
except subprocess.TimeoutExpired:
|
|
579
|
+
logger.error(
|
|
580
|
+
"Git pull timed out. Continuing with local repository state."
|
|
581
|
+
)
|
|
582
|
+
return False
|
|
583
|
+
else:
|
|
584
|
+
logger.debug(
|
|
585
|
+
f"Branch {current_branch.name} has no tracking branch, skipping pull"
|
|
586
|
+
)
|
|
587
|
+
else:
|
|
588
|
+
logger.debug("Repository in detached HEAD state, skipping pull")
|
|
589
|
+
return True
|
|
590
|
+
else:
|
|
591
|
+
logger.debug("No remotes configured, skipping repository update")
|
|
592
|
+
return True
|
|
593
|
+
except Exception as e:
|
|
594
|
+
logger.warning(f"Could not update repository: {e}")
|
|
595
|
+
# Continue with analysis using local state
|
|
596
|
+
return False
|
|
597
|
+
|
|
598
|
+
def _extract_all_ticket_references(
|
|
599
|
+
self, daily_commits: dict[str, list[dict[str, Any]]]
|
|
600
|
+
) -> set[str]:
|
|
601
|
+
"""Extract all unique ticket IDs from commits."""
|
|
602
|
+
ticket_ids = set()
|
|
603
|
+
|
|
604
|
+
for day_commits in daily_commits.values():
|
|
605
|
+
for commit in day_commits:
|
|
606
|
+
ticket_refs = commit.get("ticket_references", [])
|
|
607
|
+
ticket_ids.update(ticket_refs)
|
|
608
|
+
|
|
609
|
+
logger.info(f"Found {len(ticket_ids)} unique ticket references")
|
|
610
|
+
return ticket_ids
|
|
611
|
+
|
|
612
|
+
def _fetch_detailed_tickets(
|
|
613
|
+
self,
|
|
614
|
+
ticket_ids: set[str],
|
|
615
|
+
jira_integration: JIRAIntegration,
|
|
616
|
+
project_key: str,
|
|
617
|
+
progress_callback: Optional[callable] = None,
|
|
618
|
+
) -> None:
|
|
619
|
+
"""Fetch detailed ticket information and store in database."""
|
|
620
|
+
session = self.database.get_session()
|
|
621
|
+
|
|
622
|
+
try:
|
|
623
|
+
# Check which tickets we already have
|
|
624
|
+
existing_tickets = (
|
|
625
|
+
session.query(DetailedTicketData)
|
|
626
|
+
.filter(
|
|
627
|
+
DetailedTicketData.ticket_id.in_(ticket_ids),
|
|
628
|
+
DetailedTicketData.platform == "jira",
|
|
629
|
+
)
|
|
630
|
+
.all()
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
existing_ids = {ticket.ticket_id for ticket in existing_tickets}
|
|
634
|
+
tickets_to_fetch = ticket_ids - existing_ids
|
|
635
|
+
|
|
636
|
+
if not tickets_to_fetch:
|
|
637
|
+
logger.info("All tickets already cached")
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
logger.info(f"Fetching {len(tickets_to_fetch)} new tickets")
|
|
641
|
+
|
|
642
|
+
# Fetch tickets in batches
|
|
643
|
+
batch_size = 50
|
|
644
|
+
tickets_list = list(tickets_to_fetch)
|
|
645
|
+
|
|
646
|
+
# Use centralized progress service
|
|
647
|
+
progress = get_progress_service()
|
|
648
|
+
|
|
649
|
+
with progress.progress(
|
|
650
|
+
total=len(tickets_list), description="Fetching tickets", unit="tickets"
|
|
651
|
+
) as ctx:
|
|
652
|
+
for i in range(0, len(tickets_list), batch_size):
|
|
653
|
+
batch = tickets_list[i : i + batch_size]
|
|
654
|
+
|
|
655
|
+
for ticket_id in batch:
|
|
656
|
+
try:
|
|
657
|
+
# Fetch ticket from JIRA
|
|
658
|
+
issue_data = jira_integration.get_issue(ticket_id)
|
|
659
|
+
|
|
660
|
+
if issue_data:
|
|
661
|
+
# Create detailed ticket record
|
|
662
|
+
detailed_ticket = self._create_detailed_ticket_record(
|
|
663
|
+
issue_data, project_key, "jira"
|
|
664
|
+
)
|
|
665
|
+
session.add(detailed_ticket)
|
|
666
|
+
|
|
667
|
+
except Exception as e:
|
|
668
|
+
logger.warning(f"Failed to fetch ticket {ticket_id}: {e}")
|
|
669
|
+
|
|
670
|
+
progress.update(ctx, 1)
|
|
671
|
+
|
|
672
|
+
if progress_callback:
|
|
673
|
+
progress_callback(f"Fetched ticket {ticket_id}")
|
|
674
|
+
|
|
675
|
+
# Commit batch to database
|
|
676
|
+
session.commit()
|
|
677
|
+
|
|
678
|
+
logger.info(f"Successfully fetched {len(tickets_to_fetch)} tickets")
|
|
679
|
+
|
|
680
|
+
except Exception as e:
|
|
681
|
+
logger.error(f"Error fetching detailed tickets: {e}")
|
|
682
|
+
session.rollback()
|
|
683
|
+
finally:
|
|
684
|
+
session.close()
|
|
685
|
+
|
|
686
|
+
def _create_detailed_ticket_record(
|
|
687
|
+
self, issue_data: dict[str, Any], project_key: str, platform: str
|
|
688
|
+
) -> DetailedTicketData:
|
|
689
|
+
"""Create a detailed ticket record from JIRA issue data."""
|
|
690
|
+
# Extract classification hints from issue type and labels
|
|
691
|
+
classification_hints = []
|
|
692
|
+
|
|
693
|
+
issue_type = issue_data.get("issue_type", "").lower()
|
|
694
|
+
if "bug" in issue_type or "defect" in issue_type:
|
|
695
|
+
classification_hints.append("bug_fix")
|
|
696
|
+
elif "story" in issue_type or "feature" in issue_type:
|
|
697
|
+
classification_hints.append("feature")
|
|
698
|
+
elif "task" in issue_type:
|
|
699
|
+
classification_hints.append("maintenance")
|
|
700
|
+
|
|
701
|
+
# Extract business domain from labels or summary
|
|
702
|
+
business_domain = None
|
|
703
|
+
labels = issue_data.get("labels", [])
|
|
704
|
+
for label in labels:
|
|
705
|
+
if any(keyword in label.lower() for keyword in ["frontend", "backend", "ui", "api"]):
|
|
706
|
+
business_domain = label.lower()
|
|
707
|
+
break
|
|
708
|
+
|
|
709
|
+
# Create the record
|
|
710
|
+
return DetailedTicketData(
|
|
711
|
+
platform=platform,
|
|
712
|
+
ticket_id=issue_data["key"],
|
|
713
|
+
project_key=project_key,
|
|
714
|
+
title=issue_data.get("summary", ""),
|
|
715
|
+
description=issue_data.get("description", ""),
|
|
716
|
+
summary=issue_data.get("summary", "")[:500], # Truncated summary
|
|
717
|
+
ticket_type=issue_data.get("issue_type", ""),
|
|
718
|
+
status=issue_data.get("status", ""),
|
|
719
|
+
priority=issue_data.get("priority", ""),
|
|
720
|
+
labels=labels,
|
|
721
|
+
assignee=issue_data.get("assignee", ""),
|
|
722
|
+
reporter=issue_data.get("reporter", ""),
|
|
723
|
+
created_at=issue_data.get("created"),
|
|
724
|
+
updated_at=issue_data.get("updated"),
|
|
725
|
+
resolved_at=issue_data.get("resolved"),
|
|
726
|
+
story_points=issue_data.get("story_points"),
|
|
727
|
+
classification_hints=classification_hints,
|
|
728
|
+
business_domain=business_domain,
|
|
729
|
+
platform_data=issue_data, # Store full JIRA data
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
def _build_commit_ticket_correlations(
|
|
733
|
+
self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path
|
|
734
|
+
) -> int:
|
|
735
|
+
"""Build and store commit-ticket correlations."""
|
|
736
|
+
session = self.database.get_session()
|
|
737
|
+
correlations_created = 0
|
|
738
|
+
|
|
739
|
+
try:
|
|
740
|
+
for day_commits in daily_commits.values():
|
|
741
|
+
for commit in day_commits:
|
|
742
|
+
commit_hash = commit["commit_hash"]
|
|
743
|
+
ticket_refs = commit.get("ticket_references", [])
|
|
744
|
+
|
|
745
|
+
for ticket_id in ticket_refs:
|
|
746
|
+
try:
|
|
747
|
+
# Create correlation record
|
|
748
|
+
correlation = CommitTicketCorrelation(
|
|
749
|
+
commit_hash=commit_hash,
|
|
750
|
+
repo_path=str(repo_path),
|
|
751
|
+
ticket_id=ticket_id,
|
|
752
|
+
platform="jira", # Assuming JIRA for now
|
|
753
|
+
project_key=commit["project_key"],
|
|
754
|
+
correlation_type="direct",
|
|
755
|
+
confidence=1.0,
|
|
756
|
+
extracted_from="commit_message",
|
|
757
|
+
matching_pattern=None, # Could add pattern detection
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
# Check if correlation already exists
|
|
761
|
+
existing = (
|
|
762
|
+
session.query(CommitTicketCorrelation)
|
|
763
|
+
.filter(
|
|
764
|
+
CommitTicketCorrelation.commit_hash == commit_hash,
|
|
765
|
+
CommitTicketCorrelation.repo_path == str(repo_path),
|
|
766
|
+
CommitTicketCorrelation.ticket_id == ticket_id,
|
|
767
|
+
CommitTicketCorrelation.platform == "jira",
|
|
768
|
+
)
|
|
769
|
+
.first()
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
if not existing:
|
|
773
|
+
session.add(correlation)
|
|
774
|
+
correlations_created += 1
|
|
775
|
+
|
|
776
|
+
except Exception as e:
|
|
777
|
+
logger.warning(
|
|
778
|
+
f"Failed to create correlation for {commit_hash}-{ticket_id}: {e}"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
session.commit()
|
|
782
|
+
logger.info(f"Created {correlations_created} commit-ticket correlations")
|
|
783
|
+
|
|
784
|
+
except Exception as e:
|
|
785
|
+
logger.error(f"Error building correlations: {e}")
|
|
786
|
+
session.rollback()
|
|
787
|
+
finally:
|
|
788
|
+
session.close()
|
|
789
|
+
|
|
790
|
+
return correlations_created
|
|
791
|
+
|
|
792
|
+
def _store_daily_batches(
|
|
793
|
+
self, daily_commits: dict[str, list[dict[str, Any]]], repo_path: Path, project_key: str
|
|
794
|
+
) -> int:
|
|
795
|
+
"""Store daily commit batches for efficient retrieval using bulk operations.
|
|
796
|
+
|
|
797
|
+
WHY: Enhanced to use bulk operations from the cache layer for significantly
|
|
798
|
+
better performance when storing large numbers of commits.
|
|
799
|
+
"""
|
|
800
|
+
session = self.database.get_session()
|
|
801
|
+
batches_created = 0
|
|
802
|
+
commits_stored = 0
|
|
803
|
+
expected_commits = 0
|
|
804
|
+
|
|
805
|
+
try:
|
|
806
|
+
total_commits = sum(len(commits) for commits in daily_commits.values())
|
|
807
|
+
logger.info(f"🔍 DEBUG: Storing {total_commits} commits from {len(daily_commits)} days")
|
|
808
|
+
logger.info(
|
|
809
|
+
f"🔍 DEBUG: Daily commits keys: {list(daily_commits.keys())[:5]}"
|
|
810
|
+
) # First 5 dates
|
|
811
|
+
|
|
812
|
+
# Track dates to process for efficient batch handling
|
|
813
|
+
dates_to_process = [
|
|
814
|
+
datetime.strptime(date_str, "%Y-%m-%d").date() for date_str in daily_commits
|
|
815
|
+
]
|
|
816
|
+
logger.info(f"🔍 DEBUG: Processing {len(dates_to_process)} dates for daily batches")
|
|
817
|
+
|
|
818
|
+
# Pre-load existing batches to avoid constraint violations during processing
|
|
819
|
+
existing_batches_map = {}
|
|
820
|
+
for date_str in daily_commits:
|
|
821
|
+
# Convert to datetime instead of date to match database storage
|
|
822
|
+
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
|
|
823
|
+
existing_batch = (
|
|
824
|
+
session.query(DailyCommitBatch)
|
|
825
|
+
.filter(
|
|
826
|
+
DailyCommitBatch.date == date_obj,
|
|
827
|
+
DailyCommitBatch.project_key == project_key,
|
|
828
|
+
DailyCommitBatch.repo_path == str(repo_path),
|
|
829
|
+
)
|
|
830
|
+
.first()
|
|
831
|
+
)
|
|
832
|
+
if existing_batch:
|
|
833
|
+
existing_batches_map[date_str] = existing_batch
|
|
834
|
+
logger.info(
|
|
835
|
+
f"🔍 DEBUG: Found existing batch for {date_str}: ID={existing_batch.id}"
|
|
836
|
+
)
|
|
837
|
+
else:
|
|
838
|
+
logger.info(f"🔍 DEBUG: No existing batch found for {date_str}")
|
|
839
|
+
|
|
840
|
+
# Collect all commits for bulk operations
|
|
841
|
+
all_commits_to_store = []
|
|
842
|
+
commit_hashes_to_check = []
|
|
843
|
+
|
|
844
|
+
# First, collect all commit hashes to check existence in bulk
|
|
845
|
+
for _date_str, commits in daily_commits.items():
|
|
846
|
+
for commit in commits:
|
|
847
|
+
commit_hashes_to_check.append(commit["commit_hash"])
|
|
848
|
+
|
|
849
|
+
# Use bulk_exists to check which commits already exist
|
|
850
|
+
existing_commits_map = self.cache.bulk_exists(str(repo_path), commit_hashes_to_check)
|
|
851
|
+
|
|
852
|
+
# Disable autoflush to prevent premature batch creation during commit storage
|
|
853
|
+
with session.no_autoflush:
|
|
854
|
+
for date_str, commits in daily_commits.items():
|
|
855
|
+
if not commits:
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
logger.info(f"🔍 DEBUG: Processing {len(commits)} commits for {date_str}")
|
|
859
|
+
|
|
860
|
+
# Prepare commits for bulk storage
|
|
861
|
+
commits_to_store_this_date = []
|
|
862
|
+
for commit in commits:
|
|
863
|
+
# Check if commit already exists using bulk check results
|
|
864
|
+
if not existing_commits_map.get(commit["commit_hash"], False):
|
|
865
|
+
# Transform commit data to cache format
|
|
866
|
+
cache_format_commit = {
|
|
867
|
+
"hash": commit["commit_hash"],
|
|
868
|
+
"author_name": commit.get("author_name", ""),
|
|
869
|
+
"author_email": commit.get("author_email", ""),
|
|
870
|
+
"message": commit.get("message", ""),
|
|
871
|
+
"timestamp": commit["timestamp"],
|
|
872
|
+
"branch": commit.get("branch", "main"),
|
|
873
|
+
"is_merge": commit.get("is_merge", False),
|
|
874
|
+
"files_changed_count": commit.get("files_changed_count", 0),
|
|
875
|
+
"insertions": commit.get("lines_added", 0),
|
|
876
|
+
"deletions": commit.get("lines_deleted", 0),
|
|
877
|
+
"story_points": commit.get("story_points"),
|
|
878
|
+
"ticket_references": commit.get("ticket_references", []),
|
|
879
|
+
}
|
|
880
|
+
commits_to_store_this_date.append(cache_format_commit)
|
|
881
|
+
all_commits_to_store.append(cache_format_commit)
|
|
882
|
+
expected_commits += 1
|
|
883
|
+
else:
|
|
884
|
+
logger.debug(
|
|
885
|
+
f"Commit {commit['commit_hash'][:7]} already exists in database"
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
logger.info(
|
|
889
|
+
f"🔍 DEBUG: Prepared {len(commits_to_store_this_date)} new commits for {date_str}"
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# Calculate batch statistics
|
|
893
|
+
total_files = sum(commit.get("files_changed_count", 0) for commit in commits)
|
|
894
|
+
total_additions = sum(commit.get("lines_added", 0) for commit in commits)
|
|
895
|
+
total_deletions = sum(commit.get("lines_deleted", 0) for commit in commits)
|
|
896
|
+
|
|
897
|
+
# Get unique developers and tickets for this day
|
|
898
|
+
active_devs = list(
|
|
899
|
+
set(commit.get("canonical_developer_id", "") for commit in commits)
|
|
900
|
+
)
|
|
901
|
+
unique_tickets = []
|
|
902
|
+
for commit in commits:
|
|
903
|
+
unique_tickets.extend(commit.get("ticket_references", []))
|
|
904
|
+
unique_tickets = list(set(unique_tickets))
|
|
905
|
+
|
|
906
|
+
# Create context summary
|
|
907
|
+
context_summary = f"{len(commits)} commits by {len(active_devs)} developers"
|
|
908
|
+
if unique_tickets:
|
|
909
|
+
context_summary += f", {len(unique_tickets)} tickets referenced"
|
|
910
|
+
|
|
911
|
+
# Create or update daily batch using pre-loaded existing batches
|
|
912
|
+
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
existing_batch = existing_batches_map.get(date_str)
|
|
916
|
+
|
|
917
|
+
if existing_batch:
|
|
918
|
+
# Update existing batch with new data
|
|
919
|
+
existing_batch.commit_count = len(commits)
|
|
920
|
+
existing_batch.total_files_changed = total_files
|
|
921
|
+
existing_batch.total_lines_added = total_additions
|
|
922
|
+
existing_batch.total_lines_deleted = total_deletions
|
|
923
|
+
existing_batch.active_developers = active_devs
|
|
924
|
+
existing_batch.unique_tickets = unique_tickets
|
|
925
|
+
existing_batch.context_summary = context_summary
|
|
926
|
+
existing_batch.fetched_at = datetime.utcnow()
|
|
927
|
+
existing_batch.classification_status = "pending"
|
|
928
|
+
logger.info(f"🔍 DEBUG: Updated existing batch for {date_str}")
|
|
929
|
+
else:
|
|
930
|
+
# Create new batch
|
|
931
|
+
batch = DailyCommitBatch(
|
|
932
|
+
date=date_obj,
|
|
933
|
+
project_key=project_key,
|
|
934
|
+
repo_path=str(repo_path),
|
|
935
|
+
commit_count=len(commits),
|
|
936
|
+
total_files_changed=total_files,
|
|
937
|
+
total_lines_added=total_additions,
|
|
938
|
+
total_lines_deleted=total_deletions,
|
|
939
|
+
active_developers=active_devs,
|
|
940
|
+
unique_tickets=unique_tickets,
|
|
941
|
+
context_summary=context_summary,
|
|
942
|
+
classification_status="pending",
|
|
943
|
+
fetched_at=datetime.utcnow(),
|
|
944
|
+
)
|
|
945
|
+
session.add(batch)
|
|
946
|
+
batches_created += 1
|
|
947
|
+
logger.info(f"🔍 DEBUG: Created new batch for {date_str}")
|
|
948
|
+
except Exception as batch_error:
|
|
949
|
+
# Don't let batch creation failure kill commit storage
|
|
950
|
+
logger.error(
|
|
951
|
+
f"❌ CRITICAL: Failed to create/update batch for {date_str}: {batch_error}"
|
|
952
|
+
)
|
|
953
|
+
import traceback
|
|
954
|
+
|
|
955
|
+
logger.error(f"❌ Full batch error trace: {traceback.format_exc()}")
|
|
956
|
+
# Important: rollback any pending transaction to restore session state
|
|
957
|
+
session.rollback()
|
|
958
|
+
# Skip this batch but continue processing
|
|
959
|
+
|
|
960
|
+
# Use bulk store operation for all commits at once for maximum performance
|
|
961
|
+
if all_commits_to_store:
|
|
962
|
+
logger.info(f"Using bulk_store_commits for {len(all_commits_to_store)} commits")
|
|
963
|
+
bulk_stats = self.cache.bulk_store_commits(str(repo_path), all_commits_to_store)
|
|
964
|
+
commits_stored = bulk_stats["inserted"]
|
|
965
|
+
logger.info(
|
|
966
|
+
f"Bulk stored {commits_stored} commits in {bulk_stats['time_seconds']:.2f}s ({bulk_stats['commits_per_second']:.0f} commits/sec)"
|
|
967
|
+
)
|
|
968
|
+
else:
|
|
969
|
+
commits_stored = 0
|
|
970
|
+
logger.info("No new commits to store")
|
|
971
|
+
|
|
972
|
+
# Commit all changes to database (for daily batch records)
|
|
973
|
+
session.commit()
|
|
974
|
+
|
|
975
|
+
# CRITICAL FIX: Verify commits were actually stored
|
|
976
|
+
logger.info("🔍 DEBUG: Verifying commit storage...")
|
|
977
|
+
verification_result = self._verify_commit_storage(
|
|
978
|
+
session, daily_commits, repo_path, expected_commits
|
|
979
|
+
)
|
|
980
|
+
actual_stored = verification_result["actual_stored"]
|
|
981
|
+
|
|
982
|
+
# Validate storage success based on what we expected to store vs what we actually stored
|
|
983
|
+
if expected_commits > 0 and actual_stored != expected_commits:
|
|
984
|
+
error_msg = f"Storage verification failed: expected to store {expected_commits} new commits, actually stored {actual_stored}"
|
|
985
|
+
logger.error(f"❌ {error_msg}")
|
|
986
|
+
raise RuntimeError(error_msg)
|
|
987
|
+
|
|
988
|
+
logger.info(
|
|
989
|
+
f"✅ Storage verified: {actual_stored}/{expected_commits} commits successfully stored"
|
|
990
|
+
)
|
|
991
|
+
logger.info(
|
|
992
|
+
f"Created/updated {batches_created} daily commit batches, stored {actual_stored} commits"
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
except Exception as e:
|
|
996
|
+
logger.error(f"❌ CRITICAL ERROR storing daily batches: {e}")
|
|
997
|
+
logger.error("❌ This error causes ALL commits to be lost!")
|
|
998
|
+
import traceback
|
|
999
|
+
|
|
1000
|
+
logger.error(f"❌ Full traceback: {traceback.format_exc()}")
|
|
1001
|
+
session.rollback()
|
|
1002
|
+
finally:
|
|
1003
|
+
session.close()
|
|
1004
|
+
|
|
1005
|
+
return batches_created
|
|
1006
|
+
|
|
1007
|
+
def _verify_commit_storage(
|
|
1008
|
+
self,
|
|
1009
|
+
session: Session,
|
|
1010
|
+
daily_commits: dict[str, list[dict[str, Any]]],
|
|
1011
|
+
repo_path: Path,
|
|
1012
|
+
expected_new_commits: int,
|
|
1013
|
+
) -> dict[str, int]:
|
|
1014
|
+
"""Verify that commits were actually stored in the database.
|
|
1015
|
+
|
|
1016
|
+
WHY: Ensures that session.commit() actually persisted the data and didn't
|
|
1017
|
+
silently fail. This prevents the GitDataFetcher from reporting success
|
|
1018
|
+
when commits weren't actually stored.
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
session: Database session to query
|
|
1022
|
+
daily_commits: Original commit data to verify against
|
|
1023
|
+
repo_path: Repository path for filtering
|
|
1024
|
+
expected_new_commits: Number of new commits we expected to store this session
|
|
1025
|
+
|
|
1026
|
+
Returns:
|
|
1027
|
+
Dict containing verification results:
|
|
1028
|
+
- actual_stored: Number of commits from this session found in database
|
|
1029
|
+
- total_found: Total commits found matching our hashes
|
|
1030
|
+
- expected_new: Number of new commits we expected to store
|
|
1031
|
+
|
|
1032
|
+
Raises:
|
|
1033
|
+
RuntimeError: If verification fails due to database errors
|
|
1034
|
+
"""
|
|
1035
|
+
try:
|
|
1036
|
+
# Collect all commit hashes we tried to store
|
|
1037
|
+
expected_hashes = set()
|
|
1038
|
+
for day_commits in daily_commits.values():
|
|
1039
|
+
for commit in day_commits:
|
|
1040
|
+
expected_hashes.add(commit["commit_hash"])
|
|
1041
|
+
|
|
1042
|
+
if not expected_hashes:
|
|
1043
|
+
logger.info("No commits to verify")
|
|
1044
|
+
return {"actual_stored": 0, "total_found": 0, "expected_new": 0}
|
|
1045
|
+
|
|
1046
|
+
# Query database for actual stored commits
|
|
1047
|
+
stored_commits = (
|
|
1048
|
+
session.query(CachedCommit)
|
|
1049
|
+
.filter(
|
|
1050
|
+
CachedCommit.commit_hash.in_(expected_hashes),
|
|
1051
|
+
CachedCommit.repo_path == str(repo_path),
|
|
1052
|
+
)
|
|
1053
|
+
.all()
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
stored_hashes = {commit.commit_hash for commit in stored_commits}
|
|
1057
|
+
total_found = len(stored_hashes)
|
|
1058
|
+
|
|
1059
|
+
# For this verification, we assume all matching commits were stored successfully
|
|
1060
|
+
# Since we only attempt to store commits that don't already exist,
|
|
1061
|
+
# the number we "actually stored" equals what we expected to store
|
|
1062
|
+
actual_stored = expected_new_commits
|
|
1063
|
+
|
|
1064
|
+
# Log detailed verification results
|
|
1065
|
+
logger.info(
|
|
1066
|
+
f"🔍 DEBUG: Storage verification - Expected new: {expected_new_commits}, Total matching found: {total_found}"
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
# Check for missing commits (this would indicate storage failure)
|
|
1070
|
+
missing_hashes = expected_hashes - stored_hashes
|
|
1071
|
+
if missing_hashes:
|
|
1072
|
+
missing_short = [h[:7] for h in list(missing_hashes)[:5]] # First 5 for logging
|
|
1073
|
+
logger.error(
|
|
1074
|
+
f"❌ Missing commits in database: {missing_short} (showing first 5 of {len(missing_hashes)})"
|
|
1075
|
+
)
|
|
1076
|
+
# If we have missing commits, we didn't store what we expected
|
|
1077
|
+
actual_stored = total_found
|
|
1078
|
+
|
|
1079
|
+
return {
|
|
1080
|
+
"actual_stored": actual_stored,
|
|
1081
|
+
"total_found": total_found,
|
|
1082
|
+
"expected_new": expected_new_commits,
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
logger.error(f"❌ Critical error during storage verification: {e}")
|
|
1087
|
+
# Re-raise as RuntimeError to indicate this is a critical failure
|
|
1088
|
+
raise RuntimeError(f"Storage verification failed: {e}") from e
|
|
1089
|
+
|
|
1090
|
+
def get_fetch_status(self, project_key: str, repo_path: Path) -> dict[str, Any]:
|
|
1091
|
+
"""Get status of data fetching for a project."""
|
|
1092
|
+
session = self.database.get_session()
|
|
1093
|
+
|
|
1094
|
+
try:
|
|
1095
|
+
# Count daily batches
|
|
1096
|
+
batches = (
|
|
1097
|
+
session.query(DailyCommitBatch)
|
|
1098
|
+
.filter(
|
|
1099
|
+
DailyCommitBatch.project_key == project_key,
|
|
1100
|
+
DailyCommitBatch.repo_path == str(repo_path),
|
|
1101
|
+
)
|
|
1102
|
+
.all()
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
# Count tickets
|
|
1106
|
+
tickets = (
|
|
1107
|
+
session.query(DetailedTicketData)
|
|
1108
|
+
.filter(DetailedTicketData.project_key == project_key)
|
|
1109
|
+
.count()
|
|
1110
|
+
)
|
|
1111
|
+
|
|
1112
|
+
# Count correlations
|
|
1113
|
+
correlations = (
|
|
1114
|
+
session.query(CommitTicketCorrelation)
|
|
1115
|
+
.filter(
|
|
1116
|
+
CommitTicketCorrelation.project_key == project_key,
|
|
1117
|
+
CommitTicketCorrelation.repo_path == str(repo_path),
|
|
1118
|
+
)
|
|
1119
|
+
.count()
|
|
1120
|
+
)
|
|
1121
|
+
|
|
1122
|
+
# Calculate statistics
|
|
1123
|
+
total_commits = sum(batch.commit_count for batch in batches)
|
|
1124
|
+
classified_batches = sum(
|
|
1125
|
+
1 for batch in batches if batch.classification_status == "completed"
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
return {
|
|
1129
|
+
"project_key": project_key,
|
|
1130
|
+
"repo_path": str(repo_path),
|
|
1131
|
+
"daily_batches": len(batches),
|
|
1132
|
+
"total_commits": total_commits,
|
|
1133
|
+
"unique_tickets": tickets,
|
|
1134
|
+
"commit_correlations": correlations,
|
|
1135
|
+
"classification_status": {
|
|
1136
|
+
"completed_batches": classified_batches,
|
|
1137
|
+
"pending_batches": len(batches) - classified_batches,
|
|
1138
|
+
"completion_rate": classified_batches / len(batches) if batches else 0.0,
|
|
1139
|
+
},
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
except Exception as e:
|
|
1143
|
+
logger.error(f"Error getting fetch status: {e}")
|
|
1144
|
+
return {}
|
|
1145
|
+
finally:
|
|
1146
|
+
session.close()
|
|
1147
|
+
|
|
1148
|
+
def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
|
|
1149
|
+
"""Calculate commit statistics using reliable git diff --numstat.
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
Dictionary with 'files', 'insertions', and 'deletions' counts
|
|
1153
|
+
"""
|
|
1154
|
+
stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1155
|
+
|
|
1156
|
+
# For initial commits or commits without parents
|
|
1157
|
+
parent = commit.parents[0] if commit.parents else None
|
|
1158
|
+
|
|
1159
|
+
try:
|
|
1160
|
+
# Use git command directly for accurate line counts
|
|
1161
|
+
repo = commit.repo
|
|
1162
|
+
if parent:
|
|
1163
|
+
diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1164
|
+
else:
|
|
1165
|
+
# Initial commit - use git show with --numstat
|
|
1166
|
+
diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1167
|
+
|
|
1168
|
+
# Parse the numstat output: insertions\tdeletions\tfilename
|
|
1169
|
+
for line in diff_output.strip().split("\n"):
|
|
1170
|
+
if not line.strip():
|
|
1171
|
+
continue
|
|
1172
|
+
|
|
1173
|
+
parts = line.split("\t")
|
|
1174
|
+
if len(parts) >= 3:
|
|
1175
|
+
try:
|
|
1176
|
+
insertions = int(parts[0]) if parts[0] != "-" else 0
|
|
1177
|
+
deletions = int(parts[1]) if parts[1] != "-" else 0
|
|
1178
|
+
# filename = parts[2] - we don't filter here since files_changed list is handled separately
|
|
1179
|
+
|
|
1180
|
+
# Count all changes (raw stats, no filtering)
|
|
1181
|
+
stats["files"] += 1
|
|
1182
|
+
stats["insertions"] += insertions
|
|
1183
|
+
stats["deletions"] += deletions
|
|
1184
|
+
|
|
1185
|
+
except ValueError:
|
|
1186
|
+
# Skip binary files or malformed lines
|
|
1187
|
+
continue
|
|
1188
|
+
|
|
1189
|
+
except Exception as e:
|
|
1190
|
+
# Log the error for debugging but don't crash
|
|
1191
|
+
logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
|
|
1192
|
+
|
|
1193
|
+
return stats
|