gitflow-analytics 1.3.6__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/batch_classifier.py +156 -4
- gitflow_analytics/cli.py +897 -179
- gitflow_analytics/config/loader.py +40 -1
- gitflow_analytics/config/schema.py +4 -0
- gitflow_analytics/core/cache.py +20 -0
- gitflow_analytics/core/data_fetcher.py +1254 -228
- gitflow_analytics/core/git_auth.py +169 -0
- gitflow_analytics/core/git_timeout_wrapper.py +347 -0
- gitflow_analytics/core/metrics_storage.py +12 -3
- gitflow_analytics/core/progress.py +219 -18
- gitflow_analytics/core/subprocess_git.py +145 -0
- gitflow_analytics/extractors/ml_tickets.py +3 -2
- gitflow_analytics/extractors/tickets.py +93 -8
- gitflow_analytics/integrations/jira_integration.py +1 -1
- gitflow_analytics/integrations/orchestrator.py +47 -29
- gitflow_analytics/metrics/branch_health.py +3 -2
- gitflow_analytics/models/database.py +72 -1
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +12 -5
- gitflow_analytics/pm_framework/orchestrator.py +8 -3
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +24 -4
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +3 -1
- gitflow_analytics/qualitative/core/llm_fallback.py +34 -2
- gitflow_analytics/reports/narrative_writer.py +118 -74
- gitflow_analytics/security/__init__.py +11 -0
- gitflow_analytics/security/config.py +189 -0
- gitflow_analytics/security/extractors/__init__.py +7 -0
- gitflow_analytics/security/extractors/dependency_checker.py +379 -0
- gitflow_analytics/security/extractors/secret_detector.py +197 -0
- gitflow_analytics/security/extractors/vulnerability_scanner.py +333 -0
- gitflow_analytics/security/llm_analyzer.py +347 -0
- gitflow_analytics/security/reports/__init__.py +5 -0
- gitflow_analytics/security/reports/security_report.py +358 -0
- gitflow_analytics/security/security_analyzer.py +414 -0
- gitflow_analytics/tui/app.py +3 -1
- gitflow_analytics/tui/progress_adapter.py +313 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +407 -46
- gitflow_analytics/tui/screens/results_screen.py +219 -206
- gitflow_analytics/ui/__init__.py +21 -0
- gitflow_analytics/ui/progress_display.py +1477 -0
- gitflow_analytics/verify_activity.py +697 -0
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/METADATA +2 -1
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/RECORD +47 -31
- gitflow_analytics/cli_rich.py +0 -503
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/top_level.txt +0 -0
|
@@ -5,15 +5,17 @@ focusing purely on data collection from Git repositories and ticket systems
|
|
|
5
5
|
without performing any LLM-based classification.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import contextlib
|
|
9
8
|
import logging
|
|
10
9
|
import os
|
|
11
10
|
import subprocess
|
|
12
|
-
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
13
14
|
from datetime import datetime, timedelta, timezone
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
from typing import Any, Optional
|
|
16
17
|
|
|
18
|
+
from sqlalchemy import func
|
|
17
19
|
from sqlalchemy.orm import Session
|
|
18
20
|
|
|
19
21
|
from ..extractors.story_points import StoryPointExtractor
|
|
@@ -27,11 +29,17 @@ from ..models.database import (
|
|
|
27
29
|
)
|
|
28
30
|
from .branch_mapper import BranchToProjectMapper
|
|
29
31
|
from .cache import GitAnalysisCache
|
|
32
|
+
from .git_timeout_wrapper import GitOperationTimeout, GitTimeoutWrapper, HeartbeatLogger
|
|
30
33
|
from .identity import DeveloperIdentityResolver
|
|
31
34
|
from .progress import get_progress_service
|
|
32
35
|
|
|
33
36
|
logger = logging.getLogger(__name__)
|
|
34
37
|
|
|
38
|
+
# THREAD SAFETY: Module-level thread-local storage for repository instances
|
|
39
|
+
# Each thread gets its own isolated storage to prevent thread-safety issues
|
|
40
|
+
# when GitDataFetcher is called from ThreadPoolExecutor
|
|
41
|
+
_thread_local = threading.local()
|
|
42
|
+
|
|
35
43
|
|
|
36
44
|
class GitDataFetcher:
|
|
37
45
|
"""Fetches raw Git commit data and organizes it by day for efficient batch processing.
|
|
@@ -50,6 +58,7 @@ class GitDataFetcher:
|
|
|
50
58
|
branch_mapping_rules: Optional[dict[str, list[str]]] = None,
|
|
51
59
|
allowed_ticket_platforms: Optional[list[str]] = None,
|
|
52
60
|
exclude_paths: Optional[list[str]] = None,
|
|
61
|
+
skip_remote_fetch: bool = False,
|
|
53
62
|
):
|
|
54
63
|
"""Initialize the data fetcher.
|
|
55
64
|
|
|
@@ -58,8 +67,11 @@ class GitDataFetcher:
|
|
|
58
67
|
branch_mapping_rules: Rules for mapping branches to projects
|
|
59
68
|
allowed_ticket_platforms: List of allowed ticket platforms
|
|
60
69
|
exclude_paths: List of file paths to exclude from analysis
|
|
70
|
+
skip_remote_fetch: If True, skip git fetch/pull operations
|
|
61
71
|
"""
|
|
62
72
|
self.cache = cache
|
|
73
|
+
self.skip_remote_fetch = skip_remote_fetch
|
|
74
|
+
self.repository_status = {} # Track status of each repository
|
|
63
75
|
# CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
|
|
64
76
|
self.database = cache.db
|
|
65
77
|
self.story_point_extractor = StoryPointExtractor()
|
|
@@ -67,10 +79,35 @@ class GitDataFetcher:
|
|
|
67
79
|
self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
|
|
68
80
|
self.exclude_paths = exclude_paths or []
|
|
69
81
|
|
|
82
|
+
# Log exclusion configuration
|
|
83
|
+
if self.exclude_paths:
|
|
84
|
+
logger.info(
|
|
85
|
+
f"GitDataFetcher initialized with {len(self.exclude_paths)} exclusion patterns:"
|
|
86
|
+
)
|
|
87
|
+
for pattern in self.exclude_paths[:5]: # Show first 5 patterns
|
|
88
|
+
logger.debug(f" - {pattern}")
|
|
89
|
+
if len(self.exclude_paths) > 5:
|
|
90
|
+
logger.debug(f" ... and {len(self.exclude_paths) - 5} more patterns")
|
|
91
|
+
else:
|
|
92
|
+
logger.info("GitDataFetcher initialized with no file exclusions")
|
|
93
|
+
|
|
70
94
|
# Initialize identity resolver
|
|
71
95
|
identity_db_path = cache.cache_dir / "identities.db"
|
|
72
96
|
self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
|
|
73
97
|
|
|
98
|
+
# Initialize git timeout wrapper for safe operations
|
|
99
|
+
self.git_wrapper = GitTimeoutWrapper(default_timeout=30)
|
|
100
|
+
|
|
101
|
+
# Statistics for tracking repository processing
|
|
102
|
+
self.processing_stats = {
|
|
103
|
+
"total": 0,
|
|
104
|
+
"processed": 0,
|
|
105
|
+
"success": 0,
|
|
106
|
+
"failed": 0,
|
|
107
|
+
"timeout": 0,
|
|
108
|
+
"repositories": {},
|
|
109
|
+
}
|
|
110
|
+
|
|
74
111
|
def fetch_repository_data(
|
|
75
112
|
self,
|
|
76
113
|
repo_path: Path,
|
|
@@ -117,37 +154,75 @@ class GitDataFetcher:
|
|
|
117
154
|
f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
|
|
118
155
|
)
|
|
119
156
|
|
|
120
|
-
#
|
|
157
|
+
# Get progress service for top-level progress tracking
|
|
158
|
+
progress = get_progress_service()
|
|
159
|
+
|
|
160
|
+
# Start Rich display for this repository if enabled
|
|
161
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
162
|
+
# Count total commits for progress estimation
|
|
163
|
+
try:
|
|
164
|
+
import git
|
|
165
|
+
|
|
166
|
+
git.Repo(repo_path)
|
|
167
|
+
# Check if we need to clone or pull
|
|
168
|
+
if not repo_path.exists() or not (repo_path / ".git").exists():
|
|
169
|
+
logger.info(f"📥 Repository {project_key} needs cloning")
|
|
170
|
+
progress.start_repository(f"{project_key} (cloning)", 0)
|
|
171
|
+
else:
|
|
172
|
+
# Rough estimate based on weeks
|
|
173
|
+
estimated_commits = weeks_back * 50 # Estimate ~50 commits per week
|
|
174
|
+
progress.start_repository(project_key, estimated_commits)
|
|
175
|
+
except Exception:
|
|
176
|
+
progress.start_repository(project_key, 100) # Default estimate
|
|
177
|
+
|
|
178
|
+
# Step 1: Collect all commits organized by day with enhanced progress tracking
|
|
121
179
|
logger.info("🔍 DEBUG: About to fetch commits by day")
|
|
122
|
-
logger.info("Fetching commits organized by day
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
|
|
136
|
-
self._fetch_detailed_tickets(
|
|
137
|
-
ticket_ids, jira_integration, project_key, progress_callback
|
|
180
|
+
logger.info(f"Fetching commits organized by day for repository: {project_key}")
|
|
181
|
+
|
|
182
|
+
# Create top-level progress for this repository
|
|
183
|
+
with progress.progress(
|
|
184
|
+
total=3, # Three main steps: fetch commits, extract tickets, store data
|
|
185
|
+
description=f"📊 Processing repository: {project_key}",
|
|
186
|
+
unit="steps",
|
|
187
|
+
) as repo_progress_ctx:
|
|
188
|
+
|
|
189
|
+
# Step 1: Fetch commits
|
|
190
|
+
progress.set_description(repo_progress_ctx, f"🔍 {project_key}: Fetching commits")
|
|
191
|
+
daily_commits = self._fetch_commits_by_day(
|
|
192
|
+
repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
|
|
138
193
|
)
|
|
194
|
+
logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
|
|
195
|
+
progress.update(repo_progress_ctx)
|
|
139
196
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
197
|
+
# Step 2: Extract and fetch all referenced tickets
|
|
198
|
+
progress.set_description(repo_progress_ctx, f"🎫 {project_key}: Processing tickets")
|
|
199
|
+
logger.info("🔍 DEBUG: About to extract ticket references")
|
|
200
|
+
logger.info(f"Extracting ticket references for {project_key}...")
|
|
201
|
+
ticket_ids = self._extract_all_ticket_references(daily_commits)
|
|
202
|
+
logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
|
|
143
203
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
204
|
+
if jira_integration and ticket_ids:
|
|
205
|
+
logger.info(
|
|
206
|
+
f"Fetching {len(ticket_ids)} unique tickets from JIRA for {project_key}..."
|
|
207
|
+
)
|
|
208
|
+
self._fetch_detailed_tickets(
|
|
209
|
+
ticket_ids, jira_integration, project_key, progress_callback
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Build commit-ticket correlations
|
|
213
|
+
logger.info(f"Building commit-ticket correlations for {project_key}...")
|
|
214
|
+
correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
|
|
215
|
+
progress.update(repo_progress_ctx)
|
|
216
|
+
|
|
217
|
+
# Step 3: Store daily commit batches
|
|
218
|
+
progress.set_description(repo_progress_ctx, f"💾 {project_key}: Storing data")
|
|
219
|
+
logger.info(
|
|
220
|
+
f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
|
|
221
|
+
)
|
|
222
|
+
logger.info("Storing daily commit batches...")
|
|
223
|
+
batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
|
|
224
|
+
logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
|
|
225
|
+
progress.update(repo_progress_ctx)
|
|
151
226
|
|
|
152
227
|
# CRITICAL FIX: Verify actual storage before reporting success
|
|
153
228
|
session = self.database.get_session()
|
|
@@ -167,6 +242,46 @@ class GitDataFetcher:
|
|
|
167
242
|
# Return summary statistics with ACTUAL stored counts
|
|
168
243
|
# expected_commits already calculated above
|
|
169
244
|
|
|
245
|
+
# Calculate exclusion impact summary if exclusions are configured
|
|
246
|
+
exclusion_stats = {
|
|
247
|
+
"patterns_applied": len(self.exclude_paths),
|
|
248
|
+
"enabled": bool(self.exclude_paths),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if self.exclude_paths and expected_commits > 0:
|
|
252
|
+
# Get aggregate stats from daily_commit_batches
|
|
253
|
+
session = self.database.get_session()
|
|
254
|
+
try:
|
|
255
|
+
batch_stats = (
|
|
256
|
+
session.query(
|
|
257
|
+
func.sum(DailyCommitBatch.total_lines_added).label("total_added"),
|
|
258
|
+
func.sum(DailyCommitBatch.total_lines_deleted).label("total_deleted"),
|
|
259
|
+
)
|
|
260
|
+
.filter(
|
|
261
|
+
DailyCommitBatch.project_key == project_key,
|
|
262
|
+
DailyCommitBatch.repo_path == str(repo_path),
|
|
263
|
+
)
|
|
264
|
+
.first()
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if batch_stats and batch_stats.total_added:
|
|
268
|
+
total_lines = (batch_stats.total_added or 0) + (batch_stats.total_deleted or 0)
|
|
269
|
+
exclusion_stats["total_lines_after_filtering"] = total_lines
|
|
270
|
+
exclusion_stats["lines_added"] = batch_stats.total_added or 0
|
|
271
|
+
exclusion_stats["lines_deleted"] = batch_stats.total_deleted or 0
|
|
272
|
+
|
|
273
|
+
logger.info(
|
|
274
|
+
f"📊 Exclusion Impact Summary for {project_key}:\n"
|
|
275
|
+
f" - Exclusion patterns applied: {len(self.exclude_paths)}\n"
|
|
276
|
+
f" - Total lines after filtering: {total_lines:,}\n"
|
|
277
|
+
f" - Lines added: {batch_stats.total_added:,}\n"
|
|
278
|
+
f" - Lines deleted: {batch_stats.total_deleted:,}"
|
|
279
|
+
)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.debug(f"Could not calculate exclusion impact summary: {e}")
|
|
282
|
+
finally:
|
|
283
|
+
session.close()
|
|
284
|
+
|
|
170
285
|
results = {
|
|
171
286
|
"project_key": project_key,
|
|
172
287
|
"repo_path": str(repo_path),
|
|
@@ -180,6 +295,7 @@ class GitDataFetcher:
|
|
|
180
295
|
"correlations_created": correlations_created,
|
|
181
296
|
"batches_created": batches_created,
|
|
182
297
|
},
|
|
298
|
+
"exclusions": exclusion_stats,
|
|
183
299
|
"daily_commits": daily_commits, # For immediate use if needed
|
|
184
300
|
}
|
|
185
301
|
|
|
@@ -188,10 +304,20 @@ class GitDataFetcher:
|
|
|
188
304
|
logger.info(
|
|
189
305
|
f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
190
306
|
)
|
|
307
|
+
# Finish repository in Rich display with success
|
|
308
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
309
|
+
progress.finish_repository(project_key, success=True)
|
|
191
310
|
else:
|
|
192
311
|
logger.error(
|
|
193
312
|
f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
194
313
|
)
|
|
314
|
+
# Finish repository in Rich display with error
|
|
315
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
316
|
+
progress.finish_repository(
|
|
317
|
+
project_key,
|
|
318
|
+
success=False,
|
|
319
|
+
error_message=f"Storage issue: {actual_stored_commits}/{expected_commits} commits",
|
|
320
|
+
)
|
|
195
321
|
|
|
196
322
|
return results
|
|
197
323
|
|
|
@@ -209,74 +335,296 @@ class GitDataFetcher:
|
|
|
209
335
|
Returns:
|
|
210
336
|
Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
|
|
211
337
|
"""
|
|
212
|
-
|
|
338
|
+
|
|
339
|
+
# THREAD SAFETY: Use the module-level thread-local storage to ensure each thread
|
|
340
|
+
# gets its own Repo instance. This prevents thread-safety issues when called from ThreadPoolExecutor
|
|
341
|
+
|
|
342
|
+
# Set environment variables to prevent ANY password prompts before opening repo
|
|
343
|
+
original_env = {}
|
|
344
|
+
env_vars = {
|
|
345
|
+
"GIT_TERMINAL_PROMPT": "0",
|
|
346
|
+
"GIT_ASKPASS": "/bin/echo", # Use full path to echo
|
|
347
|
+
"SSH_ASKPASS": "/bin/echo",
|
|
348
|
+
"GCM_INTERACTIVE": "never",
|
|
349
|
+
"GIT_SSH_COMMAND": "ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o PasswordAuthentication=no",
|
|
350
|
+
"DISPLAY": "",
|
|
351
|
+
"GIT_CREDENTIAL_HELPER": "", # Disable credential helper
|
|
352
|
+
"GCM_PROVIDER": "none", # Disable Git Credential Manager
|
|
353
|
+
"GIT_CREDENTIALS": "", # Clear any cached credentials
|
|
354
|
+
"GIT_CONFIG_NOSYSTEM": "1", # Don't use system config
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
# Save original environment and set our values
|
|
358
|
+
for key, value in env_vars.items():
|
|
359
|
+
original_env[key] = os.environ.get(key)
|
|
360
|
+
os.environ[key] = value
|
|
213
361
|
|
|
214
362
|
try:
|
|
215
|
-
|
|
363
|
+
# THREAD SAFETY: Create a fresh Repo instance for this thread
|
|
364
|
+
# Do NOT reuse Repo instances across threads
|
|
365
|
+
from git import Repo
|
|
366
|
+
|
|
367
|
+
# Check for security issues in repository configuration
|
|
368
|
+
self._check_repository_security(repo_path, project_key)
|
|
369
|
+
|
|
370
|
+
# When skip_remote_fetch is enabled, use a more restricted repository access
|
|
371
|
+
if self.skip_remote_fetch:
|
|
372
|
+
# Use a special git configuration that completely disables credential helpers
|
|
373
|
+
import tempfile
|
|
374
|
+
|
|
375
|
+
# Create a secure temporary directory for our config
|
|
376
|
+
temp_dir = tempfile.mkdtemp(prefix="gitflow_")
|
|
377
|
+
|
|
378
|
+
# Create a temporary git config that disables all authentication
|
|
379
|
+
tmp_config_path = os.path.join(temp_dir, ".gitconfig")
|
|
380
|
+
with open(tmp_config_path, "w") as tmp_config:
|
|
381
|
+
tmp_config.write("[credential]\n")
|
|
382
|
+
tmp_config.write(" helper = \n")
|
|
383
|
+
tmp_config.write("[core]\n")
|
|
384
|
+
tmp_config.write(" askpass = \n")
|
|
385
|
+
|
|
386
|
+
# Set GIT_CONFIG to use our temporary config
|
|
387
|
+
os.environ["GIT_CONFIG_GLOBAL"] = tmp_config_path
|
|
388
|
+
os.environ["GIT_CONFIG_SYSTEM"] = "/dev/null"
|
|
389
|
+
|
|
390
|
+
# Store temp_dir in thread-local storage for cleanup
|
|
391
|
+
_thread_local.temp_dir = temp_dir
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Open repository with our restricted configuration
|
|
395
|
+
# THREAD SAFETY: Each thread gets its own Repo instance
|
|
396
|
+
repo = Repo(repo_path)
|
|
397
|
+
finally:
|
|
398
|
+
# Clean up temporary config directory
|
|
399
|
+
try:
|
|
400
|
+
import shutil
|
|
401
|
+
|
|
402
|
+
if hasattr(_thread_local, "temp_dir"):
|
|
403
|
+
shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
|
|
404
|
+
delattr(_thread_local, "temp_dir")
|
|
405
|
+
except:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
# Configure git to never prompt for credentials
|
|
410
|
+
with repo.config_writer() as git_config:
|
|
411
|
+
git_config.set_value("core", "askpass", "")
|
|
412
|
+
git_config.set_value("credential", "helper", "")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.debug(f"Could not update git config: {e}")
|
|
415
|
+
|
|
416
|
+
# Note: We can't monkey-patch remotes as it's a property without setter
|
|
417
|
+
# The skip_remote_fetch flag will prevent remote operations elsewhere
|
|
418
|
+
|
|
419
|
+
logger.debug(
|
|
420
|
+
f"Opened repository {project_key} in offline mode (skip_remote_fetch=true)"
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
# THREAD SAFETY: Each thread gets its own Repo instance
|
|
424
|
+
repo = Repo(repo_path)
|
|
425
|
+
# Track repository status
|
|
426
|
+
self.repository_status[project_key] = {
|
|
427
|
+
"path": str(repo_path),
|
|
428
|
+
"remote_update": "skipped" if self.skip_remote_fetch else "pending",
|
|
429
|
+
"authentication_issues": False,
|
|
430
|
+
"error": None,
|
|
431
|
+
}
|
|
432
|
+
|
|
216
433
|
# Update repository from remote before analysis
|
|
217
|
-
self.
|
|
434
|
+
if not self.skip_remote_fetch:
|
|
435
|
+
logger.info(f"📥 Updating repository {project_key} from remote...")
|
|
436
|
+
|
|
437
|
+
update_success = self._update_repository(repo)
|
|
438
|
+
if not self.skip_remote_fetch:
|
|
439
|
+
self.repository_status[project_key]["remote_update"] = (
|
|
440
|
+
"success" if update_success else "failed"
|
|
441
|
+
)
|
|
442
|
+
if not update_success:
|
|
443
|
+
logger.warning(
|
|
444
|
+
f"⚠️ {project_key}: Continuing with local repository state (remote update failed)"
|
|
445
|
+
)
|
|
446
|
+
|
|
218
447
|
except Exception as e:
|
|
219
|
-
logger.error(f"Failed to open repository at {repo_path}: {e}")
|
|
448
|
+
logger.error(f"Failed to open repository {project_key} at {repo_path}: {e}")
|
|
449
|
+
self.repository_status[project_key] = {
|
|
450
|
+
"path": str(repo_path),
|
|
451
|
+
"remote_update": "error",
|
|
452
|
+
"authentication_issues": "authentication" in str(e).lower()
|
|
453
|
+
or "password" in str(e).lower(),
|
|
454
|
+
"error": str(e),
|
|
455
|
+
}
|
|
456
|
+
# Restore original environment variables before returning
|
|
457
|
+
for key, value in original_env.items():
|
|
458
|
+
if value is None:
|
|
459
|
+
os.environ.pop(key, None)
|
|
460
|
+
else:
|
|
461
|
+
os.environ[key] = value
|
|
220
462
|
return {}
|
|
221
463
|
|
|
222
|
-
#
|
|
223
|
-
all_commits = []
|
|
464
|
+
# Get branches to analyze
|
|
224
465
|
branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
|
|
225
466
|
|
|
226
467
|
if not branches_to_analyze:
|
|
227
|
-
logger.warning(
|
|
468
|
+
logger.warning(
|
|
469
|
+
f"No accessible branches found in repository {project_key} at {repo_path}"
|
|
470
|
+
)
|
|
471
|
+
# Restore original environment variables before returning
|
|
472
|
+
for key, value in original_env.items():
|
|
473
|
+
if value is None:
|
|
474
|
+
os.environ.pop(key, None)
|
|
475
|
+
else:
|
|
476
|
+
os.environ[key] = value
|
|
228
477
|
return {}
|
|
229
478
|
|
|
230
|
-
logger.info(f"Analyzing branches: {branches_to_analyze}")
|
|
479
|
+
logger.info(f"🌿 {project_key}: Analyzing branches: {branches_to_analyze}")
|
|
231
480
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
481
|
+
# Calculate days to process
|
|
482
|
+
current_date = start_date.date()
|
|
483
|
+
end_date_only = end_date.date()
|
|
484
|
+
days_to_process = []
|
|
485
|
+
while current_date <= end_date_only:
|
|
486
|
+
days_to_process.append(current_date)
|
|
487
|
+
current_date += timedelta(days=1)
|
|
237
488
|
|
|
238
|
-
|
|
239
|
-
|
|
489
|
+
logger.info(
|
|
490
|
+
f"Processing {len(days_to_process)} days from {start_date.date()} to {end_date.date()}"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Get progress service for nested progress tracking
|
|
494
|
+
progress = get_progress_service()
|
|
495
|
+
|
|
496
|
+
# Dictionary to store commits by day
|
|
497
|
+
daily_commits = {}
|
|
498
|
+
all_commit_hashes = set() # Track all hashes for deduplication
|
|
499
|
+
|
|
500
|
+
# Count total commits first for Rich display
|
|
501
|
+
try:
|
|
502
|
+
for branch_name in branches_to_analyze[:1]: # Sample from first branch
|
|
503
|
+
sample_commits = list(
|
|
504
|
+
repo.iter_commits(branch_name, since=start_date, until=end_date)
|
|
240
505
|
)
|
|
506
|
+
# Estimate based on first branch (multiply by number of branches for rough estimate)
|
|
507
|
+
len(sample_commits) * len(branches_to_analyze)
|
|
508
|
+
break
|
|
509
|
+
except:
|
|
510
|
+
len(days_to_process) * 50 # Default estimate
|
|
511
|
+
|
|
512
|
+
# Update repository in Rich display with estimated commit count
|
|
513
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
514
|
+
progress.update_repository(project_key, 0, 0.0)
|
|
515
|
+
|
|
516
|
+
# Create nested progress for day-by-day processing
|
|
517
|
+
with progress.progress(
|
|
518
|
+
total=len(days_to_process),
|
|
519
|
+
description=f"📅 Fetching commits for repository: {project_key}",
|
|
520
|
+
unit="days",
|
|
521
|
+
nested=True,
|
|
522
|
+
) as day_progress_ctx:
|
|
523
|
+
|
|
524
|
+
for day_date in days_to_process:
|
|
525
|
+
# Update description to show current repository and day clearly
|
|
526
|
+
day_str = day_date.strftime("%Y-%m-%d")
|
|
527
|
+
progress.set_description(day_progress_ctx, f"🔍 {project_key}: Analyzing {day_str}")
|
|
528
|
+
|
|
529
|
+
# Calculate day boundaries
|
|
530
|
+
day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
|
|
531
|
+
day_end = datetime.combine(day_date, datetime.max.time(), tzinfo=timezone.utc)
|
|
532
|
+
|
|
533
|
+
day_commits = []
|
|
534
|
+
commits_found_today = 0
|
|
535
|
+
|
|
536
|
+
# Process each branch for this specific day
|
|
537
|
+
for branch_name in branches_to_analyze:
|
|
538
|
+
try:
|
|
539
|
+
# Fetch commits for this specific day and branch with timeout protection
|
|
540
|
+
def fetch_branch_commits():
|
|
541
|
+
return list(
|
|
542
|
+
repo.iter_commits(
|
|
543
|
+
branch_name, since=day_start, until=day_end, reverse=False
|
|
544
|
+
)
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Use timeout wrapper to prevent hanging on iter_commits
|
|
548
|
+
try:
|
|
549
|
+
branch_commits = self.git_wrapper.run_with_timeout(
|
|
550
|
+
fetch_branch_commits,
|
|
551
|
+
timeout=15, # 15 seconds per branch/day combination
|
|
552
|
+
operation_name=f"iter_commits_{branch_name}_{day_str}",
|
|
553
|
+
)
|
|
554
|
+
except GitOperationTimeout:
|
|
555
|
+
logger.warning(
|
|
556
|
+
f"⏱️ Timeout fetching commits for branch {branch_name} on {day_str}, skipping"
|
|
557
|
+
)
|
|
558
|
+
continue
|
|
241
559
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
560
|
+
for commit in branch_commits:
|
|
561
|
+
# Skip if we've already processed this commit
|
|
562
|
+
if commit.hexsha in all_commit_hashes:
|
|
563
|
+
continue
|
|
245
564
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
565
|
+
# Extract commit data with full metadata
|
|
566
|
+
commit_data = self._extract_commit_data(
|
|
567
|
+
commit, branch_name, project_key, repo_path
|
|
568
|
+
)
|
|
569
|
+
if commit_data:
|
|
570
|
+
day_commits.append(commit_data)
|
|
571
|
+
all_commit_hashes.add(commit.hexsha)
|
|
572
|
+
commits_found_today += 1
|
|
573
|
+
|
|
574
|
+
except GitOperationTimeout as e:
|
|
575
|
+
logger.warning(
|
|
576
|
+
f"⏱️ Timeout processing branch {branch_name} for day {day_str}: {e}"
|
|
577
|
+
)
|
|
578
|
+
continue
|
|
579
|
+
except Exception as e:
|
|
580
|
+
logger.warning(
|
|
581
|
+
f"Error processing branch {branch_name} for day {day_str}: {e}"
|
|
582
|
+
)
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
# Store commits for this day if any were found
|
|
586
|
+
if day_commits:
|
|
587
|
+
# Sort commits by timestamp
|
|
588
|
+
day_commits.sort(key=lambda c: c["timestamp"])
|
|
589
|
+
daily_commits[day_str] = day_commits
|
|
590
|
+
|
|
591
|
+
# Incremental caching - store commits for this day immediately
|
|
592
|
+
self._store_day_commits_incremental(
|
|
593
|
+
repo_path, day_str, day_commits, project_key
|
|
249
594
|
)
|
|
250
|
-
if commit_data:
|
|
251
|
-
all_commits.append(commit_data)
|
|
252
595
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
596
|
+
logger.debug(f"Found {commits_found_today} commits on {day_str}")
|
|
597
|
+
|
|
598
|
+
# Update progress callback if provided
|
|
599
|
+
if progress_callback:
|
|
600
|
+
progress_callback(f"Processed {day_str}: {commits_found_today} commits")
|
|
256
601
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
602
|
+
# Update progress bar
|
|
603
|
+
progress.update(day_progress_ctx)
|
|
604
|
+
|
|
605
|
+
# Update Rich display with current commit count and speed
|
|
606
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
607
|
+
total_processed = len(all_commit_hashes)
|
|
608
|
+
# Calculate speed (commits per second) based on elapsed time
|
|
609
|
+
import time
|
|
610
|
+
|
|
611
|
+
if not hasattr(self, "_fetch_start_time"):
|
|
612
|
+
self._fetch_start_time = time.time()
|
|
613
|
+
elapsed = time.time() - self._fetch_start_time
|
|
614
|
+
speed = total_processed / elapsed if elapsed > 0 else 0
|
|
615
|
+
progress.update_repository(project_key, total_processed, speed)
|
|
616
|
+
|
|
617
|
+
total_commits = sum(len(commits) for commits in daily_commits.values())
|
|
618
|
+
logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
|
|
619
|
+
|
|
620
|
+
# Restore original environment variables
|
|
621
|
+
for key, value in original_env.items():
|
|
622
|
+
if value is None:
|
|
623
|
+
os.environ.pop(key, None)
|
|
624
|
+
else:
|
|
625
|
+
os.environ[key] = value
|
|
626
|
+
|
|
627
|
+
return daily_commits
|
|
280
628
|
|
|
281
629
|
def _extract_commit_data(
|
|
282
630
|
self, commit: Any, branch_name: str, project_key: str, repo_path: Path
|
|
@@ -317,13 +665,19 @@ class GitDataFetcher:
|
|
|
317
665
|
line_stats = self._calculate_commit_stats(commit)
|
|
318
666
|
total_insertions = line_stats["insertions"]
|
|
319
667
|
total_deletions = line_stats["deletions"]
|
|
668
|
+
raw_insertions = line_stats.get("raw_insertions", total_insertions)
|
|
669
|
+
raw_deletions = line_stats.get("raw_deletions", total_deletions)
|
|
320
670
|
|
|
321
671
|
commit_data.update(
|
|
322
672
|
{
|
|
323
673
|
"files_changed": files_changed,
|
|
324
674
|
"files_changed_count": len(files_changed),
|
|
325
|
-
"lines_added": total_insertions,
|
|
675
|
+
"lines_added": total_insertions, # Filtered counts for backward compatibility
|
|
326
676
|
"lines_deleted": total_deletions,
|
|
677
|
+
"filtered_insertions": total_insertions, # Explicitly filtered counts
|
|
678
|
+
"filtered_deletions": total_deletions,
|
|
679
|
+
"raw_insertions": raw_insertions, # Raw unfiltered counts
|
|
680
|
+
"raw_deletions": raw_deletions,
|
|
327
681
|
}
|
|
328
682
|
)
|
|
329
683
|
|
|
@@ -335,6 +689,10 @@ class GitDataFetcher:
|
|
|
335
689
|
"files_changed_count": 0,
|
|
336
690
|
"lines_added": 0,
|
|
337
691
|
"lines_deleted": 0,
|
|
692
|
+
"filtered_insertions": 0,
|
|
693
|
+
"filtered_deletions": 0,
|
|
694
|
+
"raw_insertions": 0,
|
|
695
|
+
"raw_deletions": 0,
|
|
338
696
|
}
|
|
339
697
|
)
|
|
340
698
|
|
|
@@ -363,9 +721,166 @@ class GitDataFetcher:
|
|
|
363
721
|
|
|
364
722
|
def _should_exclude_file(self, file_path: str) -> bool:
|
|
365
723
|
"""Check if a file should be excluded based on exclude patterns."""
|
|
724
|
+
return any(self._matches_glob_pattern(file_path, pattern) for pattern in self.exclude_paths)
|
|
725
|
+
|
|
726
|
+
def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
|
|
727
|
+
"""Check if a file path matches a glob pattern, handling ** recursion correctly.
|
|
728
|
+
|
|
729
|
+
This method properly handles different glob pattern types:
|
|
730
|
+
- **/vendor/** : matches files inside vendor directories at any level
|
|
731
|
+
- **/*.min.js : matches files with specific suffix anywhere in directory tree
|
|
732
|
+
- vendor/** : matches files inside vendor directory at root level only
|
|
733
|
+
- **pattern** : handles other complex patterns with pathlib.match()
|
|
734
|
+
- simple patterns : uses fnmatch for basic wildcards
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
filepath: The file path to check
|
|
738
|
+
pattern: The glob pattern to match against
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
True if the file path matches the pattern, False otherwise
|
|
742
|
+
"""
|
|
743
|
+
import fnmatch
|
|
744
|
+
import re
|
|
745
|
+
from pathlib import PurePath
|
|
746
|
+
|
|
747
|
+
# Handle empty or invalid inputs
|
|
748
|
+
if not filepath or not pattern:
|
|
749
|
+
return False
|
|
750
|
+
|
|
751
|
+
path = PurePath(filepath)
|
|
752
|
+
|
|
753
|
+
# Check for multiple ** patterns first (most complex)
|
|
754
|
+
if "**" in pattern and pattern.count("**") > 1:
|
|
755
|
+
# Multiple ** patterns - use custom recursive matching for complex patterns
|
|
756
|
+
return self._match_recursive_pattern(filepath, pattern)
|
|
757
|
+
|
|
758
|
+
# Then handle simple ** patterns
|
|
759
|
+
elif pattern.startswith("**/") and pattern.endswith("/**"):
|
|
760
|
+
# Pattern like **/vendor/** - matches files inside vendor directories at any level
|
|
761
|
+
dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
|
|
762
|
+
if not dir_name: # Handle edge case of '**/**'
|
|
763
|
+
return True
|
|
764
|
+
return dir_name in path.parts
|
|
765
|
+
|
|
766
|
+
elif pattern.startswith("**/"):
|
|
767
|
+
# Pattern like **/*.min.js - matches files with specific suffix anywhere
|
|
768
|
+
suffix_pattern = pattern[3:]
|
|
769
|
+
if not suffix_pattern: # Handle edge case of '**/'
|
|
770
|
+
return True
|
|
771
|
+
# Check against filename for file patterns, or any path part for directory patterns
|
|
772
|
+
if suffix_pattern.endswith("/"):
|
|
773
|
+
# Directory pattern like **/build/
|
|
774
|
+
dir_name = suffix_pattern[:-1]
|
|
775
|
+
return dir_name in path.parts
|
|
776
|
+
else:
|
|
777
|
+
# File pattern like *.min.js
|
|
778
|
+
return fnmatch.fnmatch(path.name, suffix_pattern)
|
|
779
|
+
|
|
780
|
+
elif pattern.endswith("/**"):
|
|
781
|
+
# Pattern like vendor/** or docs/build/** - matches files inside directory at root level
|
|
782
|
+
dir_name = pattern[:-3]
|
|
783
|
+
if not dir_name: # Handle edge case of '/**'
|
|
784
|
+
return True
|
|
785
|
+
|
|
786
|
+
# Handle both single directory names and nested paths
|
|
787
|
+
expected_parts = PurePath(dir_name).parts
|
|
788
|
+
return (
|
|
789
|
+
len(path.parts) >= len(expected_parts)
|
|
790
|
+
and path.parts[: len(expected_parts)] == expected_parts
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
elif "**" in pattern:
|
|
794
|
+
# Single ** pattern - use pathlib matching with fallback
|
|
795
|
+
try:
|
|
796
|
+
return path.match(pattern)
|
|
797
|
+
except (ValueError, TypeError):
|
|
798
|
+
# Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
|
|
799
|
+
try:
|
|
800
|
+
return fnmatch.fnmatch(filepath, pattern)
|
|
801
|
+
except re.error:
|
|
802
|
+
# Invalid regex pattern - return False to be safe
|
|
803
|
+
return False
|
|
804
|
+
else:
|
|
805
|
+
# Simple pattern - use fnmatch for basic wildcards
|
|
806
|
+
try:
|
|
807
|
+
# Try matching the full path first
|
|
808
|
+
if fnmatch.fnmatch(filepath, pattern):
|
|
809
|
+
return True
|
|
810
|
+
# Also try matching just the filename for simple patterns
|
|
811
|
+
# This allows "package-lock.json" to match "src/package-lock.json"
|
|
812
|
+
return fnmatch.fnmatch(path.name, pattern)
|
|
813
|
+
except re.error:
|
|
814
|
+
# Invalid regex pattern - return False to be safe
|
|
815
|
+
return False
|
|
816
|
+
|
|
817
|
+
def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
|
|
818
|
+
"""Handle complex patterns with multiple ** wildcards.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
filepath: The file path to check
|
|
822
|
+
pattern: The pattern with multiple ** wildcards
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
True if the path matches the pattern, False otherwise
|
|
826
|
+
"""
|
|
366
827
|
import fnmatch
|
|
828
|
+
from pathlib import PurePath
|
|
829
|
+
|
|
830
|
+
# Split pattern by ** to handle each segment
|
|
831
|
+
parts = pattern.split("**")
|
|
832
|
+
|
|
833
|
+
# Validate that we have actual segments
|
|
834
|
+
if not parts:
|
|
835
|
+
return False
|
|
836
|
+
|
|
837
|
+
# Convert filepath to parts for easier matching
|
|
838
|
+
path_parts = list(PurePath(filepath).parts)
|
|
839
|
+
|
|
840
|
+
# Start matching from the beginning
|
|
841
|
+
path_index = 0
|
|
842
|
+
|
|
843
|
+
for i, part in enumerate(parts):
|
|
844
|
+
if not part:
|
|
845
|
+
# Empty part (e.g., from leading or trailing **)
|
|
846
|
+
if i == 0 or i == len(parts) - 1:
|
|
847
|
+
# Leading or trailing ** - continue
|
|
848
|
+
continue
|
|
849
|
+
# Middle empty part (consecutive **) - match any number of path components
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
# Clean the part (remove leading/trailing slashes)
|
|
853
|
+
part = part.strip("/")
|
|
367
854
|
|
|
368
|
-
|
|
855
|
+
if not part:
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
# Find where this part matches in the remaining path
|
|
859
|
+
found = False
|
|
860
|
+
for j in range(path_index, len(path_parts)):
|
|
861
|
+
# Check if the current path part matches the pattern part
|
|
862
|
+
if "/" in part:
|
|
863
|
+
# Part contains multiple path components
|
|
864
|
+
sub_parts = part.split("/")
|
|
865
|
+
if j + len(sub_parts) <= len(path_parts) and all(
|
|
866
|
+
fnmatch.fnmatch(path_parts[j + k], sub_parts[k])
|
|
867
|
+
for k in range(len(sub_parts))
|
|
868
|
+
):
|
|
869
|
+
path_index = j + len(sub_parts)
|
|
870
|
+
found = True
|
|
871
|
+
break
|
|
872
|
+
else:
|
|
873
|
+
# Single component part
|
|
874
|
+
if fnmatch.fnmatch(path_parts[j], part):
|
|
875
|
+
path_index = j + 1
|
|
876
|
+
found = True
|
|
877
|
+
break
|
|
878
|
+
|
|
879
|
+
if not found and part:
|
|
880
|
+
# Required part not found in path
|
|
881
|
+
return False
|
|
882
|
+
|
|
883
|
+
return True
|
|
369
884
|
|
|
370
885
|
def _get_branches_to_analyze(
|
|
371
886
|
self, repo: Any, branch_patterns: Optional[list[str]]
|
|
@@ -373,114 +888,115 @@ class GitDataFetcher:
|
|
|
373
888
|
"""Get list of branches to analyze based on patterns.
|
|
374
889
|
|
|
375
890
|
WHY: Robust branch detection that handles missing remotes, missing default branches,
|
|
376
|
-
and provides good fallback behavior.
|
|
891
|
+
and provides good fallback behavior. When no patterns specified, analyzes ALL branches
|
|
892
|
+
to capture the complete development picture.
|
|
377
893
|
|
|
378
894
|
DESIGN DECISION:
|
|
379
|
-
-
|
|
895
|
+
- When no patterns: analyze ALL accessible branches (not just main)
|
|
896
|
+
- When patterns specified: match against those patterns only
|
|
380
897
|
- Handle missing remotes gracefully
|
|
381
898
|
- Skip remote tracking branches to avoid duplicates
|
|
382
899
|
- Use actual branch existence checking rather than assuming branches exist
|
|
900
|
+
|
|
901
|
+
THREAD SAFETY: This method is thread-safe as it doesn't modify shared state
|
|
902
|
+
and works with a repo instance passed as a parameter.
|
|
383
903
|
"""
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
available_branches = []
|
|
904
|
+
# Collect all available branches (local branches preferred)
|
|
905
|
+
available_branches = []
|
|
387
906
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
907
|
+
# First, try local branches
|
|
908
|
+
try:
|
|
909
|
+
# THREAD SAFETY: Create a new list to avoid sharing references
|
|
910
|
+
local_branches = list([branch.name for branch in repo.branches])
|
|
911
|
+
available_branches.extend(local_branches)
|
|
912
|
+
logger.debug(f"Found local branches: {local_branches}")
|
|
913
|
+
except Exception as e:
|
|
914
|
+
logger.debug(f"Error getting local branches: {e}")
|
|
395
915
|
|
|
396
|
-
|
|
916
|
+
# If we have remotes, also consider remote branches (but clean the names)
|
|
917
|
+
# Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
|
|
918
|
+
if not self.skip_remote_fetch:
|
|
397
919
|
try:
|
|
398
920
|
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
921
|
+
# THREAD SAFETY: Create a new list to avoid sharing references
|
|
922
|
+
remote_branches = list(
|
|
923
|
+
[
|
|
924
|
+
ref.name.replace("origin/", "")
|
|
925
|
+
for ref in repo.remotes.origin.refs
|
|
926
|
+
if not ref.name.endswith("HEAD") # Skip HEAD ref
|
|
927
|
+
]
|
|
928
|
+
)
|
|
404
929
|
# Only add remote branches that aren't already in local branches
|
|
405
930
|
for branch in remote_branches:
|
|
406
931
|
if branch not in available_branches:
|
|
407
932
|
available_branches.append(branch)
|
|
408
933
|
logger.debug(f"Found remote branches: {remote_branches}")
|
|
409
934
|
except Exception as e:
|
|
410
|
-
logger.debug(f"Error getting remote branches: {e}")
|
|
935
|
+
logger.debug(f"Error getting remote branches (may require authentication): {e}")
|
|
936
|
+
# Continue with local branches only
|
|
937
|
+
else:
|
|
938
|
+
logger.debug("Skipping remote branch enumeration (skip_remote_fetch=true)")
|
|
411
939
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
940
|
+
# If no branches found, fallback to trying common names directly
|
|
941
|
+
if not available_branches:
|
|
942
|
+
logger.warning("No branches found via normal detection, falling back to common names")
|
|
943
|
+
available_branches = ["main", "master", "develop", "dev"]
|
|
944
|
+
|
|
945
|
+
# Filter branches based on patterns if provided
|
|
946
|
+
if branch_patterns:
|
|
947
|
+
import fnmatch
|
|
948
|
+
|
|
949
|
+
matching_branches = []
|
|
950
|
+
for pattern in branch_patterns:
|
|
951
|
+
matching = [
|
|
952
|
+
branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)
|
|
953
|
+
]
|
|
954
|
+
matching_branches.extend(matching)
|
|
955
|
+
# Remove duplicates while preserving order
|
|
956
|
+
branches_to_test = list(dict.fromkeys(matching_branches))
|
|
957
|
+
else:
|
|
958
|
+
# No patterns specified - analyze ALL branches for complete coverage
|
|
959
|
+
branches_to_test = available_branches
|
|
960
|
+
logger.info(
|
|
961
|
+
f"No branch patterns specified - will analyze all {len(branches_to_test)} branches"
|
|
962
|
+
)
|
|
418
963
|
|
|
419
|
-
|
|
964
|
+
# Test that branches are actually accessible
|
|
965
|
+
accessible_branches = []
|
|
966
|
+
for branch in branches_to_test:
|
|
967
|
+
try:
|
|
968
|
+
# THREAD SAFETY: Use iterator without storing intermediate results
|
|
969
|
+
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
970
|
+
accessible_branches.append(branch)
|
|
971
|
+
except Exception as e:
|
|
972
|
+
logger.debug(f"Branch {branch} not accessible: {e}")
|
|
973
|
+
|
|
974
|
+
if not accessible_branches:
|
|
975
|
+
# Last resort: try to find ANY working branch
|
|
976
|
+
logger.warning("No accessible branches found from patterns/default, trying fallback")
|
|
420
977
|
main_branches = ["main", "master", "develop", "dev"]
|
|
421
978
|
for branch in main_branches:
|
|
422
979
|
if branch in available_branches:
|
|
423
|
-
# Test that we can actually access this branch
|
|
424
980
|
try:
|
|
425
|
-
# Just try to get the commit object to verify branch exists and is accessible
|
|
426
981
|
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
427
|
-
logger.info(f"Using main branch: {branch}")
|
|
982
|
+
logger.info(f"Using fallback main branch: {branch}")
|
|
428
983
|
return [branch]
|
|
429
|
-
except Exception
|
|
430
|
-
logger.debug(f"Branch {branch} exists but not accessible: {e}")
|
|
984
|
+
except Exception:
|
|
431
985
|
continue
|
|
432
986
|
|
|
433
|
-
#
|
|
987
|
+
# Try any available branch
|
|
434
988
|
for branch in available_branches:
|
|
435
989
|
try:
|
|
436
990
|
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
437
991
|
logger.info(f"Using fallback branch: {branch}")
|
|
438
992
|
return [branch]
|
|
439
|
-
except Exception
|
|
440
|
-
logger.debug(f"Branch {branch} not accessible: {e}")
|
|
993
|
+
except Exception:
|
|
441
994
|
continue
|
|
442
995
|
|
|
443
|
-
# Last resort: return empty list (will be handled gracefully by caller)
|
|
444
996
|
logger.warning("No accessible branches found")
|
|
445
997
|
return []
|
|
446
998
|
|
|
447
|
-
|
|
448
|
-
import fnmatch
|
|
449
|
-
|
|
450
|
-
available_branches = []
|
|
451
|
-
|
|
452
|
-
# Collect all branches (local and remote)
|
|
453
|
-
with contextlib.suppress(Exception):
|
|
454
|
-
available_branches.extend([branch.name for branch in repo.branches])
|
|
455
|
-
|
|
456
|
-
try:
|
|
457
|
-
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
458
|
-
remote_branches = [
|
|
459
|
-
ref.name.replace("origin/", "")
|
|
460
|
-
for ref in repo.remotes.origin.refs
|
|
461
|
-
if not ref.name.endswith("HEAD")
|
|
462
|
-
]
|
|
463
|
-
for branch in remote_branches:
|
|
464
|
-
if branch not in available_branches:
|
|
465
|
-
available_branches.append(branch)
|
|
466
|
-
except Exception:
|
|
467
|
-
pass
|
|
468
|
-
|
|
469
|
-
# Match patterns against available branches
|
|
470
|
-
matching_branches = []
|
|
471
|
-
for pattern in branch_patterns:
|
|
472
|
-
matching = [branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)]
|
|
473
|
-
matching_branches.extend(matching)
|
|
474
|
-
|
|
475
|
-
# Test that matched branches are actually accessible
|
|
476
|
-
accessible_branches = []
|
|
477
|
-
for branch in list(set(matching_branches)): # Remove duplicates
|
|
478
|
-
try:
|
|
479
|
-
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
480
|
-
accessible_branches.append(branch)
|
|
481
|
-
except Exception as e:
|
|
482
|
-
logger.debug(f"Matched branch {branch} not accessible: {e}")
|
|
483
|
-
|
|
999
|
+
logger.info(f"Will analyze {len(accessible_branches)} branches: {accessible_branches}")
|
|
484
1000
|
return accessible_branches
|
|
485
1001
|
|
|
486
1002
|
def _update_repository(self, repo) -> bool:
|
|
@@ -503,82 +1019,62 @@ class GitDataFetcher:
|
|
|
503
1019
|
Returns:
|
|
504
1020
|
bool: True if update succeeded, False if failed (but analysis continues)
|
|
505
1021
|
"""
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
1022
|
+
# Skip remote operations if configured
|
|
1023
|
+
if self.skip_remote_fetch:
|
|
1024
|
+
logger.info("🚫 Skipping remote fetch (skip_remote_fetch=true)")
|
|
1025
|
+
return True
|
|
509
1026
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
env["GIT_TERMINAL_PROMPT"] = "0"
|
|
513
|
-
env["GIT_ASKPASS"] = ""
|
|
514
|
-
env["GCM_INTERACTIVE"] = "never"
|
|
1027
|
+
# Check for stale repository (last fetch > 1 hour ago)
|
|
1028
|
+
self._check_repository_staleness(repo)
|
|
515
1029
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
timeout=30, # 30 second timeout
|
|
525
|
-
)
|
|
1030
|
+
try:
|
|
1031
|
+
# Check if we have remotes without triggering authentication
|
|
1032
|
+
has_remotes = False
|
|
1033
|
+
try:
|
|
1034
|
+
has_remotes = bool(repo.remotes)
|
|
1035
|
+
except Exception as e:
|
|
1036
|
+
logger.debug(f"Could not check for remotes (may require authentication): {e}")
|
|
1037
|
+
return True # Continue with local analysis
|
|
526
1038
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
if any(
|
|
530
|
-
x in error_str
|
|
531
|
-
for x in ["authentication", "permission denied", "401", "403"]
|
|
532
|
-
):
|
|
533
|
-
logger.error(
|
|
534
|
-
"GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
|
|
535
|
-
"You can verify with: gh auth status"
|
|
536
|
-
)
|
|
537
|
-
# Skip fetch but continue with local analysis
|
|
538
|
-
return False
|
|
539
|
-
logger.warning(f"Git fetch failed: {result.stderr}")
|
|
540
|
-
return False
|
|
1039
|
+
if has_remotes:
|
|
1040
|
+
logger.info("Fetching latest changes from remote")
|
|
541
1041
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
1042
|
+
# Use our timeout wrapper for safe git operations
|
|
1043
|
+
repo_path = Path(repo.working_dir)
|
|
1044
|
+
|
|
1045
|
+
# Try to fetch with timeout protection
|
|
1046
|
+
fetch_success = self.git_wrapper.fetch_with_timeout(repo_path, timeout=30)
|
|
1047
|
+
|
|
1048
|
+
if not fetch_success:
|
|
1049
|
+
# Mark this repository as having authentication issues if applicable
|
|
1050
|
+
if hasattr(self, "repository_status"):
|
|
1051
|
+
for key in self.repository_status:
|
|
1052
|
+
if repo.working_dir.endswith(key) or key in repo.working_dir:
|
|
1053
|
+
self.repository_status[key]["remote_update"] = "failed"
|
|
1054
|
+
break
|
|
1055
|
+
|
|
1056
|
+
# Explicit warning to user about stale data
|
|
1057
|
+
logger.warning(
|
|
1058
|
+
f"❌ Failed to fetch updates for {repo_path.name}. "
|
|
1059
|
+
f"Analysis will use potentially stale local data. "
|
|
1060
|
+
f"Check authentication or network connectivity."
|
|
545
1061
|
)
|
|
546
1062
|
return False
|
|
1063
|
+
else:
|
|
1064
|
+
# Explicit success confirmation
|
|
1065
|
+
logger.info(f"✅ Successfully fetched updates for {repo_path.name}")
|
|
547
1066
|
|
|
548
1067
|
# Only try to pull if not in detached HEAD state
|
|
549
1068
|
if not repo.head.is_detached:
|
|
550
1069
|
current_branch = repo.active_branch
|
|
551
1070
|
tracking = current_branch.tracking_branch()
|
|
552
1071
|
if tracking:
|
|
553
|
-
# Pull latest changes using
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
["git", "pull", "--config", "credential.helper="],
|
|
557
|
-
cwd=repo.working_dir,
|
|
558
|
-
env=env,
|
|
559
|
-
capture_output=True,
|
|
560
|
-
text=True,
|
|
561
|
-
timeout=30,
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
if result.returncode != 0:
|
|
565
|
-
error_str = result.stderr.lower()
|
|
566
|
-
if any(
|
|
567
|
-
x in error_str
|
|
568
|
-
for x in ["authentication", "permission denied", "401", "403"]
|
|
569
|
-
):
|
|
570
|
-
logger.error(
|
|
571
|
-
"GitHub authentication failed during pull. Continuing with local repository state."
|
|
572
|
-
)
|
|
573
|
-
return False
|
|
574
|
-
logger.warning(f"Git pull failed: {result.stderr}")
|
|
575
|
-
return False
|
|
1072
|
+
# Pull latest changes using timeout wrapper
|
|
1073
|
+
pull_success = self.git_wrapper.pull_with_timeout(repo_path, timeout=30)
|
|
1074
|
+
if pull_success:
|
|
576
1075
|
logger.debug(f"Pulled latest changes for {current_branch.name}")
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
logger.error(
|
|
580
|
-
"Git pull timed out. Continuing with local repository state."
|
|
581
|
-
)
|
|
1076
|
+
else:
|
|
1077
|
+
logger.warning("Git pull failed, continuing with fetched state")
|
|
582
1078
|
return False
|
|
583
1079
|
else:
|
|
584
1080
|
logger.debug(
|
|
@@ -595,6 +1091,114 @@ class GitDataFetcher:
|
|
|
595
1091
|
# Continue with analysis using local state
|
|
596
1092
|
return False
|
|
597
1093
|
|
|
1094
|
+
def _check_repository_staleness(self, repo) -> None:
|
|
1095
|
+
"""Check if repository hasn't been fetched recently and warn user.
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
repo: GitPython Repo object
|
|
1099
|
+
"""
|
|
1100
|
+
try:
|
|
1101
|
+
repo_path = Path(repo.working_dir)
|
|
1102
|
+
fetch_head_path = repo_path / ".git" / "FETCH_HEAD"
|
|
1103
|
+
|
|
1104
|
+
if fetch_head_path.exists():
|
|
1105
|
+
# Get last fetch time from FETCH_HEAD modification time
|
|
1106
|
+
last_fetch_time = datetime.fromtimestamp(
|
|
1107
|
+
fetch_head_path.stat().st_mtime, tz=timezone.utc
|
|
1108
|
+
)
|
|
1109
|
+
now = datetime.now(timezone.utc)
|
|
1110
|
+
hours_since_fetch = (now - last_fetch_time).total_seconds() / 3600
|
|
1111
|
+
|
|
1112
|
+
if hours_since_fetch > 1:
|
|
1113
|
+
logger.warning(
|
|
1114
|
+
f"⏰ Repository {repo_path.name} last fetched {hours_since_fetch:.1f} hours ago. "
|
|
1115
|
+
f"Data may be stale."
|
|
1116
|
+
)
|
|
1117
|
+
else:
|
|
1118
|
+
logger.warning(
|
|
1119
|
+
f"⚠️ Repository {repo_path.name} has never been fetched. "
|
|
1120
|
+
f"Will attempt to fetch now."
|
|
1121
|
+
)
|
|
1122
|
+
except Exception as e:
|
|
1123
|
+
logger.debug(f"Could not check repository staleness: {e}")
|
|
1124
|
+
|
|
1125
|
+
def _check_repository_security(self, repo_path: Path, project_key: str) -> None:
|
|
1126
|
+
"""Check for security issues in repository configuration.
|
|
1127
|
+
|
|
1128
|
+
Warns about:
|
|
1129
|
+
- Exposed tokens in remote URLs
|
|
1130
|
+
- Insecure credential storage
|
|
1131
|
+
"""
|
|
1132
|
+
|
|
1133
|
+
try:
|
|
1134
|
+
# Check for tokens in remote URLs
|
|
1135
|
+
result = subprocess.run(
|
|
1136
|
+
["git", "remote", "-v"],
|
|
1137
|
+
cwd=repo_path,
|
|
1138
|
+
capture_output=True,
|
|
1139
|
+
text=True,
|
|
1140
|
+
timeout=2,
|
|
1141
|
+
env={"GIT_TERMINAL_PROMPT": "0"},
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
if result.returncode == 0:
|
|
1145
|
+
output = result.stdout
|
|
1146
|
+
# Check for various token patterns in URLs
|
|
1147
|
+
token_patterns = [
|
|
1148
|
+
r"https://[^@]*@", # Any HTTPS URL with embedded credentials
|
|
1149
|
+
r"ghp_[a-zA-Z0-9]+", # GitHub Personal Access Token
|
|
1150
|
+
r"ghs_[a-zA-Z0-9]+", # GitHub Server Token
|
|
1151
|
+
r"github_pat_[a-zA-Z0-9]+", # New GitHub PAT format
|
|
1152
|
+
]
|
|
1153
|
+
|
|
1154
|
+
for pattern in token_patterns:
|
|
1155
|
+
import re
|
|
1156
|
+
|
|
1157
|
+
if re.search(pattern, output):
|
|
1158
|
+
logger.warning(
|
|
1159
|
+
f"⚠️ SECURITY WARNING for {project_key}: "
|
|
1160
|
+
f"Repository appears to have credentials in remote URL. "
|
|
1161
|
+
f"This is a security risk! Consider using: "
|
|
1162
|
+
f"1) GitHub CLI (gh auth login), "
|
|
1163
|
+
f"2) SSH keys, or "
|
|
1164
|
+
f"3) Git credential manager instead."
|
|
1165
|
+
)
|
|
1166
|
+
break
|
|
1167
|
+
except:
|
|
1168
|
+
# Don't fail analysis due to security check
|
|
1169
|
+
pass
|
|
1170
|
+
|
|
1171
|
+
def get_repository_status_summary(self) -> dict[str, Any]:
|
|
1172
|
+
"""Get a summary of repository fetch status.
|
|
1173
|
+
|
|
1174
|
+
Returns:
|
|
1175
|
+
Dictionary with status summary including any repositories with issues
|
|
1176
|
+
"""
|
|
1177
|
+
summary = {
|
|
1178
|
+
"total_repositories": len(self.repository_status),
|
|
1179
|
+
"successful_updates": 0,
|
|
1180
|
+
"failed_updates": 0,
|
|
1181
|
+
"skipped_updates": 0,
|
|
1182
|
+
"authentication_issues": [],
|
|
1183
|
+
"errors": [],
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
for project_key, status in self.repository_status.items():
|
|
1187
|
+
if status["remote_update"] == "success":
|
|
1188
|
+
summary["successful_updates"] += 1
|
|
1189
|
+
elif status["remote_update"] == "failed":
|
|
1190
|
+
summary["failed_updates"] += 1
|
|
1191
|
+
if status.get("authentication_issues"):
|
|
1192
|
+
summary["authentication_issues"].append(project_key)
|
|
1193
|
+
elif status["remote_update"] == "skipped":
|
|
1194
|
+
summary["skipped_updates"] += 1
|
|
1195
|
+
elif status["remote_update"] == "error":
|
|
1196
|
+
summary["errors"].append(
|
|
1197
|
+
{"repository": project_key, "error": status.get("error", "Unknown error")}
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
return summary
|
|
1201
|
+
|
|
598
1202
|
def _extract_all_ticket_references(
|
|
599
1203
|
self, daily_commits: dict[str, list[dict[str, Any]]]
|
|
600
1204
|
) -> set[str]:
|
|
@@ -872,8 +1476,20 @@ class GitDataFetcher:
|
|
|
872
1476
|
"branch": commit.get("branch", "main"),
|
|
873
1477
|
"is_merge": commit.get("is_merge", False),
|
|
874
1478
|
"files_changed_count": commit.get("files_changed_count", 0),
|
|
875
|
-
|
|
876
|
-
"
|
|
1479
|
+
# Store raw unfiltered values in insertions/deletions
|
|
1480
|
+
"insertions": commit.get(
|
|
1481
|
+
"raw_insertions", commit.get("lines_added", 0)
|
|
1482
|
+
),
|
|
1483
|
+
"deletions": commit.get(
|
|
1484
|
+
"raw_deletions", commit.get("lines_deleted", 0)
|
|
1485
|
+
),
|
|
1486
|
+
# Store filtered values separately
|
|
1487
|
+
"filtered_insertions": commit.get(
|
|
1488
|
+
"filtered_insertions", commit.get("lines_added", 0)
|
|
1489
|
+
),
|
|
1490
|
+
"filtered_deletions": commit.get(
|
|
1491
|
+
"filtered_deletions", commit.get("lines_deleted", 0)
|
|
1492
|
+
),
|
|
877
1493
|
"story_points": commit.get("story_points"),
|
|
878
1494
|
"ticket_references": commit.get("ticket_references", []),
|
|
879
1495
|
}
|
|
@@ -1146,24 +1762,48 @@ class GitDataFetcher:
|
|
|
1146
1762
|
session.close()
|
|
1147
1763
|
|
|
1148
1764
|
def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
|
|
1149
|
-
"""Calculate commit statistics using reliable git diff --numstat.
|
|
1765
|
+
"""Calculate commit statistics using reliable git diff --numstat with exclude_paths filtering.
|
|
1150
1766
|
|
|
1151
1767
|
Returns:
|
|
1152
|
-
Dictionary with
|
|
1768
|
+
Dictionary with both raw and filtered statistics:
|
|
1769
|
+
- 'files', 'insertions', 'deletions': filtered counts
|
|
1770
|
+
- 'raw_insertions', 'raw_deletions': unfiltered counts
|
|
1771
|
+
|
|
1772
|
+
THREAD SAFETY: This method is thread-safe as it works with commit objects
|
|
1773
|
+
that have their own repo references.
|
|
1153
1774
|
"""
|
|
1154
1775
|
stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1155
1776
|
|
|
1777
|
+
# Track raw stats for storage
|
|
1778
|
+
raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1779
|
+
excluded_stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1780
|
+
|
|
1156
1781
|
# For initial commits or commits without parents
|
|
1157
1782
|
parent = commit.parents[0] if commit.parents else None
|
|
1158
1783
|
|
|
1159
1784
|
try:
|
|
1160
|
-
# Use
|
|
1785
|
+
# THREAD SAFETY: Use the repo reference from the commit object
|
|
1786
|
+
# Each thread has its own commit object with its own repo reference
|
|
1161
1787
|
repo = commit.repo
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1788
|
+
Path(repo.working_dir)
|
|
1789
|
+
|
|
1790
|
+
def get_diff_output():
|
|
1791
|
+
if parent:
|
|
1792
|
+
return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1793
|
+
else:
|
|
1794
|
+
# Initial commit - use git show with --numstat
|
|
1795
|
+
return repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1796
|
+
|
|
1797
|
+
# Use timeout wrapper for git diff operations
|
|
1798
|
+
try:
|
|
1799
|
+
diff_output = self.git_wrapper.run_with_timeout(
|
|
1800
|
+
get_diff_output,
|
|
1801
|
+
timeout=10, # 10 seconds for diff operations
|
|
1802
|
+
operation_name=f"diff_{commit.hexsha[:8]}",
|
|
1803
|
+
)
|
|
1804
|
+
except GitOperationTimeout:
|
|
1805
|
+
logger.warning(f"⏱️ Timeout calculating stats for commit {commit.hexsha[:8]}")
|
|
1806
|
+
return stats # Return zeros
|
|
1167
1807
|
|
|
1168
1808
|
# Parse the numstat output: insertions\tdeletions\tfilename
|
|
1169
1809
|
for line in diff_output.strip().split("\n"):
|
|
@@ -1175,9 +1815,22 @@ class GitDataFetcher:
|
|
|
1175
1815
|
try:
|
|
1176
1816
|
insertions = int(parts[0]) if parts[0] != "-" else 0
|
|
1177
1817
|
deletions = int(parts[1]) if parts[1] != "-" else 0
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
#
|
|
1818
|
+
filename = parts[2]
|
|
1819
|
+
|
|
1820
|
+
# Always count raw stats
|
|
1821
|
+
raw_stats["files"] += 1
|
|
1822
|
+
raw_stats["insertions"] += insertions
|
|
1823
|
+
raw_stats["deletions"] += deletions
|
|
1824
|
+
|
|
1825
|
+
# Skip excluded files based on exclude_paths patterns
|
|
1826
|
+
if self._should_exclude_file(filename):
|
|
1827
|
+
logger.debug(f"Excluding file from line counts: {filename}")
|
|
1828
|
+
excluded_stats["files"] += 1
|
|
1829
|
+
excluded_stats["insertions"] += insertions
|
|
1830
|
+
excluded_stats["deletions"] += deletions
|
|
1831
|
+
continue
|
|
1832
|
+
|
|
1833
|
+
# Count only non-excluded files and their changes
|
|
1181
1834
|
stats["files"] += 1
|
|
1182
1835
|
stats["insertions"] += insertions
|
|
1183
1836
|
stats["deletions"] += deletions
|
|
@@ -1186,8 +1839,381 @@ class GitDataFetcher:
|
|
|
1186
1839
|
# Skip binary files or malformed lines
|
|
1187
1840
|
continue
|
|
1188
1841
|
|
|
1842
|
+
# Log exclusion statistics if significant
|
|
1843
|
+
if excluded_stats["files"] > 0 or (
|
|
1844
|
+
raw_stats["insertions"] > 0 and stats["insertions"] < raw_stats["insertions"]
|
|
1845
|
+
):
|
|
1846
|
+
reduction_pct = (
|
|
1847
|
+
100 * (1 - stats["insertions"] / raw_stats["insertions"])
|
|
1848
|
+
if raw_stats["insertions"] > 0
|
|
1849
|
+
else 0
|
|
1850
|
+
)
|
|
1851
|
+
logger.info(
|
|
1852
|
+
f"Commit {commit.hexsha[:8]}: Excluded {excluded_stats['files']} files, "
|
|
1853
|
+
f"{excluded_stats['insertions']} insertions, {excluded_stats['deletions']} deletions "
|
|
1854
|
+
f"({reduction_pct:.1f}% reduction)"
|
|
1855
|
+
)
|
|
1856
|
+
|
|
1857
|
+
# Log if exclusions are configured
|
|
1858
|
+
if self.exclude_paths and raw_stats["files"] > 0:
|
|
1859
|
+
logger.debug(
|
|
1860
|
+
f"Commit {commit.hexsha[:8]}: Applied {len(self.exclude_paths)} exclusion patterns. "
|
|
1861
|
+
f"Raw: {raw_stats['files']} files, +{raw_stats['insertions']} -{raw_stats['deletions']}. "
|
|
1862
|
+
f"Filtered: {stats['files']} files, +{stats['insertions']} -{stats['deletions']}"
|
|
1863
|
+
)
|
|
1864
|
+
|
|
1189
1865
|
except Exception as e:
|
|
1190
1866
|
# Log the error for debugging but don't crash
|
|
1191
1867
|
logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
|
|
1192
1868
|
|
|
1869
|
+
# Return both raw and filtered stats
|
|
1870
|
+
stats["raw_insertions"] = raw_stats["insertions"]
|
|
1871
|
+
stats["raw_deletions"] = raw_stats["deletions"]
|
|
1193
1872
|
return stats
|
|
1873
|
+
|
|
1874
|
+
def _store_day_commits_incremental(
|
|
1875
|
+
self, repo_path: Path, date_str: str, commits: list[dict[str, Any]], project_key: str
|
|
1876
|
+
) -> None:
|
|
1877
|
+
"""Store commits for a single day incrementally to enable progress tracking.
|
|
1878
|
+
|
|
1879
|
+
This method stores commits immediately after fetching them for a day,
|
|
1880
|
+
allowing for better progress tracking and recovery from interruptions.
|
|
1881
|
+
|
|
1882
|
+
Args:
|
|
1883
|
+
repo_path: Path to the repository
|
|
1884
|
+
date_str: Date string in YYYY-MM-DD format
|
|
1885
|
+
commits: List of commit data for the day
|
|
1886
|
+
project_key: Project identifier
|
|
1887
|
+
"""
|
|
1888
|
+
try:
|
|
1889
|
+
# Transform commits to cache format
|
|
1890
|
+
cache_format_commits = []
|
|
1891
|
+
for commit in commits:
|
|
1892
|
+
cache_format_commit = {
|
|
1893
|
+
"hash": commit["commit_hash"],
|
|
1894
|
+
"author_name": commit.get("author_name", ""),
|
|
1895
|
+
"author_email": commit.get("author_email", ""),
|
|
1896
|
+
"message": commit.get("message", ""),
|
|
1897
|
+
"timestamp": commit["timestamp"],
|
|
1898
|
+
"branch": commit.get("branch", "main"),
|
|
1899
|
+
"is_merge": commit.get("is_merge", False),
|
|
1900
|
+
"files_changed_count": commit.get("files_changed_count", 0),
|
|
1901
|
+
# Store raw unfiltered values
|
|
1902
|
+
"insertions": commit.get("raw_insertions", commit.get("lines_added", 0)),
|
|
1903
|
+
"deletions": commit.get("raw_deletions", commit.get("lines_deleted", 0)),
|
|
1904
|
+
# Store filtered values
|
|
1905
|
+
"filtered_insertions": commit.get(
|
|
1906
|
+
"filtered_insertions", commit.get("lines_added", 0)
|
|
1907
|
+
),
|
|
1908
|
+
"filtered_deletions": commit.get(
|
|
1909
|
+
"filtered_deletions", commit.get("lines_deleted", 0)
|
|
1910
|
+
),
|
|
1911
|
+
"story_points": commit.get("story_points"),
|
|
1912
|
+
"ticket_references": commit.get("ticket_references", []),
|
|
1913
|
+
}
|
|
1914
|
+
cache_format_commits.append(cache_format_commit)
|
|
1915
|
+
|
|
1916
|
+
# Use bulk store for efficiency
|
|
1917
|
+
if cache_format_commits:
|
|
1918
|
+
bulk_stats = self.cache.bulk_store_commits(str(repo_path), cache_format_commits)
|
|
1919
|
+
logger.debug(
|
|
1920
|
+
f"Incrementally stored {bulk_stats['inserted']} commits for {date_str} "
|
|
1921
|
+
f"({bulk_stats['skipped']} already cached)"
|
|
1922
|
+
)
|
|
1923
|
+
except Exception as e:
|
|
1924
|
+
# Log error but don't fail - commits will be stored again in batch at the end
|
|
1925
|
+
logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")
|
|
1926
|
+
|
|
1927
|
+
def process_repositories_parallel(
|
|
1928
|
+
self,
|
|
1929
|
+
repositories: list[dict],
|
|
1930
|
+
weeks_back: int = 4,
|
|
1931
|
+
jira_integration: Optional[JIRAIntegration] = None,
|
|
1932
|
+
start_date: Optional[datetime] = None,
|
|
1933
|
+
end_date: Optional[datetime] = None,
|
|
1934
|
+
max_workers: int = 3,
|
|
1935
|
+
) -> dict[str, Any]:
|
|
1936
|
+
"""Process multiple repositories in parallel with proper timeout protection.
|
|
1937
|
+
|
|
1938
|
+
Args:
|
|
1939
|
+
repositories: List of repository configurations
|
|
1940
|
+
weeks_back: Number of weeks to analyze
|
|
1941
|
+
jira_integration: Optional JIRA integration for ticket data
|
|
1942
|
+
start_date: Optional explicit start date
|
|
1943
|
+
end_date: Optional explicit end date
|
|
1944
|
+
max_workers: Maximum number of parallel workers
|
|
1945
|
+
|
|
1946
|
+
Returns:
|
|
1947
|
+
Dictionary containing processing results and statistics
|
|
1948
|
+
"""
|
|
1949
|
+
logger.info(
|
|
1950
|
+
f"🚀 Starting parallel processing of {len(repositories)} repositories with {max_workers} workers"
|
|
1951
|
+
)
|
|
1952
|
+
|
|
1953
|
+
# Initialize statistics
|
|
1954
|
+
self.processing_stats = {
|
|
1955
|
+
"total": len(repositories),
|
|
1956
|
+
"processed": 0,
|
|
1957
|
+
"success": 0,
|
|
1958
|
+
"failed": 0,
|
|
1959
|
+
"timeout": 0,
|
|
1960
|
+
"repositories": {},
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
# Get progress service for updates
|
|
1964
|
+
progress = get_progress_service()
|
|
1965
|
+
|
|
1966
|
+
# Start heartbeat logger for monitoring
|
|
1967
|
+
with HeartbeatLogger(interval=5):
|
|
1968
|
+
results = {}
|
|
1969
|
+
|
|
1970
|
+
# Use ThreadPoolExecutor for parallel processing
|
|
1971
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1972
|
+
# Submit all repository processing tasks
|
|
1973
|
+
future_to_repo = {}
|
|
1974
|
+
|
|
1975
|
+
for repo_config in repositories:
|
|
1976
|
+
repo_path = Path(repo_config.get("path", ""))
|
|
1977
|
+
project_key = repo_config.get("project_key", repo_path.name)
|
|
1978
|
+
branch_patterns = repo_config.get("branch_patterns")
|
|
1979
|
+
|
|
1980
|
+
# Submit task with timeout wrapper
|
|
1981
|
+
future = executor.submit(
|
|
1982
|
+
self._process_repository_with_timeout,
|
|
1983
|
+
repo_path,
|
|
1984
|
+
project_key,
|
|
1985
|
+
weeks_back,
|
|
1986
|
+
branch_patterns,
|
|
1987
|
+
jira_integration,
|
|
1988
|
+
start_date,
|
|
1989
|
+
end_date,
|
|
1990
|
+
)
|
|
1991
|
+
future_to_repo[future] = {
|
|
1992
|
+
"path": repo_path,
|
|
1993
|
+
"project_key": project_key,
|
|
1994
|
+
"start_time": time.time(),
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
logger.info(f"📋 Submitted {project_key} for processing")
|
|
1998
|
+
|
|
1999
|
+
# Process results as they complete
|
|
2000
|
+
for future in as_completed(future_to_repo):
|
|
2001
|
+
repo_info = future_to_repo[future]
|
|
2002
|
+
project_key = repo_info["project_key"]
|
|
2003
|
+
elapsed_time = time.time() - repo_info["start_time"]
|
|
2004
|
+
|
|
2005
|
+
try:
|
|
2006
|
+
result = future.result(timeout=5) # Short timeout to get result
|
|
2007
|
+
|
|
2008
|
+
if result:
|
|
2009
|
+
self.processing_stats["success"] += 1
|
|
2010
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2011
|
+
"status": "success",
|
|
2012
|
+
"elapsed_time": elapsed_time,
|
|
2013
|
+
"commits": result.get("stats", {}).get("total_commits", 0),
|
|
2014
|
+
"tickets": result.get("stats", {}).get("unique_tickets", 0),
|
|
2015
|
+
}
|
|
2016
|
+
results[project_key] = result
|
|
2017
|
+
|
|
2018
|
+
logger.info(
|
|
2019
|
+
f"✅ {project_key}: Successfully processed "
|
|
2020
|
+
f"{result['stats']['total_commits']} commits in {elapsed_time:.1f}s"
|
|
2021
|
+
)
|
|
2022
|
+
|
|
2023
|
+
# Update progress
|
|
2024
|
+
if hasattr(progress, "finish_repository"):
|
|
2025
|
+
# Check if progress adapter supports stats parameter
|
|
2026
|
+
if hasattr(progress, "update_stats"):
|
|
2027
|
+
progress.update_stats(
|
|
2028
|
+
processed=self.processing_stats["processed"],
|
|
2029
|
+
success=self.processing_stats["success"],
|
|
2030
|
+
failed=self.processing_stats["failed"],
|
|
2031
|
+
timeout=self.processing_stats["timeout"],
|
|
2032
|
+
total=self.processing_stats["total"],
|
|
2033
|
+
)
|
|
2034
|
+
progress.finish_repository(project_key, success=True)
|
|
2035
|
+
else:
|
|
2036
|
+
progress.finish_repository(project_key, success=True)
|
|
2037
|
+
else:
|
|
2038
|
+
self.processing_stats["failed"] += 1
|
|
2039
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2040
|
+
"status": "failed",
|
|
2041
|
+
"elapsed_time": elapsed_time,
|
|
2042
|
+
"error": "Processing returned no result",
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
logger.error(
|
|
2046
|
+
f"❌ {project_key}: Processing failed after {elapsed_time:.1f}s"
|
|
2047
|
+
)
|
|
2048
|
+
|
|
2049
|
+
# Update progress
|
|
2050
|
+
if hasattr(progress, "finish_repository"):
|
|
2051
|
+
# Check if progress adapter supports stats parameter
|
|
2052
|
+
if hasattr(progress, "update_stats"):
|
|
2053
|
+
progress.update_stats(
|
|
2054
|
+
processed=self.processing_stats["processed"],
|
|
2055
|
+
success=self.processing_stats["success"],
|
|
2056
|
+
failed=self.processing_stats["failed"],
|
|
2057
|
+
timeout=self.processing_stats["timeout"],
|
|
2058
|
+
total=self.processing_stats["total"],
|
|
2059
|
+
)
|
|
2060
|
+
progress.finish_repository(
|
|
2061
|
+
project_key,
|
|
2062
|
+
success=False,
|
|
2063
|
+
error_message="Processing failed",
|
|
2064
|
+
)
|
|
2065
|
+
else:
|
|
2066
|
+
progress.finish_repository(
|
|
2067
|
+
project_key,
|
|
2068
|
+
success=False,
|
|
2069
|
+
error_message="Processing failed",
|
|
2070
|
+
)
|
|
2071
|
+
|
|
2072
|
+
except GitOperationTimeout:
|
|
2073
|
+
self.processing_stats["timeout"] += 1
|
|
2074
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2075
|
+
"status": "timeout",
|
|
2076
|
+
"elapsed_time": elapsed_time,
|
|
2077
|
+
"error": "Operation timed out",
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
logger.error(f"⏱️ {project_key}: Timed out after {elapsed_time:.1f}s")
|
|
2081
|
+
|
|
2082
|
+
# Update progress
|
|
2083
|
+
if hasattr(progress, "finish_repository"):
|
|
2084
|
+
# Check if progress adapter supports stats parameter
|
|
2085
|
+
if hasattr(progress, "update_stats"):
|
|
2086
|
+
progress.update_stats(
|
|
2087
|
+
processed=self.processing_stats["processed"],
|
|
2088
|
+
success=self.processing_stats["success"],
|
|
2089
|
+
failed=self.processing_stats["failed"],
|
|
2090
|
+
timeout=self.processing_stats["timeout"],
|
|
2091
|
+
total=self.processing_stats["total"],
|
|
2092
|
+
)
|
|
2093
|
+
progress.finish_repository(
|
|
2094
|
+
project_key, success=False, error_message="Timeout"
|
|
2095
|
+
)
|
|
2096
|
+
else:
|
|
2097
|
+
progress.finish_repository(
|
|
2098
|
+
project_key, success=False, error_message="Timeout"
|
|
2099
|
+
)
|
|
2100
|
+
|
|
2101
|
+
except Exception as e:
|
|
2102
|
+
self.processing_stats["failed"] += 1
|
|
2103
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2104
|
+
"status": "failed",
|
|
2105
|
+
"elapsed_time": elapsed_time,
|
|
2106
|
+
"error": str(e),
|
|
2107
|
+
}
|
|
2108
|
+
|
|
2109
|
+
logger.error(f"❌ {project_key}: Error after {elapsed_time:.1f}s - {e}")
|
|
2110
|
+
|
|
2111
|
+
# Update progress
|
|
2112
|
+
if hasattr(progress, "finish_repository"):
|
|
2113
|
+
# Check if progress adapter supports stats parameter
|
|
2114
|
+
if hasattr(progress, "update_stats"):
|
|
2115
|
+
progress.update_stats(
|
|
2116
|
+
processed=self.processing_stats["processed"],
|
|
2117
|
+
success=self.processing_stats["success"],
|
|
2118
|
+
failed=self.processing_stats["failed"],
|
|
2119
|
+
timeout=self.processing_stats["timeout"],
|
|
2120
|
+
total=self.processing_stats["total"],
|
|
2121
|
+
)
|
|
2122
|
+
progress.finish_repository(
|
|
2123
|
+
project_key, success=False, error_message=str(e)
|
|
2124
|
+
)
|
|
2125
|
+
else:
|
|
2126
|
+
progress.finish_repository(
|
|
2127
|
+
project_key, success=False, error_message=str(e)
|
|
2128
|
+
)
|
|
2129
|
+
|
|
2130
|
+
finally:
|
|
2131
|
+
# Update processed counter BEFORE logging and progress updates
|
|
2132
|
+
self.processing_stats["processed"] += 1
|
|
2133
|
+
|
|
2134
|
+
# Update progress service with actual processing stats
|
|
2135
|
+
if hasattr(progress, "update_stats"):
|
|
2136
|
+
progress.update_stats(
|
|
2137
|
+
processed=self.processing_stats["processed"],
|
|
2138
|
+
success=self.processing_stats["success"],
|
|
2139
|
+
failed=self.processing_stats["failed"],
|
|
2140
|
+
timeout=self.processing_stats["timeout"],
|
|
2141
|
+
total=self.processing_stats["total"],
|
|
2142
|
+
)
|
|
2143
|
+
|
|
2144
|
+
# Log progress
|
|
2145
|
+
logger.info(
|
|
2146
|
+
f"📊 Progress: {self.processing_stats['processed']}/{self.processing_stats['total']} repositories "
|
|
2147
|
+
f"(✅ {self.processing_stats['success']} | ❌ {self.processing_stats['failed']} | ⏱️ {self.processing_stats['timeout']})"
|
|
2148
|
+
)
|
|
2149
|
+
|
|
2150
|
+
# Final summary
|
|
2151
|
+
logger.info("=" * 60)
|
|
2152
|
+
logger.info("📈 PARALLEL PROCESSING SUMMARY")
|
|
2153
|
+
logger.info(f" Total repositories: {self.processing_stats['total']}")
|
|
2154
|
+
logger.info(f" Successfully processed: {self.processing_stats['success']}")
|
|
2155
|
+
logger.info(f" Failed: {self.processing_stats['failed']}")
|
|
2156
|
+
logger.info(f" Timed out: {self.processing_stats['timeout']}")
|
|
2157
|
+
logger.info("=" * 60)
|
|
2158
|
+
|
|
2159
|
+
return {"results": results, "statistics": self.processing_stats}
|
|
2160
|
+
|
|
2161
|
+
def _process_repository_with_timeout(
|
|
2162
|
+
self,
|
|
2163
|
+
repo_path: Path,
|
|
2164
|
+
project_key: str,
|
|
2165
|
+
weeks_back: int = 4,
|
|
2166
|
+
branch_patterns: Optional[list[str]] = None,
|
|
2167
|
+
jira_integration: Optional[JIRAIntegration] = None,
|
|
2168
|
+
start_date: Optional[datetime] = None,
|
|
2169
|
+
end_date: Optional[datetime] = None,
|
|
2170
|
+
timeout_per_operation: int = 30,
|
|
2171
|
+
) -> Optional[dict[str, Any]]:
|
|
2172
|
+
"""Process a single repository with comprehensive timeout protection.
|
|
2173
|
+
|
|
2174
|
+
Args:
|
|
2175
|
+
repo_path: Path to the repository
|
|
2176
|
+
project_key: Project identifier
|
|
2177
|
+
weeks_back: Number of weeks to analyze
|
|
2178
|
+
branch_patterns: Branch patterns to include
|
|
2179
|
+
jira_integration: JIRA integration for ticket data
|
|
2180
|
+
start_date: Optional explicit start date
|
|
2181
|
+
end_date: Optional explicit end date
|
|
2182
|
+
timeout_per_operation: Timeout for individual git operations
|
|
2183
|
+
|
|
2184
|
+
Returns:
|
|
2185
|
+
Repository processing results or None if failed
|
|
2186
|
+
"""
|
|
2187
|
+
try:
|
|
2188
|
+
# Track this repository in progress
|
|
2189
|
+
progress = get_progress_service()
|
|
2190
|
+
if hasattr(progress, "start_repository"):
|
|
2191
|
+
progress.start_repository(project_key, 0)
|
|
2192
|
+
|
|
2193
|
+
logger.info(f"🔍 Processing repository: {project_key} at {repo_path}")
|
|
2194
|
+
|
|
2195
|
+
# Use the regular fetch method but with timeout wrapper active
|
|
2196
|
+
with self.git_wrapper.operation_tracker("fetch_repository_data", repo_path):
|
|
2197
|
+
result = self.fetch_repository_data(
|
|
2198
|
+
repo_path=repo_path,
|
|
2199
|
+
project_key=project_key,
|
|
2200
|
+
weeks_back=weeks_back,
|
|
2201
|
+
branch_patterns=branch_patterns,
|
|
2202
|
+
jira_integration=jira_integration,
|
|
2203
|
+
progress_callback=None, # We handle progress at a higher level
|
|
2204
|
+
start_date=start_date,
|
|
2205
|
+
end_date=end_date,
|
|
2206
|
+
)
|
|
2207
|
+
|
|
2208
|
+
return result
|
|
2209
|
+
|
|
2210
|
+
except GitOperationTimeout as e:
|
|
2211
|
+
logger.error(f"⏱️ Repository {project_key} processing timed out: {e}")
|
|
2212
|
+
raise
|
|
2213
|
+
|
|
2214
|
+
except Exception as e:
|
|
2215
|
+
logger.error(f"❌ Error processing repository {project_key}: {e}")
|
|
2216
|
+
import traceback
|
|
2217
|
+
|
|
2218
|
+
logger.debug(f"Full traceback: {traceback.format_exc()}")
|
|
2219
|
+
return None
|