gitflow-analytics 1.3.11__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/batch_classifier.py +156 -4
- gitflow_analytics/cli.py +803 -135
- gitflow_analytics/config/loader.py +39 -1
- gitflow_analytics/config/schema.py +1 -0
- gitflow_analytics/core/cache.py +20 -0
- gitflow_analytics/core/data_fetcher.py +1051 -117
- gitflow_analytics/core/git_auth.py +169 -0
- gitflow_analytics/core/git_timeout_wrapper.py +347 -0
- gitflow_analytics/core/metrics_storage.py +12 -3
- gitflow_analytics/core/progress.py +219 -18
- gitflow_analytics/core/subprocess_git.py +145 -0
- gitflow_analytics/extractors/ml_tickets.py +3 -2
- gitflow_analytics/extractors/tickets.py +93 -8
- gitflow_analytics/integrations/jira_integration.py +1 -1
- gitflow_analytics/integrations/orchestrator.py +47 -29
- gitflow_analytics/metrics/branch_health.py +3 -2
- gitflow_analytics/models/database.py +72 -1
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +12 -5
- gitflow_analytics/pm_framework/orchestrator.py +8 -3
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +24 -4
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +3 -1
- gitflow_analytics/qualitative/core/llm_fallback.py +34 -2
- gitflow_analytics/reports/narrative_writer.py +118 -74
- gitflow_analytics/security/__init__.py +11 -0
- gitflow_analytics/security/config.py +189 -0
- gitflow_analytics/security/extractors/__init__.py +7 -0
- gitflow_analytics/security/extractors/dependency_checker.py +379 -0
- gitflow_analytics/security/extractors/secret_detector.py +197 -0
- gitflow_analytics/security/extractors/vulnerability_scanner.py +333 -0
- gitflow_analytics/security/llm_analyzer.py +347 -0
- gitflow_analytics/security/reports/__init__.py +5 -0
- gitflow_analytics/security/reports/security_report.py +358 -0
- gitflow_analytics/security/security_analyzer.py +414 -0
- gitflow_analytics/tui/app.py +3 -1
- gitflow_analytics/tui/progress_adapter.py +313 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +407 -46
- gitflow_analytics/tui/screens/results_screen.py +219 -206
- gitflow_analytics/ui/__init__.py +21 -0
- gitflow_analytics/ui/progress_display.py +1477 -0
- gitflow_analytics/verify_activity.py +697 -0
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/METADATA +2 -1
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/RECORD +47 -31
- gitflow_analytics/cli_rich.py +0 -503
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/top_level.txt +0 -0
|
@@ -8,10 +8,14 @@ without performing any LLM-based classification.
|
|
|
8
8
|
import logging
|
|
9
9
|
import os
|
|
10
10
|
import subprocess
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
14
|
from datetime import datetime, timedelta, timezone
|
|
12
15
|
from pathlib import Path
|
|
13
16
|
from typing import Any, Optional
|
|
14
17
|
|
|
18
|
+
from sqlalchemy import func
|
|
15
19
|
from sqlalchemy.orm import Session
|
|
16
20
|
|
|
17
21
|
from ..extractors.story_points import StoryPointExtractor
|
|
@@ -25,11 +29,17 @@ from ..models.database import (
|
|
|
25
29
|
)
|
|
26
30
|
from .branch_mapper import BranchToProjectMapper
|
|
27
31
|
from .cache import GitAnalysisCache
|
|
32
|
+
from .git_timeout_wrapper import GitOperationTimeout, GitTimeoutWrapper, HeartbeatLogger
|
|
28
33
|
from .identity import DeveloperIdentityResolver
|
|
29
34
|
from .progress import get_progress_service
|
|
30
35
|
|
|
31
36
|
logger = logging.getLogger(__name__)
|
|
32
37
|
|
|
38
|
+
# THREAD SAFETY: Module-level thread-local storage for repository instances
|
|
39
|
+
# Each thread gets its own isolated storage to prevent thread-safety issues
|
|
40
|
+
# when GitDataFetcher is called from ThreadPoolExecutor
|
|
41
|
+
_thread_local = threading.local()
|
|
42
|
+
|
|
33
43
|
|
|
34
44
|
class GitDataFetcher:
|
|
35
45
|
"""Fetches raw Git commit data and organizes it by day for efficient batch processing.
|
|
@@ -48,6 +58,7 @@ class GitDataFetcher:
|
|
|
48
58
|
branch_mapping_rules: Optional[dict[str, list[str]]] = None,
|
|
49
59
|
allowed_ticket_platforms: Optional[list[str]] = None,
|
|
50
60
|
exclude_paths: Optional[list[str]] = None,
|
|
61
|
+
skip_remote_fetch: bool = False,
|
|
51
62
|
):
|
|
52
63
|
"""Initialize the data fetcher.
|
|
53
64
|
|
|
@@ -56,8 +67,11 @@ class GitDataFetcher:
|
|
|
56
67
|
branch_mapping_rules: Rules for mapping branches to projects
|
|
57
68
|
allowed_ticket_platforms: List of allowed ticket platforms
|
|
58
69
|
exclude_paths: List of file paths to exclude from analysis
|
|
70
|
+
skip_remote_fetch: If True, skip git fetch/pull operations
|
|
59
71
|
"""
|
|
60
72
|
self.cache = cache
|
|
73
|
+
self.skip_remote_fetch = skip_remote_fetch
|
|
74
|
+
self.repository_status = {} # Track status of each repository
|
|
61
75
|
# CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
|
|
62
76
|
self.database = cache.db
|
|
63
77
|
self.story_point_extractor = StoryPointExtractor()
|
|
@@ -65,10 +79,35 @@ class GitDataFetcher:
|
|
|
65
79
|
self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
|
|
66
80
|
self.exclude_paths = exclude_paths or []
|
|
67
81
|
|
|
82
|
+
# Log exclusion configuration
|
|
83
|
+
if self.exclude_paths:
|
|
84
|
+
logger.info(
|
|
85
|
+
f"GitDataFetcher initialized with {len(self.exclude_paths)} exclusion patterns:"
|
|
86
|
+
)
|
|
87
|
+
for pattern in self.exclude_paths[:5]: # Show first 5 patterns
|
|
88
|
+
logger.debug(f" - {pattern}")
|
|
89
|
+
if len(self.exclude_paths) > 5:
|
|
90
|
+
logger.debug(f" ... and {len(self.exclude_paths) - 5} more patterns")
|
|
91
|
+
else:
|
|
92
|
+
logger.info("GitDataFetcher initialized with no file exclusions")
|
|
93
|
+
|
|
68
94
|
# Initialize identity resolver
|
|
69
95
|
identity_db_path = cache.cache_dir / "identities.db"
|
|
70
96
|
self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
|
|
71
97
|
|
|
98
|
+
# Initialize git timeout wrapper for safe operations
|
|
99
|
+
self.git_wrapper = GitTimeoutWrapper(default_timeout=30)
|
|
100
|
+
|
|
101
|
+
# Statistics for tracking repository processing
|
|
102
|
+
self.processing_stats = {
|
|
103
|
+
"total": 0,
|
|
104
|
+
"processed": 0,
|
|
105
|
+
"success": 0,
|
|
106
|
+
"failed": 0,
|
|
107
|
+
"timeout": 0,
|
|
108
|
+
"repositories": {},
|
|
109
|
+
}
|
|
110
|
+
|
|
72
111
|
def fetch_repository_data(
|
|
73
112
|
self,
|
|
74
113
|
repo_path: Path,
|
|
@@ -118,19 +157,37 @@ class GitDataFetcher:
|
|
|
118
157
|
# Get progress service for top-level progress tracking
|
|
119
158
|
progress = get_progress_service()
|
|
120
159
|
|
|
160
|
+
# Start Rich display for this repository if enabled
|
|
161
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
162
|
+
# Count total commits for progress estimation
|
|
163
|
+
try:
|
|
164
|
+
import git
|
|
165
|
+
|
|
166
|
+
git.Repo(repo_path)
|
|
167
|
+
# Check if we need to clone or pull
|
|
168
|
+
if not repo_path.exists() or not (repo_path / ".git").exists():
|
|
169
|
+
logger.info(f"📥 Repository {project_key} needs cloning")
|
|
170
|
+
progress.start_repository(f"{project_key} (cloning)", 0)
|
|
171
|
+
else:
|
|
172
|
+
# Rough estimate based on weeks
|
|
173
|
+
estimated_commits = weeks_back * 50 # Estimate ~50 commits per week
|
|
174
|
+
progress.start_repository(project_key, estimated_commits)
|
|
175
|
+
except Exception:
|
|
176
|
+
progress.start_repository(project_key, 100) # Default estimate
|
|
177
|
+
|
|
121
178
|
# Step 1: Collect all commits organized by day with enhanced progress tracking
|
|
122
179
|
logger.info("🔍 DEBUG: About to fetch commits by day")
|
|
123
|
-
logger.info("Fetching commits organized by day
|
|
180
|
+
logger.info(f"Fetching commits organized by day for repository: {project_key}")
|
|
124
181
|
|
|
125
182
|
# Create top-level progress for this repository
|
|
126
183
|
with progress.progress(
|
|
127
184
|
total=3, # Three main steps: fetch commits, extract tickets, store data
|
|
128
|
-
description=f"Processing {project_key}",
|
|
185
|
+
description=f"📊 Processing repository: {project_key}",
|
|
129
186
|
unit="steps",
|
|
130
187
|
) as repo_progress_ctx:
|
|
131
188
|
|
|
132
189
|
# Step 1: Fetch commits
|
|
133
|
-
progress.set_description(repo_progress_ctx, f"{project_key}: Fetching commits")
|
|
190
|
+
progress.set_description(repo_progress_ctx, f"🔍 {project_key}: Fetching commits")
|
|
134
191
|
daily_commits = self._fetch_commits_by_day(
|
|
135
192
|
repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
|
|
136
193
|
)
|
|
@@ -138,25 +195,27 @@ class GitDataFetcher:
|
|
|
138
195
|
progress.update(repo_progress_ctx)
|
|
139
196
|
|
|
140
197
|
# Step 2: Extract and fetch all referenced tickets
|
|
141
|
-
progress.set_description(repo_progress_ctx, f"{project_key}: Processing tickets")
|
|
198
|
+
progress.set_description(repo_progress_ctx, f"🎫 {project_key}: Processing tickets")
|
|
142
199
|
logger.info("🔍 DEBUG: About to extract ticket references")
|
|
143
|
-
logger.info("Extracting ticket references...")
|
|
200
|
+
logger.info(f"Extracting ticket references for {project_key}...")
|
|
144
201
|
ticket_ids = self._extract_all_ticket_references(daily_commits)
|
|
145
202
|
logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
|
|
146
203
|
|
|
147
204
|
if jira_integration and ticket_ids:
|
|
148
|
-
logger.info(
|
|
205
|
+
logger.info(
|
|
206
|
+
f"Fetching {len(ticket_ids)} unique tickets from JIRA for {project_key}..."
|
|
207
|
+
)
|
|
149
208
|
self._fetch_detailed_tickets(
|
|
150
209
|
ticket_ids, jira_integration, project_key, progress_callback
|
|
151
210
|
)
|
|
152
211
|
|
|
153
212
|
# Build commit-ticket correlations
|
|
154
|
-
logger.info("Building commit-ticket correlations...")
|
|
213
|
+
logger.info(f"Building commit-ticket correlations for {project_key}...")
|
|
155
214
|
correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
|
|
156
215
|
progress.update(repo_progress_ctx)
|
|
157
216
|
|
|
158
217
|
# Step 3: Store daily commit batches
|
|
159
|
-
progress.set_description(repo_progress_ctx, f"{project_key}: Storing data")
|
|
218
|
+
progress.set_description(repo_progress_ctx, f"💾 {project_key}: Storing data")
|
|
160
219
|
logger.info(
|
|
161
220
|
f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
|
|
162
221
|
)
|
|
@@ -183,6 +242,46 @@ class GitDataFetcher:
|
|
|
183
242
|
# Return summary statistics with ACTUAL stored counts
|
|
184
243
|
# expected_commits already calculated above
|
|
185
244
|
|
|
245
|
+
# Calculate exclusion impact summary if exclusions are configured
|
|
246
|
+
exclusion_stats = {
|
|
247
|
+
"patterns_applied": len(self.exclude_paths),
|
|
248
|
+
"enabled": bool(self.exclude_paths),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if self.exclude_paths and expected_commits > 0:
|
|
252
|
+
# Get aggregate stats from daily_commit_batches
|
|
253
|
+
session = self.database.get_session()
|
|
254
|
+
try:
|
|
255
|
+
batch_stats = (
|
|
256
|
+
session.query(
|
|
257
|
+
func.sum(DailyCommitBatch.total_lines_added).label("total_added"),
|
|
258
|
+
func.sum(DailyCommitBatch.total_lines_deleted).label("total_deleted"),
|
|
259
|
+
)
|
|
260
|
+
.filter(
|
|
261
|
+
DailyCommitBatch.project_key == project_key,
|
|
262
|
+
DailyCommitBatch.repo_path == str(repo_path),
|
|
263
|
+
)
|
|
264
|
+
.first()
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
if batch_stats and batch_stats.total_added:
|
|
268
|
+
total_lines = (batch_stats.total_added or 0) + (batch_stats.total_deleted or 0)
|
|
269
|
+
exclusion_stats["total_lines_after_filtering"] = total_lines
|
|
270
|
+
exclusion_stats["lines_added"] = batch_stats.total_added or 0
|
|
271
|
+
exclusion_stats["lines_deleted"] = batch_stats.total_deleted or 0
|
|
272
|
+
|
|
273
|
+
logger.info(
|
|
274
|
+
f"📊 Exclusion Impact Summary for {project_key}:\n"
|
|
275
|
+
f" - Exclusion patterns applied: {len(self.exclude_paths)}\n"
|
|
276
|
+
f" - Total lines after filtering: {total_lines:,}\n"
|
|
277
|
+
f" - Lines added: {batch_stats.total_added:,}\n"
|
|
278
|
+
f" - Lines deleted: {batch_stats.total_deleted:,}"
|
|
279
|
+
)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.debug(f"Could not calculate exclusion impact summary: {e}")
|
|
282
|
+
finally:
|
|
283
|
+
session.close()
|
|
284
|
+
|
|
186
285
|
results = {
|
|
187
286
|
"project_key": project_key,
|
|
188
287
|
"repo_path": str(repo_path),
|
|
@@ -196,6 +295,7 @@ class GitDataFetcher:
|
|
|
196
295
|
"correlations_created": correlations_created,
|
|
197
296
|
"batches_created": batches_created,
|
|
198
297
|
},
|
|
298
|
+
"exclusions": exclusion_stats,
|
|
199
299
|
"daily_commits": daily_commits, # For immediate use if needed
|
|
200
300
|
}
|
|
201
301
|
|
|
@@ -204,10 +304,20 @@ class GitDataFetcher:
|
|
|
204
304
|
logger.info(
|
|
205
305
|
f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
206
306
|
)
|
|
307
|
+
# Finish repository in Rich display with success
|
|
308
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
309
|
+
progress.finish_repository(project_key, success=True)
|
|
207
310
|
else:
|
|
208
311
|
logger.error(
|
|
209
312
|
f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
|
|
210
313
|
)
|
|
314
|
+
# Finish repository in Rich display with error
|
|
315
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
316
|
+
progress.finish_repository(
|
|
317
|
+
project_key,
|
|
318
|
+
success=False,
|
|
319
|
+
error_message=f"Storage issue: {actual_stored_commits}/{expected_commits} commits",
|
|
320
|
+
)
|
|
211
321
|
|
|
212
322
|
return results
|
|
213
323
|
|
|
@@ -225,24 +335,148 @@ class GitDataFetcher:
|
|
|
225
335
|
Returns:
|
|
226
336
|
Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
|
|
227
337
|
"""
|
|
228
|
-
|
|
338
|
+
|
|
339
|
+
# THREAD SAFETY: Use the module-level thread-local storage to ensure each thread
|
|
340
|
+
# gets its own Repo instance. This prevents thread-safety issues when called from ThreadPoolExecutor
|
|
341
|
+
|
|
342
|
+
# Set environment variables to prevent ANY password prompts before opening repo
|
|
343
|
+
original_env = {}
|
|
344
|
+
env_vars = {
|
|
345
|
+
"GIT_TERMINAL_PROMPT": "0",
|
|
346
|
+
"GIT_ASKPASS": "/bin/echo", # Use full path to echo
|
|
347
|
+
"SSH_ASKPASS": "/bin/echo",
|
|
348
|
+
"GCM_INTERACTIVE": "never",
|
|
349
|
+
"GIT_SSH_COMMAND": "ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o PasswordAuthentication=no",
|
|
350
|
+
"DISPLAY": "",
|
|
351
|
+
"GIT_CREDENTIAL_HELPER": "", # Disable credential helper
|
|
352
|
+
"GCM_PROVIDER": "none", # Disable Git Credential Manager
|
|
353
|
+
"GIT_CREDENTIALS": "", # Clear any cached credentials
|
|
354
|
+
"GIT_CONFIG_NOSYSTEM": "1", # Don't use system config
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
# Save original environment and set our values
|
|
358
|
+
for key, value in env_vars.items():
|
|
359
|
+
original_env[key] = os.environ.get(key)
|
|
360
|
+
os.environ[key] = value
|
|
229
361
|
|
|
230
362
|
try:
|
|
231
|
-
|
|
363
|
+
# THREAD SAFETY: Create a fresh Repo instance for this thread
|
|
364
|
+
# Do NOT reuse Repo instances across threads
|
|
365
|
+
from git import Repo
|
|
366
|
+
|
|
367
|
+
# Check for security issues in repository configuration
|
|
368
|
+
self._check_repository_security(repo_path, project_key)
|
|
369
|
+
|
|
370
|
+
# When skip_remote_fetch is enabled, use a more restricted repository access
|
|
371
|
+
if self.skip_remote_fetch:
|
|
372
|
+
# Use a special git configuration that completely disables credential helpers
|
|
373
|
+
import tempfile
|
|
374
|
+
|
|
375
|
+
# Create a secure temporary directory for our config
|
|
376
|
+
temp_dir = tempfile.mkdtemp(prefix="gitflow_")
|
|
377
|
+
|
|
378
|
+
# Create a temporary git config that disables all authentication
|
|
379
|
+
tmp_config_path = os.path.join(temp_dir, ".gitconfig")
|
|
380
|
+
with open(tmp_config_path, "w") as tmp_config:
|
|
381
|
+
tmp_config.write("[credential]\n")
|
|
382
|
+
tmp_config.write(" helper = \n")
|
|
383
|
+
tmp_config.write("[core]\n")
|
|
384
|
+
tmp_config.write(" askpass = \n")
|
|
385
|
+
|
|
386
|
+
# Set GIT_CONFIG to use our temporary config
|
|
387
|
+
os.environ["GIT_CONFIG_GLOBAL"] = tmp_config_path
|
|
388
|
+
os.environ["GIT_CONFIG_SYSTEM"] = "/dev/null"
|
|
389
|
+
|
|
390
|
+
# Store temp_dir in thread-local storage for cleanup
|
|
391
|
+
_thread_local.temp_dir = temp_dir
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Open repository with our restricted configuration
|
|
395
|
+
# THREAD SAFETY: Each thread gets its own Repo instance
|
|
396
|
+
repo = Repo(repo_path)
|
|
397
|
+
finally:
|
|
398
|
+
# Clean up temporary config directory
|
|
399
|
+
try:
|
|
400
|
+
import shutil
|
|
401
|
+
|
|
402
|
+
if hasattr(_thread_local, "temp_dir"):
|
|
403
|
+
shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
|
|
404
|
+
delattr(_thread_local, "temp_dir")
|
|
405
|
+
except:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
# Configure git to never prompt for credentials
|
|
410
|
+
with repo.config_writer() as git_config:
|
|
411
|
+
git_config.set_value("core", "askpass", "")
|
|
412
|
+
git_config.set_value("credential", "helper", "")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.debug(f"Could not update git config: {e}")
|
|
415
|
+
|
|
416
|
+
# Note: We can't monkey-patch remotes as it's a property without setter
|
|
417
|
+
# The skip_remote_fetch flag will prevent remote operations elsewhere
|
|
418
|
+
|
|
419
|
+
logger.debug(
|
|
420
|
+
f"Opened repository {project_key} in offline mode (skip_remote_fetch=true)"
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
# THREAD SAFETY: Each thread gets its own Repo instance
|
|
424
|
+
repo = Repo(repo_path)
|
|
425
|
+
# Track repository status
|
|
426
|
+
self.repository_status[project_key] = {
|
|
427
|
+
"path": str(repo_path),
|
|
428
|
+
"remote_update": "skipped" if self.skip_remote_fetch else "pending",
|
|
429
|
+
"authentication_issues": False,
|
|
430
|
+
"error": None,
|
|
431
|
+
}
|
|
432
|
+
|
|
232
433
|
# Update repository from remote before analysis
|
|
233
|
-
self.
|
|
434
|
+
if not self.skip_remote_fetch:
|
|
435
|
+
logger.info(f"📥 Updating repository {project_key} from remote...")
|
|
436
|
+
|
|
437
|
+
update_success = self._update_repository(repo)
|
|
438
|
+
if not self.skip_remote_fetch:
|
|
439
|
+
self.repository_status[project_key]["remote_update"] = (
|
|
440
|
+
"success" if update_success else "failed"
|
|
441
|
+
)
|
|
442
|
+
if not update_success:
|
|
443
|
+
logger.warning(
|
|
444
|
+
f"⚠️ {project_key}: Continuing with local repository state (remote update failed)"
|
|
445
|
+
)
|
|
446
|
+
|
|
234
447
|
except Exception as e:
|
|
235
|
-
logger.error(f"Failed to open repository at {repo_path}: {e}")
|
|
448
|
+
logger.error(f"Failed to open repository {project_key} at {repo_path}: {e}")
|
|
449
|
+
self.repository_status[project_key] = {
|
|
450
|
+
"path": str(repo_path),
|
|
451
|
+
"remote_update": "error",
|
|
452
|
+
"authentication_issues": "authentication" in str(e).lower()
|
|
453
|
+
or "password" in str(e).lower(),
|
|
454
|
+
"error": str(e),
|
|
455
|
+
}
|
|
456
|
+
# Restore original environment variables before returning
|
|
457
|
+
for key, value in original_env.items():
|
|
458
|
+
if value is None:
|
|
459
|
+
os.environ.pop(key, None)
|
|
460
|
+
else:
|
|
461
|
+
os.environ[key] = value
|
|
236
462
|
return {}
|
|
237
463
|
|
|
238
464
|
# Get branches to analyze
|
|
239
465
|
branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
|
|
240
466
|
|
|
241
467
|
if not branches_to_analyze:
|
|
242
|
-
logger.warning(
|
|
468
|
+
logger.warning(
|
|
469
|
+
f"No accessible branches found in repository {project_key} at {repo_path}"
|
|
470
|
+
)
|
|
471
|
+
# Restore original environment variables before returning
|
|
472
|
+
for key, value in original_env.items():
|
|
473
|
+
if value is None:
|
|
474
|
+
os.environ.pop(key, None)
|
|
475
|
+
else:
|
|
476
|
+
os.environ[key] = value
|
|
243
477
|
return {}
|
|
244
478
|
|
|
245
|
-
logger.info(f"Analyzing branches: {branches_to_analyze}")
|
|
479
|
+
logger.info(f"🌿 {project_key}: Analyzing branches: {branches_to_analyze}")
|
|
246
480
|
|
|
247
481
|
# Calculate days to process
|
|
248
482
|
current_date = start_date.date()
|
|
@@ -263,18 +497,34 @@ class GitDataFetcher:
|
|
|
263
497
|
daily_commits = {}
|
|
264
498
|
all_commit_hashes = set() # Track all hashes for deduplication
|
|
265
499
|
|
|
500
|
+
# Count total commits first for Rich display
|
|
501
|
+
try:
|
|
502
|
+
for branch_name in branches_to_analyze[:1]: # Sample from first branch
|
|
503
|
+
sample_commits = list(
|
|
504
|
+
repo.iter_commits(branch_name, since=start_date, until=end_date)
|
|
505
|
+
)
|
|
506
|
+
# Estimate based on first branch (multiply by number of branches for rough estimate)
|
|
507
|
+
len(sample_commits) * len(branches_to_analyze)
|
|
508
|
+
break
|
|
509
|
+
except:
|
|
510
|
+
len(days_to_process) * 50 # Default estimate
|
|
511
|
+
|
|
512
|
+
# Update repository in Rich display with estimated commit count
|
|
513
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
514
|
+
progress.update_repository(project_key, 0, 0.0)
|
|
515
|
+
|
|
266
516
|
# Create nested progress for day-by-day processing
|
|
267
517
|
with progress.progress(
|
|
268
518
|
total=len(days_to_process),
|
|
269
|
-
description=f"Fetching commits for {project_key}",
|
|
519
|
+
description=f"📅 Fetching commits for repository: {project_key}",
|
|
270
520
|
unit="days",
|
|
271
521
|
nested=True,
|
|
272
522
|
) as day_progress_ctx:
|
|
273
523
|
|
|
274
524
|
for day_date in days_to_process:
|
|
275
|
-
# Update description to show current day
|
|
525
|
+
# Update description to show current repository and day clearly
|
|
276
526
|
day_str = day_date.strftime("%Y-%m-%d")
|
|
277
|
-
progress.set_description(day_progress_ctx, f"{project_key}:
|
|
527
|
+
progress.set_description(day_progress_ctx, f"🔍 {project_key}: Analyzing {day_str}")
|
|
278
528
|
|
|
279
529
|
# Calculate day boundaries
|
|
280
530
|
day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
|
|
@@ -286,12 +536,26 @@ class GitDataFetcher:
|
|
|
286
536
|
# Process each branch for this specific day
|
|
287
537
|
for branch_name in branches_to_analyze:
|
|
288
538
|
try:
|
|
289
|
-
# Fetch commits for this specific day and branch
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
539
|
+
# Fetch commits for this specific day and branch with timeout protection
|
|
540
|
+
def fetch_branch_commits():
|
|
541
|
+
return list(
|
|
542
|
+
repo.iter_commits(
|
|
543
|
+
branch_name, since=day_start, until=day_end, reverse=False
|
|
544
|
+
)
|
|
293
545
|
)
|
|
294
|
-
|
|
546
|
+
|
|
547
|
+
# Use timeout wrapper to prevent hanging on iter_commits
|
|
548
|
+
try:
|
|
549
|
+
branch_commits = self.git_wrapper.run_with_timeout(
|
|
550
|
+
fetch_branch_commits,
|
|
551
|
+
timeout=15, # 15 seconds per branch/day combination
|
|
552
|
+
operation_name=f"iter_commits_{branch_name}_{day_str}",
|
|
553
|
+
)
|
|
554
|
+
except GitOperationTimeout:
|
|
555
|
+
logger.warning(
|
|
556
|
+
f"⏱️ Timeout fetching commits for branch {branch_name} on {day_str}, skipping"
|
|
557
|
+
)
|
|
558
|
+
continue
|
|
295
559
|
|
|
296
560
|
for commit in branch_commits:
|
|
297
561
|
# Skip if we've already processed this commit
|
|
@@ -307,6 +571,11 @@ class GitDataFetcher:
|
|
|
307
571
|
all_commit_hashes.add(commit.hexsha)
|
|
308
572
|
commits_found_today += 1
|
|
309
573
|
|
|
574
|
+
except GitOperationTimeout as e:
|
|
575
|
+
logger.warning(
|
|
576
|
+
f"⏱️ Timeout processing branch {branch_name} for day {day_str}: {e}"
|
|
577
|
+
)
|
|
578
|
+
continue
|
|
310
579
|
except Exception as e:
|
|
311
580
|
logger.warning(
|
|
312
581
|
f"Error processing branch {branch_name} for day {day_str}: {e}"
|
|
@@ -333,8 +602,28 @@ class GitDataFetcher:
|
|
|
333
602
|
# Update progress bar
|
|
334
603
|
progress.update(day_progress_ctx)
|
|
335
604
|
|
|
605
|
+
# Update Rich display with current commit count and speed
|
|
606
|
+
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
607
|
+
total_processed = len(all_commit_hashes)
|
|
608
|
+
# Calculate speed (commits per second) based on elapsed time
|
|
609
|
+
import time
|
|
610
|
+
|
|
611
|
+
if not hasattr(self, "_fetch_start_time"):
|
|
612
|
+
self._fetch_start_time = time.time()
|
|
613
|
+
elapsed = time.time() - self._fetch_start_time
|
|
614
|
+
speed = total_processed / elapsed if elapsed > 0 else 0
|
|
615
|
+
progress.update_repository(project_key, total_processed, speed)
|
|
616
|
+
|
|
336
617
|
total_commits = sum(len(commits) for commits in daily_commits.values())
|
|
337
618
|
logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
|
|
619
|
+
|
|
620
|
+
# Restore original environment variables
|
|
621
|
+
for key, value in original_env.items():
|
|
622
|
+
if value is None:
|
|
623
|
+
os.environ.pop(key, None)
|
|
624
|
+
else:
|
|
625
|
+
os.environ[key] = value
|
|
626
|
+
|
|
338
627
|
return daily_commits
|
|
339
628
|
|
|
340
629
|
def _extract_commit_data(
|
|
@@ -376,13 +665,19 @@ class GitDataFetcher:
|
|
|
376
665
|
line_stats = self._calculate_commit_stats(commit)
|
|
377
666
|
total_insertions = line_stats["insertions"]
|
|
378
667
|
total_deletions = line_stats["deletions"]
|
|
668
|
+
raw_insertions = line_stats.get("raw_insertions", total_insertions)
|
|
669
|
+
raw_deletions = line_stats.get("raw_deletions", total_deletions)
|
|
379
670
|
|
|
380
671
|
commit_data.update(
|
|
381
672
|
{
|
|
382
673
|
"files_changed": files_changed,
|
|
383
674
|
"files_changed_count": len(files_changed),
|
|
384
|
-
"lines_added": total_insertions,
|
|
675
|
+
"lines_added": total_insertions, # Filtered counts for backward compatibility
|
|
385
676
|
"lines_deleted": total_deletions,
|
|
677
|
+
"filtered_insertions": total_insertions, # Explicitly filtered counts
|
|
678
|
+
"filtered_deletions": total_deletions,
|
|
679
|
+
"raw_insertions": raw_insertions, # Raw unfiltered counts
|
|
680
|
+
"raw_deletions": raw_deletions,
|
|
386
681
|
}
|
|
387
682
|
)
|
|
388
683
|
|
|
@@ -394,6 +689,10 @@ class GitDataFetcher:
|
|
|
394
689
|
"files_changed_count": 0,
|
|
395
690
|
"lines_added": 0,
|
|
396
691
|
"lines_deleted": 0,
|
|
692
|
+
"filtered_insertions": 0,
|
|
693
|
+
"filtered_deletions": 0,
|
|
694
|
+
"raw_insertions": 0,
|
|
695
|
+
"raw_deletions": 0,
|
|
397
696
|
}
|
|
398
697
|
)
|
|
399
698
|
|
|
@@ -422,9 +721,166 @@ class GitDataFetcher:
|
|
|
422
721
|
|
|
423
722
|
def _should_exclude_file(self, file_path: str) -> bool:
|
|
424
723
|
"""Check if a file should be excluded based on exclude patterns."""
|
|
724
|
+
return any(self._matches_glob_pattern(file_path, pattern) for pattern in self.exclude_paths)
|
|
725
|
+
|
|
726
|
+
def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
|
|
727
|
+
"""Check if a file path matches a glob pattern, handling ** recursion correctly.
|
|
728
|
+
|
|
729
|
+
This method properly handles different glob pattern types:
|
|
730
|
+
- **/vendor/** : matches files inside vendor directories at any level
|
|
731
|
+
- **/*.min.js : matches files with specific suffix anywhere in directory tree
|
|
732
|
+
- vendor/** : matches files inside vendor directory at root level only
|
|
733
|
+
- **pattern** : handles other complex patterns with pathlib.match()
|
|
734
|
+
- simple patterns : uses fnmatch for basic wildcards
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
filepath: The file path to check
|
|
738
|
+
pattern: The glob pattern to match against
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
True if the file path matches the pattern, False otherwise
|
|
742
|
+
"""
|
|
743
|
+
import fnmatch
|
|
744
|
+
import re
|
|
745
|
+
from pathlib import PurePath
|
|
746
|
+
|
|
747
|
+
# Handle empty or invalid inputs
|
|
748
|
+
if not filepath or not pattern:
|
|
749
|
+
return False
|
|
750
|
+
|
|
751
|
+
path = PurePath(filepath)
|
|
752
|
+
|
|
753
|
+
# Check for multiple ** patterns first (most complex)
|
|
754
|
+
if "**" in pattern and pattern.count("**") > 1:
|
|
755
|
+
# Multiple ** patterns - use custom recursive matching for complex patterns
|
|
756
|
+
return self._match_recursive_pattern(filepath, pattern)
|
|
757
|
+
|
|
758
|
+
# Then handle simple ** patterns
|
|
759
|
+
elif pattern.startswith("**/") and pattern.endswith("/**"):
|
|
760
|
+
# Pattern like **/vendor/** - matches files inside vendor directories at any level
|
|
761
|
+
dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
|
|
762
|
+
if not dir_name: # Handle edge case of '**/**'
|
|
763
|
+
return True
|
|
764
|
+
return dir_name in path.parts
|
|
765
|
+
|
|
766
|
+
elif pattern.startswith("**/"):
|
|
767
|
+
# Pattern like **/*.min.js - matches files with specific suffix anywhere
|
|
768
|
+
suffix_pattern = pattern[3:]
|
|
769
|
+
if not suffix_pattern: # Handle edge case of '**/'
|
|
770
|
+
return True
|
|
771
|
+
# Check against filename for file patterns, or any path part for directory patterns
|
|
772
|
+
if suffix_pattern.endswith("/"):
|
|
773
|
+
# Directory pattern like **/build/
|
|
774
|
+
dir_name = suffix_pattern[:-1]
|
|
775
|
+
return dir_name in path.parts
|
|
776
|
+
else:
|
|
777
|
+
# File pattern like *.min.js
|
|
778
|
+
return fnmatch.fnmatch(path.name, suffix_pattern)
|
|
779
|
+
|
|
780
|
+
elif pattern.endswith("/**"):
|
|
781
|
+
# Pattern like vendor/** or docs/build/** - matches files inside directory at root level
|
|
782
|
+
dir_name = pattern[:-3]
|
|
783
|
+
if not dir_name: # Handle edge case of '/**'
|
|
784
|
+
return True
|
|
785
|
+
|
|
786
|
+
# Handle both single directory names and nested paths
|
|
787
|
+
expected_parts = PurePath(dir_name).parts
|
|
788
|
+
return (
|
|
789
|
+
len(path.parts) >= len(expected_parts)
|
|
790
|
+
and path.parts[: len(expected_parts)] == expected_parts
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
elif "**" in pattern:
|
|
794
|
+
# Single ** pattern - use pathlib matching with fallback
|
|
795
|
+
try:
|
|
796
|
+
return path.match(pattern)
|
|
797
|
+
except (ValueError, TypeError):
|
|
798
|
+
# Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
|
|
799
|
+
try:
|
|
800
|
+
return fnmatch.fnmatch(filepath, pattern)
|
|
801
|
+
except re.error:
|
|
802
|
+
# Invalid regex pattern - return False to be safe
|
|
803
|
+
return False
|
|
804
|
+
else:
|
|
805
|
+
# Simple pattern - use fnmatch for basic wildcards
|
|
806
|
+
try:
|
|
807
|
+
# Try matching the full path first
|
|
808
|
+
if fnmatch.fnmatch(filepath, pattern):
|
|
809
|
+
return True
|
|
810
|
+
# Also try matching just the filename for simple patterns
|
|
811
|
+
# This allows "package-lock.json" to match "src/package-lock.json"
|
|
812
|
+
return fnmatch.fnmatch(path.name, pattern)
|
|
813
|
+
except re.error:
|
|
814
|
+
# Invalid regex pattern - return False to be safe
|
|
815
|
+
return False
|
|
816
|
+
|
|
817
|
+
def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
|
|
818
|
+
"""Handle complex patterns with multiple ** wildcards.
|
|
819
|
+
|
|
820
|
+
Args:
|
|
821
|
+
filepath: The file path to check
|
|
822
|
+
pattern: The pattern with multiple ** wildcards
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
True if the path matches the pattern, False otherwise
|
|
826
|
+
"""
|
|
425
827
|
import fnmatch
|
|
828
|
+
from pathlib import PurePath
|
|
829
|
+
|
|
830
|
+
# Split pattern by ** to handle each segment
|
|
831
|
+
parts = pattern.split("**")
|
|
832
|
+
|
|
833
|
+
# Validate that we have actual segments
|
|
834
|
+
if not parts:
|
|
835
|
+
return False
|
|
426
836
|
|
|
427
|
-
|
|
837
|
+
# Convert filepath to parts for easier matching
|
|
838
|
+
path_parts = list(PurePath(filepath).parts)
|
|
839
|
+
|
|
840
|
+
# Start matching from the beginning
|
|
841
|
+
path_index = 0
|
|
842
|
+
|
|
843
|
+
for i, part in enumerate(parts):
|
|
844
|
+
if not part:
|
|
845
|
+
# Empty part (e.g., from leading or trailing **)
|
|
846
|
+
if i == 0 or i == len(parts) - 1:
|
|
847
|
+
# Leading or trailing ** - continue
|
|
848
|
+
continue
|
|
849
|
+
# Middle empty part (consecutive **) - match any number of path components
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
# Clean the part (remove leading/trailing slashes)
|
|
853
|
+
part = part.strip("/")
|
|
854
|
+
|
|
855
|
+
if not part:
|
|
856
|
+
continue
|
|
857
|
+
|
|
858
|
+
# Find where this part matches in the remaining path
|
|
859
|
+
found = False
|
|
860
|
+
for j in range(path_index, len(path_parts)):
|
|
861
|
+
# Check if the current path part matches the pattern part
|
|
862
|
+
if "/" in part:
|
|
863
|
+
# Part contains multiple path components
|
|
864
|
+
sub_parts = part.split("/")
|
|
865
|
+
if j + len(sub_parts) <= len(path_parts) and all(
|
|
866
|
+
fnmatch.fnmatch(path_parts[j + k], sub_parts[k])
|
|
867
|
+
for k in range(len(sub_parts))
|
|
868
|
+
):
|
|
869
|
+
path_index = j + len(sub_parts)
|
|
870
|
+
found = True
|
|
871
|
+
break
|
|
872
|
+
else:
|
|
873
|
+
# Single component part
|
|
874
|
+
if fnmatch.fnmatch(path_parts[j], part):
|
|
875
|
+
path_index = j + 1
|
|
876
|
+
found = True
|
|
877
|
+
break
|
|
878
|
+
|
|
879
|
+
if not found and part:
|
|
880
|
+
# Required part not found in path
|
|
881
|
+
return False
|
|
882
|
+
|
|
883
|
+
return True
|
|
428
884
|
|
|
429
885
|
def _get_branches_to_analyze(
|
|
430
886
|
self, repo: Any, branch_patterns: Optional[list[str]]
|
|
@@ -441,33 +897,45 @@ class GitDataFetcher:
|
|
|
441
897
|
- Handle missing remotes gracefully
|
|
442
898
|
- Skip remote tracking branches to avoid duplicates
|
|
443
899
|
- Use actual branch existence checking rather than assuming branches exist
|
|
900
|
+
|
|
901
|
+
THREAD SAFETY: This method is thread-safe as it doesn't modify shared state
|
|
902
|
+
and works with a repo instance passed as a parameter.
|
|
444
903
|
"""
|
|
445
904
|
# Collect all available branches (local branches preferred)
|
|
446
905
|
available_branches = []
|
|
447
906
|
|
|
448
907
|
# First, try local branches
|
|
449
908
|
try:
|
|
450
|
-
|
|
909
|
+
# THREAD SAFETY: Create a new list to avoid sharing references
|
|
910
|
+
local_branches = list([branch.name for branch in repo.branches])
|
|
451
911
|
available_branches.extend(local_branches)
|
|
452
912
|
logger.debug(f"Found local branches: {local_branches}")
|
|
453
913
|
except Exception as e:
|
|
454
914
|
logger.debug(f"Error getting local branches: {e}")
|
|
455
915
|
|
|
456
916
|
# If we have remotes, also consider remote branches (but clean the names)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
917
|
+
# Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
|
|
918
|
+
if not self.skip_remote_fetch:
|
|
919
|
+
try:
|
|
920
|
+
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
921
|
+
# THREAD SAFETY: Create a new list to avoid sharing references
|
|
922
|
+
remote_branches = list(
|
|
923
|
+
[
|
|
924
|
+
ref.name.replace("origin/", "")
|
|
925
|
+
for ref in repo.remotes.origin.refs
|
|
926
|
+
if not ref.name.endswith("HEAD") # Skip HEAD ref
|
|
927
|
+
]
|
|
928
|
+
)
|
|
929
|
+
# Only add remote branches that aren't already in local branches
|
|
930
|
+
for branch in remote_branches:
|
|
931
|
+
if branch not in available_branches:
|
|
932
|
+
available_branches.append(branch)
|
|
933
|
+
logger.debug(f"Found remote branches: {remote_branches}")
|
|
934
|
+
except Exception as e:
|
|
935
|
+
logger.debug(f"Error getting remote branches (may require authentication): {e}")
|
|
936
|
+
# Continue with local branches only
|
|
937
|
+
else:
|
|
938
|
+
logger.debug("Skipping remote branch enumeration (skip_remote_fetch=true)")
|
|
471
939
|
|
|
472
940
|
# If no branches found, fallback to trying common names directly
|
|
473
941
|
if not available_branches:
|
|
@@ -497,6 +965,7 @@ class GitDataFetcher:
|
|
|
497
965
|
accessible_branches = []
|
|
498
966
|
for branch in branches_to_test:
|
|
499
967
|
try:
|
|
968
|
+
# THREAD SAFETY: Use iterator without storing intermediate results
|
|
500
969
|
next(iter(repo.iter_commits(branch, max_count=1)), None)
|
|
501
970
|
accessible_branches.append(branch)
|
|
502
971
|
except Exception as e:
|
|
@@ -550,82 +1019,62 @@ class GitDataFetcher:
|
|
|
550
1019
|
Returns:
|
|
551
1020
|
bool: True if update succeeded, False if failed (but analysis continues)
|
|
552
1021
|
"""
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
1022
|
+
# Skip remote operations if configured
|
|
1023
|
+
if self.skip_remote_fetch:
|
|
1024
|
+
logger.info("🚫 Skipping remote fetch (skip_remote_fetch=true)")
|
|
1025
|
+
return True
|
|
556
1026
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
env["GIT_TERMINAL_PROMPT"] = "0"
|
|
560
|
-
env["GIT_ASKPASS"] = ""
|
|
561
|
-
env["GCM_INTERACTIVE"] = "never"
|
|
1027
|
+
# Check for stale repository (last fetch > 1 hour ago)
|
|
1028
|
+
self._check_repository_staleness(repo)
|
|
562
1029
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
timeout=30, # 30 second timeout
|
|
572
|
-
)
|
|
1030
|
+
try:
|
|
1031
|
+
# Check if we have remotes without triggering authentication
|
|
1032
|
+
has_remotes = False
|
|
1033
|
+
try:
|
|
1034
|
+
has_remotes = bool(repo.remotes)
|
|
1035
|
+
except Exception as e:
|
|
1036
|
+
logger.debug(f"Could not check for remotes (may require authentication): {e}")
|
|
1037
|
+
return True # Continue with local analysis
|
|
573
1038
|
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
if any(
|
|
577
|
-
x in error_str
|
|
578
|
-
for x in ["authentication", "permission denied", "401", "403"]
|
|
579
|
-
):
|
|
580
|
-
logger.error(
|
|
581
|
-
"GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
|
|
582
|
-
"You can verify with: gh auth status"
|
|
583
|
-
)
|
|
584
|
-
# Skip fetch but continue with local analysis
|
|
585
|
-
return False
|
|
586
|
-
logger.warning(f"Git fetch failed: {result.stderr}")
|
|
587
|
-
return False
|
|
1039
|
+
if has_remotes:
|
|
1040
|
+
logger.info("Fetching latest changes from remote")
|
|
588
1041
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
1042
|
+
# Use our timeout wrapper for safe git operations
|
|
1043
|
+
repo_path = Path(repo.working_dir)
|
|
1044
|
+
|
|
1045
|
+
# Try to fetch with timeout protection
|
|
1046
|
+
fetch_success = self.git_wrapper.fetch_with_timeout(repo_path, timeout=30)
|
|
1047
|
+
|
|
1048
|
+
if not fetch_success:
|
|
1049
|
+
# Mark this repository as having authentication issues if applicable
|
|
1050
|
+
if hasattr(self, "repository_status"):
|
|
1051
|
+
for key in self.repository_status:
|
|
1052
|
+
if repo.working_dir.endswith(key) or key in repo.working_dir:
|
|
1053
|
+
self.repository_status[key]["remote_update"] = "failed"
|
|
1054
|
+
break
|
|
1055
|
+
|
|
1056
|
+
# Explicit warning to user about stale data
|
|
1057
|
+
logger.warning(
|
|
1058
|
+
f"❌ Failed to fetch updates for {repo_path.name}. "
|
|
1059
|
+
f"Analysis will use potentially stale local data. "
|
|
1060
|
+
f"Check authentication or network connectivity."
|
|
592
1061
|
)
|
|
593
1062
|
return False
|
|
1063
|
+
else:
|
|
1064
|
+
# Explicit success confirmation
|
|
1065
|
+
logger.info(f"✅ Successfully fetched updates for {repo_path.name}")
|
|
594
1066
|
|
|
595
1067
|
# Only try to pull if not in detached HEAD state
|
|
596
1068
|
if not repo.head.is_detached:
|
|
597
1069
|
current_branch = repo.active_branch
|
|
598
1070
|
tracking = current_branch.tracking_branch()
|
|
599
1071
|
if tracking:
|
|
600
|
-
# Pull latest changes using
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
["git", "pull"],
|
|
604
|
-
cwd=repo.working_dir,
|
|
605
|
-
env=env,
|
|
606
|
-
capture_output=True,
|
|
607
|
-
text=True,
|
|
608
|
-
timeout=30,
|
|
609
|
-
)
|
|
610
|
-
|
|
611
|
-
if result.returncode != 0:
|
|
612
|
-
error_str = result.stderr.lower()
|
|
613
|
-
if any(
|
|
614
|
-
x in error_str
|
|
615
|
-
for x in ["authentication", "permission denied", "401", "403"]
|
|
616
|
-
):
|
|
617
|
-
logger.error(
|
|
618
|
-
"GitHub authentication failed during pull. Continuing with local repository state."
|
|
619
|
-
)
|
|
620
|
-
return False
|
|
621
|
-
logger.warning(f"Git pull failed: {result.stderr}")
|
|
622
|
-
return False
|
|
1072
|
+
# Pull latest changes using timeout wrapper
|
|
1073
|
+
pull_success = self.git_wrapper.pull_with_timeout(repo_path, timeout=30)
|
|
1074
|
+
if pull_success:
|
|
623
1075
|
logger.debug(f"Pulled latest changes for {current_branch.name}")
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
logger.error(
|
|
627
|
-
"Git pull timed out. Continuing with local repository state."
|
|
628
|
-
)
|
|
1076
|
+
else:
|
|
1077
|
+
logger.warning("Git pull failed, continuing with fetched state")
|
|
629
1078
|
return False
|
|
630
1079
|
else:
|
|
631
1080
|
logger.debug(
|
|
@@ -642,6 +1091,114 @@ class GitDataFetcher:
|
|
|
642
1091
|
# Continue with analysis using local state
|
|
643
1092
|
return False
|
|
644
1093
|
|
|
1094
|
+
def _check_repository_staleness(self, repo) -> None:
|
|
1095
|
+
"""Check if repository hasn't been fetched recently and warn user.
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
repo: GitPython Repo object
|
|
1099
|
+
"""
|
|
1100
|
+
try:
|
|
1101
|
+
repo_path = Path(repo.working_dir)
|
|
1102
|
+
fetch_head_path = repo_path / ".git" / "FETCH_HEAD"
|
|
1103
|
+
|
|
1104
|
+
if fetch_head_path.exists():
|
|
1105
|
+
# Get last fetch time from FETCH_HEAD modification time
|
|
1106
|
+
last_fetch_time = datetime.fromtimestamp(
|
|
1107
|
+
fetch_head_path.stat().st_mtime, tz=timezone.utc
|
|
1108
|
+
)
|
|
1109
|
+
now = datetime.now(timezone.utc)
|
|
1110
|
+
hours_since_fetch = (now - last_fetch_time).total_seconds() / 3600
|
|
1111
|
+
|
|
1112
|
+
if hours_since_fetch > 1:
|
|
1113
|
+
logger.warning(
|
|
1114
|
+
f"⏰ Repository {repo_path.name} last fetched {hours_since_fetch:.1f} hours ago. "
|
|
1115
|
+
f"Data may be stale."
|
|
1116
|
+
)
|
|
1117
|
+
else:
|
|
1118
|
+
logger.warning(
|
|
1119
|
+
f"⚠️ Repository {repo_path.name} has never been fetched. "
|
|
1120
|
+
f"Will attempt to fetch now."
|
|
1121
|
+
)
|
|
1122
|
+
except Exception as e:
|
|
1123
|
+
logger.debug(f"Could not check repository staleness: {e}")
|
|
1124
|
+
|
|
1125
|
+
def _check_repository_security(self, repo_path: Path, project_key: str) -> None:
|
|
1126
|
+
"""Check for security issues in repository configuration.
|
|
1127
|
+
|
|
1128
|
+
Warns about:
|
|
1129
|
+
- Exposed tokens in remote URLs
|
|
1130
|
+
- Insecure credential storage
|
|
1131
|
+
"""
|
|
1132
|
+
|
|
1133
|
+
try:
|
|
1134
|
+
# Check for tokens in remote URLs
|
|
1135
|
+
result = subprocess.run(
|
|
1136
|
+
["git", "remote", "-v"],
|
|
1137
|
+
cwd=repo_path,
|
|
1138
|
+
capture_output=True,
|
|
1139
|
+
text=True,
|
|
1140
|
+
timeout=2,
|
|
1141
|
+
env={"GIT_TERMINAL_PROMPT": "0"},
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
if result.returncode == 0:
|
|
1145
|
+
output = result.stdout
|
|
1146
|
+
# Check for various token patterns in URLs
|
|
1147
|
+
token_patterns = [
|
|
1148
|
+
r"https://[^@]*@", # Any HTTPS URL with embedded credentials
|
|
1149
|
+
r"ghp_[a-zA-Z0-9]+", # GitHub Personal Access Token
|
|
1150
|
+
r"ghs_[a-zA-Z0-9]+", # GitHub Server Token
|
|
1151
|
+
r"github_pat_[a-zA-Z0-9]+", # New GitHub PAT format
|
|
1152
|
+
]
|
|
1153
|
+
|
|
1154
|
+
for pattern in token_patterns:
|
|
1155
|
+
import re
|
|
1156
|
+
|
|
1157
|
+
if re.search(pattern, output):
|
|
1158
|
+
logger.warning(
|
|
1159
|
+
f"⚠️ SECURITY WARNING for {project_key}: "
|
|
1160
|
+
f"Repository appears to have credentials in remote URL. "
|
|
1161
|
+
f"This is a security risk! Consider using: "
|
|
1162
|
+
f"1) GitHub CLI (gh auth login), "
|
|
1163
|
+
f"2) SSH keys, or "
|
|
1164
|
+
f"3) Git credential manager instead."
|
|
1165
|
+
)
|
|
1166
|
+
break
|
|
1167
|
+
except:
|
|
1168
|
+
# Don't fail analysis due to security check
|
|
1169
|
+
pass
|
|
1170
|
+
|
|
1171
|
+
def get_repository_status_summary(self) -> dict[str, Any]:
|
|
1172
|
+
"""Get a summary of repository fetch status.
|
|
1173
|
+
|
|
1174
|
+
Returns:
|
|
1175
|
+
Dictionary with status summary including any repositories with issues
|
|
1176
|
+
"""
|
|
1177
|
+
summary = {
|
|
1178
|
+
"total_repositories": len(self.repository_status),
|
|
1179
|
+
"successful_updates": 0,
|
|
1180
|
+
"failed_updates": 0,
|
|
1181
|
+
"skipped_updates": 0,
|
|
1182
|
+
"authentication_issues": [],
|
|
1183
|
+
"errors": [],
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
for project_key, status in self.repository_status.items():
|
|
1187
|
+
if status["remote_update"] == "success":
|
|
1188
|
+
summary["successful_updates"] += 1
|
|
1189
|
+
elif status["remote_update"] == "failed":
|
|
1190
|
+
summary["failed_updates"] += 1
|
|
1191
|
+
if status.get("authentication_issues"):
|
|
1192
|
+
summary["authentication_issues"].append(project_key)
|
|
1193
|
+
elif status["remote_update"] == "skipped":
|
|
1194
|
+
summary["skipped_updates"] += 1
|
|
1195
|
+
elif status["remote_update"] == "error":
|
|
1196
|
+
summary["errors"].append(
|
|
1197
|
+
{"repository": project_key, "error": status.get("error", "Unknown error")}
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
return summary
|
|
1201
|
+
|
|
645
1202
|
def _extract_all_ticket_references(
|
|
646
1203
|
self, daily_commits: dict[str, list[dict[str, Any]]]
|
|
647
1204
|
) -> set[str]:
|
|
@@ -919,8 +1476,20 @@ class GitDataFetcher:
|
|
|
919
1476
|
"branch": commit.get("branch", "main"),
|
|
920
1477
|
"is_merge": commit.get("is_merge", False),
|
|
921
1478
|
"files_changed_count": commit.get("files_changed_count", 0),
|
|
922
|
-
|
|
923
|
-
"
|
|
1479
|
+
# Store raw unfiltered values in insertions/deletions
|
|
1480
|
+
"insertions": commit.get(
|
|
1481
|
+
"raw_insertions", commit.get("lines_added", 0)
|
|
1482
|
+
),
|
|
1483
|
+
"deletions": commit.get(
|
|
1484
|
+
"raw_deletions", commit.get("lines_deleted", 0)
|
|
1485
|
+
),
|
|
1486
|
+
# Store filtered values separately
|
|
1487
|
+
"filtered_insertions": commit.get(
|
|
1488
|
+
"filtered_insertions", commit.get("lines_added", 0)
|
|
1489
|
+
),
|
|
1490
|
+
"filtered_deletions": commit.get(
|
|
1491
|
+
"filtered_deletions", commit.get("lines_deleted", 0)
|
|
1492
|
+
),
|
|
924
1493
|
"story_points": commit.get("story_points"),
|
|
925
1494
|
"ticket_references": commit.get("ticket_references", []),
|
|
926
1495
|
}
|
|
@@ -1193,24 +1762,48 @@ class GitDataFetcher:
|
|
|
1193
1762
|
session.close()
|
|
1194
1763
|
|
|
1195
1764
|
def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
|
|
1196
|
-
"""Calculate commit statistics using reliable git diff --numstat.
|
|
1765
|
+
"""Calculate commit statistics using reliable git diff --numstat with exclude_paths filtering.
|
|
1197
1766
|
|
|
1198
1767
|
Returns:
|
|
1199
|
-
Dictionary with
|
|
1768
|
+
Dictionary with both raw and filtered statistics:
|
|
1769
|
+
- 'files', 'insertions', 'deletions': filtered counts
|
|
1770
|
+
- 'raw_insertions', 'raw_deletions': unfiltered counts
|
|
1771
|
+
|
|
1772
|
+
THREAD SAFETY: This method is thread-safe as it works with commit objects
|
|
1773
|
+
that have their own repo references.
|
|
1200
1774
|
"""
|
|
1201
1775
|
stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1202
1776
|
|
|
1777
|
+
# Track raw stats for storage
|
|
1778
|
+
raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1779
|
+
excluded_stats = {"files": 0, "insertions": 0, "deletions": 0}
|
|
1780
|
+
|
|
1203
1781
|
# For initial commits or commits without parents
|
|
1204
1782
|
parent = commit.parents[0] if commit.parents else None
|
|
1205
1783
|
|
|
1206
1784
|
try:
|
|
1207
|
-
# Use
|
|
1785
|
+
# THREAD SAFETY: Use the repo reference from the commit object
|
|
1786
|
+
# Each thread has its own commit object with its own repo reference
|
|
1208
1787
|
repo = commit.repo
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1788
|
+
Path(repo.working_dir)
|
|
1789
|
+
|
|
1790
|
+
def get_diff_output():
|
|
1791
|
+
if parent:
|
|
1792
|
+
return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1793
|
+
else:
|
|
1794
|
+
# Initial commit - use git show with --numstat
|
|
1795
|
+
return repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1796
|
+
|
|
1797
|
+
# Use timeout wrapper for git diff operations
|
|
1798
|
+
try:
|
|
1799
|
+
diff_output = self.git_wrapper.run_with_timeout(
|
|
1800
|
+
get_diff_output,
|
|
1801
|
+
timeout=10, # 10 seconds for diff operations
|
|
1802
|
+
operation_name=f"diff_{commit.hexsha[:8]}",
|
|
1803
|
+
)
|
|
1804
|
+
except GitOperationTimeout:
|
|
1805
|
+
logger.warning(f"⏱️ Timeout calculating stats for commit {commit.hexsha[:8]}")
|
|
1806
|
+
return stats # Return zeros
|
|
1214
1807
|
|
|
1215
1808
|
# Parse the numstat output: insertions\tdeletions\tfilename
|
|
1216
1809
|
for line in diff_output.strip().split("\n"):
|
|
@@ -1222,9 +1815,22 @@ class GitDataFetcher:
|
|
|
1222
1815
|
try:
|
|
1223
1816
|
insertions = int(parts[0]) if parts[0] != "-" else 0
|
|
1224
1817
|
deletions = int(parts[1]) if parts[1] != "-" else 0
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
#
|
|
1818
|
+
filename = parts[2]
|
|
1819
|
+
|
|
1820
|
+
# Always count raw stats
|
|
1821
|
+
raw_stats["files"] += 1
|
|
1822
|
+
raw_stats["insertions"] += insertions
|
|
1823
|
+
raw_stats["deletions"] += deletions
|
|
1824
|
+
|
|
1825
|
+
# Skip excluded files based on exclude_paths patterns
|
|
1826
|
+
if self._should_exclude_file(filename):
|
|
1827
|
+
logger.debug(f"Excluding file from line counts: {filename}")
|
|
1828
|
+
excluded_stats["files"] += 1
|
|
1829
|
+
excluded_stats["insertions"] += insertions
|
|
1830
|
+
excluded_stats["deletions"] += deletions
|
|
1831
|
+
continue
|
|
1832
|
+
|
|
1833
|
+
# Count only non-excluded files and their changes
|
|
1228
1834
|
stats["files"] += 1
|
|
1229
1835
|
stats["insertions"] += insertions
|
|
1230
1836
|
stats["deletions"] += deletions
|
|
@@ -1233,10 +1839,36 @@ class GitDataFetcher:
|
|
|
1233
1839
|
# Skip binary files or malformed lines
|
|
1234
1840
|
continue
|
|
1235
1841
|
|
|
1842
|
+
# Log exclusion statistics if significant
|
|
1843
|
+
if excluded_stats["files"] > 0 or (
|
|
1844
|
+
raw_stats["insertions"] > 0 and stats["insertions"] < raw_stats["insertions"]
|
|
1845
|
+
):
|
|
1846
|
+
reduction_pct = (
|
|
1847
|
+
100 * (1 - stats["insertions"] / raw_stats["insertions"])
|
|
1848
|
+
if raw_stats["insertions"] > 0
|
|
1849
|
+
else 0
|
|
1850
|
+
)
|
|
1851
|
+
logger.info(
|
|
1852
|
+
f"Commit {commit.hexsha[:8]}: Excluded {excluded_stats['files']} files, "
|
|
1853
|
+
f"{excluded_stats['insertions']} insertions, {excluded_stats['deletions']} deletions "
|
|
1854
|
+
f"({reduction_pct:.1f}% reduction)"
|
|
1855
|
+
)
|
|
1856
|
+
|
|
1857
|
+
# Log if exclusions are configured
|
|
1858
|
+
if self.exclude_paths and raw_stats["files"] > 0:
|
|
1859
|
+
logger.debug(
|
|
1860
|
+
f"Commit {commit.hexsha[:8]}: Applied {len(self.exclude_paths)} exclusion patterns. "
|
|
1861
|
+
f"Raw: {raw_stats['files']} files, +{raw_stats['insertions']} -{raw_stats['deletions']}. "
|
|
1862
|
+
f"Filtered: {stats['files']} files, +{stats['insertions']} -{stats['deletions']}"
|
|
1863
|
+
)
|
|
1864
|
+
|
|
1236
1865
|
except Exception as e:
|
|
1237
1866
|
# Log the error for debugging but don't crash
|
|
1238
1867
|
logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
|
|
1239
1868
|
|
|
1869
|
+
# Return both raw and filtered stats
|
|
1870
|
+
stats["raw_insertions"] = raw_stats["insertions"]
|
|
1871
|
+
stats["raw_deletions"] = raw_stats["deletions"]
|
|
1240
1872
|
return stats
|
|
1241
1873
|
|
|
1242
1874
|
def _store_day_commits_incremental(
|
|
@@ -1266,8 +1898,16 @@ class GitDataFetcher:
|
|
|
1266
1898
|
"branch": commit.get("branch", "main"),
|
|
1267
1899
|
"is_merge": commit.get("is_merge", False),
|
|
1268
1900
|
"files_changed_count": commit.get("files_changed_count", 0),
|
|
1269
|
-
|
|
1270
|
-
"
|
|
1901
|
+
# Store raw unfiltered values
|
|
1902
|
+
"insertions": commit.get("raw_insertions", commit.get("lines_added", 0)),
|
|
1903
|
+
"deletions": commit.get("raw_deletions", commit.get("lines_deleted", 0)),
|
|
1904
|
+
# Store filtered values
|
|
1905
|
+
"filtered_insertions": commit.get(
|
|
1906
|
+
"filtered_insertions", commit.get("lines_added", 0)
|
|
1907
|
+
),
|
|
1908
|
+
"filtered_deletions": commit.get(
|
|
1909
|
+
"filtered_deletions", commit.get("lines_deleted", 0)
|
|
1910
|
+
),
|
|
1271
1911
|
"story_points": commit.get("story_points"),
|
|
1272
1912
|
"ticket_references": commit.get("ticket_references", []),
|
|
1273
1913
|
}
|
|
@@ -1283,3 +1923,297 @@ class GitDataFetcher:
|
|
|
1283
1923
|
except Exception as e:
|
|
1284
1924
|
# Log error but don't fail - commits will be stored again in batch at the end
|
|
1285
1925
|
logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")
|
|
1926
|
+
|
|
1927
|
+
def process_repositories_parallel(
|
|
1928
|
+
self,
|
|
1929
|
+
repositories: list[dict],
|
|
1930
|
+
weeks_back: int = 4,
|
|
1931
|
+
jira_integration: Optional[JIRAIntegration] = None,
|
|
1932
|
+
start_date: Optional[datetime] = None,
|
|
1933
|
+
end_date: Optional[datetime] = None,
|
|
1934
|
+
max_workers: int = 3,
|
|
1935
|
+
) -> dict[str, Any]:
|
|
1936
|
+
"""Process multiple repositories in parallel with proper timeout protection.
|
|
1937
|
+
|
|
1938
|
+
Args:
|
|
1939
|
+
repositories: List of repository configurations
|
|
1940
|
+
weeks_back: Number of weeks to analyze
|
|
1941
|
+
jira_integration: Optional JIRA integration for ticket data
|
|
1942
|
+
start_date: Optional explicit start date
|
|
1943
|
+
end_date: Optional explicit end date
|
|
1944
|
+
max_workers: Maximum number of parallel workers
|
|
1945
|
+
|
|
1946
|
+
Returns:
|
|
1947
|
+
Dictionary containing processing results and statistics
|
|
1948
|
+
"""
|
|
1949
|
+
logger.info(
|
|
1950
|
+
f"🚀 Starting parallel processing of {len(repositories)} repositories with {max_workers} workers"
|
|
1951
|
+
)
|
|
1952
|
+
|
|
1953
|
+
# Initialize statistics
|
|
1954
|
+
self.processing_stats = {
|
|
1955
|
+
"total": len(repositories),
|
|
1956
|
+
"processed": 0,
|
|
1957
|
+
"success": 0,
|
|
1958
|
+
"failed": 0,
|
|
1959
|
+
"timeout": 0,
|
|
1960
|
+
"repositories": {},
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
# Get progress service for updates
|
|
1964
|
+
progress = get_progress_service()
|
|
1965
|
+
|
|
1966
|
+
# Start heartbeat logger for monitoring
|
|
1967
|
+
with HeartbeatLogger(interval=5):
|
|
1968
|
+
results = {}
|
|
1969
|
+
|
|
1970
|
+
# Use ThreadPoolExecutor for parallel processing
|
|
1971
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1972
|
+
# Submit all repository processing tasks
|
|
1973
|
+
future_to_repo = {}
|
|
1974
|
+
|
|
1975
|
+
for repo_config in repositories:
|
|
1976
|
+
repo_path = Path(repo_config.get("path", ""))
|
|
1977
|
+
project_key = repo_config.get("project_key", repo_path.name)
|
|
1978
|
+
branch_patterns = repo_config.get("branch_patterns")
|
|
1979
|
+
|
|
1980
|
+
# Submit task with timeout wrapper
|
|
1981
|
+
future = executor.submit(
|
|
1982
|
+
self._process_repository_with_timeout,
|
|
1983
|
+
repo_path,
|
|
1984
|
+
project_key,
|
|
1985
|
+
weeks_back,
|
|
1986
|
+
branch_patterns,
|
|
1987
|
+
jira_integration,
|
|
1988
|
+
start_date,
|
|
1989
|
+
end_date,
|
|
1990
|
+
)
|
|
1991
|
+
future_to_repo[future] = {
|
|
1992
|
+
"path": repo_path,
|
|
1993
|
+
"project_key": project_key,
|
|
1994
|
+
"start_time": time.time(),
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
logger.info(f"📋 Submitted {project_key} for processing")
|
|
1998
|
+
|
|
1999
|
+
# Process results as they complete
|
|
2000
|
+
for future in as_completed(future_to_repo):
|
|
2001
|
+
repo_info = future_to_repo[future]
|
|
2002
|
+
project_key = repo_info["project_key"]
|
|
2003
|
+
elapsed_time = time.time() - repo_info["start_time"]
|
|
2004
|
+
|
|
2005
|
+
try:
|
|
2006
|
+
result = future.result(timeout=5) # Short timeout to get result
|
|
2007
|
+
|
|
2008
|
+
if result:
|
|
2009
|
+
self.processing_stats["success"] += 1
|
|
2010
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2011
|
+
"status": "success",
|
|
2012
|
+
"elapsed_time": elapsed_time,
|
|
2013
|
+
"commits": result.get("stats", {}).get("total_commits", 0),
|
|
2014
|
+
"tickets": result.get("stats", {}).get("unique_tickets", 0),
|
|
2015
|
+
}
|
|
2016
|
+
results[project_key] = result
|
|
2017
|
+
|
|
2018
|
+
logger.info(
|
|
2019
|
+
f"✅ {project_key}: Successfully processed "
|
|
2020
|
+
f"{result['stats']['total_commits']} commits in {elapsed_time:.1f}s"
|
|
2021
|
+
)
|
|
2022
|
+
|
|
2023
|
+
# Update progress
|
|
2024
|
+
if hasattr(progress, "finish_repository"):
|
|
2025
|
+
# Check if progress adapter supports stats parameter
|
|
2026
|
+
if hasattr(progress, "update_stats"):
|
|
2027
|
+
progress.update_stats(
|
|
2028
|
+
processed=self.processing_stats["processed"],
|
|
2029
|
+
success=self.processing_stats["success"],
|
|
2030
|
+
failed=self.processing_stats["failed"],
|
|
2031
|
+
timeout=self.processing_stats["timeout"],
|
|
2032
|
+
total=self.processing_stats["total"],
|
|
2033
|
+
)
|
|
2034
|
+
progress.finish_repository(project_key, success=True)
|
|
2035
|
+
else:
|
|
2036
|
+
progress.finish_repository(project_key, success=True)
|
|
2037
|
+
else:
|
|
2038
|
+
self.processing_stats["failed"] += 1
|
|
2039
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2040
|
+
"status": "failed",
|
|
2041
|
+
"elapsed_time": elapsed_time,
|
|
2042
|
+
"error": "Processing returned no result",
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
logger.error(
|
|
2046
|
+
f"❌ {project_key}: Processing failed after {elapsed_time:.1f}s"
|
|
2047
|
+
)
|
|
2048
|
+
|
|
2049
|
+
# Update progress
|
|
2050
|
+
if hasattr(progress, "finish_repository"):
|
|
2051
|
+
# Check if progress adapter supports stats parameter
|
|
2052
|
+
if hasattr(progress, "update_stats"):
|
|
2053
|
+
progress.update_stats(
|
|
2054
|
+
processed=self.processing_stats["processed"],
|
|
2055
|
+
success=self.processing_stats["success"],
|
|
2056
|
+
failed=self.processing_stats["failed"],
|
|
2057
|
+
timeout=self.processing_stats["timeout"],
|
|
2058
|
+
total=self.processing_stats["total"],
|
|
2059
|
+
)
|
|
2060
|
+
progress.finish_repository(
|
|
2061
|
+
project_key,
|
|
2062
|
+
success=False,
|
|
2063
|
+
error_message="Processing failed",
|
|
2064
|
+
)
|
|
2065
|
+
else:
|
|
2066
|
+
progress.finish_repository(
|
|
2067
|
+
project_key,
|
|
2068
|
+
success=False,
|
|
2069
|
+
error_message="Processing failed",
|
|
2070
|
+
)
|
|
2071
|
+
|
|
2072
|
+
except GitOperationTimeout:
|
|
2073
|
+
self.processing_stats["timeout"] += 1
|
|
2074
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2075
|
+
"status": "timeout",
|
|
2076
|
+
"elapsed_time": elapsed_time,
|
|
2077
|
+
"error": "Operation timed out",
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
logger.error(f"⏱️ {project_key}: Timed out after {elapsed_time:.1f}s")
|
|
2081
|
+
|
|
2082
|
+
# Update progress
|
|
2083
|
+
if hasattr(progress, "finish_repository"):
|
|
2084
|
+
# Check if progress adapter supports stats parameter
|
|
2085
|
+
if hasattr(progress, "update_stats"):
|
|
2086
|
+
progress.update_stats(
|
|
2087
|
+
processed=self.processing_stats["processed"],
|
|
2088
|
+
success=self.processing_stats["success"],
|
|
2089
|
+
failed=self.processing_stats["failed"],
|
|
2090
|
+
timeout=self.processing_stats["timeout"],
|
|
2091
|
+
total=self.processing_stats["total"],
|
|
2092
|
+
)
|
|
2093
|
+
progress.finish_repository(
|
|
2094
|
+
project_key, success=False, error_message="Timeout"
|
|
2095
|
+
)
|
|
2096
|
+
else:
|
|
2097
|
+
progress.finish_repository(
|
|
2098
|
+
project_key, success=False, error_message="Timeout"
|
|
2099
|
+
)
|
|
2100
|
+
|
|
2101
|
+
except Exception as e:
|
|
2102
|
+
self.processing_stats["failed"] += 1
|
|
2103
|
+
self.processing_stats["repositories"][project_key] = {
|
|
2104
|
+
"status": "failed",
|
|
2105
|
+
"elapsed_time": elapsed_time,
|
|
2106
|
+
"error": str(e),
|
|
2107
|
+
}
|
|
2108
|
+
|
|
2109
|
+
logger.error(f"❌ {project_key}: Error after {elapsed_time:.1f}s - {e}")
|
|
2110
|
+
|
|
2111
|
+
# Update progress
|
|
2112
|
+
if hasattr(progress, "finish_repository"):
|
|
2113
|
+
# Check if progress adapter supports stats parameter
|
|
2114
|
+
if hasattr(progress, "update_stats"):
|
|
2115
|
+
progress.update_stats(
|
|
2116
|
+
processed=self.processing_stats["processed"],
|
|
2117
|
+
success=self.processing_stats["success"],
|
|
2118
|
+
failed=self.processing_stats["failed"],
|
|
2119
|
+
timeout=self.processing_stats["timeout"],
|
|
2120
|
+
total=self.processing_stats["total"],
|
|
2121
|
+
)
|
|
2122
|
+
progress.finish_repository(
|
|
2123
|
+
project_key, success=False, error_message=str(e)
|
|
2124
|
+
)
|
|
2125
|
+
else:
|
|
2126
|
+
progress.finish_repository(
|
|
2127
|
+
project_key, success=False, error_message=str(e)
|
|
2128
|
+
)
|
|
2129
|
+
|
|
2130
|
+
finally:
|
|
2131
|
+
# Update processed counter BEFORE logging and progress updates
|
|
2132
|
+
self.processing_stats["processed"] += 1
|
|
2133
|
+
|
|
2134
|
+
# Update progress service with actual processing stats
|
|
2135
|
+
if hasattr(progress, "update_stats"):
|
|
2136
|
+
progress.update_stats(
|
|
2137
|
+
processed=self.processing_stats["processed"],
|
|
2138
|
+
success=self.processing_stats["success"],
|
|
2139
|
+
failed=self.processing_stats["failed"],
|
|
2140
|
+
timeout=self.processing_stats["timeout"],
|
|
2141
|
+
total=self.processing_stats["total"],
|
|
2142
|
+
)
|
|
2143
|
+
|
|
2144
|
+
# Log progress
|
|
2145
|
+
logger.info(
|
|
2146
|
+
f"📊 Progress: {self.processing_stats['processed']}/{self.processing_stats['total']} repositories "
|
|
2147
|
+
f"(✅ {self.processing_stats['success']} | ❌ {self.processing_stats['failed']} | ⏱️ {self.processing_stats['timeout']})"
|
|
2148
|
+
)
|
|
2149
|
+
|
|
2150
|
+
# Final summary
|
|
2151
|
+
logger.info("=" * 60)
|
|
2152
|
+
logger.info("📈 PARALLEL PROCESSING SUMMARY")
|
|
2153
|
+
logger.info(f" Total repositories: {self.processing_stats['total']}")
|
|
2154
|
+
logger.info(f" Successfully processed: {self.processing_stats['success']}")
|
|
2155
|
+
logger.info(f" Failed: {self.processing_stats['failed']}")
|
|
2156
|
+
logger.info(f" Timed out: {self.processing_stats['timeout']}")
|
|
2157
|
+
logger.info("=" * 60)
|
|
2158
|
+
|
|
2159
|
+
return {"results": results, "statistics": self.processing_stats}
|
|
2160
|
+
|
|
2161
|
+
def _process_repository_with_timeout(
|
|
2162
|
+
self,
|
|
2163
|
+
repo_path: Path,
|
|
2164
|
+
project_key: str,
|
|
2165
|
+
weeks_back: int = 4,
|
|
2166
|
+
branch_patterns: Optional[list[str]] = None,
|
|
2167
|
+
jira_integration: Optional[JIRAIntegration] = None,
|
|
2168
|
+
start_date: Optional[datetime] = None,
|
|
2169
|
+
end_date: Optional[datetime] = None,
|
|
2170
|
+
timeout_per_operation: int = 30,
|
|
2171
|
+
) -> Optional[dict[str, Any]]:
|
|
2172
|
+
"""Process a single repository with comprehensive timeout protection.
|
|
2173
|
+
|
|
2174
|
+
Args:
|
|
2175
|
+
repo_path: Path to the repository
|
|
2176
|
+
project_key: Project identifier
|
|
2177
|
+
weeks_back: Number of weeks to analyze
|
|
2178
|
+
branch_patterns: Branch patterns to include
|
|
2179
|
+
jira_integration: JIRA integration for ticket data
|
|
2180
|
+
start_date: Optional explicit start date
|
|
2181
|
+
end_date: Optional explicit end date
|
|
2182
|
+
timeout_per_operation: Timeout for individual git operations
|
|
2183
|
+
|
|
2184
|
+
Returns:
|
|
2185
|
+
Repository processing results or None if failed
|
|
2186
|
+
"""
|
|
2187
|
+
try:
|
|
2188
|
+
# Track this repository in progress
|
|
2189
|
+
progress = get_progress_service()
|
|
2190
|
+
if hasattr(progress, "start_repository"):
|
|
2191
|
+
progress.start_repository(project_key, 0)
|
|
2192
|
+
|
|
2193
|
+
logger.info(f"🔍 Processing repository: {project_key} at {repo_path}")
|
|
2194
|
+
|
|
2195
|
+
# Use the regular fetch method but with timeout wrapper active
|
|
2196
|
+
with self.git_wrapper.operation_tracker("fetch_repository_data", repo_path):
|
|
2197
|
+
result = self.fetch_repository_data(
|
|
2198
|
+
repo_path=repo_path,
|
|
2199
|
+
project_key=project_key,
|
|
2200
|
+
weeks_back=weeks_back,
|
|
2201
|
+
branch_patterns=branch_patterns,
|
|
2202
|
+
jira_integration=jira_integration,
|
|
2203
|
+
progress_callback=None, # We handle progress at a higher level
|
|
2204
|
+
start_date=start_date,
|
|
2205
|
+
end_date=end_date,
|
|
2206
|
+
)
|
|
2207
|
+
|
|
2208
|
+
return result
|
|
2209
|
+
|
|
2210
|
+
except GitOperationTimeout as e:
|
|
2211
|
+
logger.error(f"⏱️ Repository {project_key} processing timed out: {e}")
|
|
2212
|
+
raise
|
|
2213
|
+
|
|
2214
|
+
except Exception as e:
|
|
2215
|
+
logger.error(f"❌ Error processing repository {project_key}: {e}")
|
|
2216
|
+
import traceback
|
|
2217
|
+
|
|
2218
|
+
logger.debug(f"Full traceback: {traceback.format_exc()}")
|
|
2219
|
+
return None
|