gitflow-analytics 1.3.6__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/batch_classifier.py +156 -4
  3. gitflow_analytics/cli.py +897 -179
  4. gitflow_analytics/config/loader.py +40 -1
  5. gitflow_analytics/config/schema.py +4 -0
  6. gitflow_analytics/core/cache.py +20 -0
  7. gitflow_analytics/core/data_fetcher.py +1254 -228
  8. gitflow_analytics/core/git_auth.py +169 -0
  9. gitflow_analytics/core/git_timeout_wrapper.py +347 -0
  10. gitflow_analytics/core/metrics_storage.py +12 -3
  11. gitflow_analytics/core/progress.py +219 -18
  12. gitflow_analytics/core/subprocess_git.py +145 -0
  13. gitflow_analytics/extractors/ml_tickets.py +3 -2
  14. gitflow_analytics/extractors/tickets.py +93 -8
  15. gitflow_analytics/integrations/jira_integration.py +1 -1
  16. gitflow_analytics/integrations/orchestrator.py +47 -29
  17. gitflow_analytics/metrics/branch_health.py +3 -2
  18. gitflow_analytics/models/database.py +72 -1
  19. gitflow_analytics/pm_framework/adapters/jira_adapter.py +12 -5
  20. gitflow_analytics/pm_framework/orchestrator.py +8 -3
  21. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +24 -4
  22. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +3 -1
  23. gitflow_analytics/qualitative/core/llm_fallback.py +34 -2
  24. gitflow_analytics/reports/narrative_writer.py +118 -74
  25. gitflow_analytics/security/__init__.py +11 -0
  26. gitflow_analytics/security/config.py +189 -0
  27. gitflow_analytics/security/extractors/__init__.py +7 -0
  28. gitflow_analytics/security/extractors/dependency_checker.py +379 -0
  29. gitflow_analytics/security/extractors/secret_detector.py +197 -0
  30. gitflow_analytics/security/extractors/vulnerability_scanner.py +333 -0
  31. gitflow_analytics/security/llm_analyzer.py +347 -0
  32. gitflow_analytics/security/reports/__init__.py +5 -0
  33. gitflow_analytics/security/reports/security_report.py +358 -0
  34. gitflow_analytics/security/security_analyzer.py +414 -0
  35. gitflow_analytics/tui/app.py +3 -1
  36. gitflow_analytics/tui/progress_adapter.py +313 -0
  37. gitflow_analytics/tui/screens/analysis_progress_screen.py +407 -46
  38. gitflow_analytics/tui/screens/results_screen.py +219 -206
  39. gitflow_analytics/ui/__init__.py +21 -0
  40. gitflow_analytics/ui/progress_display.py +1477 -0
  41. gitflow_analytics/verify_activity.py +697 -0
  42. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/METADATA +2 -1
  43. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/RECORD +47 -31
  44. gitflow_analytics/cli_rich.py +0 -503
  45. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/WHEEL +0 -0
  46. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/entry_points.txt +0 -0
  47. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/licenses/LICENSE +0 -0
  48. {gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/top_level.txt +0 -0
@@ -5,15 +5,17 @@ focusing purely on data collection from Git repositories and ticket systems
5
5
  without performing any LLM-based classification.
6
6
  """
7
7
 
8
- import contextlib
9
8
  import logging
10
9
  import os
11
10
  import subprocess
12
- from collections import defaultdict
11
+ import threading
12
+ import time
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
14
  from datetime import datetime, timedelta, timezone
14
15
  from pathlib import Path
15
16
  from typing import Any, Optional
16
17
 
18
+ from sqlalchemy import func
17
19
  from sqlalchemy.orm import Session
18
20
 
19
21
  from ..extractors.story_points import StoryPointExtractor
@@ -27,11 +29,17 @@ from ..models.database import (
27
29
  )
28
30
  from .branch_mapper import BranchToProjectMapper
29
31
  from .cache import GitAnalysisCache
32
+ from .git_timeout_wrapper import GitOperationTimeout, GitTimeoutWrapper, HeartbeatLogger
30
33
  from .identity import DeveloperIdentityResolver
31
34
  from .progress import get_progress_service
32
35
 
33
36
  logger = logging.getLogger(__name__)
34
37
 
38
+ # THREAD SAFETY: Module-level thread-local storage for repository instances
39
+ # Each thread gets its own isolated storage to prevent thread-safety issues
40
+ # when GitDataFetcher is called from ThreadPoolExecutor
41
+ _thread_local = threading.local()
42
+
35
43
 
36
44
  class GitDataFetcher:
37
45
  """Fetches raw Git commit data and organizes it by day for efficient batch processing.
@@ -50,6 +58,7 @@ class GitDataFetcher:
50
58
  branch_mapping_rules: Optional[dict[str, list[str]]] = None,
51
59
  allowed_ticket_platforms: Optional[list[str]] = None,
52
60
  exclude_paths: Optional[list[str]] = None,
61
+ skip_remote_fetch: bool = False,
53
62
  ):
54
63
  """Initialize the data fetcher.
55
64
 
@@ -58,8 +67,11 @@ class GitDataFetcher:
58
67
  branch_mapping_rules: Rules for mapping branches to projects
59
68
  allowed_ticket_platforms: List of allowed ticket platforms
60
69
  exclude_paths: List of file paths to exclude from analysis
70
+ skip_remote_fetch: If True, skip git fetch/pull operations
61
71
  """
62
72
  self.cache = cache
73
+ self.skip_remote_fetch = skip_remote_fetch
74
+ self.repository_status = {} # Track status of each repository
63
75
  # CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
64
76
  self.database = cache.db
65
77
  self.story_point_extractor = StoryPointExtractor()
@@ -67,10 +79,35 @@ class GitDataFetcher:
67
79
  self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
68
80
  self.exclude_paths = exclude_paths or []
69
81
 
82
+ # Log exclusion configuration
83
+ if self.exclude_paths:
84
+ logger.info(
85
+ f"GitDataFetcher initialized with {len(self.exclude_paths)} exclusion patterns:"
86
+ )
87
+ for pattern in self.exclude_paths[:5]: # Show first 5 patterns
88
+ logger.debug(f" - {pattern}")
89
+ if len(self.exclude_paths) > 5:
90
+ logger.debug(f" ... and {len(self.exclude_paths) - 5} more patterns")
91
+ else:
92
+ logger.info("GitDataFetcher initialized with no file exclusions")
93
+
70
94
  # Initialize identity resolver
71
95
  identity_db_path = cache.cache_dir / "identities.db"
72
96
  self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
73
97
 
98
+ # Initialize git timeout wrapper for safe operations
99
+ self.git_wrapper = GitTimeoutWrapper(default_timeout=30)
100
+
101
+ # Statistics for tracking repository processing
102
+ self.processing_stats = {
103
+ "total": 0,
104
+ "processed": 0,
105
+ "success": 0,
106
+ "failed": 0,
107
+ "timeout": 0,
108
+ "repositories": {},
109
+ }
110
+
74
111
  def fetch_repository_data(
75
112
  self,
76
113
  repo_path: Path,
@@ -117,37 +154,75 @@ class GitDataFetcher:
117
154
  f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
118
155
  )
119
156
 
120
- # Step 1: Collect all commits organized by day
157
+ # Get progress service for top-level progress tracking
158
+ progress = get_progress_service()
159
+
160
+ # Start Rich display for this repository if enabled
161
+ if hasattr(progress, "_use_rich") and progress._use_rich:
162
+ # Count total commits for progress estimation
163
+ try:
164
+ import git
165
+
166
+ git.Repo(repo_path)
167
+ # Check if we need to clone or pull
168
+ if not repo_path.exists() or not (repo_path / ".git").exists():
169
+ logger.info(f"📥 Repository {project_key} needs cloning")
170
+ progress.start_repository(f"{project_key} (cloning)", 0)
171
+ else:
172
+ # Rough estimate based on weeks
173
+ estimated_commits = weeks_back * 50 # Estimate ~50 commits per week
174
+ progress.start_repository(project_key, estimated_commits)
175
+ except Exception:
176
+ progress.start_repository(project_key, 100) # Default estimate
177
+
178
+ # Step 1: Collect all commits organized by day with enhanced progress tracking
121
179
  logger.info("🔍 DEBUG: About to fetch commits by day")
122
- logger.info("Fetching commits organized by day...")
123
- daily_commits = self._fetch_commits_by_day(
124
- repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
125
- )
126
- logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
127
-
128
- # Step 2: Extract and fetch all referenced tickets
129
- logger.info("🔍 DEBUG: About to extract ticket references")
130
- logger.info("Extracting ticket references...")
131
- ticket_ids = self._extract_all_ticket_references(daily_commits)
132
- logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
133
-
134
- if jira_integration and ticket_ids:
135
- logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
136
- self._fetch_detailed_tickets(
137
- ticket_ids, jira_integration, project_key, progress_callback
180
+ logger.info(f"Fetching commits organized by day for repository: {project_key}")
181
+
182
+ # Create top-level progress for this repository
183
+ with progress.progress(
184
+ total=3, # Three main steps: fetch commits, extract tickets, store data
185
+ description=f"📊 Processing repository: {project_key}",
186
+ unit="steps",
187
+ ) as repo_progress_ctx:
188
+
189
+ # Step 1: Fetch commits
190
+ progress.set_description(repo_progress_ctx, f"🔍 {project_key}: Fetching commits")
191
+ daily_commits = self._fetch_commits_by_day(
192
+ repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
138
193
  )
194
+ logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
195
+ progress.update(repo_progress_ctx)
139
196
 
140
- # Step 3: Store commit-ticket correlations
141
- logger.info("Building commit-ticket correlations...")
142
- correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
197
+ # Step 2: Extract and fetch all referenced tickets
198
+ progress.set_description(repo_progress_ctx, f"🎫 {project_key}: Processing tickets")
199
+ logger.info("🔍 DEBUG: About to extract ticket references")
200
+ logger.info(f"Extracting ticket references for {project_key}...")
201
+ ticket_ids = self._extract_all_ticket_references(daily_commits)
202
+ logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
143
203
 
144
- # Step 4: Store daily commit batches
145
- logger.info(
146
- f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
147
- )
148
- logger.info("Storing daily commit batches...")
149
- batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
150
- logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
204
+ if jira_integration and ticket_ids:
205
+ logger.info(
206
+ f"Fetching {len(ticket_ids)} unique tickets from JIRA for {project_key}..."
207
+ )
208
+ self._fetch_detailed_tickets(
209
+ ticket_ids, jira_integration, project_key, progress_callback
210
+ )
211
+
212
+ # Build commit-ticket correlations
213
+ logger.info(f"Building commit-ticket correlations for {project_key}...")
214
+ correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
215
+ progress.update(repo_progress_ctx)
216
+
217
+ # Step 3: Store daily commit batches
218
+ progress.set_description(repo_progress_ctx, f"💾 {project_key}: Storing data")
219
+ logger.info(
220
+ f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
221
+ )
222
+ logger.info("Storing daily commit batches...")
223
+ batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
224
+ logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
225
+ progress.update(repo_progress_ctx)
151
226
 
152
227
  # CRITICAL FIX: Verify actual storage before reporting success
153
228
  session = self.database.get_session()
@@ -167,6 +242,46 @@ class GitDataFetcher:
167
242
  # Return summary statistics with ACTUAL stored counts
168
243
  # expected_commits already calculated above
169
244
 
245
+ # Calculate exclusion impact summary if exclusions are configured
246
+ exclusion_stats = {
247
+ "patterns_applied": len(self.exclude_paths),
248
+ "enabled": bool(self.exclude_paths),
249
+ }
250
+
251
+ if self.exclude_paths and expected_commits > 0:
252
+ # Get aggregate stats from daily_commit_batches
253
+ session = self.database.get_session()
254
+ try:
255
+ batch_stats = (
256
+ session.query(
257
+ func.sum(DailyCommitBatch.total_lines_added).label("total_added"),
258
+ func.sum(DailyCommitBatch.total_lines_deleted).label("total_deleted"),
259
+ )
260
+ .filter(
261
+ DailyCommitBatch.project_key == project_key,
262
+ DailyCommitBatch.repo_path == str(repo_path),
263
+ )
264
+ .first()
265
+ )
266
+
267
+ if batch_stats and batch_stats.total_added:
268
+ total_lines = (batch_stats.total_added or 0) + (batch_stats.total_deleted or 0)
269
+ exclusion_stats["total_lines_after_filtering"] = total_lines
270
+ exclusion_stats["lines_added"] = batch_stats.total_added or 0
271
+ exclusion_stats["lines_deleted"] = batch_stats.total_deleted or 0
272
+
273
+ logger.info(
274
+ f"📊 Exclusion Impact Summary for {project_key}:\n"
275
+ f" - Exclusion patterns applied: {len(self.exclude_paths)}\n"
276
+ f" - Total lines after filtering: {total_lines:,}\n"
277
+ f" - Lines added: {batch_stats.total_added:,}\n"
278
+ f" - Lines deleted: {batch_stats.total_deleted:,}"
279
+ )
280
+ except Exception as e:
281
+ logger.debug(f"Could not calculate exclusion impact summary: {e}")
282
+ finally:
283
+ session.close()
284
+
170
285
  results = {
171
286
  "project_key": project_key,
172
287
  "repo_path": str(repo_path),
@@ -180,6 +295,7 @@ class GitDataFetcher:
180
295
  "correlations_created": correlations_created,
181
296
  "batches_created": batches_created,
182
297
  },
298
+ "exclusions": exclusion_stats,
183
299
  "daily_commits": daily_commits, # For immediate use if needed
184
300
  }
185
301
 
@@ -188,10 +304,20 @@ class GitDataFetcher:
188
304
  logger.info(
189
305
  f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
190
306
  )
307
+ # Finish repository in Rich display with success
308
+ if hasattr(progress, "_use_rich") and progress._use_rich:
309
+ progress.finish_repository(project_key, success=True)
191
310
  else:
192
311
  logger.error(
193
312
  f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
194
313
  )
314
+ # Finish repository in Rich display with error
315
+ if hasattr(progress, "_use_rich") and progress._use_rich:
316
+ progress.finish_repository(
317
+ project_key,
318
+ success=False,
319
+ error_message=f"Storage issue: {actual_stored_commits}/{expected_commits} commits",
320
+ )
195
321
 
196
322
  return results
197
323
 
@@ -209,74 +335,296 @@ class GitDataFetcher:
209
335
  Returns:
210
336
  Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
211
337
  """
212
- from git import Repo
338
+
339
+ # THREAD SAFETY: Use the module-level thread-local storage to ensure each thread
340
+ # gets its own Repo instance. This prevents thread-safety issues when called from ThreadPoolExecutor
341
+
342
+ # Set environment variables to prevent ANY password prompts before opening repo
343
+ original_env = {}
344
+ env_vars = {
345
+ "GIT_TERMINAL_PROMPT": "0",
346
+ "GIT_ASKPASS": "/bin/echo", # Use full path to echo
347
+ "SSH_ASKPASS": "/bin/echo",
348
+ "GCM_INTERACTIVE": "never",
349
+ "GIT_SSH_COMMAND": "ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o PasswordAuthentication=no",
350
+ "DISPLAY": "",
351
+ "GIT_CREDENTIAL_HELPER": "", # Disable credential helper
352
+ "GCM_PROVIDER": "none", # Disable Git Credential Manager
353
+ "GIT_CREDENTIALS": "", # Clear any cached credentials
354
+ "GIT_CONFIG_NOSYSTEM": "1", # Don't use system config
355
+ }
356
+
357
+ # Save original environment and set our values
358
+ for key, value in env_vars.items():
359
+ original_env[key] = os.environ.get(key)
360
+ os.environ[key] = value
213
361
 
214
362
  try:
215
- repo = Repo(repo_path)
363
+ # THREAD SAFETY: Create a fresh Repo instance for this thread
364
+ # Do NOT reuse Repo instances across threads
365
+ from git import Repo
366
+
367
+ # Check for security issues in repository configuration
368
+ self._check_repository_security(repo_path, project_key)
369
+
370
+ # When skip_remote_fetch is enabled, use a more restricted repository access
371
+ if self.skip_remote_fetch:
372
+ # Use a special git configuration that completely disables credential helpers
373
+ import tempfile
374
+
375
+ # Create a secure temporary directory for our config
376
+ temp_dir = tempfile.mkdtemp(prefix="gitflow_")
377
+
378
+ # Create a temporary git config that disables all authentication
379
+ tmp_config_path = os.path.join(temp_dir, ".gitconfig")
380
+ with open(tmp_config_path, "w") as tmp_config:
381
+ tmp_config.write("[credential]\n")
382
+ tmp_config.write(" helper = \n")
383
+ tmp_config.write("[core]\n")
384
+ tmp_config.write(" askpass = \n")
385
+
386
+ # Set GIT_CONFIG to use our temporary config
387
+ os.environ["GIT_CONFIG_GLOBAL"] = tmp_config_path
388
+ os.environ["GIT_CONFIG_SYSTEM"] = "/dev/null"
389
+
390
+ # Store temp_dir in thread-local storage for cleanup
391
+ _thread_local.temp_dir = temp_dir
392
+
393
+ try:
394
+ # Open repository with our restricted configuration
395
+ # THREAD SAFETY: Each thread gets its own Repo instance
396
+ repo = Repo(repo_path)
397
+ finally:
398
+ # Clean up temporary config directory
399
+ try:
400
+ import shutil
401
+
402
+ if hasattr(_thread_local, "temp_dir"):
403
+ shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
404
+ delattr(_thread_local, "temp_dir")
405
+ except:
406
+ pass
407
+
408
+ try:
409
+ # Configure git to never prompt for credentials
410
+ with repo.config_writer() as git_config:
411
+ git_config.set_value("core", "askpass", "")
412
+ git_config.set_value("credential", "helper", "")
413
+ except Exception as e:
414
+ logger.debug(f"Could not update git config: {e}")
415
+
416
+ # Note: We can't monkey-patch remotes as it's a property without setter
417
+ # The skip_remote_fetch flag will prevent remote operations elsewhere
418
+
419
+ logger.debug(
420
+ f"Opened repository {project_key} in offline mode (skip_remote_fetch=true)"
421
+ )
422
+ else:
423
+ # THREAD SAFETY: Each thread gets its own Repo instance
424
+ repo = Repo(repo_path)
425
+ # Track repository status
426
+ self.repository_status[project_key] = {
427
+ "path": str(repo_path),
428
+ "remote_update": "skipped" if self.skip_remote_fetch else "pending",
429
+ "authentication_issues": False,
430
+ "error": None,
431
+ }
432
+
216
433
  # Update repository from remote before analysis
217
- self._update_repository(repo)
434
+ if not self.skip_remote_fetch:
435
+ logger.info(f"📥 Updating repository {project_key} from remote...")
436
+
437
+ update_success = self._update_repository(repo)
438
+ if not self.skip_remote_fetch:
439
+ self.repository_status[project_key]["remote_update"] = (
440
+ "success" if update_success else "failed"
441
+ )
442
+ if not update_success:
443
+ logger.warning(
444
+ f"⚠️ {project_key}: Continuing with local repository state (remote update failed)"
445
+ )
446
+
218
447
  except Exception as e:
219
- logger.error(f"Failed to open repository at {repo_path}: {e}")
448
+ logger.error(f"Failed to open repository {project_key} at {repo_path}: {e}")
449
+ self.repository_status[project_key] = {
450
+ "path": str(repo_path),
451
+ "remote_update": "error",
452
+ "authentication_issues": "authentication" in str(e).lower()
453
+ or "password" in str(e).lower(),
454
+ "error": str(e),
455
+ }
456
+ # Restore original environment variables before returning
457
+ for key, value in original_env.items():
458
+ if value is None:
459
+ os.environ.pop(key, None)
460
+ else:
461
+ os.environ[key] = value
220
462
  return {}
221
463
 
222
- # Collect commits from all relevant branches
223
- all_commits = []
464
+ # Get branches to analyze
224
465
  branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
225
466
 
226
467
  if not branches_to_analyze:
227
- logger.warning(f"No accessible branches found in repository {repo_path}")
468
+ logger.warning(
469
+ f"No accessible branches found in repository {project_key} at {repo_path}"
470
+ )
471
+ # Restore original environment variables before returning
472
+ for key, value in original_env.items():
473
+ if value is None:
474
+ os.environ.pop(key, None)
475
+ else:
476
+ os.environ[key] = value
228
477
  return {}
229
478
 
230
- logger.info(f"Analyzing branches: {branches_to_analyze}")
479
+ logger.info(f"🌿 {project_key}: Analyzing branches: {branches_to_analyze}")
231
480
 
232
- for branch_name in branches_to_analyze:
233
- try:
234
- branch_commits = list(
235
- repo.iter_commits(branch_name, since=start_date, until=end_date, reverse=False)
236
- )
481
+ # Calculate days to process
482
+ current_date = start_date.date()
483
+ end_date_only = end_date.date()
484
+ days_to_process = []
485
+ while current_date <= end_date_only:
486
+ days_to_process.append(current_date)
487
+ current_date += timedelta(days=1)
237
488
 
238
- logger.debug(
239
- f"Found {len(branch_commits)} commits in branch {branch_name} for date range"
489
+ logger.info(
490
+ f"Processing {len(days_to_process)} days from {start_date.date()} to {end_date.date()}"
491
+ )
492
+
493
+ # Get progress service for nested progress tracking
494
+ progress = get_progress_service()
495
+
496
+ # Dictionary to store commits by day
497
+ daily_commits = {}
498
+ all_commit_hashes = set() # Track all hashes for deduplication
499
+
500
+ # Count total commits first for Rich display
501
+ try:
502
+ for branch_name in branches_to_analyze[:1]: # Sample from first branch
503
+ sample_commits = list(
504
+ repo.iter_commits(branch_name, since=start_date, until=end_date)
240
505
  )
506
+ # Estimate based on first branch (multiply by number of branches for rough estimate)
507
+ len(sample_commits) * len(branches_to_analyze)
508
+ break
509
+ except:
510
+ len(days_to_process) * 50 # Default estimate
511
+
512
+ # Update repository in Rich display with estimated commit count
513
+ if hasattr(progress, "_use_rich") and progress._use_rich:
514
+ progress.update_repository(project_key, 0, 0.0)
515
+
516
+ # Create nested progress for day-by-day processing
517
+ with progress.progress(
518
+ total=len(days_to_process),
519
+ description=f"📅 Fetching commits for repository: {project_key}",
520
+ unit="days",
521
+ nested=True,
522
+ ) as day_progress_ctx:
523
+
524
+ for day_date in days_to_process:
525
+ # Update description to show current repository and day clearly
526
+ day_str = day_date.strftime("%Y-%m-%d")
527
+ progress.set_description(day_progress_ctx, f"🔍 {project_key}: Analyzing {day_str}")
528
+
529
+ # Calculate day boundaries
530
+ day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
531
+ day_end = datetime.combine(day_date, datetime.max.time(), tzinfo=timezone.utc)
532
+
533
+ day_commits = []
534
+ commits_found_today = 0
535
+
536
+ # Process each branch for this specific day
537
+ for branch_name in branches_to_analyze:
538
+ try:
539
+ # Fetch commits for this specific day and branch with timeout protection
540
+ def fetch_branch_commits():
541
+ return list(
542
+ repo.iter_commits(
543
+ branch_name, since=day_start, until=day_end, reverse=False
544
+ )
545
+ )
546
+
547
+ # Use timeout wrapper to prevent hanging on iter_commits
548
+ try:
549
+ branch_commits = self.git_wrapper.run_with_timeout(
550
+ fetch_branch_commits,
551
+ timeout=15, # 15 seconds per branch/day combination
552
+ operation_name=f"iter_commits_{branch_name}_{day_str}",
553
+ )
554
+ except GitOperationTimeout:
555
+ logger.warning(
556
+ f"⏱️ Timeout fetching commits for branch {branch_name} on {day_str}, skipping"
557
+ )
558
+ continue
241
559
 
242
- for commit in branch_commits:
243
- # Include merge commits like the original analyzer
244
- # The original analyzer marks merge commits with is_merge=True but doesn't skip them
560
+ for commit in branch_commits:
561
+ # Skip if we've already processed this commit
562
+ if commit.hexsha in all_commit_hashes:
563
+ continue
245
564
 
246
- # Extract commit data with full metadata
247
- commit_data = self._extract_commit_data(
248
- commit, branch_name, project_key, repo_path
565
+ # Extract commit data with full metadata
566
+ commit_data = self._extract_commit_data(
567
+ commit, branch_name, project_key, repo_path
568
+ )
569
+ if commit_data:
570
+ day_commits.append(commit_data)
571
+ all_commit_hashes.add(commit.hexsha)
572
+ commits_found_today += 1
573
+
574
+ except GitOperationTimeout as e:
575
+ logger.warning(
576
+ f"⏱️ Timeout processing branch {branch_name} for day {day_str}: {e}"
577
+ )
578
+ continue
579
+ except Exception as e:
580
+ logger.warning(
581
+ f"Error processing branch {branch_name} for day {day_str}: {e}"
582
+ )
583
+ continue
584
+
585
+ # Store commits for this day if any were found
586
+ if day_commits:
587
+ # Sort commits by timestamp
588
+ day_commits.sort(key=lambda c: c["timestamp"])
589
+ daily_commits[day_str] = day_commits
590
+
591
+ # Incremental caching - store commits for this day immediately
592
+ self._store_day_commits_incremental(
593
+ repo_path, day_str, day_commits, project_key
249
594
  )
250
- if commit_data:
251
- all_commits.append(commit_data)
252
595
 
253
- except Exception as e:
254
- logger.warning(f"Error processing branch {branch_name}: {e}")
255
- continue
596
+ logger.debug(f"Found {commits_found_today} commits on {day_str}")
597
+
598
+ # Update progress callback if provided
599
+ if progress_callback:
600
+ progress_callback(f"Processed {day_str}: {commits_found_today} commits")
256
601
 
257
- # Deduplicate commits (same commit may appear in multiple branches)
258
- seen_hashes = set()
259
- unique_commits = []
260
- for commit_data in all_commits:
261
- commit_hash = commit_data["commit_hash"]
262
- if commit_hash not in seen_hashes:
263
- seen_hashes.add(commit_hash)
264
- unique_commits.append(commit_data)
265
-
266
- # Organize commits by day
267
- daily_commits = defaultdict(list)
268
- for commit_data in unique_commits:
269
- # Convert timestamp to date key
270
- commit_date = commit_data["timestamp"].date()
271
- date_key = commit_date.strftime("%Y-%m-%d")
272
- daily_commits[date_key].append(commit_data)
273
-
274
- # Sort commits within each day by timestamp
275
- for date_key in daily_commits:
276
- daily_commits[date_key].sort(key=lambda c: c["timestamp"])
277
-
278
- logger.info(f"Collected {len(unique_commits)} commits across {len(daily_commits)} days")
279
- return dict(daily_commits)
602
+ # Update progress bar
603
+ progress.update(day_progress_ctx)
604
+
605
+ # Update Rich display with current commit count and speed
606
+ if hasattr(progress, "_use_rich") and progress._use_rich:
607
+ total_processed = len(all_commit_hashes)
608
+ # Calculate speed (commits per second) based on elapsed time
609
+ import time
610
+
611
+ if not hasattr(self, "_fetch_start_time"):
612
+ self._fetch_start_time = time.time()
613
+ elapsed = time.time() - self._fetch_start_time
614
+ speed = total_processed / elapsed if elapsed > 0 else 0
615
+ progress.update_repository(project_key, total_processed, speed)
616
+
617
+ total_commits = sum(len(commits) for commits in daily_commits.values())
618
+ logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
619
+
620
+ # Restore original environment variables
621
+ for key, value in original_env.items():
622
+ if value is None:
623
+ os.environ.pop(key, None)
624
+ else:
625
+ os.environ[key] = value
626
+
627
+ return daily_commits
280
628
 
281
629
  def _extract_commit_data(
282
630
  self, commit: Any, branch_name: str, project_key: str, repo_path: Path
@@ -317,13 +665,19 @@ class GitDataFetcher:
317
665
  line_stats = self._calculate_commit_stats(commit)
318
666
  total_insertions = line_stats["insertions"]
319
667
  total_deletions = line_stats["deletions"]
668
+ raw_insertions = line_stats.get("raw_insertions", total_insertions)
669
+ raw_deletions = line_stats.get("raw_deletions", total_deletions)
320
670
 
321
671
  commit_data.update(
322
672
  {
323
673
  "files_changed": files_changed,
324
674
  "files_changed_count": len(files_changed),
325
- "lines_added": total_insertions,
675
+ "lines_added": total_insertions, # Filtered counts for backward compatibility
326
676
  "lines_deleted": total_deletions,
677
+ "filtered_insertions": total_insertions, # Explicitly filtered counts
678
+ "filtered_deletions": total_deletions,
679
+ "raw_insertions": raw_insertions, # Raw unfiltered counts
680
+ "raw_deletions": raw_deletions,
327
681
  }
328
682
  )
329
683
 
@@ -335,6 +689,10 @@ class GitDataFetcher:
335
689
  "files_changed_count": 0,
336
690
  "lines_added": 0,
337
691
  "lines_deleted": 0,
692
+ "filtered_insertions": 0,
693
+ "filtered_deletions": 0,
694
+ "raw_insertions": 0,
695
+ "raw_deletions": 0,
338
696
  }
339
697
  )
340
698
 
@@ -363,9 +721,166 @@ class GitDataFetcher:
363
721
 
364
722
  def _should_exclude_file(self, file_path: str) -> bool:
365
723
  """Check if a file should be excluded based on exclude patterns."""
724
+ return any(self._matches_glob_pattern(file_path, pattern) for pattern in self.exclude_paths)
725
+
726
+ def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
727
+ """Check if a file path matches a glob pattern, handling ** recursion correctly.
728
+
729
+ This method properly handles different glob pattern types:
730
+ - **/vendor/** : matches files inside vendor directories at any level
731
+ - **/*.min.js : matches files with specific suffix anywhere in directory tree
732
+ - vendor/** : matches files inside vendor directory at root level only
733
+ - **pattern** : handles other complex patterns with pathlib.match()
734
+ - simple patterns : uses fnmatch for basic wildcards
735
+
736
+ Args:
737
+ filepath: The file path to check
738
+ pattern: The glob pattern to match against
739
+
740
+ Returns:
741
+ True if the file path matches the pattern, False otherwise
742
+ """
743
+ import fnmatch
744
+ import re
745
+ from pathlib import PurePath
746
+
747
+ # Handle empty or invalid inputs
748
+ if not filepath or not pattern:
749
+ return False
750
+
751
+ path = PurePath(filepath)
752
+
753
+ # Check for multiple ** patterns first (most complex)
754
+ if "**" in pattern and pattern.count("**") > 1:
755
+ # Multiple ** patterns - use custom recursive matching for complex patterns
756
+ return self._match_recursive_pattern(filepath, pattern)
757
+
758
+ # Then handle simple ** patterns
759
+ elif pattern.startswith("**/") and pattern.endswith("/**"):
760
+ # Pattern like **/vendor/** - matches files inside vendor directories at any level
761
+ dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
762
+ if not dir_name: # Handle edge case of '**/**'
763
+ return True
764
+ return dir_name in path.parts
765
+
766
+ elif pattern.startswith("**/"):
767
+ # Pattern like **/*.min.js - matches files with specific suffix anywhere
768
+ suffix_pattern = pattern[3:]
769
+ if not suffix_pattern: # Handle edge case of '**/'
770
+ return True
771
+ # Check against filename for file patterns, or any path part for directory patterns
772
+ if suffix_pattern.endswith("/"):
773
+ # Directory pattern like **/build/
774
+ dir_name = suffix_pattern[:-1]
775
+ return dir_name in path.parts
776
+ else:
777
+ # File pattern like *.min.js
778
+ return fnmatch.fnmatch(path.name, suffix_pattern)
779
+
780
+ elif pattern.endswith("/**"):
781
+ # Pattern like vendor/** or docs/build/** - matches files inside directory at root level
782
+ dir_name = pattern[:-3]
783
+ if not dir_name: # Handle edge case of '/**'
784
+ return True
785
+
786
+ # Handle both single directory names and nested paths
787
+ expected_parts = PurePath(dir_name).parts
788
+ return (
789
+ len(path.parts) >= len(expected_parts)
790
+ and path.parts[: len(expected_parts)] == expected_parts
791
+ )
792
+
793
+ elif "**" in pattern:
794
+ # Single ** pattern - use pathlib matching with fallback
795
+ try:
796
+ return path.match(pattern)
797
+ except (ValueError, TypeError):
798
+ # Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
799
+ try:
800
+ return fnmatch.fnmatch(filepath, pattern)
801
+ except re.error:
802
+ # Invalid regex pattern - return False to be safe
803
+ return False
804
+ else:
805
+ # Simple pattern - use fnmatch for basic wildcards
806
+ try:
807
+ # Try matching the full path first
808
+ if fnmatch.fnmatch(filepath, pattern):
809
+ return True
810
+ # Also try matching just the filename for simple patterns
811
+ # This allows "package-lock.json" to match "src/package-lock.json"
812
+ return fnmatch.fnmatch(path.name, pattern)
813
+ except re.error:
814
+ # Invalid regex pattern - return False to be safe
815
+ return False
816
+
817
+ def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
818
+ """Handle complex patterns with multiple ** wildcards.
819
+
820
+ Args:
821
+ filepath: The file path to check
822
+ pattern: The pattern with multiple ** wildcards
823
+
824
+ Returns:
825
+ True if the path matches the pattern, False otherwise
826
+ """
366
827
  import fnmatch
828
+ from pathlib import PurePath
829
+
830
+ # Split pattern by ** to handle each segment
831
+ parts = pattern.split("**")
832
+
833
+ # Validate that we have actual segments
834
+ if not parts:
835
+ return False
836
+
837
+ # Convert filepath to parts for easier matching
838
+ path_parts = list(PurePath(filepath).parts)
839
+
840
+ # Start matching from the beginning
841
+ path_index = 0
842
+
843
+ for i, part in enumerate(parts):
844
+ if not part:
845
+ # Empty part (e.g., from leading or trailing **)
846
+ if i == 0 or i == len(parts) - 1:
847
+ # Leading or trailing ** - continue
848
+ continue
849
+ # Middle empty part (consecutive **) - match any number of path components
850
+ continue
851
+
852
+ # Clean the part (remove leading/trailing slashes)
853
+ part = part.strip("/")
367
854
 
368
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
855
+ if not part:
856
+ continue
857
+
858
+ # Find where this part matches in the remaining path
859
+ found = False
860
+ for j in range(path_index, len(path_parts)):
861
+ # Check if the current path part matches the pattern part
862
+ if "/" in part:
863
+ # Part contains multiple path components
864
+ sub_parts = part.split("/")
865
+ if j + len(sub_parts) <= len(path_parts) and all(
866
+ fnmatch.fnmatch(path_parts[j + k], sub_parts[k])
867
+ for k in range(len(sub_parts))
868
+ ):
869
+ path_index = j + len(sub_parts)
870
+ found = True
871
+ break
872
+ else:
873
+ # Single component part
874
+ if fnmatch.fnmatch(path_parts[j], part):
875
+ path_index = j + 1
876
+ found = True
877
+ break
878
+
879
+ if not found and part:
880
+ # Required part not found in path
881
+ return False
882
+
883
+ return True
369
884
 
370
885
  def _get_branches_to_analyze(
371
886
  self, repo: Any, branch_patterns: Optional[list[str]]
@@ -373,114 +888,115 @@ class GitDataFetcher:
373
888
  """Get list of branches to analyze based on patterns.
374
889
 
375
890
  WHY: Robust branch detection that handles missing remotes, missing default branches,
376
- and provides good fallback behavior. Based on the approach used in the existing analyzer.
891
+ and provides good fallback behavior. When no patterns specified, analyzes ALL branches
892
+ to capture the complete development picture.
377
893
 
378
894
  DESIGN DECISION:
379
- - Try default branches first, fall back to all available branches
895
+ - When no patterns: analyze ALL accessible branches (not just main)
896
+ - When patterns specified: match against those patterns only
380
897
  - Handle missing remotes gracefully
381
898
  - Skip remote tracking branches to avoid duplicates
382
899
  - Use actual branch existence checking rather than assuming branches exist
900
+
901
+ THREAD SAFETY: This method is thread-safe as it doesn't modify shared state
902
+ and works with a repo instance passed as a parameter.
383
903
  """
384
- if not branch_patterns:
385
- # Get all available branches (local branches preferred)
386
- available_branches = []
904
+ # Collect all available branches (local branches preferred)
905
+ available_branches = []
387
906
 
388
- # First, try local branches
389
- try:
390
- local_branches = [branch.name for branch in repo.branches]
391
- available_branches.extend(local_branches)
392
- logger.debug(f"Found local branches: {local_branches}")
393
- except Exception as e:
394
- logger.debug(f"Error getting local branches: {e}")
907
+ # First, try local branches
908
+ try:
909
+ # THREAD SAFETY: Create a new list to avoid sharing references
910
+ local_branches = list([branch.name for branch in repo.branches])
911
+ available_branches.extend(local_branches)
912
+ logger.debug(f"Found local branches: {local_branches}")
913
+ except Exception as e:
914
+ logger.debug(f"Error getting local branches: {e}")
395
915
 
396
- # If we have remotes, also consider remote branches (but clean the names)
916
+ # If we have remotes, also consider remote branches (but clean the names)
917
+ # Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
918
+ if not self.skip_remote_fetch:
397
919
  try:
398
920
  if repo.remotes and hasattr(repo.remotes, "origin"):
399
- remote_branches = [
400
- ref.name.replace("origin/", "")
401
- for ref in repo.remotes.origin.refs
402
- if not ref.name.endswith("HEAD") # Skip HEAD ref
403
- ]
921
+ # THREAD SAFETY: Create a new list to avoid sharing references
922
+ remote_branches = list(
923
+ [
924
+ ref.name.replace("origin/", "")
925
+ for ref in repo.remotes.origin.refs
926
+ if not ref.name.endswith("HEAD") # Skip HEAD ref
927
+ ]
928
+ )
404
929
  # Only add remote branches that aren't already in local branches
405
930
  for branch in remote_branches:
406
931
  if branch not in available_branches:
407
932
  available_branches.append(branch)
408
933
  logger.debug(f"Found remote branches: {remote_branches}")
409
934
  except Exception as e:
410
- logger.debug(f"Error getting remote branches: {e}")
935
+ logger.debug(f"Error getting remote branches (may require authentication): {e}")
936
+ # Continue with local branches only
937
+ else:
938
+ logger.debug("Skipping remote branch enumeration (skip_remote_fetch=true)")
411
939
 
412
- # If no branches found, fallback to trying common names directly
413
- if not available_branches:
414
- logger.warning(
415
- "No branches found via normal detection, falling back to common names"
416
- )
417
- available_branches = ["main", "master", "develop", "dev"]
940
+ # If no branches found, fallback to trying common names directly
941
+ if not available_branches:
942
+ logger.warning("No branches found via normal detection, falling back to common names")
943
+ available_branches = ["main", "master", "develop", "dev"]
944
+
945
+ # Filter branches based on patterns if provided
946
+ if branch_patterns:
947
+ import fnmatch
948
+
949
+ matching_branches = []
950
+ for pattern in branch_patterns:
951
+ matching = [
952
+ branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)
953
+ ]
954
+ matching_branches.extend(matching)
955
+ # Remove duplicates while preserving order
956
+ branches_to_test = list(dict.fromkeys(matching_branches))
957
+ else:
958
+ # No patterns specified - analyze ALL branches for complete coverage
959
+ branches_to_test = available_branches
960
+ logger.info(
961
+ f"No branch patterns specified - will analyze all {len(branches_to_test)} branches"
962
+ )
418
963
 
419
- # Try default main branches first, in order of preference
964
+ # Test that branches are actually accessible
965
+ accessible_branches = []
966
+ for branch in branches_to_test:
967
+ try:
968
+ # THREAD SAFETY: Use iterator without storing intermediate results
969
+ next(iter(repo.iter_commits(branch, max_count=1)), None)
970
+ accessible_branches.append(branch)
971
+ except Exception as e:
972
+ logger.debug(f"Branch {branch} not accessible: {e}")
973
+
974
+ if not accessible_branches:
975
+ # Last resort: try to find ANY working branch
976
+ logger.warning("No accessible branches found from patterns/default, trying fallback")
420
977
  main_branches = ["main", "master", "develop", "dev"]
421
978
  for branch in main_branches:
422
979
  if branch in available_branches:
423
- # Test that we can actually access this branch
424
980
  try:
425
- # Just try to get the commit object to verify branch exists and is accessible
426
981
  next(iter(repo.iter_commits(branch, max_count=1)), None)
427
- logger.info(f"Using main branch: {branch}")
982
+ logger.info(f"Using fallback main branch: {branch}")
428
983
  return [branch]
429
- except Exception as e:
430
- logger.debug(f"Branch {branch} exists but not accessible: {e}")
984
+ except Exception:
431
985
  continue
432
986
 
433
- # If no main branches work, try the first available branch that actually works
987
+ # Try any available branch
434
988
  for branch in available_branches:
435
989
  try:
436
990
  next(iter(repo.iter_commits(branch, max_count=1)), None)
437
991
  logger.info(f"Using fallback branch: {branch}")
438
992
  return [branch]
439
- except Exception as e:
440
- logger.debug(f"Branch {branch} not accessible: {e}")
993
+ except Exception:
441
994
  continue
442
995
 
443
- # Last resort: return empty list (will be handled gracefully by caller)
444
996
  logger.warning("No accessible branches found")
445
997
  return []
446
998
 
447
- # Use specified patterns - match against all available branches
448
- import fnmatch
449
-
450
- available_branches = []
451
-
452
- # Collect all branches (local and remote)
453
- with contextlib.suppress(Exception):
454
- available_branches.extend([branch.name for branch in repo.branches])
455
-
456
- try:
457
- if repo.remotes and hasattr(repo.remotes, "origin"):
458
- remote_branches = [
459
- ref.name.replace("origin/", "")
460
- for ref in repo.remotes.origin.refs
461
- if not ref.name.endswith("HEAD")
462
- ]
463
- for branch in remote_branches:
464
- if branch not in available_branches:
465
- available_branches.append(branch)
466
- except Exception:
467
- pass
468
-
469
- # Match patterns against available branches
470
- matching_branches = []
471
- for pattern in branch_patterns:
472
- matching = [branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)]
473
- matching_branches.extend(matching)
474
-
475
- # Test that matched branches are actually accessible
476
- accessible_branches = []
477
- for branch in list(set(matching_branches)): # Remove duplicates
478
- try:
479
- next(iter(repo.iter_commits(branch, max_count=1)), None)
480
- accessible_branches.append(branch)
481
- except Exception as e:
482
- logger.debug(f"Matched branch {branch} not accessible: {e}")
483
-
999
+ logger.info(f"Will analyze {len(accessible_branches)} branches: {accessible_branches}")
484
1000
  return accessible_branches
485
1001
 
486
1002
  def _update_repository(self, repo) -> bool:
@@ -503,82 +1019,62 @@ class GitDataFetcher:
503
1019
  Returns:
504
1020
  bool: True if update succeeded, False if failed (but analysis continues)
505
1021
  """
506
- try:
507
- if repo.remotes:
508
- logger.info("Fetching latest changes from remote")
1022
+ # Skip remote operations if configured
1023
+ if self.skip_remote_fetch:
1024
+ logger.info("🚫 Skipping remote fetch (skip_remote_fetch=true)")
1025
+ return True
509
1026
 
510
- # Use subprocess for git fetch to prevent password prompts
511
- env = os.environ.copy()
512
- env["GIT_TERMINAL_PROMPT"] = "0"
513
- env["GIT_ASKPASS"] = ""
514
- env["GCM_INTERACTIVE"] = "never"
1027
+ # Check for stale repository (last fetch > 1 hour ago)
1028
+ self._check_repository_staleness(repo)
515
1029
 
516
- # Run git fetch with timeout
517
- try:
518
- result = subprocess.run(
519
- ["git", "fetch", "--all", "--config", "credential.helper="],
520
- cwd=repo.working_dir,
521
- env=env,
522
- capture_output=True,
523
- text=True,
524
- timeout=30, # 30 second timeout
525
- )
1030
+ try:
1031
+ # Check if we have remotes without triggering authentication
1032
+ has_remotes = False
1033
+ try:
1034
+ has_remotes = bool(repo.remotes)
1035
+ except Exception as e:
1036
+ logger.debug(f"Could not check for remotes (may require authentication): {e}")
1037
+ return True # Continue with local analysis
526
1038
 
527
- if result.returncode != 0:
528
- error_str = result.stderr.lower()
529
- if any(
530
- x in error_str
531
- for x in ["authentication", "permission denied", "401", "403"]
532
- ):
533
- logger.error(
534
- "GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
535
- "You can verify with: gh auth status"
536
- )
537
- # Skip fetch but continue with local analysis
538
- return False
539
- logger.warning(f"Git fetch failed: {result.stderr}")
540
- return False
1039
+ if has_remotes:
1040
+ logger.info("Fetching latest changes from remote")
541
1041
 
542
- except subprocess.TimeoutExpired:
543
- logger.error(
544
- "Git fetch timed out (likely authentication failure). Continuing with local repository state."
1042
+ # Use our timeout wrapper for safe git operations
1043
+ repo_path = Path(repo.working_dir)
1044
+
1045
+ # Try to fetch with timeout protection
1046
+ fetch_success = self.git_wrapper.fetch_with_timeout(repo_path, timeout=30)
1047
+
1048
+ if not fetch_success:
1049
+ # Mark this repository as having authentication issues if applicable
1050
+ if hasattr(self, "repository_status"):
1051
+ for key in self.repository_status:
1052
+ if repo.working_dir.endswith(key) or key in repo.working_dir:
1053
+ self.repository_status[key]["remote_update"] = "failed"
1054
+ break
1055
+
1056
+ # Explicit warning to user about stale data
1057
+ logger.warning(
1058
+ f"❌ Failed to fetch updates for {repo_path.name}. "
1059
+ f"Analysis will use potentially stale local data. "
1060
+ f"Check authentication or network connectivity."
545
1061
  )
546
1062
  return False
1063
+ else:
1064
+ # Explicit success confirmation
1065
+ logger.info(f"✅ Successfully fetched updates for {repo_path.name}")
547
1066
 
548
1067
  # Only try to pull if not in detached HEAD state
549
1068
  if not repo.head.is_detached:
550
1069
  current_branch = repo.active_branch
551
1070
  tracking = current_branch.tracking_branch()
552
1071
  if tracking:
553
- # Pull latest changes using subprocess
554
- try:
555
- result = subprocess.run(
556
- ["git", "pull", "--config", "credential.helper="],
557
- cwd=repo.working_dir,
558
- env=env,
559
- capture_output=True,
560
- text=True,
561
- timeout=30,
562
- )
563
-
564
- if result.returncode != 0:
565
- error_str = result.stderr.lower()
566
- if any(
567
- x in error_str
568
- for x in ["authentication", "permission denied", "401", "403"]
569
- ):
570
- logger.error(
571
- "GitHub authentication failed during pull. Continuing with local repository state."
572
- )
573
- return False
574
- logger.warning(f"Git pull failed: {result.stderr}")
575
- return False
1072
+ # Pull latest changes using timeout wrapper
1073
+ pull_success = self.git_wrapper.pull_with_timeout(repo_path, timeout=30)
1074
+ if pull_success:
576
1075
  logger.debug(f"Pulled latest changes for {current_branch.name}")
577
-
578
- except subprocess.TimeoutExpired:
579
- logger.error(
580
- "Git pull timed out. Continuing with local repository state."
581
- )
1076
+ else:
1077
+ logger.warning("Git pull failed, continuing with fetched state")
582
1078
  return False
583
1079
  else:
584
1080
  logger.debug(
@@ -595,6 +1091,114 @@ class GitDataFetcher:
595
1091
  # Continue with analysis using local state
596
1092
  return False
597
1093
 
1094
+ def _check_repository_staleness(self, repo) -> None:
1095
+ """Check if repository hasn't been fetched recently and warn user.
1096
+
1097
+ Args:
1098
+ repo: GitPython Repo object
1099
+ """
1100
+ try:
1101
+ repo_path = Path(repo.working_dir)
1102
+ fetch_head_path = repo_path / ".git" / "FETCH_HEAD"
1103
+
1104
+ if fetch_head_path.exists():
1105
+ # Get last fetch time from FETCH_HEAD modification time
1106
+ last_fetch_time = datetime.fromtimestamp(
1107
+ fetch_head_path.stat().st_mtime, tz=timezone.utc
1108
+ )
1109
+ now = datetime.now(timezone.utc)
1110
+ hours_since_fetch = (now - last_fetch_time).total_seconds() / 3600
1111
+
1112
+ if hours_since_fetch > 1:
1113
+ logger.warning(
1114
+ f"⏰ Repository {repo_path.name} last fetched {hours_since_fetch:.1f} hours ago. "
1115
+ f"Data may be stale."
1116
+ )
1117
+ else:
1118
+ logger.warning(
1119
+ f"⚠️ Repository {repo_path.name} has never been fetched. "
1120
+ f"Will attempt to fetch now."
1121
+ )
1122
+ except Exception as e:
1123
+ logger.debug(f"Could not check repository staleness: {e}")
1124
+
1125
+ def _check_repository_security(self, repo_path: Path, project_key: str) -> None:
1126
+ """Check for security issues in repository configuration.
1127
+
1128
+ Warns about:
1129
+ - Exposed tokens in remote URLs
1130
+ - Insecure credential storage
1131
+ """
1132
+
1133
+ try:
1134
+ # Check for tokens in remote URLs
1135
+ result = subprocess.run(
1136
+ ["git", "remote", "-v"],
1137
+ cwd=repo_path,
1138
+ capture_output=True,
1139
+ text=True,
1140
+ timeout=2,
1141
+ env={"GIT_TERMINAL_PROMPT": "0"},
1142
+ )
1143
+
1144
+ if result.returncode == 0:
1145
+ output = result.stdout
1146
+ # Check for various token patterns in URLs
1147
+ token_patterns = [
1148
+ r"https://[^@]*@", # Any HTTPS URL with embedded credentials
1149
+ r"ghp_[a-zA-Z0-9]+", # GitHub Personal Access Token
1150
+ r"ghs_[a-zA-Z0-9]+", # GitHub Server Token
1151
+ r"github_pat_[a-zA-Z0-9]+", # New GitHub PAT format
1152
+ ]
1153
+
1154
+ for pattern in token_patterns:
1155
+ import re
1156
+
1157
+ if re.search(pattern, output):
1158
+ logger.warning(
1159
+ f"⚠️ SECURITY WARNING for {project_key}: "
1160
+ f"Repository appears to have credentials in remote URL. "
1161
+ f"This is a security risk! Consider using: "
1162
+ f"1) GitHub CLI (gh auth login), "
1163
+ f"2) SSH keys, or "
1164
+ f"3) Git credential manager instead."
1165
+ )
1166
+ break
1167
+ except:
1168
+ # Don't fail analysis due to security check
1169
+ pass
1170
+
1171
+ def get_repository_status_summary(self) -> dict[str, Any]:
1172
+ """Get a summary of repository fetch status.
1173
+
1174
+ Returns:
1175
+ Dictionary with status summary including any repositories with issues
1176
+ """
1177
+ summary = {
1178
+ "total_repositories": len(self.repository_status),
1179
+ "successful_updates": 0,
1180
+ "failed_updates": 0,
1181
+ "skipped_updates": 0,
1182
+ "authentication_issues": [],
1183
+ "errors": [],
1184
+ }
1185
+
1186
+ for project_key, status in self.repository_status.items():
1187
+ if status["remote_update"] == "success":
1188
+ summary["successful_updates"] += 1
1189
+ elif status["remote_update"] == "failed":
1190
+ summary["failed_updates"] += 1
1191
+ if status.get("authentication_issues"):
1192
+ summary["authentication_issues"].append(project_key)
1193
+ elif status["remote_update"] == "skipped":
1194
+ summary["skipped_updates"] += 1
1195
+ elif status["remote_update"] == "error":
1196
+ summary["errors"].append(
1197
+ {"repository": project_key, "error": status.get("error", "Unknown error")}
1198
+ )
1199
+
1200
+ return summary
1201
+
598
1202
  def _extract_all_ticket_references(
599
1203
  self, daily_commits: dict[str, list[dict[str, Any]]]
600
1204
  ) -> set[str]:
@@ -872,8 +1476,20 @@ class GitDataFetcher:
872
1476
  "branch": commit.get("branch", "main"),
873
1477
  "is_merge": commit.get("is_merge", False),
874
1478
  "files_changed_count": commit.get("files_changed_count", 0),
875
- "insertions": commit.get("lines_added", 0),
876
- "deletions": commit.get("lines_deleted", 0),
1479
+ # Store raw unfiltered values in insertions/deletions
1480
+ "insertions": commit.get(
1481
+ "raw_insertions", commit.get("lines_added", 0)
1482
+ ),
1483
+ "deletions": commit.get(
1484
+ "raw_deletions", commit.get("lines_deleted", 0)
1485
+ ),
1486
+ # Store filtered values separately
1487
+ "filtered_insertions": commit.get(
1488
+ "filtered_insertions", commit.get("lines_added", 0)
1489
+ ),
1490
+ "filtered_deletions": commit.get(
1491
+ "filtered_deletions", commit.get("lines_deleted", 0)
1492
+ ),
877
1493
  "story_points": commit.get("story_points"),
878
1494
  "ticket_references": commit.get("ticket_references", []),
879
1495
  }
@@ -1146,24 +1762,48 @@ class GitDataFetcher:
1146
1762
  session.close()
1147
1763
 
1148
1764
  def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
1149
- """Calculate commit statistics using reliable git diff --numstat.
1765
+ """Calculate commit statistics using reliable git diff --numstat with exclude_paths filtering.
1150
1766
 
1151
1767
  Returns:
1152
- Dictionary with 'files', 'insertions', and 'deletions' counts
1768
+ Dictionary with both raw and filtered statistics:
1769
+ - 'files', 'insertions', 'deletions': filtered counts
1770
+ - 'raw_insertions', 'raw_deletions': unfiltered counts
1771
+
1772
+ THREAD SAFETY: This method is thread-safe as it works with commit objects
1773
+ that have their own repo references.
1153
1774
  """
1154
1775
  stats = {"files": 0, "insertions": 0, "deletions": 0}
1155
1776
 
1777
+ # Track raw stats for storage
1778
+ raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
1779
+ excluded_stats = {"files": 0, "insertions": 0, "deletions": 0}
1780
+
1156
1781
  # For initial commits or commits without parents
1157
1782
  parent = commit.parents[0] if commit.parents else None
1158
1783
 
1159
1784
  try:
1160
- # Use git command directly for accurate line counts
1785
+ # THREAD SAFETY: Use the repo reference from the commit object
1786
+ # Each thread has its own commit object with its own repo reference
1161
1787
  repo = commit.repo
1162
- if parent:
1163
- diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1164
- else:
1165
- # Initial commit - use git show with --numstat
1166
- diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1788
+ Path(repo.working_dir)
1789
+
1790
+ def get_diff_output():
1791
+ if parent:
1792
+ return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1793
+ else:
1794
+ # Initial commit - use git show with --numstat
1795
+ return repo.git.show(commit.hexsha, "--numstat", "--format=")
1796
+
1797
+ # Use timeout wrapper for git diff operations
1798
+ try:
1799
+ diff_output = self.git_wrapper.run_with_timeout(
1800
+ get_diff_output,
1801
+ timeout=10, # 10 seconds for diff operations
1802
+ operation_name=f"diff_{commit.hexsha[:8]}",
1803
+ )
1804
+ except GitOperationTimeout:
1805
+ logger.warning(f"⏱️ Timeout calculating stats for commit {commit.hexsha[:8]}")
1806
+ return stats # Return zeros
1167
1807
 
1168
1808
  # Parse the numstat output: insertions\tdeletions\tfilename
1169
1809
  for line in diff_output.strip().split("\n"):
@@ -1175,9 +1815,22 @@ class GitDataFetcher:
1175
1815
  try:
1176
1816
  insertions = int(parts[0]) if parts[0] != "-" else 0
1177
1817
  deletions = int(parts[1]) if parts[1] != "-" else 0
1178
- # filename = parts[2] - we don't filter here since files_changed list is handled separately
1179
-
1180
- # Count all changes (raw stats, no filtering)
1818
+ filename = parts[2]
1819
+
1820
+ # Always count raw stats
1821
+ raw_stats["files"] += 1
1822
+ raw_stats["insertions"] += insertions
1823
+ raw_stats["deletions"] += deletions
1824
+
1825
+ # Skip excluded files based on exclude_paths patterns
1826
+ if self._should_exclude_file(filename):
1827
+ logger.debug(f"Excluding file from line counts: {filename}")
1828
+ excluded_stats["files"] += 1
1829
+ excluded_stats["insertions"] += insertions
1830
+ excluded_stats["deletions"] += deletions
1831
+ continue
1832
+
1833
+ # Count only non-excluded files and their changes
1181
1834
  stats["files"] += 1
1182
1835
  stats["insertions"] += insertions
1183
1836
  stats["deletions"] += deletions
@@ -1186,8 +1839,381 @@ class GitDataFetcher:
1186
1839
  # Skip binary files or malformed lines
1187
1840
  continue
1188
1841
 
1842
+ # Log exclusion statistics if significant
1843
+ if excluded_stats["files"] > 0 or (
1844
+ raw_stats["insertions"] > 0 and stats["insertions"] < raw_stats["insertions"]
1845
+ ):
1846
+ reduction_pct = (
1847
+ 100 * (1 - stats["insertions"] / raw_stats["insertions"])
1848
+ if raw_stats["insertions"] > 0
1849
+ else 0
1850
+ )
1851
+ logger.info(
1852
+ f"Commit {commit.hexsha[:8]}: Excluded {excluded_stats['files']} files, "
1853
+ f"{excluded_stats['insertions']} insertions, {excluded_stats['deletions']} deletions "
1854
+ f"({reduction_pct:.1f}% reduction)"
1855
+ )
1856
+
1857
+ # Log if exclusions are configured
1858
+ if self.exclude_paths and raw_stats["files"] > 0:
1859
+ logger.debug(
1860
+ f"Commit {commit.hexsha[:8]}: Applied {len(self.exclude_paths)} exclusion patterns. "
1861
+ f"Raw: {raw_stats['files']} files, +{raw_stats['insertions']} -{raw_stats['deletions']}. "
1862
+ f"Filtered: {stats['files']} files, +{stats['insertions']} -{stats['deletions']}"
1863
+ )
1864
+
1189
1865
  except Exception as e:
1190
1866
  # Log the error for debugging but don't crash
1191
1867
  logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
1192
1868
 
1869
+ # Return both raw and filtered stats
1870
+ stats["raw_insertions"] = raw_stats["insertions"]
1871
+ stats["raw_deletions"] = raw_stats["deletions"]
1193
1872
  return stats
1873
+
1874
+ def _store_day_commits_incremental(
1875
+ self, repo_path: Path, date_str: str, commits: list[dict[str, Any]], project_key: str
1876
+ ) -> None:
1877
+ """Store commits for a single day incrementally to enable progress tracking.
1878
+
1879
+ This method stores commits immediately after fetching them for a day,
1880
+ allowing for better progress tracking and recovery from interruptions.
1881
+
1882
+ Args:
1883
+ repo_path: Path to the repository
1884
+ date_str: Date string in YYYY-MM-DD format
1885
+ commits: List of commit data for the day
1886
+ project_key: Project identifier
1887
+ """
1888
+ try:
1889
+ # Transform commits to cache format
1890
+ cache_format_commits = []
1891
+ for commit in commits:
1892
+ cache_format_commit = {
1893
+ "hash": commit["commit_hash"],
1894
+ "author_name": commit.get("author_name", ""),
1895
+ "author_email": commit.get("author_email", ""),
1896
+ "message": commit.get("message", ""),
1897
+ "timestamp": commit["timestamp"],
1898
+ "branch": commit.get("branch", "main"),
1899
+ "is_merge": commit.get("is_merge", False),
1900
+ "files_changed_count": commit.get("files_changed_count", 0),
1901
+ # Store raw unfiltered values
1902
+ "insertions": commit.get("raw_insertions", commit.get("lines_added", 0)),
1903
+ "deletions": commit.get("raw_deletions", commit.get("lines_deleted", 0)),
1904
+ # Store filtered values
1905
+ "filtered_insertions": commit.get(
1906
+ "filtered_insertions", commit.get("lines_added", 0)
1907
+ ),
1908
+ "filtered_deletions": commit.get(
1909
+ "filtered_deletions", commit.get("lines_deleted", 0)
1910
+ ),
1911
+ "story_points": commit.get("story_points"),
1912
+ "ticket_references": commit.get("ticket_references", []),
1913
+ }
1914
+ cache_format_commits.append(cache_format_commit)
1915
+
1916
+ # Use bulk store for efficiency
1917
+ if cache_format_commits:
1918
+ bulk_stats = self.cache.bulk_store_commits(str(repo_path), cache_format_commits)
1919
+ logger.debug(
1920
+ f"Incrementally stored {bulk_stats['inserted']} commits for {date_str} "
1921
+ f"({bulk_stats['skipped']} already cached)"
1922
+ )
1923
+ except Exception as e:
1924
+ # Log error but don't fail - commits will be stored again in batch at the end
1925
+ logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")
1926
+
1927
+ def process_repositories_parallel(
1928
+ self,
1929
+ repositories: list[dict],
1930
+ weeks_back: int = 4,
1931
+ jira_integration: Optional[JIRAIntegration] = None,
1932
+ start_date: Optional[datetime] = None,
1933
+ end_date: Optional[datetime] = None,
1934
+ max_workers: int = 3,
1935
+ ) -> dict[str, Any]:
1936
+ """Process multiple repositories in parallel with proper timeout protection.
1937
+
1938
+ Args:
1939
+ repositories: List of repository configurations
1940
+ weeks_back: Number of weeks to analyze
1941
+ jira_integration: Optional JIRA integration for ticket data
1942
+ start_date: Optional explicit start date
1943
+ end_date: Optional explicit end date
1944
+ max_workers: Maximum number of parallel workers
1945
+
1946
+ Returns:
1947
+ Dictionary containing processing results and statistics
1948
+ """
1949
+ logger.info(
1950
+ f"🚀 Starting parallel processing of {len(repositories)} repositories with {max_workers} workers"
1951
+ )
1952
+
1953
+ # Initialize statistics
1954
+ self.processing_stats = {
1955
+ "total": len(repositories),
1956
+ "processed": 0,
1957
+ "success": 0,
1958
+ "failed": 0,
1959
+ "timeout": 0,
1960
+ "repositories": {},
1961
+ }
1962
+
1963
+ # Get progress service for updates
1964
+ progress = get_progress_service()
1965
+
1966
+ # Start heartbeat logger for monitoring
1967
+ with HeartbeatLogger(interval=5):
1968
+ results = {}
1969
+
1970
+ # Use ThreadPoolExecutor for parallel processing
1971
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1972
+ # Submit all repository processing tasks
1973
+ future_to_repo = {}
1974
+
1975
+ for repo_config in repositories:
1976
+ repo_path = Path(repo_config.get("path", ""))
1977
+ project_key = repo_config.get("project_key", repo_path.name)
1978
+ branch_patterns = repo_config.get("branch_patterns")
1979
+
1980
+ # Submit task with timeout wrapper
1981
+ future = executor.submit(
1982
+ self._process_repository_with_timeout,
1983
+ repo_path,
1984
+ project_key,
1985
+ weeks_back,
1986
+ branch_patterns,
1987
+ jira_integration,
1988
+ start_date,
1989
+ end_date,
1990
+ )
1991
+ future_to_repo[future] = {
1992
+ "path": repo_path,
1993
+ "project_key": project_key,
1994
+ "start_time": time.time(),
1995
+ }
1996
+
1997
+ logger.info(f"📋 Submitted {project_key} for processing")
1998
+
1999
+ # Process results as they complete
2000
+ for future in as_completed(future_to_repo):
2001
+ repo_info = future_to_repo[future]
2002
+ project_key = repo_info["project_key"]
2003
+ elapsed_time = time.time() - repo_info["start_time"]
2004
+
2005
+ try:
2006
+ result = future.result(timeout=5) # Short timeout to get result
2007
+
2008
+ if result:
2009
+ self.processing_stats["success"] += 1
2010
+ self.processing_stats["repositories"][project_key] = {
2011
+ "status": "success",
2012
+ "elapsed_time": elapsed_time,
2013
+ "commits": result.get("stats", {}).get("total_commits", 0),
2014
+ "tickets": result.get("stats", {}).get("unique_tickets", 0),
2015
+ }
2016
+ results[project_key] = result
2017
+
2018
+ logger.info(
2019
+ f"✅ {project_key}: Successfully processed "
2020
+ f"{result['stats']['total_commits']} commits in {elapsed_time:.1f}s"
2021
+ )
2022
+
2023
+ # Update progress
2024
+ if hasattr(progress, "finish_repository"):
2025
+ # Check if progress adapter supports stats parameter
2026
+ if hasattr(progress, "update_stats"):
2027
+ progress.update_stats(
2028
+ processed=self.processing_stats["processed"],
2029
+ success=self.processing_stats["success"],
2030
+ failed=self.processing_stats["failed"],
2031
+ timeout=self.processing_stats["timeout"],
2032
+ total=self.processing_stats["total"],
2033
+ )
2034
+ progress.finish_repository(project_key, success=True)
2035
+ else:
2036
+ progress.finish_repository(project_key, success=True)
2037
+ else:
2038
+ self.processing_stats["failed"] += 1
2039
+ self.processing_stats["repositories"][project_key] = {
2040
+ "status": "failed",
2041
+ "elapsed_time": elapsed_time,
2042
+ "error": "Processing returned no result",
2043
+ }
2044
+
2045
+ logger.error(
2046
+ f"❌ {project_key}: Processing failed after {elapsed_time:.1f}s"
2047
+ )
2048
+
2049
+ # Update progress
2050
+ if hasattr(progress, "finish_repository"):
2051
+ # Check if progress adapter supports stats parameter
2052
+ if hasattr(progress, "update_stats"):
2053
+ progress.update_stats(
2054
+ processed=self.processing_stats["processed"],
2055
+ success=self.processing_stats["success"],
2056
+ failed=self.processing_stats["failed"],
2057
+ timeout=self.processing_stats["timeout"],
2058
+ total=self.processing_stats["total"],
2059
+ )
2060
+ progress.finish_repository(
2061
+ project_key,
2062
+ success=False,
2063
+ error_message="Processing failed",
2064
+ )
2065
+ else:
2066
+ progress.finish_repository(
2067
+ project_key,
2068
+ success=False,
2069
+ error_message="Processing failed",
2070
+ )
2071
+
2072
+ except GitOperationTimeout:
2073
+ self.processing_stats["timeout"] += 1
2074
+ self.processing_stats["repositories"][project_key] = {
2075
+ "status": "timeout",
2076
+ "elapsed_time": elapsed_time,
2077
+ "error": "Operation timed out",
2078
+ }
2079
+
2080
+ logger.error(f"⏱️ {project_key}: Timed out after {elapsed_time:.1f}s")
2081
+
2082
+ # Update progress
2083
+ if hasattr(progress, "finish_repository"):
2084
+ # Check if progress adapter supports stats parameter
2085
+ if hasattr(progress, "update_stats"):
2086
+ progress.update_stats(
2087
+ processed=self.processing_stats["processed"],
2088
+ success=self.processing_stats["success"],
2089
+ failed=self.processing_stats["failed"],
2090
+ timeout=self.processing_stats["timeout"],
2091
+ total=self.processing_stats["total"],
2092
+ )
2093
+ progress.finish_repository(
2094
+ project_key, success=False, error_message="Timeout"
2095
+ )
2096
+ else:
2097
+ progress.finish_repository(
2098
+ project_key, success=False, error_message="Timeout"
2099
+ )
2100
+
2101
+ except Exception as e:
2102
+ self.processing_stats["failed"] += 1
2103
+ self.processing_stats["repositories"][project_key] = {
2104
+ "status": "failed",
2105
+ "elapsed_time": elapsed_time,
2106
+ "error": str(e),
2107
+ }
2108
+
2109
+ logger.error(f"❌ {project_key}: Error after {elapsed_time:.1f}s - {e}")
2110
+
2111
+ # Update progress
2112
+ if hasattr(progress, "finish_repository"):
2113
+ # Check if progress adapter supports stats parameter
2114
+ if hasattr(progress, "update_stats"):
2115
+ progress.update_stats(
2116
+ processed=self.processing_stats["processed"],
2117
+ success=self.processing_stats["success"],
2118
+ failed=self.processing_stats["failed"],
2119
+ timeout=self.processing_stats["timeout"],
2120
+ total=self.processing_stats["total"],
2121
+ )
2122
+ progress.finish_repository(
2123
+ project_key, success=False, error_message=str(e)
2124
+ )
2125
+ else:
2126
+ progress.finish_repository(
2127
+ project_key, success=False, error_message=str(e)
2128
+ )
2129
+
2130
+ finally:
2131
+ # Update processed counter BEFORE logging and progress updates
2132
+ self.processing_stats["processed"] += 1
2133
+
2134
+ # Update progress service with actual processing stats
2135
+ if hasattr(progress, "update_stats"):
2136
+ progress.update_stats(
2137
+ processed=self.processing_stats["processed"],
2138
+ success=self.processing_stats["success"],
2139
+ failed=self.processing_stats["failed"],
2140
+ timeout=self.processing_stats["timeout"],
2141
+ total=self.processing_stats["total"],
2142
+ )
2143
+
2144
+ # Log progress
2145
+ logger.info(
2146
+ f"📊 Progress: {self.processing_stats['processed']}/{self.processing_stats['total']} repositories "
2147
+ f"(✅ {self.processing_stats['success']} | ❌ {self.processing_stats['failed']} | ⏱️ {self.processing_stats['timeout']})"
2148
+ )
2149
+
2150
+ # Final summary
2151
+ logger.info("=" * 60)
2152
+ logger.info("📈 PARALLEL PROCESSING SUMMARY")
2153
+ logger.info(f" Total repositories: {self.processing_stats['total']}")
2154
+ logger.info(f" Successfully processed: {self.processing_stats['success']}")
2155
+ logger.info(f" Failed: {self.processing_stats['failed']}")
2156
+ logger.info(f" Timed out: {self.processing_stats['timeout']}")
2157
+ logger.info("=" * 60)
2158
+
2159
+ return {"results": results, "statistics": self.processing_stats}
2160
+
2161
+ def _process_repository_with_timeout(
2162
+ self,
2163
+ repo_path: Path,
2164
+ project_key: str,
2165
+ weeks_back: int = 4,
2166
+ branch_patterns: Optional[list[str]] = None,
2167
+ jira_integration: Optional[JIRAIntegration] = None,
2168
+ start_date: Optional[datetime] = None,
2169
+ end_date: Optional[datetime] = None,
2170
+ timeout_per_operation: int = 30,
2171
+ ) -> Optional[dict[str, Any]]:
2172
+ """Process a single repository with comprehensive timeout protection.
2173
+
2174
+ Args:
2175
+ repo_path: Path to the repository
2176
+ project_key: Project identifier
2177
+ weeks_back: Number of weeks to analyze
2178
+ branch_patterns: Branch patterns to include
2179
+ jira_integration: JIRA integration for ticket data
2180
+ start_date: Optional explicit start date
2181
+ end_date: Optional explicit end date
2182
+ timeout_per_operation: Timeout for individual git operations
2183
+
2184
+ Returns:
2185
+ Repository processing results or None if failed
2186
+ """
2187
+ try:
2188
+ # Track this repository in progress
2189
+ progress = get_progress_service()
2190
+ if hasattr(progress, "start_repository"):
2191
+ progress.start_repository(project_key, 0)
2192
+
2193
+ logger.info(f"🔍 Processing repository: {project_key} at {repo_path}")
2194
+
2195
+ # Use the regular fetch method but with timeout wrapper active
2196
+ with self.git_wrapper.operation_tracker("fetch_repository_data", repo_path):
2197
+ result = self.fetch_repository_data(
2198
+ repo_path=repo_path,
2199
+ project_key=project_key,
2200
+ weeks_back=weeks_back,
2201
+ branch_patterns=branch_patterns,
2202
+ jira_integration=jira_integration,
2203
+ progress_callback=None, # We handle progress at a higher level
2204
+ start_date=start_date,
2205
+ end_date=end_date,
2206
+ )
2207
+
2208
+ return result
2209
+
2210
+ except GitOperationTimeout as e:
2211
+ logger.error(f"⏱️ Repository {project_key} processing timed out: {e}")
2212
+ raise
2213
+
2214
+ except Exception as e:
2215
+ logger.error(f"❌ Error processing repository {project_key}: {e}")
2216
+ import traceback
2217
+
2218
+ logger.debug(f"Full traceback: {traceback.format_exc()}")
2219
+ return None