gitflow-analytics 1.3.11__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/batch_classifier.py +156 -4
  3. gitflow_analytics/cli.py +803 -135
  4. gitflow_analytics/config/loader.py +39 -1
  5. gitflow_analytics/config/schema.py +1 -0
  6. gitflow_analytics/core/cache.py +20 -0
  7. gitflow_analytics/core/data_fetcher.py +1051 -117
  8. gitflow_analytics/core/git_auth.py +169 -0
  9. gitflow_analytics/core/git_timeout_wrapper.py +347 -0
  10. gitflow_analytics/core/metrics_storage.py +12 -3
  11. gitflow_analytics/core/progress.py +219 -18
  12. gitflow_analytics/core/subprocess_git.py +145 -0
  13. gitflow_analytics/extractors/ml_tickets.py +3 -2
  14. gitflow_analytics/extractors/tickets.py +93 -8
  15. gitflow_analytics/integrations/jira_integration.py +1 -1
  16. gitflow_analytics/integrations/orchestrator.py +47 -29
  17. gitflow_analytics/metrics/branch_health.py +3 -2
  18. gitflow_analytics/models/database.py +72 -1
  19. gitflow_analytics/pm_framework/adapters/jira_adapter.py +12 -5
  20. gitflow_analytics/pm_framework/orchestrator.py +8 -3
  21. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +24 -4
  22. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +3 -1
  23. gitflow_analytics/qualitative/core/llm_fallback.py +34 -2
  24. gitflow_analytics/reports/narrative_writer.py +118 -74
  25. gitflow_analytics/security/__init__.py +11 -0
  26. gitflow_analytics/security/config.py +189 -0
  27. gitflow_analytics/security/extractors/__init__.py +7 -0
  28. gitflow_analytics/security/extractors/dependency_checker.py +379 -0
  29. gitflow_analytics/security/extractors/secret_detector.py +197 -0
  30. gitflow_analytics/security/extractors/vulnerability_scanner.py +333 -0
  31. gitflow_analytics/security/llm_analyzer.py +347 -0
  32. gitflow_analytics/security/reports/__init__.py +5 -0
  33. gitflow_analytics/security/reports/security_report.py +358 -0
  34. gitflow_analytics/security/security_analyzer.py +414 -0
  35. gitflow_analytics/tui/app.py +3 -1
  36. gitflow_analytics/tui/progress_adapter.py +313 -0
  37. gitflow_analytics/tui/screens/analysis_progress_screen.py +407 -46
  38. gitflow_analytics/tui/screens/results_screen.py +219 -206
  39. gitflow_analytics/ui/__init__.py +21 -0
  40. gitflow_analytics/ui/progress_display.py +1477 -0
  41. gitflow_analytics/verify_activity.py +697 -0
  42. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/METADATA +2 -1
  43. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/RECORD +47 -31
  44. gitflow_analytics/cli_rich.py +0 -503
  45. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/WHEEL +0 -0
  46. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/entry_points.txt +0 -0
  47. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/licenses/LICENSE +0 -0
  48. {gitflow_analytics-1.3.11.dist-info → gitflow_analytics-3.3.0.dist-info}/top_level.txt +0 -0
@@ -8,10 +8,14 @@ without performing any LLM-based classification.
8
8
  import logging
9
9
  import os
10
10
  import subprocess
11
+ import threading
12
+ import time
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
14
  from datetime import datetime, timedelta, timezone
12
15
  from pathlib import Path
13
16
  from typing import Any, Optional
14
17
 
18
+ from sqlalchemy import func
15
19
  from sqlalchemy.orm import Session
16
20
 
17
21
  from ..extractors.story_points import StoryPointExtractor
@@ -25,11 +29,17 @@ from ..models.database import (
25
29
  )
26
30
  from .branch_mapper import BranchToProjectMapper
27
31
  from .cache import GitAnalysisCache
32
+ from .git_timeout_wrapper import GitOperationTimeout, GitTimeoutWrapper, HeartbeatLogger
28
33
  from .identity import DeveloperIdentityResolver
29
34
  from .progress import get_progress_service
30
35
 
31
36
  logger = logging.getLogger(__name__)
32
37
 
38
+ # THREAD SAFETY: Module-level thread-local storage for repository instances
39
+ # Each thread gets its own isolated storage to prevent thread-safety issues
40
+ # when GitDataFetcher is called from ThreadPoolExecutor
41
+ _thread_local = threading.local()
42
+
33
43
 
34
44
  class GitDataFetcher:
35
45
  """Fetches raw Git commit data and organizes it by day for efficient batch processing.
@@ -48,6 +58,7 @@ class GitDataFetcher:
48
58
  branch_mapping_rules: Optional[dict[str, list[str]]] = None,
49
59
  allowed_ticket_platforms: Optional[list[str]] = None,
50
60
  exclude_paths: Optional[list[str]] = None,
61
+ skip_remote_fetch: bool = False,
51
62
  ):
52
63
  """Initialize the data fetcher.
53
64
 
@@ -56,8 +67,11 @@ class GitDataFetcher:
56
67
  branch_mapping_rules: Rules for mapping branches to projects
57
68
  allowed_ticket_platforms: List of allowed ticket platforms
58
69
  exclude_paths: List of file paths to exclude from analysis
70
+ skip_remote_fetch: If True, skip git fetch/pull operations
59
71
  """
60
72
  self.cache = cache
73
+ self.skip_remote_fetch = skip_remote_fetch
74
+ self.repository_status = {} # Track status of each repository
61
75
  # CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
62
76
  self.database = cache.db
63
77
  self.story_point_extractor = StoryPointExtractor()
@@ -65,10 +79,35 @@ class GitDataFetcher:
65
79
  self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
66
80
  self.exclude_paths = exclude_paths or []
67
81
 
82
+ # Log exclusion configuration
83
+ if self.exclude_paths:
84
+ logger.info(
85
+ f"GitDataFetcher initialized with {len(self.exclude_paths)} exclusion patterns:"
86
+ )
87
+ for pattern in self.exclude_paths[:5]: # Show first 5 patterns
88
+ logger.debug(f" - {pattern}")
89
+ if len(self.exclude_paths) > 5:
90
+ logger.debug(f" ... and {len(self.exclude_paths) - 5} more patterns")
91
+ else:
92
+ logger.info("GitDataFetcher initialized with no file exclusions")
93
+
68
94
  # Initialize identity resolver
69
95
  identity_db_path = cache.cache_dir / "identities.db"
70
96
  self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
71
97
 
98
+ # Initialize git timeout wrapper for safe operations
99
+ self.git_wrapper = GitTimeoutWrapper(default_timeout=30)
100
+
101
+ # Statistics for tracking repository processing
102
+ self.processing_stats = {
103
+ "total": 0,
104
+ "processed": 0,
105
+ "success": 0,
106
+ "failed": 0,
107
+ "timeout": 0,
108
+ "repositories": {},
109
+ }
110
+
72
111
  def fetch_repository_data(
73
112
  self,
74
113
  repo_path: Path,
@@ -118,19 +157,37 @@ class GitDataFetcher:
118
157
  # Get progress service for top-level progress tracking
119
158
  progress = get_progress_service()
120
159
 
160
+ # Start Rich display for this repository if enabled
161
+ if hasattr(progress, "_use_rich") and progress._use_rich:
162
+ # Count total commits for progress estimation
163
+ try:
164
+ import git
165
+
166
+ git.Repo(repo_path)
167
+ # Check if we need to clone or pull
168
+ if not repo_path.exists() or not (repo_path / ".git").exists():
169
+ logger.info(f"📥 Repository {project_key} needs cloning")
170
+ progress.start_repository(f"{project_key} (cloning)", 0)
171
+ else:
172
+ # Rough estimate based on weeks
173
+ estimated_commits = weeks_back * 50 # Estimate ~50 commits per week
174
+ progress.start_repository(project_key, estimated_commits)
175
+ except Exception:
176
+ progress.start_repository(project_key, 100) # Default estimate
177
+
121
178
  # Step 1: Collect all commits organized by day with enhanced progress tracking
122
179
  logger.info("🔍 DEBUG: About to fetch commits by day")
123
- logger.info("Fetching commits organized by day...")
180
+ logger.info(f"Fetching commits organized by day for repository: {project_key}")
124
181
 
125
182
  # Create top-level progress for this repository
126
183
  with progress.progress(
127
184
  total=3, # Three main steps: fetch commits, extract tickets, store data
128
- description=f"Processing {project_key}",
185
+ description=f"📊 Processing repository: {project_key}",
129
186
  unit="steps",
130
187
  ) as repo_progress_ctx:
131
188
 
132
189
  # Step 1: Fetch commits
133
- progress.set_description(repo_progress_ctx, f"{project_key}: Fetching commits")
190
+ progress.set_description(repo_progress_ctx, f"🔍 {project_key}: Fetching commits")
134
191
  daily_commits = self._fetch_commits_by_day(
135
192
  repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
136
193
  )
@@ -138,25 +195,27 @@ class GitDataFetcher:
138
195
  progress.update(repo_progress_ctx)
139
196
 
140
197
  # Step 2: Extract and fetch all referenced tickets
141
- progress.set_description(repo_progress_ctx, f"{project_key}: Processing tickets")
198
+ progress.set_description(repo_progress_ctx, f"🎫 {project_key}: Processing tickets")
142
199
  logger.info("🔍 DEBUG: About to extract ticket references")
143
- logger.info("Extracting ticket references...")
200
+ logger.info(f"Extracting ticket references for {project_key}...")
144
201
  ticket_ids = self._extract_all_ticket_references(daily_commits)
145
202
  logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
146
203
 
147
204
  if jira_integration and ticket_ids:
148
- logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
205
+ logger.info(
206
+ f"Fetching {len(ticket_ids)} unique tickets from JIRA for {project_key}..."
207
+ )
149
208
  self._fetch_detailed_tickets(
150
209
  ticket_ids, jira_integration, project_key, progress_callback
151
210
  )
152
211
 
153
212
  # Build commit-ticket correlations
154
- logger.info("Building commit-ticket correlations...")
213
+ logger.info(f"Building commit-ticket correlations for {project_key}...")
155
214
  correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
156
215
  progress.update(repo_progress_ctx)
157
216
 
158
217
  # Step 3: Store daily commit batches
159
- progress.set_description(repo_progress_ctx, f"{project_key}: Storing data")
218
+ progress.set_description(repo_progress_ctx, f"💾 {project_key}: Storing data")
160
219
  logger.info(
161
220
  f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
162
221
  )
@@ -183,6 +242,46 @@ class GitDataFetcher:
183
242
  # Return summary statistics with ACTUAL stored counts
184
243
  # expected_commits already calculated above
185
244
 
245
+ # Calculate exclusion impact summary if exclusions are configured
246
+ exclusion_stats = {
247
+ "patterns_applied": len(self.exclude_paths),
248
+ "enabled": bool(self.exclude_paths),
249
+ }
250
+
251
+ if self.exclude_paths and expected_commits > 0:
252
+ # Get aggregate stats from daily_commit_batches
253
+ session = self.database.get_session()
254
+ try:
255
+ batch_stats = (
256
+ session.query(
257
+ func.sum(DailyCommitBatch.total_lines_added).label("total_added"),
258
+ func.sum(DailyCommitBatch.total_lines_deleted).label("total_deleted"),
259
+ )
260
+ .filter(
261
+ DailyCommitBatch.project_key == project_key,
262
+ DailyCommitBatch.repo_path == str(repo_path),
263
+ )
264
+ .first()
265
+ )
266
+
267
+ if batch_stats and batch_stats.total_added:
268
+ total_lines = (batch_stats.total_added or 0) + (batch_stats.total_deleted or 0)
269
+ exclusion_stats["total_lines_after_filtering"] = total_lines
270
+ exclusion_stats["lines_added"] = batch_stats.total_added or 0
271
+ exclusion_stats["lines_deleted"] = batch_stats.total_deleted or 0
272
+
273
+ logger.info(
274
+ f"📊 Exclusion Impact Summary for {project_key}:\n"
275
+ f" - Exclusion patterns applied: {len(self.exclude_paths)}\n"
276
+ f" - Total lines after filtering: {total_lines:,}\n"
277
+ f" - Lines added: {batch_stats.total_added:,}\n"
278
+ f" - Lines deleted: {batch_stats.total_deleted:,}"
279
+ )
280
+ except Exception as e:
281
+ logger.debug(f"Could not calculate exclusion impact summary: {e}")
282
+ finally:
283
+ session.close()
284
+
186
285
  results = {
187
286
  "project_key": project_key,
188
287
  "repo_path": str(repo_path),
@@ -196,6 +295,7 @@ class GitDataFetcher:
196
295
  "correlations_created": correlations_created,
197
296
  "batches_created": batches_created,
198
297
  },
298
+ "exclusions": exclusion_stats,
199
299
  "daily_commits": daily_commits, # For immediate use if needed
200
300
  }
201
301
 
@@ -204,10 +304,20 @@ class GitDataFetcher:
204
304
  logger.info(
205
305
  f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
206
306
  )
307
+ # Finish repository in Rich display with success
308
+ if hasattr(progress, "_use_rich") and progress._use_rich:
309
+ progress.finish_repository(project_key, success=True)
207
310
  else:
208
311
  logger.error(
209
312
  f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
210
313
  )
314
+ # Finish repository in Rich display with error
315
+ if hasattr(progress, "_use_rich") and progress._use_rich:
316
+ progress.finish_repository(
317
+ project_key,
318
+ success=False,
319
+ error_message=f"Storage issue: {actual_stored_commits}/{expected_commits} commits",
320
+ )
211
321
 
212
322
  return results
213
323
 
@@ -225,24 +335,148 @@ class GitDataFetcher:
225
335
  Returns:
226
336
  Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
227
337
  """
228
- from git import Repo
338
+
339
+ # THREAD SAFETY: Use the module-level thread-local storage to ensure each thread
340
+ # gets its own Repo instance. This prevents thread-safety issues when called from ThreadPoolExecutor
341
+
342
+ # Set environment variables to prevent ANY password prompts before opening repo
343
+ original_env = {}
344
+ env_vars = {
345
+ "GIT_TERMINAL_PROMPT": "0",
346
+ "GIT_ASKPASS": "/bin/echo", # Use full path to echo
347
+ "SSH_ASKPASS": "/bin/echo",
348
+ "GCM_INTERACTIVE": "never",
349
+ "GIT_SSH_COMMAND": "ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o PasswordAuthentication=no",
350
+ "DISPLAY": "",
351
+ "GIT_CREDENTIAL_HELPER": "", # Disable credential helper
352
+ "GCM_PROVIDER": "none", # Disable Git Credential Manager
353
+ "GIT_CREDENTIALS": "", # Clear any cached credentials
354
+ "GIT_CONFIG_NOSYSTEM": "1", # Don't use system config
355
+ }
356
+
357
+ # Save original environment and set our values
358
+ for key, value in env_vars.items():
359
+ original_env[key] = os.environ.get(key)
360
+ os.environ[key] = value
229
361
 
230
362
  try:
231
- repo = Repo(repo_path)
363
+ # THREAD SAFETY: Create a fresh Repo instance for this thread
364
+ # Do NOT reuse Repo instances across threads
365
+ from git import Repo
366
+
367
+ # Check for security issues in repository configuration
368
+ self._check_repository_security(repo_path, project_key)
369
+
370
+ # When skip_remote_fetch is enabled, use a more restricted repository access
371
+ if self.skip_remote_fetch:
372
+ # Use a special git configuration that completely disables credential helpers
373
+ import tempfile
374
+
375
+ # Create a secure temporary directory for our config
376
+ temp_dir = tempfile.mkdtemp(prefix="gitflow_")
377
+
378
+ # Create a temporary git config that disables all authentication
379
+ tmp_config_path = os.path.join(temp_dir, ".gitconfig")
380
+ with open(tmp_config_path, "w") as tmp_config:
381
+ tmp_config.write("[credential]\n")
382
+ tmp_config.write(" helper = \n")
383
+ tmp_config.write("[core]\n")
384
+ tmp_config.write(" askpass = \n")
385
+
386
+ # Set GIT_CONFIG to use our temporary config
387
+ os.environ["GIT_CONFIG_GLOBAL"] = tmp_config_path
388
+ os.environ["GIT_CONFIG_SYSTEM"] = "/dev/null"
389
+
390
+ # Store temp_dir in thread-local storage for cleanup
391
+ _thread_local.temp_dir = temp_dir
392
+
393
+ try:
394
+ # Open repository with our restricted configuration
395
+ # THREAD SAFETY: Each thread gets its own Repo instance
396
+ repo = Repo(repo_path)
397
+ finally:
398
+ # Clean up temporary config directory
399
+ try:
400
+ import shutil
401
+
402
+ if hasattr(_thread_local, "temp_dir"):
403
+ shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
404
+ delattr(_thread_local, "temp_dir")
405
+ except:
406
+ pass
407
+
408
+ try:
409
+ # Configure git to never prompt for credentials
410
+ with repo.config_writer() as git_config:
411
+ git_config.set_value("core", "askpass", "")
412
+ git_config.set_value("credential", "helper", "")
413
+ except Exception as e:
414
+ logger.debug(f"Could not update git config: {e}")
415
+
416
+ # Note: We can't monkey-patch remotes as it's a property without setter
417
+ # The skip_remote_fetch flag will prevent remote operations elsewhere
418
+
419
+ logger.debug(
420
+ f"Opened repository {project_key} in offline mode (skip_remote_fetch=true)"
421
+ )
422
+ else:
423
+ # THREAD SAFETY: Each thread gets its own Repo instance
424
+ repo = Repo(repo_path)
425
+ # Track repository status
426
+ self.repository_status[project_key] = {
427
+ "path": str(repo_path),
428
+ "remote_update": "skipped" if self.skip_remote_fetch else "pending",
429
+ "authentication_issues": False,
430
+ "error": None,
431
+ }
432
+
232
433
  # Update repository from remote before analysis
233
- self._update_repository(repo)
434
+ if not self.skip_remote_fetch:
435
+ logger.info(f"📥 Updating repository {project_key} from remote...")
436
+
437
+ update_success = self._update_repository(repo)
438
+ if not self.skip_remote_fetch:
439
+ self.repository_status[project_key]["remote_update"] = (
440
+ "success" if update_success else "failed"
441
+ )
442
+ if not update_success:
443
+ logger.warning(
444
+ f"⚠️ {project_key}: Continuing with local repository state (remote update failed)"
445
+ )
446
+
234
447
  except Exception as e:
235
- logger.error(f"Failed to open repository at {repo_path}: {e}")
448
+ logger.error(f"Failed to open repository {project_key} at {repo_path}: {e}")
449
+ self.repository_status[project_key] = {
450
+ "path": str(repo_path),
451
+ "remote_update": "error",
452
+ "authentication_issues": "authentication" in str(e).lower()
453
+ or "password" in str(e).lower(),
454
+ "error": str(e),
455
+ }
456
+ # Restore original environment variables before returning
457
+ for key, value in original_env.items():
458
+ if value is None:
459
+ os.environ.pop(key, None)
460
+ else:
461
+ os.environ[key] = value
236
462
  return {}
237
463
 
238
464
  # Get branches to analyze
239
465
  branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
240
466
 
241
467
  if not branches_to_analyze:
242
- logger.warning(f"No accessible branches found in repository {repo_path}")
468
+ logger.warning(
469
+ f"No accessible branches found in repository {project_key} at {repo_path}"
470
+ )
471
+ # Restore original environment variables before returning
472
+ for key, value in original_env.items():
473
+ if value is None:
474
+ os.environ.pop(key, None)
475
+ else:
476
+ os.environ[key] = value
243
477
  return {}
244
478
 
245
- logger.info(f"Analyzing branches: {branches_to_analyze}")
479
+ logger.info(f"🌿 {project_key}: Analyzing branches: {branches_to_analyze}")
246
480
 
247
481
  # Calculate days to process
248
482
  current_date = start_date.date()
@@ -263,18 +497,34 @@ class GitDataFetcher:
263
497
  daily_commits = {}
264
498
  all_commit_hashes = set() # Track all hashes for deduplication
265
499
 
500
+ # Count total commits first for Rich display
501
+ try:
502
+ for branch_name in branches_to_analyze[:1]: # Sample from first branch
503
+ sample_commits = list(
504
+ repo.iter_commits(branch_name, since=start_date, until=end_date)
505
+ )
506
+ # Estimate based on first branch (multiply by number of branches for rough estimate)
507
+ len(sample_commits) * len(branches_to_analyze)
508
+ break
509
+ except:
510
+ len(days_to_process) * 50 # Default estimate
511
+
512
+ # Update repository in Rich display with estimated commit count
513
+ if hasattr(progress, "_use_rich") and progress._use_rich:
514
+ progress.update_repository(project_key, 0, 0.0)
515
+
266
516
  # Create nested progress for day-by-day processing
267
517
  with progress.progress(
268
518
  total=len(days_to_process),
269
- description=f"Fetching commits for {project_key}",
519
+ description=f"📅 Fetching commits for repository: {project_key}",
270
520
  unit="days",
271
521
  nested=True,
272
522
  ) as day_progress_ctx:
273
523
 
274
524
  for day_date in days_to_process:
275
- # Update description to show current day
525
+ # Update description to show current repository and day clearly
276
526
  day_str = day_date.strftime("%Y-%m-%d")
277
- progress.set_description(day_progress_ctx, f"{project_key}: Processing {day_str}")
527
+ progress.set_description(day_progress_ctx, f"🔍 {project_key}: Analyzing {day_str}")
278
528
 
279
529
  # Calculate day boundaries
280
530
  day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
@@ -286,12 +536,26 @@ class GitDataFetcher:
286
536
  # Process each branch for this specific day
287
537
  for branch_name in branches_to_analyze:
288
538
  try:
289
- # Fetch commits for this specific day and branch
290
- branch_commits = list(
291
- repo.iter_commits(
292
- branch_name, since=day_start, until=day_end, reverse=False
539
+ # Fetch commits for this specific day and branch with timeout protection
540
+ def fetch_branch_commits():
541
+ return list(
542
+ repo.iter_commits(
543
+ branch_name, since=day_start, until=day_end, reverse=False
544
+ )
293
545
  )
294
- )
546
+
547
+ # Use timeout wrapper to prevent hanging on iter_commits
548
+ try:
549
+ branch_commits = self.git_wrapper.run_with_timeout(
550
+ fetch_branch_commits,
551
+ timeout=15, # 15 seconds per branch/day combination
552
+ operation_name=f"iter_commits_{branch_name}_{day_str}",
553
+ )
554
+ except GitOperationTimeout:
555
+ logger.warning(
556
+ f"⏱️ Timeout fetching commits for branch {branch_name} on {day_str}, skipping"
557
+ )
558
+ continue
295
559
 
296
560
  for commit in branch_commits:
297
561
  # Skip if we've already processed this commit
@@ -307,6 +571,11 @@ class GitDataFetcher:
307
571
  all_commit_hashes.add(commit.hexsha)
308
572
  commits_found_today += 1
309
573
 
574
+ except GitOperationTimeout as e:
575
+ logger.warning(
576
+ f"⏱️ Timeout processing branch {branch_name} for day {day_str}: {e}"
577
+ )
578
+ continue
310
579
  except Exception as e:
311
580
  logger.warning(
312
581
  f"Error processing branch {branch_name} for day {day_str}: {e}"
@@ -333,8 +602,28 @@ class GitDataFetcher:
333
602
  # Update progress bar
334
603
  progress.update(day_progress_ctx)
335
604
 
605
+ # Update Rich display with current commit count and speed
606
+ if hasattr(progress, "_use_rich") and progress._use_rich:
607
+ total_processed = len(all_commit_hashes)
608
+ # Calculate speed (commits per second) based on elapsed time
609
+ import time
610
+
611
+ if not hasattr(self, "_fetch_start_time"):
612
+ self._fetch_start_time = time.time()
613
+ elapsed = time.time() - self._fetch_start_time
614
+ speed = total_processed / elapsed if elapsed > 0 else 0
615
+ progress.update_repository(project_key, total_processed, speed)
616
+
336
617
  total_commits = sum(len(commits) for commits in daily_commits.values())
337
618
  logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
619
+
620
+ # Restore original environment variables
621
+ for key, value in original_env.items():
622
+ if value is None:
623
+ os.environ.pop(key, None)
624
+ else:
625
+ os.environ[key] = value
626
+
338
627
  return daily_commits
339
628
 
340
629
  def _extract_commit_data(
@@ -376,13 +665,19 @@ class GitDataFetcher:
376
665
  line_stats = self._calculate_commit_stats(commit)
377
666
  total_insertions = line_stats["insertions"]
378
667
  total_deletions = line_stats["deletions"]
668
+ raw_insertions = line_stats.get("raw_insertions", total_insertions)
669
+ raw_deletions = line_stats.get("raw_deletions", total_deletions)
379
670
 
380
671
  commit_data.update(
381
672
  {
382
673
  "files_changed": files_changed,
383
674
  "files_changed_count": len(files_changed),
384
- "lines_added": total_insertions,
675
+ "lines_added": total_insertions, # Filtered counts for backward compatibility
385
676
  "lines_deleted": total_deletions,
677
+ "filtered_insertions": total_insertions, # Explicitly filtered counts
678
+ "filtered_deletions": total_deletions,
679
+ "raw_insertions": raw_insertions, # Raw unfiltered counts
680
+ "raw_deletions": raw_deletions,
386
681
  }
387
682
  )
388
683
 
@@ -394,6 +689,10 @@ class GitDataFetcher:
394
689
  "files_changed_count": 0,
395
690
  "lines_added": 0,
396
691
  "lines_deleted": 0,
692
+ "filtered_insertions": 0,
693
+ "filtered_deletions": 0,
694
+ "raw_insertions": 0,
695
+ "raw_deletions": 0,
397
696
  }
398
697
  )
399
698
 
@@ -422,9 +721,166 @@ class GitDataFetcher:
422
721
 
423
722
  def _should_exclude_file(self, file_path: str) -> bool:
424
723
  """Check if a file should be excluded based on exclude patterns."""
724
+ return any(self._matches_glob_pattern(file_path, pattern) for pattern in self.exclude_paths)
725
+
726
+ def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
727
+ """Check if a file path matches a glob pattern, handling ** recursion correctly.
728
+
729
+ This method properly handles different glob pattern types:
730
+ - **/vendor/** : matches files inside vendor directories at any level
731
+ - **/*.min.js : matches files with specific suffix anywhere in directory tree
732
+ - vendor/** : matches files inside vendor directory at root level only
733
+ - **pattern** : handles other complex patterns with pathlib.match()
734
+ - simple patterns : uses fnmatch for basic wildcards
735
+
736
+ Args:
737
+ filepath: The file path to check
738
+ pattern: The glob pattern to match against
739
+
740
+ Returns:
741
+ True if the file path matches the pattern, False otherwise
742
+ """
743
+ import fnmatch
744
+ import re
745
+ from pathlib import PurePath
746
+
747
+ # Handle empty or invalid inputs
748
+ if not filepath or not pattern:
749
+ return False
750
+
751
+ path = PurePath(filepath)
752
+
753
+ # Check for multiple ** patterns first (most complex)
754
+ if "**" in pattern and pattern.count("**") > 1:
755
+ # Multiple ** patterns - use custom recursive matching for complex patterns
756
+ return self._match_recursive_pattern(filepath, pattern)
757
+
758
+ # Then handle simple ** patterns
759
+ elif pattern.startswith("**/") and pattern.endswith("/**"):
760
+ # Pattern like **/vendor/** - matches files inside vendor directories at any level
761
+ dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
762
+ if not dir_name: # Handle edge case of '**/**'
763
+ return True
764
+ return dir_name in path.parts
765
+
766
+ elif pattern.startswith("**/"):
767
+ # Pattern like **/*.min.js - matches files with specific suffix anywhere
768
+ suffix_pattern = pattern[3:]
769
+ if not suffix_pattern: # Handle edge case of '**/'
770
+ return True
771
+ # Check against filename for file patterns, or any path part for directory patterns
772
+ if suffix_pattern.endswith("/"):
773
+ # Directory pattern like **/build/
774
+ dir_name = suffix_pattern[:-1]
775
+ return dir_name in path.parts
776
+ else:
777
+ # File pattern like *.min.js
778
+ return fnmatch.fnmatch(path.name, suffix_pattern)
779
+
780
+ elif pattern.endswith("/**"):
781
+ # Pattern like vendor/** or docs/build/** - matches files inside directory at root level
782
+ dir_name = pattern[:-3]
783
+ if not dir_name: # Handle edge case of '/**'
784
+ return True
785
+
786
+ # Handle both single directory names and nested paths
787
+ expected_parts = PurePath(dir_name).parts
788
+ return (
789
+ len(path.parts) >= len(expected_parts)
790
+ and path.parts[: len(expected_parts)] == expected_parts
791
+ )
792
+
793
+ elif "**" in pattern:
794
+ # Single ** pattern - use pathlib matching with fallback
795
+ try:
796
+ return path.match(pattern)
797
+ except (ValueError, TypeError):
798
+ # Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
799
+ try:
800
+ return fnmatch.fnmatch(filepath, pattern)
801
+ except re.error:
802
+ # Invalid regex pattern - return False to be safe
803
+ return False
804
+ else:
805
+ # Simple pattern - use fnmatch for basic wildcards
806
+ try:
807
+ # Try matching the full path first
808
+ if fnmatch.fnmatch(filepath, pattern):
809
+ return True
810
+ # Also try matching just the filename for simple patterns
811
+ # This allows "package-lock.json" to match "src/package-lock.json"
812
+ return fnmatch.fnmatch(path.name, pattern)
813
+ except re.error:
814
+ # Invalid regex pattern - return False to be safe
815
+ return False
816
+
817
+ def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
818
+ """Handle complex patterns with multiple ** wildcards.
819
+
820
+ Args:
821
+ filepath: The file path to check
822
+ pattern: The pattern with multiple ** wildcards
823
+
824
+ Returns:
825
+ True if the path matches the pattern, False otherwise
826
+ """
425
827
  import fnmatch
828
+ from pathlib import PurePath
829
+
830
+ # Split pattern by ** to handle each segment
831
+ parts = pattern.split("**")
832
+
833
+ # Validate that we have actual segments
834
+ if not parts:
835
+ return False
426
836
 
427
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
837
+ # Convert filepath to parts for easier matching
838
+ path_parts = list(PurePath(filepath).parts)
839
+
840
+ # Start matching from the beginning
841
+ path_index = 0
842
+
843
+ for i, part in enumerate(parts):
844
+ if not part:
845
+ # Empty part (e.g., from leading or trailing **)
846
+ if i == 0 or i == len(parts) - 1:
847
+ # Leading or trailing ** - continue
848
+ continue
849
+ # Middle empty part (consecutive **) - match any number of path components
850
+ continue
851
+
852
+ # Clean the part (remove leading/trailing slashes)
853
+ part = part.strip("/")
854
+
855
+ if not part:
856
+ continue
857
+
858
+ # Find where this part matches in the remaining path
859
+ found = False
860
+ for j in range(path_index, len(path_parts)):
861
+ # Check if the current path part matches the pattern part
862
+ if "/" in part:
863
+ # Part contains multiple path components
864
+ sub_parts = part.split("/")
865
+ if j + len(sub_parts) <= len(path_parts) and all(
866
+ fnmatch.fnmatch(path_parts[j + k], sub_parts[k])
867
+ for k in range(len(sub_parts))
868
+ ):
869
+ path_index = j + len(sub_parts)
870
+ found = True
871
+ break
872
+ else:
873
+ # Single component part
874
+ if fnmatch.fnmatch(path_parts[j], part):
875
+ path_index = j + 1
876
+ found = True
877
+ break
878
+
879
+ if not found and part:
880
+ # Required part not found in path
881
+ return False
882
+
883
+ return True
428
884
 
429
885
  def _get_branches_to_analyze(
430
886
  self, repo: Any, branch_patterns: Optional[list[str]]
@@ -441,33 +897,45 @@ class GitDataFetcher:
441
897
  - Handle missing remotes gracefully
442
898
  - Skip remote tracking branches to avoid duplicates
443
899
  - Use actual branch existence checking rather than assuming branches exist
900
+
901
+ THREAD SAFETY: This method is thread-safe as it doesn't modify shared state
902
+ and works with a repo instance passed as a parameter.
444
903
  """
445
904
  # Collect all available branches (local branches preferred)
446
905
  available_branches = []
447
906
 
448
907
  # First, try local branches
449
908
  try:
450
- local_branches = [branch.name for branch in repo.branches]
909
+ # THREAD SAFETY: Create a new list to avoid sharing references
910
+ local_branches = list([branch.name for branch in repo.branches])
451
911
  available_branches.extend(local_branches)
452
912
  logger.debug(f"Found local branches: {local_branches}")
453
913
  except Exception as e:
454
914
  logger.debug(f"Error getting local branches: {e}")
455
915
 
456
916
  # If we have remotes, also consider remote branches (but clean the names)
457
- try:
458
- if repo.remotes and hasattr(repo.remotes, "origin"):
459
- remote_branches = [
460
- ref.name.replace("origin/", "")
461
- for ref in repo.remotes.origin.refs
462
- if not ref.name.endswith("HEAD") # Skip HEAD ref
463
- ]
464
- # Only add remote branches that aren't already in local branches
465
- for branch in remote_branches:
466
- if branch not in available_branches:
467
- available_branches.append(branch)
468
- logger.debug(f"Found remote branches: {remote_branches}")
469
- except Exception as e:
470
- logger.debug(f"Error getting remote branches: {e}")
917
+ # Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
918
+ if not self.skip_remote_fetch:
919
+ try:
920
+ if repo.remotes and hasattr(repo.remotes, "origin"):
921
+ # THREAD SAFETY: Create a new list to avoid sharing references
922
+ remote_branches = list(
923
+ [
924
+ ref.name.replace("origin/", "")
925
+ for ref in repo.remotes.origin.refs
926
+ if not ref.name.endswith("HEAD") # Skip HEAD ref
927
+ ]
928
+ )
929
+ # Only add remote branches that aren't already in local branches
930
+ for branch in remote_branches:
931
+ if branch not in available_branches:
932
+ available_branches.append(branch)
933
+ logger.debug(f"Found remote branches: {remote_branches}")
934
+ except Exception as e:
935
+ logger.debug(f"Error getting remote branches (may require authentication): {e}")
936
+ # Continue with local branches only
937
+ else:
938
+ logger.debug("Skipping remote branch enumeration (skip_remote_fetch=true)")
471
939
 
472
940
  # If no branches found, fallback to trying common names directly
473
941
  if not available_branches:
@@ -497,6 +965,7 @@ class GitDataFetcher:
497
965
  accessible_branches = []
498
966
  for branch in branches_to_test:
499
967
  try:
968
+ # THREAD SAFETY: Use iterator without storing intermediate results
500
969
  next(iter(repo.iter_commits(branch, max_count=1)), None)
501
970
  accessible_branches.append(branch)
502
971
  except Exception as e:
@@ -550,82 +1019,62 @@ class GitDataFetcher:
550
1019
  Returns:
551
1020
  bool: True if update succeeded, False if failed (but analysis continues)
552
1021
  """
553
- try:
554
- if repo.remotes:
555
- logger.info("Fetching latest changes from remote")
1022
+ # Skip remote operations if configured
1023
+ if self.skip_remote_fetch:
1024
+ logger.info("🚫 Skipping remote fetch (skip_remote_fetch=true)")
1025
+ return True
556
1026
 
557
- # Use subprocess for git fetch to prevent password prompts
558
- env = os.environ.copy()
559
- env["GIT_TERMINAL_PROMPT"] = "0"
560
- env["GIT_ASKPASS"] = ""
561
- env["GCM_INTERACTIVE"] = "never"
1027
+ # Check for stale repository (last fetch > 1 hour ago)
1028
+ self._check_repository_staleness(repo)
562
1029
 
563
- # Run git fetch with timeout
564
- try:
565
- result = subprocess.run(
566
- ["git", "fetch", "--all"],
567
- cwd=repo.working_dir,
568
- env=env,
569
- capture_output=True,
570
- text=True,
571
- timeout=30, # 30 second timeout
572
- )
1030
+ try:
1031
+ # Check if we have remotes without triggering authentication
1032
+ has_remotes = False
1033
+ try:
1034
+ has_remotes = bool(repo.remotes)
1035
+ except Exception as e:
1036
+ logger.debug(f"Could not check for remotes (may require authentication): {e}")
1037
+ return True # Continue with local analysis
573
1038
 
574
- if result.returncode != 0:
575
- error_str = result.stderr.lower()
576
- if any(
577
- x in error_str
578
- for x in ["authentication", "permission denied", "401", "403"]
579
- ):
580
- logger.error(
581
- "GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
582
- "You can verify with: gh auth status"
583
- )
584
- # Skip fetch but continue with local analysis
585
- return False
586
- logger.warning(f"Git fetch failed: {result.stderr}")
587
- return False
1039
+ if has_remotes:
1040
+ logger.info("Fetching latest changes from remote")
588
1041
 
589
- except subprocess.TimeoutExpired:
590
- logger.error(
591
- "Git fetch timed out (likely authentication failure). Continuing with local repository state."
1042
+ # Use our timeout wrapper for safe git operations
1043
+ repo_path = Path(repo.working_dir)
1044
+
1045
+ # Try to fetch with timeout protection
1046
+ fetch_success = self.git_wrapper.fetch_with_timeout(repo_path, timeout=30)
1047
+
1048
+ if not fetch_success:
1049
+ # Mark this repository as having authentication issues if applicable
1050
+ if hasattr(self, "repository_status"):
1051
+ for key in self.repository_status:
1052
+ if repo.working_dir.endswith(key) or key in repo.working_dir:
1053
+ self.repository_status[key]["remote_update"] = "failed"
1054
+ break
1055
+
1056
+ # Explicit warning to user about stale data
1057
+ logger.warning(
1058
+ f"❌ Failed to fetch updates for {repo_path.name}. "
1059
+ f"Analysis will use potentially stale local data. "
1060
+ f"Check authentication or network connectivity."
592
1061
  )
593
1062
  return False
1063
+ else:
1064
+ # Explicit success confirmation
1065
+ logger.info(f"✅ Successfully fetched updates for {repo_path.name}")
594
1066
 
595
1067
  # Only try to pull if not in detached HEAD state
596
1068
  if not repo.head.is_detached:
597
1069
  current_branch = repo.active_branch
598
1070
  tracking = current_branch.tracking_branch()
599
1071
  if tracking:
600
- # Pull latest changes using subprocess
601
- try:
602
- result = subprocess.run(
603
- ["git", "pull"],
604
- cwd=repo.working_dir,
605
- env=env,
606
- capture_output=True,
607
- text=True,
608
- timeout=30,
609
- )
610
-
611
- if result.returncode != 0:
612
- error_str = result.stderr.lower()
613
- if any(
614
- x in error_str
615
- for x in ["authentication", "permission denied", "401", "403"]
616
- ):
617
- logger.error(
618
- "GitHub authentication failed during pull. Continuing with local repository state."
619
- )
620
- return False
621
- logger.warning(f"Git pull failed: {result.stderr}")
622
- return False
1072
+ # Pull latest changes using timeout wrapper
1073
+ pull_success = self.git_wrapper.pull_with_timeout(repo_path, timeout=30)
1074
+ if pull_success:
623
1075
  logger.debug(f"Pulled latest changes for {current_branch.name}")
624
-
625
- except subprocess.TimeoutExpired:
626
- logger.error(
627
- "Git pull timed out. Continuing with local repository state."
628
- )
1076
+ else:
1077
+ logger.warning("Git pull failed, continuing with fetched state")
629
1078
  return False
630
1079
  else:
631
1080
  logger.debug(
@@ -642,6 +1091,114 @@ class GitDataFetcher:
642
1091
  # Continue with analysis using local state
643
1092
  return False
644
1093
 
1094
+ def _check_repository_staleness(self, repo) -> None:
1095
+ """Check if repository hasn't been fetched recently and warn user.
1096
+
1097
+ Args:
1098
+ repo: GitPython Repo object
1099
+ """
1100
+ try:
1101
+ repo_path = Path(repo.working_dir)
1102
+ fetch_head_path = repo_path / ".git" / "FETCH_HEAD"
1103
+
1104
+ if fetch_head_path.exists():
1105
+ # Get last fetch time from FETCH_HEAD modification time
1106
+ last_fetch_time = datetime.fromtimestamp(
1107
+ fetch_head_path.stat().st_mtime, tz=timezone.utc
1108
+ )
1109
+ now = datetime.now(timezone.utc)
1110
+ hours_since_fetch = (now - last_fetch_time).total_seconds() / 3600
1111
+
1112
+ if hours_since_fetch > 1:
1113
+ logger.warning(
1114
+ f"⏰ Repository {repo_path.name} last fetched {hours_since_fetch:.1f} hours ago. "
1115
+ f"Data may be stale."
1116
+ )
1117
+ else:
1118
+ logger.warning(
1119
+ f"⚠️ Repository {repo_path.name} has never been fetched. "
1120
+ f"Will attempt to fetch now."
1121
+ )
1122
+ except Exception as e:
1123
+ logger.debug(f"Could not check repository staleness: {e}")
1124
+
1125
+ def _check_repository_security(self, repo_path: Path, project_key: str) -> None:
1126
+ """Check for security issues in repository configuration.
1127
+
1128
+ Warns about:
1129
+ - Exposed tokens in remote URLs
1130
+ - Insecure credential storage
1131
+ """
1132
+
1133
+ try:
1134
+ # Check for tokens in remote URLs
1135
+ result = subprocess.run(
1136
+ ["git", "remote", "-v"],
1137
+ cwd=repo_path,
1138
+ capture_output=True,
1139
+ text=True,
1140
+ timeout=2,
1141
+ env={"GIT_TERMINAL_PROMPT": "0"},
1142
+ )
1143
+
1144
+ if result.returncode == 0:
1145
+ output = result.stdout
1146
+ # Check for various token patterns in URLs
1147
+ token_patterns = [
1148
+ r"https://[^@]*@", # Any HTTPS URL with embedded credentials
1149
+ r"ghp_[a-zA-Z0-9]+", # GitHub Personal Access Token
1150
+ r"ghs_[a-zA-Z0-9]+", # GitHub Server Token
1151
+ r"github_pat_[a-zA-Z0-9]+", # New GitHub PAT format
1152
+ ]
1153
+
1154
+ for pattern in token_patterns:
1155
+ import re
1156
+
1157
+ if re.search(pattern, output):
1158
+ logger.warning(
1159
+ f"⚠️ SECURITY WARNING for {project_key}: "
1160
+ f"Repository appears to have credentials in remote URL. "
1161
+ f"This is a security risk! Consider using: "
1162
+ f"1) GitHub CLI (gh auth login), "
1163
+ f"2) SSH keys, or "
1164
+ f"3) Git credential manager instead."
1165
+ )
1166
+ break
1167
+ except:
1168
+ # Don't fail analysis due to security check
1169
+ pass
1170
+
1171
+ def get_repository_status_summary(self) -> dict[str, Any]:
1172
+ """Get a summary of repository fetch status.
1173
+
1174
+ Returns:
1175
+ Dictionary with status summary including any repositories with issues
1176
+ """
1177
+ summary = {
1178
+ "total_repositories": len(self.repository_status),
1179
+ "successful_updates": 0,
1180
+ "failed_updates": 0,
1181
+ "skipped_updates": 0,
1182
+ "authentication_issues": [],
1183
+ "errors": [],
1184
+ }
1185
+
1186
+ for project_key, status in self.repository_status.items():
1187
+ if status["remote_update"] == "success":
1188
+ summary["successful_updates"] += 1
1189
+ elif status["remote_update"] == "failed":
1190
+ summary["failed_updates"] += 1
1191
+ if status.get("authentication_issues"):
1192
+ summary["authentication_issues"].append(project_key)
1193
+ elif status["remote_update"] == "skipped":
1194
+ summary["skipped_updates"] += 1
1195
+ elif status["remote_update"] == "error":
1196
+ summary["errors"].append(
1197
+ {"repository": project_key, "error": status.get("error", "Unknown error")}
1198
+ )
1199
+
1200
+ return summary
1201
+
645
1202
  def _extract_all_ticket_references(
646
1203
  self, daily_commits: dict[str, list[dict[str, Any]]]
647
1204
  ) -> set[str]:
@@ -919,8 +1476,20 @@ class GitDataFetcher:
919
1476
  "branch": commit.get("branch", "main"),
920
1477
  "is_merge": commit.get("is_merge", False),
921
1478
  "files_changed_count": commit.get("files_changed_count", 0),
922
- "insertions": commit.get("lines_added", 0),
923
- "deletions": commit.get("lines_deleted", 0),
1479
+ # Store raw unfiltered values in insertions/deletions
1480
+ "insertions": commit.get(
1481
+ "raw_insertions", commit.get("lines_added", 0)
1482
+ ),
1483
+ "deletions": commit.get(
1484
+ "raw_deletions", commit.get("lines_deleted", 0)
1485
+ ),
1486
+ # Store filtered values separately
1487
+ "filtered_insertions": commit.get(
1488
+ "filtered_insertions", commit.get("lines_added", 0)
1489
+ ),
1490
+ "filtered_deletions": commit.get(
1491
+ "filtered_deletions", commit.get("lines_deleted", 0)
1492
+ ),
924
1493
  "story_points": commit.get("story_points"),
925
1494
  "ticket_references": commit.get("ticket_references", []),
926
1495
  }
@@ -1193,24 +1762,48 @@ class GitDataFetcher:
1193
1762
  session.close()
1194
1763
 
1195
1764
  def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
1196
- """Calculate commit statistics using reliable git diff --numstat.
1765
+ """Calculate commit statistics using reliable git diff --numstat with exclude_paths filtering.
1197
1766
 
1198
1767
  Returns:
1199
- Dictionary with 'files', 'insertions', and 'deletions' counts
1768
+ Dictionary with both raw and filtered statistics:
1769
+ - 'files', 'insertions', 'deletions': filtered counts
1770
+ - 'raw_insertions', 'raw_deletions': unfiltered counts
1771
+
1772
+ THREAD SAFETY: This method is thread-safe as it works with commit objects
1773
+ that have their own repo references.
1200
1774
  """
1201
1775
  stats = {"files": 0, "insertions": 0, "deletions": 0}
1202
1776
 
1777
+ # Track raw stats for storage
1778
+ raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
1779
+ excluded_stats = {"files": 0, "insertions": 0, "deletions": 0}
1780
+
1203
1781
  # For initial commits or commits without parents
1204
1782
  parent = commit.parents[0] if commit.parents else None
1205
1783
 
1206
1784
  try:
1207
- # Use git command directly for accurate line counts
1785
+ # THREAD SAFETY: Use the repo reference from the commit object
1786
+ # Each thread has its own commit object with its own repo reference
1208
1787
  repo = commit.repo
1209
- if parent:
1210
- diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1211
- else:
1212
- # Initial commit - use git show with --numstat
1213
- diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1788
+ Path(repo.working_dir)
1789
+
1790
+ def get_diff_output():
1791
+ if parent:
1792
+ return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1793
+ else:
1794
+ # Initial commit - use git show with --numstat
1795
+ return repo.git.show(commit.hexsha, "--numstat", "--format=")
1796
+
1797
+ # Use timeout wrapper for git diff operations
1798
+ try:
1799
+ diff_output = self.git_wrapper.run_with_timeout(
1800
+ get_diff_output,
1801
+ timeout=10, # 10 seconds for diff operations
1802
+ operation_name=f"diff_{commit.hexsha[:8]}",
1803
+ )
1804
+ except GitOperationTimeout:
1805
+ logger.warning(f"⏱️ Timeout calculating stats for commit {commit.hexsha[:8]}")
1806
+ return stats # Return zeros
1214
1807
 
1215
1808
  # Parse the numstat output: insertions\tdeletions\tfilename
1216
1809
  for line in diff_output.strip().split("\n"):
@@ -1222,9 +1815,22 @@ class GitDataFetcher:
1222
1815
  try:
1223
1816
  insertions = int(parts[0]) if parts[0] != "-" else 0
1224
1817
  deletions = int(parts[1]) if parts[1] != "-" else 0
1225
- # filename = parts[2] - we don't filter here since files_changed list is handled separately
1226
-
1227
- # Count all changes (raw stats, no filtering)
1818
+ filename = parts[2]
1819
+
1820
+ # Always count raw stats
1821
+ raw_stats["files"] += 1
1822
+ raw_stats["insertions"] += insertions
1823
+ raw_stats["deletions"] += deletions
1824
+
1825
+ # Skip excluded files based on exclude_paths patterns
1826
+ if self._should_exclude_file(filename):
1827
+ logger.debug(f"Excluding file from line counts: {filename}")
1828
+ excluded_stats["files"] += 1
1829
+ excluded_stats["insertions"] += insertions
1830
+ excluded_stats["deletions"] += deletions
1831
+ continue
1832
+
1833
+ # Count only non-excluded files and their changes
1228
1834
  stats["files"] += 1
1229
1835
  stats["insertions"] += insertions
1230
1836
  stats["deletions"] += deletions
@@ -1233,10 +1839,36 @@ class GitDataFetcher:
1233
1839
  # Skip binary files or malformed lines
1234
1840
  continue
1235
1841
 
1842
+ # Log exclusion statistics if significant
1843
+ if excluded_stats["files"] > 0 or (
1844
+ raw_stats["insertions"] > 0 and stats["insertions"] < raw_stats["insertions"]
1845
+ ):
1846
+ reduction_pct = (
1847
+ 100 * (1 - stats["insertions"] / raw_stats["insertions"])
1848
+ if raw_stats["insertions"] > 0
1849
+ else 0
1850
+ )
1851
+ logger.info(
1852
+ f"Commit {commit.hexsha[:8]}: Excluded {excluded_stats['files']} files, "
1853
+ f"{excluded_stats['insertions']} insertions, {excluded_stats['deletions']} deletions "
1854
+ f"({reduction_pct:.1f}% reduction)"
1855
+ )
1856
+
1857
+ # Log if exclusions are configured
1858
+ if self.exclude_paths and raw_stats["files"] > 0:
1859
+ logger.debug(
1860
+ f"Commit {commit.hexsha[:8]}: Applied {len(self.exclude_paths)} exclusion patterns. "
1861
+ f"Raw: {raw_stats['files']} files, +{raw_stats['insertions']} -{raw_stats['deletions']}. "
1862
+ f"Filtered: {stats['files']} files, +{stats['insertions']} -{stats['deletions']}"
1863
+ )
1864
+
1236
1865
  except Exception as e:
1237
1866
  # Log the error for debugging but don't crash
1238
1867
  logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
1239
1868
 
1869
+ # Return both raw and filtered stats
1870
+ stats["raw_insertions"] = raw_stats["insertions"]
1871
+ stats["raw_deletions"] = raw_stats["deletions"]
1240
1872
  return stats
1241
1873
 
1242
1874
  def _store_day_commits_incremental(
@@ -1266,8 +1898,16 @@ class GitDataFetcher:
1266
1898
  "branch": commit.get("branch", "main"),
1267
1899
  "is_merge": commit.get("is_merge", False),
1268
1900
  "files_changed_count": commit.get("files_changed_count", 0),
1269
- "insertions": commit.get("lines_added", 0),
1270
- "deletions": commit.get("lines_deleted", 0),
1901
+ # Store raw unfiltered values
1902
+ "insertions": commit.get("raw_insertions", commit.get("lines_added", 0)),
1903
+ "deletions": commit.get("raw_deletions", commit.get("lines_deleted", 0)),
1904
+ # Store filtered values
1905
+ "filtered_insertions": commit.get(
1906
+ "filtered_insertions", commit.get("lines_added", 0)
1907
+ ),
1908
+ "filtered_deletions": commit.get(
1909
+ "filtered_deletions", commit.get("lines_deleted", 0)
1910
+ ),
1271
1911
  "story_points": commit.get("story_points"),
1272
1912
  "ticket_references": commit.get("ticket_references", []),
1273
1913
  }
@@ -1283,3 +1923,297 @@ class GitDataFetcher:
1283
1923
  except Exception as e:
1284
1924
  # Log error but don't fail - commits will be stored again in batch at the end
1285
1925
  logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")
1926
+
1927
+ def process_repositories_parallel(
1928
+ self,
1929
+ repositories: list[dict],
1930
+ weeks_back: int = 4,
1931
+ jira_integration: Optional[JIRAIntegration] = None,
1932
+ start_date: Optional[datetime] = None,
1933
+ end_date: Optional[datetime] = None,
1934
+ max_workers: int = 3,
1935
+ ) -> dict[str, Any]:
1936
+ """Process multiple repositories in parallel with proper timeout protection.
1937
+
1938
+ Args:
1939
+ repositories: List of repository configurations
1940
+ weeks_back: Number of weeks to analyze
1941
+ jira_integration: Optional JIRA integration for ticket data
1942
+ start_date: Optional explicit start date
1943
+ end_date: Optional explicit end date
1944
+ max_workers: Maximum number of parallel workers
1945
+
1946
+ Returns:
1947
+ Dictionary containing processing results and statistics
1948
+ """
1949
+ logger.info(
1950
+ f"🚀 Starting parallel processing of {len(repositories)} repositories with {max_workers} workers"
1951
+ )
1952
+
1953
+ # Initialize statistics
1954
+ self.processing_stats = {
1955
+ "total": len(repositories),
1956
+ "processed": 0,
1957
+ "success": 0,
1958
+ "failed": 0,
1959
+ "timeout": 0,
1960
+ "repositories": {},
1961
+ }
1962
+
1963
+ # Get progress service for updates
1964
+ progress = get_progress_service()
1965
+
1966
+ # Start heartbeat logger for monitoring
1967
+ with HeartbeatLogger(interval=5):
1968
+ results = {}
1969
+
1970
+ # Use ThreadPoolExecutor for parallel processing
1971
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1972
+ # Submit all repository processing tasks
1973
+ future_to_repo = {}
1974
+
1975
+ for repo_config in repositories:
1976
+ repo_path = Path(repo_config.get("path", ""))
1977
+ project_key = repo_config.get("project_key", repo_path.name)
1978
+ branch_patterns = repo_config.get("branch_patterns")
1979
+
1980
+ # Submit task with timeout wrapper
1981
+ future = executor.submit(
1982
+ self._process_repository_with_timeout,
1983
+ repo_path,
1984
+ project_key,
1985
+ weeks_back,
1986
+ branch_patterns,
1987
+ jira_integration,
1988
+ start_date,
1989
+ end_date,
1990
+ )
1991
+ future_to_repo[future] = {
1992
+ "path": repo_path,
1993
+ "project_key": project_key,
1994
+ "start_time": time.time(),
1995
+ }
1996
+
1997
+ logger.info(f"📋 Submitted {project_key} for processing")
1998
+
1999
+ # Process results as they complete
2000
+ for future in as_completed(future_to_repo):
2001
+ repo_info = future_to_repo[future]
2002
+ project_key = repo_info["project_key"]
2003
+ elapsed_time = time.time() - repo_info["start_time"]
2004
+
2005
+ try:
2006
+ result = future.result(timeout=5) # Short timeout to get result
2007
+
2008
+ if result:
2009
+ self.processing_stats["success"] += 1
2010
+ self.processing_stats["repositories"][project_key] = {
2011
+ "status": "success",
2012
+ "elapsed_time": elapsed_time,
2013
+ "commits": result.get("stats", {}).get("total_commits", 0),
2014
+ "tickets": result.get("stats", {}).get("unique_tickets", 0),
2015
+ }
2016
+ results[project_key] = result
2017
+
2018
+ logger.info(
2019
+ f"✅ {project_key}: Successfully processed "
2020
+ f"{result['stats']['total_commits']} commits in {elapsed_time:.1f}s"
2021
+ )
2022
+
2023
+ # Update progress
2024
+ if hasattr(progress, "finish_repository"):
2025
+ # Check if progress adapter supports stats parameter
2026
+ if hasattr(progress, "update_stats"):
2027
+ progress.update_stats(
2028
+ processed=self.processing_stats["processed"],
2029
+ success=self.processing_stats["success"],
2030
+ failed=self.processing_stats["failed"],
2031
+ timeout=self.processing_stats["timeout"],
2032
+ total=self.processing_stats["total"],
2033
+ )
2034
+ progress.finish_repository(project_key, success=True)
2035
+ else:
2036
+ progress.finish_repository(project_key, success=True)
2037
+ else:
2038
+ self.processing_stats["failed"] += 1
2039
+ self.processing_stats["repositories"][project_key] = {
2040
+ "status": "failed",
2041
+ "elapsed_time": elapsed_time,
2042
+ "error": "Processing returned no result",
2043
+ }
2044
+
2045
+ logger.error(
2046
+ f"❌ {project_key}: Processing failed after {elapsed_time:.1f}s"
2047
+ )
2048
+
2049
+ # Update progress
2050
+ if hasattr(progress, "finish_repository"):
2051
+ # Check if progress adapter supports stats parameter
2052
+ if hasattr(progress, "update_stats"):
2053
+ progress.update_stats(
2054
+ processed=self.processing_stats["processed"],
2055
+ success=self.processing_stats["success"],
2056
+ failed=self.processing_stats["failed"],
2057
+ timeout=self.processing_stats["timeout"],
2058
+ total=self.processing_stats["total"],
2059
+ )
2060
+ progress.finish_repository(
2061
+ project_key,
2062
+ success=False,
2063
+ error_message="Processing failed",
2064
+ )
2065
+ else:
2066
+ progress.finish_repository(
2067
+ project_key,
2068
+ success=False,
2069
+ error_message="Processing failed",
2070
+ )
2071
+
2072
+ except GitOperationTimeout:
2073
+ self.processing_stats["timeout"] += 1
2074
+ self.processing_stats["repositories"][project_key] = {
2075
+ "status": "timeout",
2076
+ "elapsed_time": elapsed_time,
2077
+ "error": "Operation timed out",
2078
+ }
2079
+
2080
+ logger.error(f"⏱️ {project_key}: Timed out after {elapsed_time:.1f}s")
2081
+
2082
+ # Update progress
2083
+ if hasattr(progress, "finish_repository"):
2084
+ # Check if progress adapter supports stats parameter
2085
+ if hasattr(progress, "update_stats"):
2086
+ progress.update_stats(
2087
+ processed=self.processing_stats["processed"],
2088
+ success=self.processing_stats["success"],
2089
+ failed=self.processing_stats["failed"],
2090
+ timeout=self.processing_stats["timeout"],
2091
+ total=self.processing_stats["total"],
2092
+ )
2093
+ progress.finish_repository(
2094
+ project_key, success=False, error_message="Timeout"
2095
+ )
2096
+ else:
2097
+ progress.finish_repository(
2098
+ project_key, success=False, error_message="Timeout"
2099
+ )
2100
+
2101
+ except Exception as e:
2102
+ self.processing_stats["failed"] += 1
2103
+ self.processing_stats["repositories"][project_key] = {
2104
+ "status": "failed",
2105
+ "elapsed_time": elapsed_time,
2106
+ "error": str(e),
2107
+ }
2108
+
2109
+ logger.error(f"❌ {project_key}: Error after {elapsed_time:.1f}s - {e}")
2110
+
2111
+ # Update progress
2112
+ if hasattr(progress, "finish_repository"):
2113
+ # Check if progress adapter supports stats parameter
2114
+ if hasattr(progress, "update_stats"):
2115
+ progress.update_stats(
2116
+ processed=self.processing_stats["processed"],
2117
+ success=self.processing_stats["success"],
2118
+ failed=self.processing_stats["failed"],
2119
+ timeout=self.processing_stats["timeout"],
2120
+ total=self.processing_stats["total"],
2121
+ )
2122
+ progress.finish_repository(
2123
+ project_key, success=False, error_message=str(e)
2124
+ )
2125
+ else:
2126
+ progress.finish_repository(
2127
+ project_key, success=False, error_message=str(e)
2128
+ )
2129
+
2130
+ finally:
2131
+ # Update processed counter BEFORE logging and progress updates
2132
+ self.processing_stats["processed"] += 1
2133
+
2134
+ # Update progress service with actual processing stats
2135
+ if hasattr(progress, "update_stats"):
2136
+ progress.update_stats(
2137
+ processed=self.processing_stats["processed"],
2138
+ success=self.processing_stats["success"],
2139
+ failed=self.processing_stats["failed"],
2140
+ timeout=self.processing_stats["timeout"],
2141
+ total=self.processing_stats["total"],
2142
+ )
2143
+
2144
+ # Log progress
2145
+ logger.info(
2146
+ f"📊 Progress: {self.processing_stats['processed']}/{self.processing_stats['total']} repositories "
2147
+ f"(✅ {self.processing_stats['success']} | ❌ {self.processing_stats['failed']} | ⏱️ {self.processing_stats['timeout']})"
2148
+ )
2149
+
2150
+ # Final summary
2151
+ logger.info("=" * 60)
2152
+ logger.info("📈 PARALLEL PROCESSING SUMMARY")
2153
+ logger.info(f" Total repositories: {self.processing_stats['total']}")
2154
+ logger.info(f" Successfully processed: {self.processing_stats['success']}")
2155
+ logger.info(f" Failed: {self.processing_stats['failed']}")
2156
+ logger.info(f" Timed out: {self.processing_stats['timeout']}")
2157
+ logger.info("=" * 60)
2158
+
2159
+ return {"results": results, "statistics": self.processing_stats}
2160
+
2161
+ def _process_repository_with_timeout(
2162
+ self,
2163
+ repo_path: Path,
2164
+ project_key: str,
2165
+ weeks_back: int = 4,
2166
+ branch_patterns: Optional[list[str]] = None,
2167
+ jira_integration: Optional[JIRAIntegration] = None,
2168
+ start_date: Optional[datetime] = None,
2169
+ end_date: Optional[datetime] = None,
2170
+ timeout_per_operation: int = 30,
2171
+ ) -> Optional[dict[str, Any]]:
2172
+ """Process a single repository with comprehensive timeout protection.
2173
+
2174
+ Args:
2175
+ repo_path: Path to the repository
2176
+ project_key: Project identifier
2177
+ weeks_back: Number of weeks to analyze
2178
+ branch_patterns: Branch patterns to include
2179
+ jira_integration: JIRA integration for ticket data
2180
+ start_date: Optional explicit start date
2181
+ end_date: Optional explicit end date
2182
+ timeout_per_operation: Timeout for individual git operations
2183
+
2184
+ Returns:
2185
+ Repository processing results or None if failed
2186
+ """
2187
+ try:
2188
+ # Track this repository in progress
2189
+ progress = get_progress_service()
2190
+ if hasattr(progress, "start_repository"):
2191
+ progress.start_repository(project_key, 0)
2192
+
2193
+ logger.info(f"🔍 Processing repository: {project_key} at {repo_path}")
2194
+
2195
+ # Use the regular fetch method but with timeout wrapper active
2196
+ with self.git_wrapper.operation_tracker("fetch_repository_data", repo_path):
2197
+ result = self.fetch_repository_data(
2198
+ repo_path=repo_path,
2199
+ project_key=project_key,
2200
+ weeks_back=weeks_back,
2201
+ branch_patterns=branch_patterns,
2202
+ jira_integration=jira_integration,
2203
+ progress_callback=None, # We handle progress at a higher level
2204
+ start_date=start_date,
2205
+ end_date=end_date,
2206
+ )
2207
+
2208
+ return result
2209
+
2210
+ except GitOperationTimeout as e:
2211
+ logger.error(f"⏱️ Repository {project_key} processing timed out: {e}")
2212
+ raise
2213
+
2214
+ except Exception as e:
2215
+ logger.error(f"❌ Error processing repository {project_key}: {e}")
2216
+ import traceback
2217
+
2218
+ logger.debug(f"Full traceback: {traceback.format_exc()}")
2219
+ return None