PyPI - gitflow-analytics - Versions diffs - 1.3.6__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

gitflow-analytics 1.3.6py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

gitflow_analytics/_version.py +1 -1
gitflow_analytics/classification/batch_classifier.py +156 -4
gitflow_analytics/cli.py +897 -179
gitflow_analytics/config/loader.py +40 -1
gitflow_analytics/config/schema.py +4 -0
gitflow_analytics/core/cache.py +20 -0
gitflow_analytics/core/data_fetcher.py +1254 -228
gitflow_analytics/core/git_auth.py +169 -0
gitflow_analytics/core/git_timeout_wrapper.py +347 -0
gitflow_analytics/core/metrics_storage.py +12 -3
gitflow_analytics/core/progress.py +219 -18
gitflow_analytics/core/subprocess_git.py +145 -0
gitflow_analytics/extractors/ml_tickets.py +3 -2
gitflow_analytics/extractors/tickets.py +93 -8
gitflow_analytics/integrations/jira_integration.py +1 -1
gitflow_analytics/integrations/orchestrator.py +47 -29
gitflow_analytics/metrics/branch_health.py +3 -2
gitflow_analytics/models/database.py +72 -1
gitflow_analytics/pm_framework/adapters/jira_adapter.py +12 -5
gitflow_analytics/pm_framework/orchestrator.py +8 -3
gitflow_analytics/qualitative/classifiers/llm/openai_client.py +24 -4
gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +3 -1
gitflow_analytics/qualitative/core/llm_fallback.py +34 -2
gitflow_analytics/reports/narrative_writer.py +118 -74
gitflow_analytics/security/__init__.py +11 -0
gitflow_analytics/security/config.py +189 -0
gitflow_analytics/security/extractors/__init__.py +7 -0
gitflow_analytics/security/extractors/dependency_checker.py +379 -0
gitflow_analytics/security/extractors/secret_detector.py +197 -0
gitflow_analytics/security/extractors/vulnerability_scanner.py +333 -0
gitflow_analytics/security/llm_analyzer.py +347 -0
gitflow_analytics/security/reports/__init__.py +5 -0
gitflow_analytics/security/reports/security_report.py +358 -0
gitflow_analytics/security/security_analyzer.py +414 -0
gitflow_analytics/tui/app.py +3 -1
gitflow_analytics/tui/progress_adapter.py +313 -0
gitflow_analytics/tui/screens/analysis_progress_screen.py +407 -46
gitflow_analytics/tui/screens/results_screen.py +219 -206
gitflow_analytics/ui/__init__.py +21 -0
gitflow_analytics/ui/progress_display.py +1477 -0
gitflow_analytics/verify_activity.py +697 -0
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/METADATA +2 -1
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/RECORD +47 -31
gitflow_analytics/cli_rich.py +0 -503
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/WHEEL +0 -0
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/entry_points.txt +0 -0
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/licenses/LICENSE +0 -0
{gitflow_analytics-1.3.6.dist-info → gitflow_analytics-3.3.0.dist-info}/top_level.txt +0 -0

gitflow_analytics/core/data_fetcher.py CHANGED Viewed

@@ -5,15 +5,17 @@ focusing purely on data collection from Git repositories and ticket systems
 without performing any LLM-based classification.
 """
-import contextlib
 import logging
 import os
 import subprocess
-from collections import defaultdict
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any, Optional
+from sqlalchemy import func
 from sqlalchemy.orm import Session
 from ..extractors.story_points import StoryPointExtractor
@@ -27,11 +29,17 @@ from ..models.database import (
 )
 from .branch_mapper import BranchToProjectMapper
 from .cache import GitAnalysisCache
+from .git_timeout_wrapper import GitOperationTimeout, GitTimeoutWrapper, HeartbeatLogger
 from .identity import DeveloperIdentityResolver
 from .progress import get_progress_service
 logger = logging.getLogger(__name__)
+# THREAD SAFETY: Module-level thread-local storage for repository instances
+# Each thread gets its own isolated storage to prevent thread-safety issues
+# when GitDataFetcher is called from ThreadPoolExecutor
+_thread_local = threading.local()
 class GitDataFetcher:
     """Fetches raw Git commit data and organizes it by day for efficient batch processing.
@@ -50,6 +58,7 @@ class GitDataFetcher:
         branch_mapping_rules: Optional[dict[str, list[str]]] = None,
         allowed_ticket_platforms: Optional[list[str]] = None,
         exclude_paths: Optional[list[str]] = None,
+        skip_remote_fetch: bool = False,
     ):
         """Initialize the data fetcher.
@@ -58,8 +67,11 @@ class GitDataFetcher:
             branch_mapping_rules: Rules for mapping branches to projects
             allowed_ticket_platforms: List of allowed ticket platforms
             exclude_paths: List of file paths to exclude from analysis
+            skip_remote_fetch: If True, skip git fetch/pull operations
         """
         self.cache = cache
+        self.skip_remote_fetch = skip_remote_fetch
+        self.repository_status = {}  # Track status of each repository
         # CRITICAL FIX: Use the same database instance as the cache to avoid session conflicts
         self.database = cache.db
         self.story_point_extractor = StoryPointExtractor()
@@ -67,10 +79,35 @@ class GitDataFetcher:
         self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
         self.exclude_paths = exclude_paths or []
+        # Log exclusion configuration
+        if self.exclude_paths:
+            logger.info(
+                f"GitDataFetcher initialized with {len(self.exclude_paths)} exclusion patterns:"
+            )
+            for pattern in self.exclude_paths[:5]:  # Show first 5 patterns
+                logger.debug(f"  - {pattern}")
+            if len(self.exclude_paths) > 5:
+                logger.debug(f"  ... and {len(self.exclude_paths) - 5} more patterns")
+        else:
+            logger.info("GitDataFetcher initialized with no file exclusions")
         # Initialize identity resolver
         identity_db_path = cache.cache_dir / "identities.db"
         self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
+        # Initialize git timeout wrapper for safe operations
+        self.git_wrapper = GitTimeoutWrapper(default_timeout=30)
+        # Statistics for tracking repository processing
+        self.processing_stats = {
+            "total": 0,
+            "processed": 0,
+            "success": 0,
+            "failed": 0,
+            "timeout": 0,
+            "repositories": {},
+        }
     def fetch_repository_data(
         self,
         repo_path: Path,
@@ -117,37 +154,75 @@ class GitDataFetcher:
                 f"🔍 DEBUG: Calculated date range from weeks_back: {start_date} to {end_date}"
             )
-        # Step 1: Collect all commits organized by day
+        # Get progress service for top-level progress tracking
+        progress = get_progress_service()
+        # Start Rich display for this repository if enabled
+        if hasattr(progress, "_use_rich") and progress._use_rich:
+            # Count total commits for progress estimation
+            try:
+                import git
+                git.Repo(repo_path)
+                # Check if we need to clone or pull
+                if not repo_path.exists() or not (repo_path / ".git").exists():
+                    logger.info(f"📥 Repository {project_key} needs cloning")
+                    progress.start_repository(f"{project_key} (cloning)", 0)
+                else:
+                    # Rough estimate based on weeks
+                    estimated_commits = weeks_back * 50  # Estimate ~50 commits per week
+                    progress.start_repository(project_key, estimated_commits)
+            except Exception:
+                progress.start_repository(project_key, 100)  # Default estimate
+        # Step 1: Collect all commits organized by day with enhanced progress tracking
         logger.info("🔍 DEBUG: About to fetch commits by day")
-        logger.info("Fetching commits organized by day...")
-        daily_commits = self._fetch_commits_by_day(
-            repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
-        )
-        logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
-        # Step 2: Extract and fetch all referenced tickets
-        logger.info("🔍 DEBUG: About to extract ticket references")
-        logger.info("Extracting ticket references...")
-        ticket_ids = self._extract_all_ticket_references(daily_commits)
-        logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
-        if jira_integration and ticket_ids:
-            logger.info(f"Fetching {len(ticket_ids)} unique tickets from JIRA...")
-            self._fetch_detailed_tickets(
-                ticket_ids, jira_integration, project_key, progress_callback
+        logger.info(f"Fetching commits organized by day for repository: {project_key}")
+        # Create top-level progress for this repository
+        with progress.progress(
+            total=3,  # Three main steps: fetch commits, extract tickets, store data
+            description=f"📊 Processing repository: {project_key}",
+            unit="steps",
+        ) as repo_progress_ctx:
+            # Step 1: Fetch commits
+            progress.set_description(repo_progress_ctx, f"🔍 {project_key}: Fetching commits")
+            daily_commits = self._fetch_commits_by_day(
+                repo_path, project_key, start_date, end_date, branch_patterns, progress_callback
             )
+            logger.info(f"🔍 DEBUG: Fetched {len(daily_commits)} days of commits")
+            progress.update(repo_progress_ctx)
-        # Step 3: Store commit-ticket correlations
-        logger.info("Building commit-ticket correlations...")
-        correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
+            # Step 2: Extract and fetch all referenced tickets
+            progress.set_description(repo_progress_ctx, f"🎫 {project_key}: Processing tickets")
+            logger.info("🔍 DEBUG: About to extract ticket references")
+            logger.info(f"Extracting ticket references for {project_key}...")
+            ticket_ids = self._extract_all_ticket_references(daily_commits)
+            logger.info(f"🔍 DEBUG: Extracted {len(ticket_ids)} ticket IDs")
-        # Step 4: Store daily commit batches
-        logger.info(
-            f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
-        )
-        logger.info("Storing daily commit batches...")
-        batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
-        logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
+            if jira_integration and ticket_ids:
+                logger.info(
+                    f"Fetching {len(ticket_ids)} unique tickets from JIRA for {project_key}..."
+                )
+                self._fetch_detailed_tickets(
+                    ticket_ids, jira_integration, project_key, progress_callback
+                )
+            # Build commit-ticket correlations
+            logger.info(f"Building commit-ticket correlations for {project_key}...")
+            correlations_created = self._build_commit_ticket_correlations(daily_commits, repo_path)
+            progress.update(repo_progress_ctx)
+            # Step 3: Store daily commit batches
+            progress.set_description(repo_progress_ctx, f"💾 {project_key}: Storing data")
+            logger.info(
+                f"🔍 DEBUG: About to store daily batches. Daily commits has {len(daily_commits)} days"
+            )
+            logger.info("Storing daily commit batches...")
+            batches_created = self._store_daily_batches(daily_commits, repo_path, project_key)
+            logger.info(f"🔍 DEBUG: Storage complete. Batches created: {batches_created}")
+            progress.update(repo_progress_ctx)
         # CRITICAL FIX: Verify actual storage before reporting success
         session = self.database.get_session()
@@ -167,6 +242,46 @@ class GitDataFetcher:
         # Return summary statistics with ACTUAL stored counts
         # expected_commits already calculated above
+        # Calculate exclusion impact summary if exclusions are configured
+        exclusion_stats = {
+            "patterns_applied": len(self.exclude_paths),
+            "enabled": bool(self.exclude_paths),
+        }
+        if self.exclude_paths and expected_commits > 0:
+            # Get aggregate stats from daily_commit_batches
+            session = self.database.get_session()
+            try:
+                batch_stats = (
+                    session.query(
+                        func.sum(DailyCommitBatch.total_lines_added).label("total_added"),
+                        func.sum(DailyCommitBatch.total_lines_deleted).label("total_deleted"),
+                    )
+                    .filter(
+                        DailyCommitBatch.project_key == project_key,
+                        DailyCommitBatch.repo_path == str(repo_path),
+                    )
+                    .first()
+                )
+                if batch_stats and batch_stats.total_added:
+                    total_lines = (batch_stats.total_added or 0) + (batch_stats.total_deleted or 0)
+                    exclusion_stats["total_lines_after_filtering"] = total_lines
+                    exclusion_stats["lines_added"] = batch_stats.total_added or 0
+                    exclusion_stats["lines_deleted"] = batch_stats.total_deleted or 0
+                    logger.info(
+                        f"📊 Exclusion Impact Summary for {project_key}:\n"
+                        f"  - Exclusion patterns applied: {len(self.exclude_paths)}\n"
+                        f"  - Total lines after filtering: {total_lines:,}\n"
+                        f"  - Lines added: {batch_stats.total_added:,}\n"
+                        f"  - Lines deleted: {batch_stats.total_deleted:,}"
+                    )
+            except Exception as e:
+                logger.debug(f"Could not calculate exclusion impact summary: {e}")
+            finally:
+                session.close()
         results = {
             "project_key": project_key,
             "repo_path": str(repo_path),
@@ -180,6 +295,7 @@ class GitDataFetcher:
                 "correlations_created": correlations_created,
                 "batches_created": batches_created,
             },
+            "exclusions": exclusion_stats,
             "daily_commits": daily_commits,  # For immediate use if needed
         }
@@ -188,10 +304,20 @@ class GitDataFetcher:
             logger.info(
                 f"✅ Data fetch completed successfully for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
             )
+            # Finish repository in Rich display with success
+            if hasattr(progress, "_use_rich") and progress._use_rich:
+                progress.finish_repository(project_key, success=True)
         else:
             logger.error(
                 f"⚠️ Data fetch completed with storage issues for {project_key}: {actual_stored_commits}/{expected_commits} commits stored, {len(ticket_ids)} tickets"
             )
+            # Finish repository in Rich display with error
+            if hasattr(progress, "_use_rich") and progress._use_rich:
+                progress.finish_repository(
+                    project_key,
+                    success=False,
+                    error_message=f"Storage issue: {actual_stored_commits}/{expected_commits} commits",
+                )
         return results
@@ -209,74 +335,296 @@ class GitDataFetcher:
         Returns:
             Dictionary mapping date strings (YYYY-MM-DD) to lists of commit data
         """
-        from git import Repo
+        # THREAD SAFETY: Use the module-level thread-local storage to ensure each thread
+        # gets its own Repo instance. This prevents thread-safety issues when called from ThreadPoolExecutor
+        # Set environment variables to prevent ANY password prompts before opening repo
+        original_env = {}
+        env_vars = {
+            "GIT_TERMINAL_PROMPT": "0",
+            "GIT_ASKPASS": "/bin/echo",  # Use full path to echo
+            "SSH_ASKPASS": "/bin/echo",
+            "GCM_INTERACTIVE": "never",
+            "GIT_SSH_COMMAND": "ssh -o BatchMode=yes -o StrictHostKeyChecking=no -o PasswordAuthentication=no",
+            "DISPLAY": "",
+            "GIT_CREDENTIAL_HELPER": "",  # Disable credential helper
+            "GCM_PROVIDER": "none",  # Disable Git Credential Manager
+            "GIT_CREDENTIALS": "",  # Clear any cached credentials
+            "GIT_CONFIG_NOSYSTEM": "1",  # Don't use system config
+        }
+        # Save original environment and set our values
+        for key, value in env_vars.items():
+            original_env[key] = os.environ.get(key)
+            os.environ[key] = value
         try:
-            repo = Repo(repo_path)
+            # THREAD SAFETY: Create a fresh Repo instance for this thread
+            # Do NOT reuse Repo instances across threads
+            from git import Repo
+            # Check for security issues in repository configuration
+            self._check_repository_security(repo_path, project_key)
+            # When skip_remote_fetch is enabled, use a more restricted repository access
+            if self.skip_remote_fetch:
+                # Use a special git configuration that completely disables credential helpers
+                import tempfile
+                # Create a secure temporary directory for our config
+                temp_dir = tempfile.mkdtemp(prefix="gitflow_")
+                # Create a temporary git config that disables all authentication
+                tmp_config_path = os.path.join(temp_dir, ".gitconfig")
+                with open(tmp_config_path, "w") as tmp_config:
+                    tmp_config.write("[credential]\n")
+                    tmp_config.write("    helper = \n")
+                    tmp_config.write("[core]\n")
+                    tmp_config.write("    askpass = \n")
+                # Set GIT_CONFIG to use our temporary config
+                os.environ["GIT_CONFIG_GLOBAL"] = tmp_config_path
+                os.environ["GIT_CONFIG_SYSTEM"] = "/dev/null"
+                # Store temp_dir in thread-local storage for cleanup
+                _thread_local.temp_dir = temp_dir
+                try:
+                    # Open repository with our restricted configuration
+                    # THREAD SAFETY: Each thread gets its own Repo instance
+                    repo = Repo(repo_path)
+                finally:
+                    # Clean up temporary config directory
+                    try:
+                        import shutil
+                        if hasattr(_thread_local, "temp_dir"):
+                            shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
+                            delattr(_thread_local, "temp_dir")
+                    except:
+                        pass
+                try:
+                    # Configure git to never prompt for credentials
+                    with repo.config_writer() as git_config:
+                        git_config.set_value("core", "askpass", "")
+                        git_config.set_value("credential", "helper", "")
+                except Exception as e:
+                    logger.debug(f"Could not update git config: {e}")
+                # Note: We can't monkey-patch remotes as it's a property without setter
+                # The skip_remote_fetch flag will prevent remote operations elsewhere
+                logger.debug(
+                    f"Opened repository {project_key} in offline mode (skip_remote_fetch=true)"
+                )
+            else:
+                # THREAD SAFETY: Each thread gets its own Repo instance
+                repo = Repo(repo_path)
+            # Track repository status
+            self.repository_status[project_key] = {
+                "path": str(repo_path),
+                "remote_update": "skipped" if self.skip_remote_fetch else "pending",
+                "authentication_issues": False,
+                "error": None,
+            }
             # Update repository from remote before analysis
-            self._update_repository(repo)
+            if not self.skip_remote_fetch:
+                logger.info(f"📥 Updating repository {project_key} from remote...")
+            update_success = self._update_repository(repo)
+            if not self.skip_remote_fetch:
+                self.repository_status[project_key]["remote_update"] = (
+                    "success" if update_success else "failed"
+                )
+                if not update_success:
+                    logger.warning(
+                        f"⚠️ {project_key}: Continuing with local repository state (remote update failed)"
+                    )
         except Exception as e:
-            logger.error(f"Failed to open repository at {repo_path}: {e}")
+            logger.error(f"Failed to open repository {project_key} at {repo_path}: {e}")
+            self.repository_status[project_key] = {
+                "path": str(repo_path),
+                "remote_update": "error",
+                "authentication_issues": "authentication" in str(e).lower()
+                or "password" in str(e).lower(),
+                "error": str(e),
+            }
+            # Restore original environment variables before returning
+            for key, value in original_env.items():
+                if value is None:
+                    os.environ.pop(key, None)
+                else:
+                    os.environ[key] = value
             return {}
-        # Collect commits from all relevant branches
-        all_commits = []
+        # Get branches to analyze
         branches_to_analyze = self._get_branches_to_analyze(repo, branch_patterns)
         if not branches_to_analyze:
-            logger.warning(f"No accessible branches found in repository {repo_path}")
+            logger.warning(
+                f"No accessible branches found in repository {project_key} at {repo_path}"
+            )
+            # Restore original environment variables before returning
+            for key, value in original_env.items():
+                if value is None:
+                    os.environ.pop(key, None)
+                else:
+                    os.environ[key] = value
             return {}
-        logger.info(f"Analyzing branches: {branches_to_analyze}")
+        logger.info(f"🌿 {project_key}: Analyzing branches: {branches_to_analyze}")
-        for branch_name in branches_to_analyze:
-            try:
-                branch_commits = list(
-                    repo.iter_commits(branch_name, since=start_date, until=end_date, reverse=False)
-                )
+        # Calculate days to process
+        current_date = start_date.date()
+        end_date_only = end_date.date()
+        days_to_process = []
+        while current_date <= end_date_only:
+            days_to_process.append(current_date)
+            current_date += timedelta(days=1)
-                logger.debug(
-                    f"Found {len(branch_commits)} commits in branch {branch_name} for date range"
+        logger.info(
+            f"Processing {len(days_to_process)} days from {start_date.date()} to {end_date.date()}"
+        )
+        # Get progress service for nested progress tracking
+        progress = get_progress_service()
+        # Dictionary to store commits by day
+        daily_commits = {}
+        all_commit_hashes = set()  # Track all hashes for deduplication
+        # Count total commits first for Rich display
+        try:
+            for branch_name in branches_to_analyze[:1]:  # Sample from first branch
+                sample_commits = list(
+                    repo.iter_commits(branch_name, since=start_date, until=end_date)
                 )
+                # Estimate based on first branch (multiply by number of branches for rough estimate)
+                len(sample_commits) * len(branches_to_analyze)
+                break
+        except:
+            len(days_to_process) * 50  # Default estimate
+        # Update repository in Rich display with estimated commit count
+        if hasattr(progress, "_use_rich") and progress._use_rich:
+            progress.update_repository(project_key, 0, 0.0)
+        # Create nested progress for day-by-day processing
+        with progress.progress(
+            total=len(days_to_process),
+            description=f"📅 Fetching commits for repository: {project_key}",
+            unit="days",
+            nested=True,
+        ) as day_progress_ctx:
+            for day_date in days_to_process:
+                # Update description to show current repository and day clearly
+                day_str = day_date.strftime("%Y-%m-%d")
+                progress.set_description(day_progress_ctx, f"🔍 {project_key}: Analyzing {day_str}")
+                # Calculate day boundaries
+                day_start = datetime.combine(day_date, datetime.min.time(), tzinfo=timezone.utc)
+                day_end = datetime.combine(day_date, datetime.max.time(), tzinfo=timezone.utc)
+                day_commits = []
+                commits_found_today = 0
+                # Process each branch for this specific day
+                for branch_name in branches_to_analyze:
+                    try:
+                        # Fetch commits for this specific day and branch with timeout protection
+                        def fetch_branch_commits():
+                            return list(
+                                repo.iter_commits(
+                                    branch_name, since=day_start, until=day_end, reverse=False
+                                )
+                            )
+                        # Use timeout wrapper to prevent hanging on iter_commits
+                        try:
+                            branch_commits = self.git_wrapper.run_with_timeout(
+                                fetch_branch_commits,
+                                timeout=15,  # 15 seconds per branch/day combination
+                                operation_name=f"iter_commits_{branch_name}_{day_str}",
+                            )
+                        except GitOperationTimeout:
+                            logger.warning(
+                                f"⏱️ Timeout fetching commits for branch {branch_name} on {day_str}, skipping"
+                            )
+                            continue
-                for commit in branch_commits:
-                    # Include merge commits like the original analyzer
-                    # The original analyzer marks merge commits with is_merge=True but doesn't skip them
+                        for commit in branch_commits:
+                            # Skip if we've already processed this commit
+                            if commit.hexsha in all_commit_hashes:
+                                continue
-                    # Extract commit data with full metadata
-                    commit_data = self._extract_commit_data(
-                        commit, branch_name, project_key, repo_path
+                            # Extract commit data with full metadata
+                            commit_data = self._extract_commit_data(
+                                commit, branch_name, project_key, repo_path
+                            )
+                            if commit_data:
+                                day_commits.append(commit_data)
+                                all_commit_hashes.add(commit.hexsha)
+                                commits_found_today += 1
+                    except GitOperationTimeout as e:
+                        logger.warning(
+                            f"⏱️ Timeout processing branch {branch_name} for day {day_str}: {e}"
+                        )
+                        continue
+                    except Exception as e:
+                        logger.warning(
+                            f"Error processing branch {branch_name} for day {day_str}: {e}"
+                        )
+                        continue
+                # Store commits for this day if any were found
+                if day_commits:
+                    # Sort commits by timestamp
+                    day_commits.sort(key=lambda c: c["timestamp"])
+                    daily_commits[day_str] = day_commits
+                    # Incremental caching - store commits for this day immediately
+                    self._store_day_commits_incremental(
+                        repo_path, day_str, day_commits, project_key
                     )
-                    if commit_data:
-                        all_commits.append(commit_data)
-            except Exception as e:
-                logger.warning(f"Error processing branch {branch_name}: {e}")
-                continue
+                    logger.debug(f"Found {commits_found_today} commits on {day_str}")
+                # Update progress callback if provided
+                if progress_callback:
+                    progress_callback(f"Processed {day_str}: {commits_found_today} commits")
-        # Deduplicate commits (same commit may appear in multiple branches)
-        seen_hashes = set()
-        unique_commits = []
-        for commit_data in all_commits:
-            commit_hash = commit_data["commit_hash"]
-            if commit_hash not in seen_hashes:
-                seen_hashes.add(commit_hash)
-                unique_commits.append(commit_data)
-        # Organize commits by day
-        daily_commits = defaultdict(list)
-        for commit_data in unique_commits:
-            # Convert timestamp to date key
-            commit_date = commit_data["timestamp"].date()
-            date_key = commit_date.strftime("%Y-%m-%d")
-            daily_commits[date_key].append(commit_data)
-        # Sort commits within each day by timestamp
-        for date_key in daily_commits:
-            daily_commits[date_key].sort(key=lambda c: c["timestamp"])
-        logger.info(f"Collected {len(unique_commits)} commits across {len(daily_commits)} days")
-        return dict(daily_commits)
+                # Update progress bar
+                progress.update(day_progress_ctx)
+                # Update Rich display with current commit count and speed
+                if hasattr(progress, "_use_rich") and progress._use_rich:
+                    total_processed = len(all_commit_hashes)
+                    # Calculate speed (commits per second) based on elapsed time
+                    import time
+                    if not hasattr(self, "_fetch_start_time"):
+                        self._fetch_start_time = time.time()
+                    elapsed = time.time() - self._fetch_start_time
+                    speed = total_processed / elapsed if elapsed > 0 else 0
+                    progress.update_repository(project_key, total_processed, speed)
+        total_commits = sum(len(commits) for commits in daily_commits.values())
+        logger.info(f"Collected {total_commits} unique commits across {len(daily_commits)} days")
+        # Restore original environment variables
+        for key, value in original_env.items():
+            if value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = value
+        return daily_commits
     def _extract_commit_data(
         self, commit: Any, branch_name: str, project_key: str, repo_path: Path
@@ -317,13 +665,19 @@ class GitDataFetcher:
                 line_stats = self._calculate_commit_stats(commit)
                 total_insertions = line_stats["insertions"]
                 total_deletions = line_stats["deletions"]
+                raw_insertions = line_stats.get("raw_insertions", total_insertions)
+                raw_deletions = line_stats.get("raw_deletions", total_deletions)
                 commit_data.update(
                     {
                         "files_changed": files_changed,
                         "files_changed_count": len(files_changed),
-                        "lines_added": total_insertions,
+                        "lines_added": total_insertions,  # Filtered counts for backward compatibility
                         "lines_deleted": total_deletions,
+                        "filtered_insertions": total_insertions,  # Explicitly filtered counts
+                        "filtered_deletions": total_deletions,
+                        "raw_insertions": raw_insertions,  # Raw unfiltered counts
+                        "raw_deletions": raw_deletions,
                     }
                 )
@@ -335,6 +689,10 @@ class GitDataFetcher:
                         "files_changed_count": 0,
                         "lines_added": 0,
                         "lines_deleted": 0,
+                        "filtered_insertions": 0,
+                        "filtered_deletions": 0,
+                        "raw_insertions": 0,
+                        "raw_deletions": 0,
                     }
                 )
@@ -363,9 +721,166 @@ class GitDataFetcher:
     def _should_exclude_file(self, file_path: str) -> bool:
         """Check if a file should be excluded based on exclude patterns."""
+        return any(self._matches_glob_pattern(file_path, pattern) for pattern in self.exclude_paths)
+    def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
+        """Check if a file path matches a glob pattern, handling ** recursion correctly.
+        This method properly handles different glob pattern types:
+        - **/vendor/** : matches files inside vendor directories at any level
+        - **/*.min.js : matches files with specific suffix anywhere in directory tree
+        - vendor/** : matches files inside vendor directory at root level only
+        - **pattern** : handles other complex patterns with pathlib.match()
+        - simple patterns : uses fnmatch for basic wildcards
+        Args:
+            filepath: The file path to check
+            pattern: The glob pattern to match against
+        Returns:
+            True if the file path matches the pattern, False otherwise
+        """
+        import fnmatch
+        import re
+        from pathlib import PurePath
+        # Handle empty or invalid inputs
+        if not filepath or not pattern:
+            return False
+        path = PurePath(filepath)
+        # Check for multiple ** patterns first (most complex)
+        if "**" in pattern and pattern.count("**") > 1:
+            # Multiple ** patterns - use custom recursive matching for complex patterns
+            return self._match_recursive_pattern(filepath, pattern)
+        # Then handle simple ** patterns
+        elif pattern.startswith("**/") and pattern.endswith("/**"):
+            # Pattern like **/vendor/** - matches files inside vendor directories at any level
+            dir_name = pattern[3:-3]  # Extract 'vendor' from '**/vendor/**'
+            if not dir_name:  # Handle edge case of '**/**'
+                return True
+            return dir_name in path.parts
+        elif pattern.startswith("**/"):
+            # Pattern like **/*.min.js - matches files with specific suffix anywhere
+            suffix_pattern = pattern[3:]
+            if not suffix_pattern:  # Handle edge case of '**/'
+                return True
+            # Check against filename for file patterns, or any path part for directory patterns
+            if suffix_pattern.endswith("/"):
+                # Directory pattern like **/build/
+                dir_name = suffix_pattern[:-1]
+                return dir_name in path.parts
+            else:
+                # File pattern like *.min.js
+                return fnmatch.fnmatch(path.name, suffix_pattern)
+        elif pattern.endswith("/**"):
+            # Pattern like vendor/** or docs/build/** - matches files inside directory at root level
+            dir_name = pattern[:-3]
+            if not dir_name:  # Handle edge case of '/**'
+                return True
+            # Handle both single directory names and nested paths
+            expected_parts = PurePath(dir_name).parts
+            return (
+                len(path.parts) >= len(expected_parts)
+                and path.parts[: len(expected_parts)] == expected_parts
+            )
+        elif "**" in pattern:
+            # Single ** pattern - use pathlib matching with fallback
+            try:
+                return path.match(pattern)
+            except (ValueError, TypeError):
+                # Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
+                try:
+                    return fnmatch.fnmatch(filepath, pattern)
+                except re.error:
+                    # Invalid regex pattern - return False to be safe
+                    return False
+        else:
+            # Simple pattern - use fnmatch for basic wildcards
+            try:
+                # Try matching the full path first
+                if fnmatch.fnmatch(filepath, pattern):
+                    return True
+                # Also try matching just the filename for simple patterns
+                # This allows "package-lock.json" to match "src/package-lock.json"
+                return fnmatch.fnmatch(path.name, pattern)
+            except re.error:
+                # Invalid regex pattern - return False to be safe
+                return False
+    def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
+        """Handle complex patterns with multiple ** wildcards.
+        Args:
+            filepath: The file path to check
+            pattern: The pattern with multiple ** wildcards
+        Returns:
+            True if the path matches the pattern, False otherwise
+        """
         import fnmatch
+        from pathlib import PurePath
+        # Split pattern by ** to handle each segment
+        parts = pattern.split("**")
+        # Validate that we have actual segments
+        if not parts:
+            return False
+        # Convert filepath to parts for easier matching
+        path_parts = list(PurePath(filepath).parts)
+        # Start matching from the beginning
+        path_index = 0
+        for i, part in enumerate(parts):
+            if not part:
+                # Empty part (e.g., from leading or trailing **)
+                if i == 0 or i == len(parts) - 1:
+                    # Leading or trailing ** - continue
+                    continue
+                # Middle empty part (consecutive **) - match any number of path components
+                continue
+            # Clean the part (remove leading/trailing slashes)
+            part = part.strip("/")
-        return any(fnmatch.fnmatch(file_path, pattern) for pattern in self.exclude_paths)
+            if not part:
+                continue
+            # Find where this part matches in the remaining path
+            found = False
+            for j in range(path_index, len(path_parts)):
+                # Check if the current path part matches the pattern part
+                if "/" in part:
+                    # Part contains multiple path components
+                    sub_parts = part.split("/")
+                    if j + len(sub_parts) <= len(path_parts) and all(
+                        fnmatch.fnmatch(path_parts[j + k], sub_parts[k])
+                        for k in range(len(sub_parts))
+                    ):
+                        path_index = j + len(sub_parts)
+                        found = True
+                        break
+                else:
+                    # Single component part
+                    if fnmatch.fnmatch(path_parts[j], part):
+                        path_index = j + 1
+                        found = True
+                        break
+            if not found and part:
+                # Required part not found in path
+                return False
+        return True
     def _get_branches_to_analyze(
         self, repo: Any, branch_patterns: Optional[list[str]]
@@ -373,114 +888,115 @@ class GitDataFetcher:
         """Get list of branches to analyze based on patterns.
         WHY: Robust branch detection that handles missing remotes, missing default branches,
-        and provides good fallback behavior. Based on the approach used in the existing analyzer.
+        and provides good fallback behavior. When no patterns specified, analyzes ALL branches
+        to capture the complete development picture.
         DESIGN DECISION:
-        - Try default branches first, fall back to all available branches
+        - When no patterns: analyze ALL accessible branches (not just main)
+        - When patterns specified: match against those patterns only
         - Handle missing remotes gracefully
         - Skip remote tracking branches to avoid duplicates
         - Use actual branch existence checking rather than assuming branches exist
+        THREAD SAFETY: This method is thread-safe as it doesn't modify shared state
+        and works with a repo instance passed as a parameter.
         """
-        if not branch_patterns:
-            # Get all available branches (local branches preferred)
-            available_branches = []
+        # Collect all available branches (local branches preferred)
+        available_branches = []
-            # First, try local branches
-            try:
-                local_branches = [branch.name for branch in repo.branches]
-                available_branches.extend(local_branches)
-                logger.debug(f"Found local branches: {local_branches}")
-            except Exception as e:
-                logger.debug(f"Error getting local branches: {e}")
+        # First, try local branches
+        try:
+            # THREAD SAFETY: Create a new list to avoid sharing references
+            local_branches = list([branch.name for branch in repo.branches])
+            available_branches.extend(local_branches)
+            logger.debug(f"Found local branches: {local_branches}")
+        except Exception as e:
+            logger.debug(f"Error getting local branches: {e}")
-            # If we have remotes, also consider remote branches (but clean the names)
+        # If we have remotes, also consider remote branches (but clean the names)
+        # Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
+        if not self.skip_remote_fetch:
             try:
                 if repo.remotes and hasattr(repo.remotes, "origin"):
-                    remote_branches = [
-                        ref.name.replace("origin/", "")
-                        for ref in repo.remotes.origin.refs
-                        if not ref.name.endswith("HEAD")  # Skip HEAD ref
-                    ]
+                    # THREAD SAFETY: Create a new list to avoid sharing references
+                    remote_branches = list(
+                        [
+                            ref.name.replace("origin/", "")
+                            for ref in repo.remotes.origin.refs
+                            if not ref.name.endswith("HEAD")  # Skip HEAD ref
+                        ]
+                    )
                     # Only add remote branches that aren't already in local branches
                     for branch in remote_branches:
                         if branch not in available_branches:
                             available_branches.append(branch)
                     logger.debug(f"Found remote branches: {remote_branches}")
             except Exception as e:
-                logger.debug(f"Error getting remote branches: {e}")
+                logger.debug(f"Error getting remote branches (may require authentication): {e}")
+                # Continue with local branches only
+        else:
+            logger.debug("Skipping remote branch enumeration (skip_remote_fetch=true)")
-            # If no branches found, fallback to trying common names directly
-            if not available_branches:
-                logger.warning(
-                    "No branches found via normal detection, falling back to common names"
-                )
-                available_branches = ["main", "master", "develop", "dev"]
+        # If no branches found, fallback to trying common names directly
+        if not available_branches:
+            logger.warning("No branches found via normal detection, falling back to common names")
+            available_branches = ["main", "master", "develop", "dev"]
+        # Filter branches based on patterns if provided
+        if branch_patterns:
+            import fnmatch
+            matching_branches = []
+            for pattern in branch_patterns:
+                matching = [
+                    branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)
+                ]
+                matching_branches.extend(matching)
+            # Remove duplicates while preserving order
+            branches_to_test = list(dict.fromkeys(matching_branches))
+        else:
+            # No patterns specified - analyze ALL branches for complete coverage
+            branches_to_test = available_branches
+            logger.info(
+                f"No branch patterns specified - will analyze all {len(branches_to_test)} branches"
+            )
-            # Try default main branches first, in order of preference
+        # Test that branches are actually accessible
+        accessible_branches = []
+        for branch in branches_to_test:
+            try:
+                # THREAD SAFETY: Use iterator without storing intermediate results
+                next(iter(repo.iter_commits(branch, max_count=1)), None)
+                accessible_branches.append(branch)
+            except Exception as e:
+                logger.debug(f"Branch {branch} not accessible: {e}")
+        if not accessible_branches:
+            # Last resort: try to find ANY working branch
+            logger.warning("No accessible branches found from patterns/default, trying fallback")
             main_branches = ["main", "master", "develop", "dev"]
             for branch in main_branches:
                 if branch in available_branches:
-                    # Test that we can actually access this branch
                     try:
-                        # Just try to get the commit object to verify branch exists and is accessible
                         next(iter(repo.iter_commits(branch, max_count=1)), None)
-                        logger.info(f"Using main branch: {branch}")
+                        logger.info(f"Using fallback main branch: {branch}")
                         return [branch]
-                    except Exception as e:
-                        logger.debug(f"Branch {branch} exists but not accessible: {e}")
+                    except Exception:
                         continue
-            # If no main branches work, try the first available branch that actually works
+            # Try any available branch
             for branch in available_branches:
                 try:
                     next(iter(repo.iter_commits(branch, max_count=1)), None)
                     logger.info(f"Using fallback branch: {branch}")
                     return [branch]
-                except Exception as e:
-                    logger.debug(f"Branch {branch} not accessible: {e}")
+                except Exception:
                     continue
-            # Last resort: return empty list (will be handled gracefully by caller)
             logger.warning("No accessible branches found")
             return []
-        # Use specified patterns - match against all available branches
-        import fnmatch
-        available_branches = []
-        # Collect all branches (local and remote)
-        with contextlib.suppress(Exception):
-            available_branches.extend([branch.name for branch in repo.branches])
-        try:
-            if repo.remotes and hasattr(repo.remotes, "origin"):
-                remote_branches = [
-                    ref.name.replace("origin/", "")
-                    for ref in repo.remotes.origin.refs
-                    if not ref.name.endswith("HEAD")
-                ]
-                for branch in remote_branches:
-                    if branch not in available_branches:
-                        available_branches.append(branch)
-        except Exception:
-            pass
-        # Match patterns against available branches
-        matching_branches = []
-        for pattern in branch_patterns:
-            matching = [branch for branch in available_branches if fnmatch.fnmatch(branch, pattern)]
-            matching_branches.extend(matching)
-        # Test that matched branches are actually accessible
-        accessible_branches = []
-        for branch in list(set(matching_branches)):  # Remove duplicates
-            try:
-                next(iter(repo.iter_commits(branch, max_count=1)), None)
-                accessible_branches.append(branch)
-            except Exception as e:
-                logger.debug(f"Matched branch {branch} not accessible: {e}")
+        logger.info(f"Will analyze {len(accessible_branches)} branches: {accessible_branches}")
         return accessible_branches
     def _update_repository(self, repo) -> bool:
@@ -503,82 +1019,62 @@ class GitDataFetcher:
         Returns:
             bool: True if update succeeded, False if failed (but analysis continues)
         """
-        try:
-            if repo.remotes:
-                logger.info("Fetching latest changes from remote")
+        # Skip remote operations if configured
+        if self.skip_remote_fetch:
+            logger.info("🚫 Skipping remote fetch (skip_remote_fetch=true)")
+            return True
-                # Use subprocess for git fetch to prevent password prompts
-                env = os.environ.copy()
-                env["GIT_TERMINAL_PROMPT"] = "0"
-                env["GIT_ASKPASS"] = ""
-                env["GCM_INTERACTIVE"] = "never"
+        # Check for stale repository (last fetch > 1 hour ago)
+        self._check_repository_staleness(repo)
-                # Run git fetch with timeout
-                try:
-                    result = subprocess.run(
-                        ["git", "fetch", "--all", "--config", "credential.helper="],
-                        cwd=repo.working_dir,
-                        env=env,
-                        capture_output=True,
-                        text=True,
-                        timeout=30,  # 30 second timeout
-                    )
+        try:
+            # Check if we have remotes without triggering authentication
+            has_remotes = False
+            try:
+                has_remotes = bool(repo.remotes)
+            except Exception as e:
+                logger.debug(f"Could not check for remotes (may require authentication): {e}")
+                return True  # Continue with local analysis
-                    if result.returncode != 0:
-                        error_str = result.stderr.lower()
-                        if any(
-                            x in error_str
-                            for x in ["authentication", "permission denied", "401", "403"]
-                        ):
-                            logger.error(
-                                "GitHub authentication failed. Please check your token is valid and has appropriate permissions. "
-                                "You can verify with: gh auth status"
-                            )
-                            # Skip fetch but continue with local analysis
-                            return False
-                        logger.warning(f"Git fetch failed: {result.stderr}")
-                        return False
+            if has_remotes:
+                logger.info("Fetching latest changes from remote")
-                except subprocess.TimeoutExpired:
-                    logger.error(
-                        "Git fetch timed out (likely authentication failure). Continuing with local repository state."
+                # Use our timeout wrapper for safe git operations
+                repo_path = Path(repo.working_dir)
+                # Try to fetch with timeout protection
+                fetch_success = self.git_wrapper.fetch_with_timeout(repo_path, timeout=30)
+                if not fetch_success:
+                    # Mark this repository as having authentication issues if applicable
+                    if hasattr(self, "repository_status"):
+                        for key in self.repository_status:
+                            if repo.working_dir.endswith(key) or key in repo.working_dir:
+                                self.repository_status[key]["remote_update"] = "failed"
+                                break
+                    # Explicit warning to user about stale data
+                    logger.warning(
+                        f"❌ Failed to fetch updates for {repo_path.name}. "
+                        f"Analysis will use potentially stale local data. "
+                        f"Check authentication or network connectivity."
                     )
                     return False
+                else:
+                    # Explicit success confirmation
+                    logger.info(f"✅ Successfully fetched updates for {repo_path.name}")
                 # Only try to pull if not in detached HEAD state
                 if not repo.head.is_detached:
                     current_branch = repo.active_branch
                     tracking = current_branch.tracking_branch()
                     if tracking:
-                        # Pull latest changes using subprocess
-                        try:
-                            result = subprocess.run(
-                                ["git", "pull", "--config", "credential.helper="],
-                                cwd=repo.working_dir,
-                                env=env,
-                                capture_output=True,
-                                text=True,
-                                timeout=30,
-                            )
-                            if result.returncode != 0:
-                                error_str = result.stderr.lower()
-                                if any(
-                                    x in error_str
-                                    for x in ["authentication", "permission denied", "401", "403"]
-                                ):
-                                    logger.error(
-                                        "GitHub authentication failed during pull. Continuing with local repository state."
-                                    )
-                                    return False
-                                logger.warning(f"Git pull failed: {result.stderr}")
-                                return False
+                        # Pull latest changes using timeout wrapper
+                        pull_success = self.git_wrapper.pull_with_timeout(repo_path, timeout=30)
+                        if pull_success:
                             logger.debug(f"Pulled latest changes for {current_branch.name}")
-                        except subprocess.TimeoutExpired:
-                            logger.error(
-                                "Git pull timed out. Continuing with local repository state."
-                            )
+                        else:
+                            logger.warning("Git pull failed, continuing with fetched state")
                             return False
                     else:
                         logger.debug(
@@ -595,6 +1091,114 @@ class GitDataFetcher:
             # Continue with analysis using local state
             return False
+    def _check_repository_staleness(self, repo) -> None:
+        """Check if repository hasn't been fetched recently and warn user.
+        Args:
+            repo: GitPython Repo object
+        """
+        try:
+            repo_path = Path(repo.working_dir)
+            fetch_head_path = repo_path / ".git" / "FETCH_HEAD"
+            if fetch_head_path.exists():
+                # Get last fetch time from FETCH_HEAD modification time
+                last_fetch_time = datetime.fromtimestamp(
+                    fetch_head_path.stat().st_mtime, tz=timezone.utc
+                )
+                now = datetime.now(timezone.utc)
+                hours_since_fetch = (now - last_fetch_time).total_seconds() / 3600
+                if hours_since_fetch > 1:
+                    logger.warning(
+                        f"⏰ Repository {repo_path.name} last fetched {hours_since_fetch:.1f} hours ago. "
+                        f"Data may be stale."
+                    )
+            else:
+                logger.warning(
+                    f"⚠️ Repository {repo_path.name} has never been fetched. "
+                    f"Will attempt to fetch now."
+                )
+        except Exception as e:
+            logger.debug(f"Could not check repository staleness: {e}")
+    def _check_repository_security(self, repo_path: Path, project_key: str) -> None:
+        """Check for security issues in repository configuration.
+        Warns about:
+        - Exposed tokens in remote URLs
+        - Insecure credential storage
+        """
+        try:
+            # Check for tokens in remote URLs
+            result = subprocess.run(
+                ["git", "remote", "-v"],
+                cwd=repo_path,
+                capture_output=True,
+                text=True,
+                timeout=2,
+                env={"GIT_TERMINAL_PROMPT": "0"},
+            )
+            if result.returncode == 0:
+                output = result.stdout
+                # Check for various token patterns in URLs
+                token_patterns = [
+                    r"https://[^@]*@",  # Any HTTPS URL with embedded credentials
+                    r"ghp_[a-zA-Z0-9]+",  # GitHub Personal Access Token
+                    r"ghs_[a-zA-Z0-9]+",  # GitHub Server Token
+                    r"github_pat_[a-zA-Z0-9]+",  # New GitHub PAT format
+                ]
+                for pattern in token_patterns:
+                    import re
+                    if re.search(pattern, output):
+                        logger.warning(
+                            f"⚠️  SECURITY WARNING for {project_key}: "
+                            f"Repository appears to have credentials in remote URL. "
+                            f"This is a security risk! Consider using: "
+                            f"1) GitHub CLI (gh auth login), "
+                            f"2) SSH keys, or "
+                            f"3) Git credential manager instead."
+                        )
+                        break
+        except:
+            # Don't fail analysis due to security check
+            pass
+    def get_repository_status_summary(self) -> dict[str, Any]:
+        """Get a summary of repository fetch status.
+        Returns:
+            Dictionary with status summary including any repositories with issues
+        """
+        summary = {
+            "total_repositories": len(self.repository_status),
+            "successful_updates": 0,
+            "failed_updates": 0,
+            "skipped_updates": 0,
+            "authentication_issues": [],
+            "errors": [],
+        }
+        for project_key, status in self.repository_status.items():
+            if status["remote_update"] == "success":
+                summary["successful_updates"] += 1
+            elif status["remote_update"] == "failed":
+                summary["failed_updates"] += 1
+                if status.get("authentication_issues"):
+                    summary["authentication_issues"].append(project_key)
+            elif status["remote_update"] == "skipped":
+                summary["skipped_updates"] += 1
+            elif status["remote_update"] == "error":
+                summary["errors"].append(
+                    {"repository": project_key, "error": status.get("error", "Unknown error")}
+                )
+        return summary
     def _extract_all_ticket_references(
         self, daily_commits: dict[str, list[dict[str, Any]]]
     ) -> set[str]:
@@ -872,8 +1476,20 @@ class GitDataFetcher:
                                 "branch": commit.get("branch", "main"),
                                 "is_merge": commit.get("is_merge", False),
                                 "files_changed_count": commit.get("files_changed_count", 0),
-                                "insertions": commit.get("lines_added", 0),
-                                "deletions": commit.get("lines_deleted", 0),
+                                # Store raw unfiltered values in insertions/deletions
+                                "insertions": commit.get(
+                                    "raw_insertions", commit.get("lines_added", 0)
+                                ),
+                                "deletions": commit.get(
+                                    "raw_deletions", commit.get("lines_deleted", 0)
+                                ),
+                                # Store filtered values separately
+                                "filtered_insertions": commit.get(
+                                    "filtered_insertions", commit.get("lines_added", 0)
+                                ),
+                                "filtered_deletions": commit.get(
+                                    "filtered_deletions", commit.get("lines_deleted", 0)
+                                ),
                                 "story_points": commit.get("story_points"),
                                 "ticket_references": commit.get("ticket_references", []),
                             }
@@ -1146,24 +1762,48 @@ class GitDataFetcher:
             session.close()
     def _calculate_commit_stats(self, commit: Any) -> dict[str, int]:
-        """Calculate commit statistics using reliable git diff --numstat.
+        """Calculate commit statistics using reliable git diff --numstat with exclude_paths filtering.
         Returns:
-            Dictionary with 'files', 'insertions', and 'deletions' counts
+            Dictionary with both raw and filtered statistics:
+            - 'files', 'insertions', 'deletions': filtered counts
+            - 'raw_insertions', 'raw_deletions': unfiltered counts
+        THREAD SAFETY: This method is thread-safe as it works with commit objects
+        that have their own repo references.
         """
         stats = {"files": 0, "insertions": 0, "deletions": 0}
+        # Track raw stats for storage
+        raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
+        excluded_stats = {"files": 0, "insertions": 0, "deletions": 0}
         # For initial commits or commits without parents
         parent = commit.parents[0] if commit.parents else None
         try:
-            # Use git command directly for accurate line counts
+            # THREAD SAFETY: Use the repo reference from the commit object
+            # Each thread has its own commit object with its own repo reference
             repo = commit.repo
-            if parent:
-                diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
-            else:
-                # Initial commit - use git show with --numstat
-                diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
+            Path(repo.working_dir)
+            def get_diff_output():
+                if parent:
+                    return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
+                else:
+                    # Initial commit - use git show with --numstat
+                    return repo.git.show(commit.hexsha, "--numstat", "--format=")
+            # Use timeout wrapper for git diff operations
+            try:
+                diff_output = self.git_wrapper.run_with_timeout(
+                    get_diff_output,
+                    timeout=10,  # 10 seconds for diff operations
+                    operation_name=f"diff_{commit.hexsha[:8]}",
+                )
+            except GitOperationTimeout:
+                logger.warning(f"⏱️ Timeout calculating stats for commit {commit.hexsha[:8]}")
+                return stats  # Return zeros
             # Parse the numstat output: insertions\tdeletions\tfilename
             for line in diff_output.strip().split("\n"):
@@ -1175,9 +1815,22 @@ class GitDataFetcher:
                     try:
                         insertions = int(parts[0]) if parts[0] != "-" else 0
                         deletions = int(parts[1]) if parts[1] != "-" else 0
-                        # filename = parts[2] - we don't filter here since files_changed list is handled separately
-                        # Count all changes (raw stats, no filtering)
+                        filename = parts[2]
+                        # Always count raw stats
+                        raw_stats["files"] += 1
+                        raw_stats["insertions"] += insertions
+                        raw_stats["deletions"] += deletions
+                        # Skip excluded files based on exclude_paths patterns
+                        if self._should_exclude_file(filename):
+                            logger.debug(f"Excluding file from line counts: {filename}")
+                            excluded_stats["files"] += 1
+                            excluded_stats["insertions"] += insertions
+                            excluded_stats["deletions"] += deletions
+                            continue
+                        # Count only non-excluded files and their changes
                         stats["files"] += 1
                         stats["insertions"] += insertions
                         stats["deletions"] += deletions
@@ -1186,8 +1839,381 @@ class GitDataFetcher:
                         # Skip binary files or malformed lines
                         continue
+            # Log exclusion statistics if significant
+            if excluded_stats["files"] > 0 or (
+                raw_stats["insertions"] > 0 and stats["insertions"] < raw_stats["insertions"]
+            ):
+                reduction_pct = (
+                    100 * (1 - stats["insertions"] / raw_stats["insertions"])
+                    if raw_stats["insertions"] > 0
+                    else 0
+                )
+                logger.info(
+                    f"Commit {commit.hexsha[:8]}: Excluded {excluded_stats['files']} files, "
+                    f"{excluded_stats['insertions']} insertions, {excluded_stats['deletions']} deletions "
+                    f"({reduction_pct:.1f}% reduction)"
+                )
+            # Log if exclusions are configured
+            if self.exclude_paths and raw_stats["files"] > 0:
+                logger.debug(
+                    f"Commit {commit.hexsha[:8]}: Applied {len(self.exclude_paths)} exclusion patterns. "
+                    f"Raw: {raw_stats['files']} files, +{raw_stats['insertions']} -{raw_stats['deletions']}. "
+                    f"Filtered: {stats['files']} files, +{stats['insertions']} -{stats['deletions']}"
+                )
         except Exception as e:
             # Log the error for debugging but don't crash
             logger.warning(f"Error calculating commit stats for {commit.hexsha[:8]}: {e}")
+        # Return both raw and filtered stats
+        stats["raw_insertions"] = raw_stats["insertions"]
+        stats["raw_deletions"] = raw_stats["deletions"]
         return stats
+    def _store_day_commits_incremental(
+        self, repo_path: Path, date_str: str, commits: list[dict[str, Any]], project_key: str
+    ) -> None:
+        """Store commits for a single day incrementally to enable progress tracking.
+        This method stores commits immediately after fetching them for a day,
+        allowing for better progress tracking and recovery from interruptions.
+        Args:
+            repo_path: Path to the repository
+            date_str: Date string in YYYY-MM-DD format
+            commits: List of commit data for the day
+            project_key: Project identifier
+        """
+        try:
+            # Transform commits to cache format
+            cache_format_commits = []
+            for commit in commits:
+                cache_format_commit = {
+                    "hash": commit["commit_hash"],
+                    "author_name": commit.get("author_name", ""),
+                    "author_email": commit.get("author_email", ""),
+                    "message": commit.get("message", ""),
+                    "timestamp": commit["timestamp"],
+                    "branch": commit.get("branch", "main"),
+                    "is_merge": commit.get("is_merge", False),
+                    "files_changed_count": commit.get("files_changed_count", 0),
+                    # Store raw unfiltered values
+                    "insertions": commit.get("raw_insertions", commit.get("lines_added", 0)),
+                    "deletions": commit.get("raw_deletions", commit.get("lines_deleted", 0)),
+                    # Store filtered values
+                    "filtered_insertions": commit.get(
+                        "filtered_insertions", commit.get("lines_added", 0)
+                    ),
+                    "filtered_deletions": commit.get(
+                        "filtered_deletions", commit.get("lines_deleted", 0)
+                    ),
+                    "story_points": commit.get("story_points"),
+                    "ticket_references": commit.get("ticket_references", []),
+                }
+                cache_format_commits.append(cache_format_commit)
+            # Use bulk store for efficiency
+            if cache_format_commits:
+                bulk_stats = self.cache.bulk_store_commits(str(repo_path), cache_format_commits)
+                logger.debug(
+                    f"Incrementally stored {bulk_stats['inserted']} commits for {date_str} "
+                    f"({bulk_stats['skipped']} already cached)"
+                )
+        except Exception as e:
+            # Log error but don't fail - commits will be stored again in batch at the end
+            logger.warning(f"Failed to incrementally store commits for {date_str}: {e}")
+    def process_repositories_parallel(
+        self,
+        repositories: list[dict],
+        weeks_back: int = 4,
+        jira_integration: Optional[JIRAIntegration] = None,
+        start_date: Optional[datetime] = None,
+        end_date: Optional[datetime] = None,
+        max_workers: int = 3,
+    ) -> dict[str, Any]:
+        """Process multiple repositories in parallel with proper timeout protection.
+        Args:
+            repositories: List of repository configurations
+            weeks_back: Number of weeks to analyze
+            jira_integration: Optional JIRA integration for ticket data
+            start_date: Optional explicit start date
+            end_date: Optional explicit end date
+            max_workers: Maximum number of parallel workers
+        Returns:
+            Dictionary containing processing results and statistics
+        """
+        logger.info(
+            f"🚀 Starting parallel processing of {len(repositories)} repositories with {max_workers} workers"
+        )
+        # Initialize statistics
+        self.processing_stats = {
+            "total": len(repositories),
+            "processed": 0,
+            "success": 0,
+            "failed": 0,
+            "timeout": 0,
+            "repositories": {},
+        }
+        # Get progress service for updates
+        progress = get_progress_service()
+        # Start heartbeat logger for monitoring
+        with HeartbeatLogger(interval=5):
+            results = {}
+            # Use ThreadPoolExecutor for parallel processing
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # Submit all repository processing tasks
+                future_to_repo = {}
+                for repo_config in repositories:
+                    repo_path = Path(repo_config.get("path", ""))
+                    project_key = repo_config.get("project_key", repo_path.name)
+                    branch_patterns = repo_config.get("branch_patterns")
+                    # Submit task with timeout wrapper
+                    future = executor.submit(
+                        self._process_repository_with_timeout,
+                        repo_path,
+                        project_key,
+                        weeks_back,
+                        branch_patterns,
+                        jira_integration,
+                        start_date,
+                        end_date,
+                    )
+                    future_to_repo[future] = {
+                        "path": repo_path,
+                        "project_key": project_key,
+                        "start_time": time.time(),
+                    }
+                    logger.info(f"📋 Submitted {project_key} for processing")
+                # Process results as they complete
+                for future in as_completed(future_to_repo):
+                    repo_info = future_to_repo[future]
+                    project_key = repo_info["project_key"]
+                    elapsed_time = time.time() - repo_info["start_time"]
+                    try:
+                        result = future.result(timeout=5)  # Short timeout to get result
+                        if result:
+                            self.processing_stats["success"] += 1
+                            self.processing_stats["repositories"][project_key] = {
+                                "status": "success",
+                                "elapsed_time": elapsed_time,
+                                "commits": result.get("stats", {}).get("total_commits", 0),
+                                "tickets": result.get("stats", {}).get("unique_tickets", 0),
+                            }
+                            results[project_key] = result
+                            logger.info(
+                                f"✅ {project_key}: Successfully processed "
+                                f"{result['stats']['total_commits']} commits in {elapsed_time:.1f}s"
+                            )
+                            # Update progress
+                            if hasattr(progress, "finish_repository"):
+                                # Check if progress adapter supports stats parameter
+                                if hasattr(progress, "update_stats"):
+                                    progress.update_stats(
+                                        processed=self.processing_stats["processed"],
+                                        success=self.processing_stats["success"],
+                                        failed=self.processing_stats["failed"],
+                                        timeout=self.processing_stats["timeout"],
+                                        total=self.processing_stats["total"],
+                                    )
+                                    progress.finish_repository(project_key, success=True)
+                                else:
+                                    progress.finish_repository(project_key, success=True)
+                        else:
+                            self.processing_stats["failed"] += 1
+                            self.processing_stats["repositories"][project_key] = {
+                                "status": "failed",
+                                "elapsed_time": elapsed_time,
+                                "error": "Processing returned no result",
+                            }
+                            logger.error(
+                                f"❌ {project_key}: Processing failed after {elapsed_time:.1f}s"
+                            )
+                            # Update progress
+                            if hasattr(progress, "finish_repository"):
+                                # Check if progress adapter supports stats parameter
+                                if hasattr(progress, "update_stats"):
+                                    progress.update_stats(
+                                        processed=self.processing_stats["processed"],
+                                        success=self.processing_stats["success"],
+                                        failed=self.processing_stats["failed"],
+                                        timeout=self.processing_stats["timeout"],
+                                        total=self.processing_stats["total"],
+                                    )
+                                    progress.finish_repository(
+                                        project_key,
+                                        success=False,
+                                        error_message="Processing failed",
+                                    )
+                                else:
+                                    progress.finish_repository(
+                                        project_key,
+                                        success=False,
+                                        error_message="Processing failed",
+                                    )
+                    except GitOperationTimeout:
+                        self.processing_stats["timeout"] += 1
+                        self.processing_stats["repositories"][project_key] = {
+                            "status": "timeout",
+                            "elapsed_time": elapsed_time,
+                            "error": "Operation timed out",
+                        }
+                        logger.error(f"⏱️ {project_key}: Timed out after {elapsed_time:.1f}s")
+                        # Update progress
+                        if hasattr(progress, "finish_repository"):
+                            # Check if progress adapter supports stats parameter
+                            if hasattr(progress, "update_stats"):
+                                progress.update_stats(
+                                    processed=self.processing_stats["processed"],
+                                    success=self.processing_stats["success"],
+                                    failed=self.processing_stats["failed"],
+                                    timeout=self.processing_stats["timeout"],
+                                    total=self.processing_stats["total"],
+                                )
+                                progress.finish_repository(
+                                    project_key, success=False, error_message="Timeout"
+                                )
+                            else:
+                                progress.finish_repository(
+                                    project_key, success=False, error_message="Timeout"
+                                )
+                    except Exception as e:
+                        self.processing_stats["failed"] += 1
+                        self.processing_stats["repositories"][project_key] = {
+                            "status": "failed",
+                            "elapsed_time": elapsed_time,
+                            "error": str(e),
+                        }
+                        logger.error(f"❌ {project_key}: Error after {elapsed_time:.1f}s - {e}")
+                        # Update progress
+                        if hasattr(progress, "finish_repository"):
+                            # Check if progress adapter supports stats parameter
+                            if hasattr(progress, "update_stats"):
+                                progress.update_stats(
+                                    processed=self.processing_stats["processed"],
+                                    success=self.processing_stats["success"],
+                                    failed=self.processing_stats["failed"],
+                                    timeout=self.processing_stats["timeout"],
+                                    total=self.processing_stats["total"],
+                                )
+                                progress.finish_repository(
+                                    project_key, success=False, error_message=str(e)
+                                )
+                            else:
+                                progress.finish_repository(
+                                    project_key, success=False, error_message=str(e)
+                                )
+                    finally:
+                        # Update processed counter BEFORE logging and progress updates
+                        self.processing_stats["processed"] += 1
+                        # Update progress service with actual processing stats
+                        if hasattr(progress, "update_stats"):
+                            progress.update_stats(
+                                processed=self.processing_stats["processed"],
+                                success=self.processing_stats["success"],
+                                failed=self.processing_stats["failed"],
+                                timeout=self.processing_stats["timeout"],
+                                total=self.processing_stats["total"],
+                            )
+                        # Log progress
+                        logger.info(
+                            f"📊 Progress: {self.processing_stats['processed']}/{self.processing_stats['total']} repositories "
+                            f"(✅ {self.processing_stats['success']} | ❌ {self.processing_stats['failed']} | ⏱️ {self.processing_stats['timeout']})"
+                        )
+        # Final summary
+        logger.info("=" * 60)
+        logger.info("📈 PARALLEL PROCESSING SUMMARY")
+        logger.info(f"   Total repositories: {self.processing_stats['total']}")
+        logger.info(f"   Successfully processed: {self.processing_stats['success']}")
+        logger.info(f"   Failed: {self.processing_stats['failed']}")
+        logger.info(f"   Timed out: {self.processing_stats['timeout']}")
+        logger.info("=" * 60)
+        return {"results": results, "statistics": self.processing_stats}
+    def _process_repository_with_timeout(
+        self,
+        repo_path: Path,
+        project_key: str,
+        weeks_back: int = 4,
+        branch_patterns: Optional[list[str]] = None,
+        jira_integration: Optional[JIRAIntegration] = None,
+        start_date: Optional[datetime] = None,
+        end_date: Optional[datetime] = None,
+        timeout_per_operation: int = 30,
+    ) -> Optional[dict[str, Any]]:
+        """Process a single repository with comprehensive timeout protection.
+        Args:
+            repo_path: Path to the repository
+            project_key: Project identifier
+            weeks_back: Number of weeks to analyze
+            branch_patterns: Branch patterns to include
+            jira_integration: JIRA integration for ticket data
+            start_date: Optional explicit start date
+            end_date: Optional explicit end date
+            timeout_per_operation: Timeout for individual git operations
+        Returns:
+            Repository processing results or None if failed
+        """
+        try:
+            # Track this repository in progress
+            progress = get_progress_service()
+            if hasattr(progress, "start_repository"):
+                progress.start_repository(project_key, 0)
+            logger.info(f"🔍 Processing repository: {project_key} at {repo_path}")
+            # Use the regular fetch method but with timeout wrapper active
+            with self.git_wrapper.operation_tracker("fetch_repository_data", repo_path):
+                result = self.fetch_repository_data(
+                    repo_path=repo_path,
+                    project_key=project_key,
+                    weeks_back=weeks_back,
+                    branch_patterns=branch_patterns,
+                    jira_integration=jira_integration,
+                    progress_callback=None,  # We handle progress at a higher level
+                    start_date=start_date,
+                    end_date=end_date,
+                )
+                return result
+        except GitOperationTimeout as e:
+            logger.error(f"⏱️ Repository {project_key} processing timed out: {e}")
+            raise
+        except Exception as e:
+            logger.error(f"❌ Error processing repository {project_key}: {e}")
+            import traceback
+            logger.debug(f"Full traceback: {traceback.format_exc()}")
+            return None

gitflow-analytics 1.3.6__py3-none-any.whl → 3.3.0__py3-none-any.whl

gitflow-analytics 1.3.6py3-none-any.whl → 3.3.0py3-none-any.whl