PyPI - gitflow-analytics - Versions diffs - 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl - Mend

gitflow-analytics 1.0.1py3-none-any.whl → 1.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

gitflow_analytics/__init__.py +11 -11
gitflow_analytics/_version.py +2 -2
gitflow_analytics/classification/__init__.py +31 -0
gitflow_analytics/classification/batch_classifier.py +752 -0
gitflow_analytics/classification/classifier.py +464 -0
gitflow_analytics/classification/feature_extractor.py +725 -0
gitflow_analytics/classification/linguist_analyzer.py +574 -0
gitflow_analytics/classification/model.py +455 -0
gitflow_analytics/cli.py +4490 -378
gitflow_analytics/cli_rich.py +503 -0
gitflow_analytics/config/__init__.py +43 -0
gitflow_analytics/config/errors.py +261 -0
gitflow_analytics/config/loader.py +904 -0
gitflow_analytics/config/profiles.py +264 -0
gitflow_analytics/config/repository.py +124 -0
gitflow_analytics/config/schema.py +441 -0
gitflow_analytics/config/validator.py +154 -0
gitflow_analytics/config.py +44 -398
gitflow_analytics/core/analyzer.py +1320 -172
gitflow_analytics/core/branch_mapper.py +132 -132
gitflow_analytics/core/cache.py +1554 -175
gitflow_analytics/core/data_fetcher.py +1193 -0
gitflow_analytics/core/identity.py +571 -185
gitflow_analytics/core/metrics_storage.py +526 -0
gitflow_analytics/core/progress.py +372 -0
gitflow_analytics/core/schema_version.py +269 -0
gitflow_analytics/extractors/base.py +13 -11
gitflow_analytics/extractors/ml_tickets.py +1100 -0
gitflow_analytics/extractors/story_points.py +77 -59
gitflow_analytics/extractors/tickets.py +841 -89
gitflow_analytics/identity_llm/__init__.py +6 -0
gitflow_analytics/identity_llm/analysis_pass.py +231 -0
gitflow_analytics/identity_llm/analyzer.py +464 -0
gitflow_analytics/identity_llm/models.py +76 -0
gitflow_analytics/integrations/github_integration.py +258 -87
gitflow_analytics/integrations/jira_integration.py +572 -123
gitflow_analytics/integrations/orchestrator.py +206 -82
gitflow_analytics/metrics/activity_scoring.py +322 -0
gitflow_analytics/metrics/branch_health.py +470 -0
gitflow_analytics/metrics/dora.py +542 -179
gitflow_analytics/models/database.py +986 -59
gitflow_analytics/pm_framework/__init__.py +115 -0
gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
gitflow_analytics/pm_framework/base.py +406 -0
gitflow_analytics/pm_framework/models.py +211 -0
gitflow_analytics/pm_framework/orchestrator.py +652 -0
gitflow_analytics/pm_framework/registry.py +333 -0
gitflow_analytics/qualitative/__init__.py +29 -0
gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
gitflow_analytics/qualitative/core/__init__.py +13 -0
gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
gitflow_analytics/qualitative/core/processor.py +673 -0
gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
gitflow_analytics/qualitative/models/__init__.py +25 -0
gitflow_analytics/qualitative/models/schemas.py +306 -0
gitflow_analytics/qualitative/utils/__init__.py +13 -0
gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
gitflow_analytics/qualitative/utils/metrics.py +361 -0
gitflow_analytics/qualitative/utils/text_processing.py +285 -0
gitflow_analytics/reports/__init__.py +100 -0
gitflow_analytics/reports/analytics_writer.py +550 -18
gitflow_analytics/reports/base.py +648 -0
gitflow_analytics/reports/branch_health_writer.py +322 -0
gitflow_analytics/reports/classification_writer.py +924 -0
gitflow_analytics/reports/cli_integration.py +427 -0
gitflow_analytics/reports/csv_writer.py +1700 -216
gitflow_analytics/reports/data_models.py +504 -0
gitflow_analytics/reports/database_report_generator.py +427 -0
gitflow_analytics/reports/example_usage.py +344 -0
gitflow_analytics/reports/factory.py +499 -0
gitflow_analytics/reports/formatters.py +698 -0
gitflow_analytics/reports/html_generator.py +1116 -0
gitflow_analytics/reports/interfaces.py +489 -0
gitflow_analytics/reports/json_exporter.py +2770 -0
gitflow_analytics/reports/narrative_writer.py +2289 -158
gitflow_analytics/reports/story_point_correlation.py +1144 -0
gitflow_analytics/reports/weekly_trends_writer.py +389 -0
gitflow_analytics/training/__init__.py +5 -0
gitflow_analytics/training/model_loader.py +377 -0
gitflow_analytics/training/pipeline.py +550 -0
gitflow_analytics/tui/__init__.py +5 -0
gitflow_analytics/tui/app.py +724 -0
gitflow_analytics/tui/screens/__init__.py +8 -0
gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
gitflow_analytics/tui/screens/configuration_screen.py +523 -0
gitflow_analytics/tui/screens/loading_screen.py +348 -0
gitflow_analytics/tui/screens/main_screen.py +321 -0
gitflow_analytics/tui/screens/results_screen.py +722 -0
gitflow_analytics/tui/widgets/__init__.py +7 -0
gitflow_analytics/tui/widgets/data_table.py +255 -0
gitflow_analytics/tui/widgets/export_modal.py +301 -0
gitflow_analytics/tui/widgets/progress_widget.py +187 -0
gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
{gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
{gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
{gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
{gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0

gitflow_analytics/integrations/github_integration.py CHANGED Viewed

@@ -1,167 +1,338 @@
 """GitHub API integration for PR and issue enrichment."""
 import time
 from datetime import datetime, timezone
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 from github import Github
 from github.GithubException import RateLimitExceededException, UnknownObjectException
 from ..core.cache import GitAnalysisCache
+from ..core.schema_version import create_schema_manager
 class GitHubIntegration:
     """Integrate with GitHub API for PR and issue data."""
-    def __init__(self, token: str, cache: GitAnalysisCache,
-                 rate_limit_retries: int = 3, backoff_factor: int = 2,
-                 allowed_ticket_platforms: Optional[List[str]] = None):
+    def __init__(
+        self,
+        token: str,
+        cache: GitAnalysisCache,
+        rate_limit_retries: int = 3,
+        backoff_factor: int = 2,
+        allowed_ticket_platforms: Optional[list[str]] = None,
+    ):
         """Initialize GitHub integration."""
         self.github = Github(token)
         self.cache = cache
         self.rate_limit_retries = rate_limit_retries
         self.backoff_factor = backoff_factor
         self.allowed_ticket_platforms = allowed_ticket_platforms
-    def enrich_repository_with_prs(self, repo_name: str, commits: List[Dict[str, Any]],
-                                  since: datetime) -> List[Dict[str, Any]]:
-        """Enrich repository commits with PR data."""
+        # Initialize schema version manager for incremental API data fetching
+        self.schema_manager = create_schema_manager(cache.cache_dir)
+    def _get_incremental_fetch_date(
+        self, component: str, requested_since: datetime, config: dict[str, Any]
+    ) -> datetime:
+        """Determine the actual fetch date based on schema versioning."""
+        # Ensure requested_since is timezone-aware
+        if requested_since.tzinfo is None:
+            requested_since = requested_since.replace(tzinfo=timezone.utc)
+        # Check if schema has changed
+        if self.schema_manager.has_schema_changed(component, config):
+            print(
+                f"   🔄 {component.title()} API schema changed, fetching all data since {requested_since}"
+            )
+            return requested_since
+        # Get last processed date
+        last_processed = self.schema_manager.get_last_processed_date(component)
+        if not last_processed:
+            print(f"   📥 First {component} API fetch, getting data since {requested_since}")
+            return requested_since
+        # Ensure last_processed is timezone-aware
+        if last_processed.tzinfo is None:
+            last_processed = last_processed.replace(tzinfo=timezone.utc)
+        # Use the later of the two dates (don't go backwards)
+        fetch_since = max(last_processed, requested_since)
+        if fetch_since > requested_since:
+            print(f"   ⚡ {component.title()} incremental fetch since {fetch_since}")
+        else:
+            print(f"   📥 {component.title()} full fetch since {requested_since}")
+        return fetch_since
+    def enrich_repository_with_prs(
+        self, repo_name: str, commits: list[dict[str, Any]], since: datetime
+    ) -> list[dict[str, Any]]:
+        """Enrich repository commits with PR data using incremental fetching."""
         try:
             repo = self.github.get_repo(repo_name)
         except UnknownObjectException:
             print(f"   ⚠️  GitHub repo not found: {repo_name}")
             return []
-        # Get PRs for the time period
-        prs = self._get_pull_requests(repo, since)
+        # Check if we need to fetch new PR data
+        github_config = {
+            "rate_limit_retries": self.rate_limit_retries,
+            "backoff_factor": self.backoff_factor,
+            "allowed_ticket_platforms": self.allowed_ticket_platforms,
+        }
+        # Determine the actual start date for fetching
+        fetch_since = self._get_incremental_fetch_date("github", since, github_config)
+        # Check cache first for existing PRs in this time period
+        cached_prs_data = self._get_cached_prs_bulk(repo_name, fetch_since)
+        # Get PRs for the time period (may be incremental)
+        prs = self._get_pull_requests(repo, fetch_since)
+        # Track cache performance
+        cached_pr_numbers = {pr["number"] for pr in cached_prs_data}
+        new_prs = [pr for pr in prs if pr.number not in cached_pr_numbers]
+        cache_hits = len(cached_prs_data)
+        cache_misses = len(new_prs)
+        if cache_hits > 0 or cache_misses > 0:
+            print(
+                f"   📊 GitHub PR cache: {cache_hits} hits, {cache_misses} misses ({cache_hits/(cache_hits+cache_misses)*100:.1f}% hit rate)"
+                if (cache_hits + cache_misses) > 0
+                else ""
+            )
+        # Update schema tracking after successful fetch
+        if prs:
+            self.schema_manager.mark_date_processed("github", since, github_config)
+        # Process new PRs and cache them
+        new_pr_data = []
+        for pr in new_prs:
+            pr_data = self._extract_pr_data(pr)
+            new_pr_data.append(pr_data)
+        # Bulk cache new PR data
+        if new_pr_data:
+            self._cache_prs_bulk(repo_name, new_pr_data)
+            print(f"   💾 Cached {len(new_pr_data)} new GitHub PRs")
+        # Combine cached and new PR data
+        all_pr_data = cached_prs_data + new_pr_data
         # Build commit to PR mapping
         commit_to_pr = {}
-        for pr in prs:
-            pr_data = self._extract_pr_data(pr)
-            # Cache PR data
-            self.cache.cache_pr(repo_name, pr_data)
-            # Map commits to this PR
-            for commit in pr.get_commits():
-                commit_to_pr[commit.sha] = pr_data
+        for pr_data in all_pr_data:
+            # Map commits to this PR (need to get commit hashes from cached data)
+            for commit_hash in pr_data.get("commit_hashes", []):
+                commit_to_pr[commit_hash] = pr_data
         # Enrich commits with PR data
         enriched_prs = []
         for commit in commits:
-            if commit['hash'] in commit_to_pr:
-                pr_data = commit_to_pr[commit['hash']]
+            if commit["hash"] in commit_to_pr:
+                pr_data = commit_to_pr[commit["hash"]]
                 # Use PR story points if commit doesn't have them
-                if not commit.get('story_points') and pr_data.get('story_points'):
-                    commit['story_points'] = pr_data['story_points']
+                if not commit.get("story_points") and pr_data.get("story_points"):
+                    commit["story_points"] = pr_data["story_points"]
                 # Add PR reference
-                commit['pr_number'] = pr_data['number']
-                commit['pr_title'] = pr_data['title']
+                commit["pr_number"] = pr_data["number"]
+                commit["pr_title"] = pr_data["title"]
                 # Add to PR list if not already there
                 if pr_data not in enriched_prs:
                     enriched_prs.append(pr_data)
         return enriched_prs
-    def _get_pull_requests(self, repo, since: datetime) -> List[Any]:
+    def _get_cached_prs_bulk(self, repo_name: str, since: datetime) -> list[dict[str, Any]]:
+        """Get cached PRs for a repository from the given date onwards.
+        WHY: Bulk PR cache lookups avoid redundant GitHub API calls and
+        significantly improve performance on repeated analysis runs.
+        Args:
+            repo_name: GitHub repository name (e.g., "owner/repo")
+            since: Only return PRs merged after this date
+        Returns:
+            List of cached PR data dictionaries
+        """
+        cached_prs = []
+        with self.cache.get_session() as session:
+            from ..models.database import PullRequestCache
+            # Ensure since is timezone-aware for comparison
+            if since.tzinfo is None:
+                since = since.replace(tzinfo=timezone.utc)
+            cached_results = (
+                session.query(PullRequestCache)
+                .filter(
+                    PullRequestCache.repo_path == repo_name,
+                    PullRequestCache.merged_at >= since.replace(tzinfo=None),  # Store as naive UTC
+                )
+                .all()
+            )
+            for cached_pr in cached_results:
+                if not self._is_pr_stale(cached_pr.cached_at):
+                    pr_data = {
+                        "number": cached_pr.pr_number,
+                        "title": cached_pr.title or "",
+                        "description": cached_pr.description or "",
+                        "author": cached_pr.author or "",
+                        "created_at": cached_pr.created_at,
+                        "merged_at": cached_pr.merged_at,
+                        "story_points": cached_pr.story_points or 0,
+                        "labels": cached_pr.labels or [],
+                        "commit_hashes": cached_pr.commit_hashes or [],
+                        "ticket_references": [],  # Would need additional extraction
+                        "review_comments": 0,  # Not stored in current schema
+                        "changed_files": 0,  # Not stored in current schema
+                        "additions": 0,  # Not stored in current schema
+                        "deletions": 0,  # Not stored in current schema
+                    }
+                    cached_prs.append(pr_data)
+        return cached_prs
+    def _cache_prs_bulk(self, repo_name: str, prs: list[dict[str, Any]]) -> None:
+        """Cache multiple PRs in bulk for better performance.
+        WHY: Bulk caching is more efficient than individual cache operations,
+        reducing database overhead when caching many PRs from GitHub API.
+        Args:
+            repo_name: GitHub repository name
+            prs: List of PR data dictionaries to cache
+        """
+        if not prs:
+            return
+        for pr_data in prs:
+            # Use existing cache_pr method which handles upserts properly
+            self.cache.cache_pr(repo_name, pr_data)
+    def _is_pr_stale(self, cached_at: datetime) -> bool:
+        """Check if cached PR data is stale based on cache TTL.
+        Args:
+            cached_at: When the PR was cached
+        Returns:
+            True if stale and should be refreshed, False if still fresh
+        """
+        from datetime import timedelta
+        if self.cache.ttl_hours == 0:  # No expiration
+            return False
+        stale_threshold = datetime.utcnow() - timedelta(hours=self.cache.ttl_hours)
+        return cached_at < stale_threshold
+    def _get_pull_requests(self, repo, since: datetime) -> list[Any]:
         """Get pull requests with rate limit handling."""
         prs = []
         # Ensure since is timezone-aware for comparison with GitHub's timezone-aware datetimes
         if since.tzinfo is None:
             since = since.replace(tzinfo=timezone.utc)
         for attempt in range(self.rate_limit_retries):
             try:
                 # Get all PRs updated since the date
-                for pr in repo.get_pulls(state='all', sort='updated', direction='desc'):
+                for pr in repo.get_pulls(state="all", sort="updated", direction="desc"):
                     if pr.updated_at < since:
                         break
                     # Only include PRs that were merged in our time period
                     if pr.merged and pr.merged_at >= since:
                         prs.append(pr)
                 return prs
             except RateLimitExceededException:
                 if attempt < self.rate_limit_retries - 1:
-                    wait_time = self.backoff_factor ** attempt
+                    wait_time = self.backoff_factor**attempt
                     print(f"   ⏳ GitHub rate limit hit, waiting {wait_time}s...")
                     time.sleep(wait_time)
                 else:
                     print("   ❌ GitHub rate limit exceeded, skipping PR enrichment")
                     return []
         return prs
-    def _extract_pr_data(self, pr) -> Dict[str, Any]:
+    def _extract_pr_data(self, pr) -> dict[str, Any]:
         """Extract relevant data from a GitHub PR object."""
         from ..extractors.story_points import StoryPointExtractor
         from ..extractors.tickets import TicketExtractor
         sp_extractor = StoryPointExtractor()
         ticket_extractor = TicketExtractor(allowed_platforms=self.allowed_ticket_platforms)
         # Extract story points from PR title and body
         pr_text = f"{pr.title} {pr.body or ''}"
         story_points = sp_extractor.extract_from_text(pr_text)
         # Extract ticket references
         tickets = ticket_extractor.extract_from_text(pr_text)
         # Get commit SHAs
         commit_hashes = [c.sha for c in pr.get_commits()]
         return {
-            'number': pr.number,
-            'title': pr.title,
-            'description': pr.body,
-            'author': pr.user.login,
-            'created_at': pr.created_at,
-            'merged_at': pr.merged_at,
-            'story_points': story_points,
-            'labels': [label.name for label in pr.labels],
-            'commit_hashes': commit_hashes,
-            'ticket_references': tickets,
-            'review_comments': pr.review_comments,
-            'changed_files': pr.changed_files,
-            'additions': pr.additions,
-            'deletions': pr.deletions
+            "number": pr.number,
+            "title": pr.title,
+            "description": pr.body,
+            "author": pr.user.login,
+            "created_at": pr.created_at,
+            "merged_at": pr.merged_at,
+            "story_points": story_points,
+            "labels": [label.name for label in pr.labels],
+            "commit_hashes": commit_hashes,
+            "ticket_references": tickets,
+            "review_comments": pr.review_comments,
+            "changed_files": pr.changed_files,
+            "additions": pr.additions,
+            "deletions": pr.deletions,
         }
-    def calculate_pr_metrics(self, prs: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def calculate_pr_metrics(self, prs: list[dict[str, Any]]) -> dict[str, Any]:
         """Calculate PR-level metrics."""
         if not prs:
             return {
-                'avg_pr_size': 0,
-                'avg_pr_lifetime_hours': 0,
-                'avg_files_per_pr': 0,
-                'total_review_comments': 0
+                "avg_pr_size": 0,
+                "avg_pr_lifetime_hours": 0,
+                "avg_files_per_pr": 0,
+                "total_review_comments": 0,
             }
-        total_size = sum(pr['additions'] + pr['deletions'] for pr in prs)
-        total_files = sum(pr.get('changed_files', 0) for pr in prs)
-        total_comments = sum(pr.get('review_comments', 0) for pr in prs)
+        total_size = sum(pr["additions"] + pr["deletions"] for pr in prs)
+        total_files = sum(pr.get("changed_files", 0) for pr in prs)
+        total_comments = sum(pr.get("review_comments", 0) for pr in prs)
         # Calculate average PR lifetime
         lifetimes = []
         for pr in prs:
-            if pr.get('merged_at') and pr.get('created_at'):
-                lifetime = (pr['merged_at'] - pr['created_at']).total_seconds() / 3600
+            if pr.get("merged_at") and pr.get("created_at"):
+                lifetime = (pr["merged_at"] - pr["created_at"]).total_seconds() / 3600
                 lifetimes.append(lifetime)
         avg_lifetime = sum(lifetimes) / len(lifetimes) if lifetimes else 0
         return {
-            'total_prs': len(prs),
-            'avg_pr_size': total_size / len(prs),
-            'avg_pr_lifetime_hours': avg_lifetime,
-            'avg_files_per_pr': total_files / len(prs),
-            'total_review_comments': total_comments,
-            'prs_with_story_points': sum(1 for pr in prs if pr.get('story_points')),
-            'story_point_coverage': sum(1 for pr in prs if pr.get('story_points')) / len(prs) * 100
-        }
+            "total_prs": len(prs),
+            "avg_pr_size": total_size / len(prs),
+            "avg_pr_lifetime_hours": avg_lifetime,
+            "avg_files_per_pr": total_files / len(prs),
+            "total_review_comments": total_comments,
+            "prs_with_story_points": sum(1 for pr in prs if pr.get("story_points")),
+            "story_point_coverage": sum(1 for pr in prs if pr.get("story_points")) / len(prs) * 100,
+        }

gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

gitflow-analytics 1.0.1py3-none-any.whl → 1.3.6py3-none-any.whl