PyPI - ado-git-repo-insights - Versions diffs - 1.2.1__py3-none-any.whl - Mend

ado-git-repo-insights 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

ado_git_repo_insights/__init__.py +3 -0
ado_git_repo_insights/cli.py +354 -0
ado_git_repo_insights/config.py +186 -0
ado_git_repo_insights/extractor/__init__.py +1 -0
ado_git_repo_insights/extractor/ado_client.py +246 -0
ado_git_repo_insights/extractor/pr_extractor.py +239 -0
ado_git_repo_insights/persistence/__init__.py +1 -0
ado_git_repo_insights/persistence/database.py +193 -0
ado_git_repo_insights/persistence/models.py +145 -0
ado_git_repo_insights/persistence/repository.py +376 -0
ado_git_repo_insights/transform/__init__.py +1 -0
ado_git_repo_insights/transform/csv_generator.py +132 -0
ado_git_repo_insights/utils/__init__.py +1 -0
ado_git_repo_insights/utils/datetime_utils.py +101 -0
ado_git_repo_insights/utils/logging_config.py +172 -0
ado_git_repo_insights/utils/run_summary.py +206 -0
ado_git_repo_insights-1.2.1.dist-info/METADATA +225 -0
ado_git_repo_insights-1.2.1.dist-info/RECORD +22 -0
ado_git_repo_insights-1.2.1.dist-info/WHEEL +5 -0
ado_git_repo_insights-1.2.1.dist-info/entry_points.txt +2 -0
ado_git_repo_insights-1.2.1.dist-info/licenses/LICENSE +21 -0
ado_git_repo_insights-1.2.1.dist-info/top_level.txt +1 -0

ado_git_repo_insights/utils/datetime_utils.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Datetime utilities for ado-git-repo-insights.
+Ported from the original generate_raw_data.py to ensure identical behavior.
+"""
+from __future__ import annotations
+import logging
+from datetime import datetime
+logger = logging.getLogger(__name__)
+def parse_iso_datetime(date_str: str | None) -> datetime | None:
+    """Parse ISO 8601 datetime strings from ADO API.
+    Handles 7-digit microseconds and 'Z' suffix quirks from ADO API responses.
+    Preserved from original implementation for compatibility.
+    Args:
+        date_str: ISO 8601 datetime string, or None.
+    Returns:
+        Parsed datetime, or None if parsing fails or input is None.
+    Examples:
+        >>> parse_iso_datetime("2024-01-15T10:30:45.1234567Z")
+        datetime.datetime(2024, 1, 15, 10, 30, 45, 123456)
+        >>> parse_iso_datetime(None)
+        None
+    """
+    if not date_str:
+        return None
+    try:
+        # Remove trailing 'Z' (Zulu/UTC indicator)
+        date_str = date_str.rstrip("Z")
+        if "." in date_str:
+            # ADO API sometimes returns 7-digit microseconds, Python only supports 6
+            date_part, microseconds = date_str.split(".")
+            microseconds = microseconds[:6]  # Truncate to 6 digits
+            date_str = f"{date_part}.{microseconds}"
+            return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f")
+        else:
+            # No microseconds
+            return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
+    except ValueError as e:
+        logger.warning(f"Failed to parse date '{date_str}': {e}")
+        return None
+def calculate_cycle_time_minutes(
+    creation_date: str | None, closed_date: str | None
+) -> float | None:
+    """Calculate PR cycle time in minutes.
+    Cycle time is the duration from PR creation to closure.
+    Minimum value is 1 minute to avoid zero/negative values.
+    Args:
+        creation_date: ISO 8601 creation date string.
+        closed_date: ISO 8601 closed date string.
+    Returns:
+        Cycle time in minutes (minimum 1.0), or None if dates are invalid.
+    Examples:
+        >>> calculate_cycle_time_minutes(
+        ...     "2024-01-15T10:00:00Z",
+        ...     "2024-01-15T10:30:00Z"
+        ... )
+        30.0
+    """
+    created = parse_iso_datetime(creation_date)
+    closed = parse_iso_datetime(closed_date)
+    if created and closed:
+        delta_seconds = (closed - created).total_seconds()
+        minutes = delta_seconds / 60
+        # Minimum 1 minute, rounded to 2 decimal places
+        return max(1.0, round(minutes, 2))
+    return None
+def format_date_for_api(dt: datetime) -> str:
+    """Format a datetime for ADO API queries.
+    Args:
+        dt: Datetime to format.
+    Returns:
+        ISO 8601 formatted string with 'Z' suffix.
+    Examples:
+        >>> format_date_for_api(datetime(2024, 1, 15, 10, 30, 0))
+        '2024-01-15T10:30:00Z'
+    """
+    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")

ado_git_repo_insights/utils/logging_config.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""Logging configuration with selective secret redaction.
+Provides console and JSONL logging formats with precise redaction rules.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass
+class RedactionConfig:
+    """Configuration for selective secret redaction."""
+    # Known secret value patterns (regex)
+    value_patterns: list[str] = field(
+        default_factory=lambda: [
+            r"[A-Za-z0-9]{52}",  # Azure DevOps PAT format (52 chars)
+            r"Bearer\s+[A-Za-z0-9\-._~+/]+=*",  # Bearer tokens
+        ]
+    )
+    # Explicit key deny-list (exact matches, case-insensitive)
+    key_denylist: set[str] = field(
+        default_factory=lambda: {
+            "pat",
+            "personal_access_token",
+            "auth_header",
+            "authorization",
+            "webhook_url",
+            "secret",
+            "password",
+        }
+    )
+    def should_redact_key(self, key: str) -> bool:
+        """Check if a key should be redacted based on deny-list."""
+        return key.lower() in self.key_denylist
+    def redact_value(self, value: str) -> str:
+        """Redact known secret patterns in a value."""
+        result = value
+        for pattern in self.value_patterns:
+            result = re.sub(pattern, "***REDACTED***", result)
+        return result
+class RedactingFormatter(logging.Formatter):
+    """Formatter that redacts sensitive information."""
+    def __init__(self, fmt: str | None = None, datefmt: str | None = None) -> None:
+        super().__init__(fmt, datefmt)
+        self.redaction_config = RedactionConfig()
+    def format(self, record: logging.LogRecord) -> str:
+        # Redact message
+        if isinstance(record.msg, str):
+            record.msg = self.redaction_config.redact_value(record.msg)
+        # Redact args
+        if record.args:
+            record.args = tuple(
+                self.redaction_config.redact_value(str(arg))
+                if isinstance(arg, str)
+                else arg
+                for arg in record.args
+            )
+        return super().format(record)
+class JsonlHandler(logging.Handler):
+    """Handler that writes structured JSONL log entries with redaction."""
+    def __init__(self, log_file: Path) -> None:
+        super().__init__()
+        self.log_file = log_file
+        self.redaction_config = RedactionConfig()
+        # Set a basic formatter for timestamp formatting
+        self.setFormatter(logging.Formatter())
+        # Ensure parent directory exists
+        self.log_file.parent.mkdir(parents=True, exist_ok=True)
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            # P1 Fix: Redact the message before writing to JSONL
+            message = record.getMessage()
+            redacted_message = self.redaction_config.redact_value(message)
+            log_entry: dict[str, Any] = {
+                "timestamp": self.formatter.formatTime(record)
+                if self.formatter
+                else "",
+                "level": record.levelname,
+                "logger": record.name,
+                "message": redacted_message,
+            }
+            # Add extra fields if present (context dict)
+            if hasattr(record, "extra") and isinstance(record.extra, dict):
+                log_entry["context"] = self._redact_dict(record.extra)
+            with self.log_file.open("a", encoding="utf-8") as f:
+                f.write(json.dumps(log_entry) + "\n")
+        except Exception:
+            self.handleError(record)
+    def _redact_dict(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Recursively redact sensitive keys/values in a dictionary."""
+        result: dict[str, Any] = {}
+        for key, value in data.items():
+            if self.redaction_config.should_redact_key(key):
+                result[key] = "***REDACTED***"
+            elif isinstance(value, str):
+                result[key] = self.redaction_config.redact_value(value)
+            elif isinstance(value, dict):
+                result[key] = self._redact_dict(value)  # Recursive call
+            else:
+                result[key] = value
+        return result
+@dataclass
+class LoggingConfig:
+    """Configuration for logging setup."""
+    format: str = "console"  # "console" or "jsonl"
+    artifacts_dir: Path = field(default_factory=lambda: Path("run_artifacts"))
+    log_file: Path | None = None
+def setup_logging(config: LoggingConfig) -> None:
+    """Configure logging based on format selection.
+    Args:
+        config: Logging configuration.
+    """
+    # Get root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.INFO)
+    # Remove existing handlers
+    root_logger.handlers.clear()
+    if config.format == "console":
+        # Console handler with redaction
+        handler = logging.StreamHandler()
+        formatter = RedactingFormatter(
+            "%(asctime)s - %(levelname)s - %(message)s",
+        )
+        handler.setFormatter(formatter)
+        root_logger.addHandler(handler)
+    elif config.format == "jsonl":
+        # JSONL file handler with redaction
+        if config.log_file is None:
+            config.log_file = config.artifacts_dir / f"run_{os.getpid()}.log.jsonl"
+        jsonl_handler: logging.Handler = JsonlHandler(config.log_file)
+        root_logger.addHandler(jsonl_handler)
+    else:
+        raise ValueError(f"Invalid log format: {config.format}")

ado_git_repo_insights/utils/run_summary.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Run summary tracking with enriched error diagnostics.
+Captures comprehensive run telemetry including per-project status and first fatal error.
+"""
+# ruff: noqa: S603, S607
+from __future__ import annotations
+import json
+import os
+import re
+import subprocess
+from dataclasses import dataclass, field
+from datetime import date
+from pathlib import Path
+from typing import Any, Literal
+def normalize_error_message(error: str, max_length: int = 500) -> str:
+    """Normalize and bound error messages to prevent secret leakage.
+    Args:
+        error: Raw error message.
+        max_length: Maximum length for bounded message.
+    Returns:
+        Normalized error message.
+    """
+    # Strip URLs with query strings (can contain secrets)
+    error = re.sub(r"https?://[^\s]+\?[^\s]+", "[URL_WITH_PARAMS]", error)
+    # Strip full URLs (can contain hostnames/paths)
+    error = re.sub(r"https?://[^\s]+", "[URL]", error)
+    # Truncate to max length
+    if len(error) > max_length:
+        error = error[:max_length] + "...[truncated]"
+    return error
+@dataclass
+class RunCounts:
+    """Counts of extracted/generated items."""
+    prs_fetched: int = 0
+    prs_updated: int = 0
+    rows_per_csv: dict[str, int] = field(default_factory=dict)
+@dataclass
+class RunTimings:
+    """Timing information for run phases."""
+    total_seconds: float = 0.0
+    extract_seconds: float = 0.0
+    persist_seconds: float = 0.0
+    export_seconds: float = 0.0
+@dataclass
+class RunSummary:
+    """Comprehensive run summary with forensic diagnostics."""
+    tool_version: str
+    git_sha: str | None
+    organization: str
+    projects: list[str]
+    date_range_start: str  # ISO format date
+    date_range_end: str  # ISO format date
+    counts: RunCounts
+    timings: RunTimings
+    warnings: list[str]
+    final_status: Literal["success", "failed"]
+    per_project_status: dict[str, str] = field(default_factory=dict)
+    first_fatal_error: str | None = None
+    def __post_init__(self) -> None:
+        """Normalize error message on initialization."""
+        if self.first_fatal_error:
+            self.first_fatal_error = normalize_error_message(self.first_fatal_error)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "tool_version": self.tool_version,
+            "git_sha": self.git_sha,
+            "organization": self.organization,
+            "projects": self.projects,
+            "date_range": {
+                "start": self.date_range_start,
+                "end": self.date_range_end,
+            },
+            "counts": {
+                "prs_fetched": self.counts.prs_fetched,
+                "prs_updated": self.counts.prs_updated,
+                "rows_per_csv": self.counts.rows_per_csv,
+            },
+            "timings": {
+                "total_seconds": self.timings.total_seconds,
+                "extract_seconds": self.timings.extract_seconds,
+                "persist_seconds": self.timings.persist_seconds,
+                "export_seconds": self.timings.export_seconds,
+            },
+            "warnings": self.warnings,
+            "final_status": self.final_status,
+            "per_project_status": self.per_project_status,
+            "first_fatal_error": self.first_fatal_error,
+        }
+    def write(self, path: Path) -> None:
+        """Write summary to JSON file.
+        Args:
+            path: Path to write summary file.
+        """
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+    def print_final_line(self) -> None:
+        """Print one-liner summary to stdout."""
+        status_symbol = "✓" if self.final_status == "success" else "✗"
+        print(
+            f"{status_symbol} {self.final_status.upper()}: "
+            f"{self.counts.prs_fetched} PRs extracted, "
+            f"{len(self.counts.rows_per_csv)} CSVs written "
+            f"({self.timings.total_seconds:.1f}s)"
+        )
+    def emit_ado_commands(self) -> None:
+        """Emit Azure Pipelines logging commands."""
+        # Only emit if running in Azure Pipelines
+        if os.environ.get("TF_BUILD") != "true":
+            return
+        if self.final_status == "failed":
+            if self.first_fatal_error:
+                print(f"##vso[task.logissue type=error]{self.first_fatal_error}")
+            print("##vso[task.complete result=Failed]")
+        elif self.warnings:
+            for warning in self.warnings:
+                print(f"##vso[task.logissue type=warning]{warning}")
+def get_tool_version() -> str:
+    """Get tool version from VERSION file."""
+    version_file = Path(__file__).parent.parent.parent.parent / "VERSION"
+    if version_file.exists():
+        return version_file.read_text().strip()
+    return "unknown"
+def get_git_sha() -> str | None:
+    """Get Git SHA from VERSION file or git command.
+    Returns:
+        Git SHA or None if unavailable.
+    """
+    # Try VERSION file first
+    version_file = Path(__file__).parent.parent.parent.parent / "VERSION"
+    if version_file.exists():
+        version = version_file.read_text().strip()
+        if "+" in version:  # Version format like "1.0.7+8d88fb4"
+            return version.split("+")[1]
+    # Fallback to git command
+    try:
+        result = subprocess.run(  # noqa: S603, S607
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=5,
+        )
+        return result.stdout.strip()
+    except Exception:
+        return None
+def create_minimal_summary(
+    error_message: str,
+    artifacts_dir: Path = Path("run_artifacts"),
+) -> RunSummary:
+    """Create a partial summary for early failures.
+    Args:
+        error_message: Error message describing the failure.
+        artifacts_dir: Directory for artifacts.
+    Returns:
+        Minimal RunSummary with failure status.
+    """
+    return RunSummary(
+        tool_version=get_tool_version(),
+        git_sha=get_git_sha(),
+        organization="unknown",
+        projects=[],
+        date_range_start=str(date.today()),
+        date_range_end=str(date.today()),
+        counts=RunCounts(),
+        timings=RunTimings(),
+        warnings=[],
+        final_status="failed",
+        first_fatal_error=normalize_error_message(error_message),
+    )

ado_git_repo_insights-1.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,225 @@
+Metadata-Version: 2.4
+Name: ado-git-repo-insights
+Version: 1.2.1
+Summary: Extract Azure DevOps Pull Request metrics to SQLite and generate PowerBI-compatible CSVs.
+Author-email: "Odd Essentials, LLC" <admin@oddessentials.com>
+License: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests>=2.28.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: azure-storage-blob>=12.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: mypy>=1.0; extra == "dev"
+Requires-Dist: pre-commit>=3.0; extra == "dev"
+Requires-Dist: types-requests>=2.28.0; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0; extra == "dev"
+Requires-Dist: pandas-stubs>=2.0.0; extra == "dev"
+Dynamic: license-file
+# ado-git-repo-insights
+![CI](https://github.com/oddessentials/ado-git-repo-insights/actions/workflows/ci.yml/badge.svg)
+[![codecov](https://codecov.io/gh/oddessentials/ado-git-repo-insights/graph/badge.svg)](https://codecov.io/gh/oddessentials/ado-git-repo-insights)
+![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+Extract Azure DevOps Pull Request metrics to SQLite and generate PowerBI-compatible CSVs.
+## Overview
+This tool replaces the MongoDB-based `ado-pull-request-metrics` with a lightweight, file-based solution that:
+- **Stores data in SQLite** - No external database required
+- **Runs as an Azure DevOps Pipeline Task** - Scheduled daily extraction
+- **Preserves the PowerBI CSV contract** - Same filenames, columns, and ordering
+- **Supports incremental + backfill extraction** - Efficient daily updates with periodic convergence
+## Quick Start
+### Installation
+```bash
+pip install ado-git-repo-insights
+```
+## Usage Options
+This tool provides **two ways** to extract Azure DevOps Pull Request metrics:
+| Aspect | CLI (Option 1) | Extension (Option 2) |
+|--------|----------------|----------------------|
+| **Requires Python** | Yes | No (bundled) |
+| **Installation** | `pip install` | Upload VSIX to ADO |
+| **Pipeline syntax** | Script steps | Task step |
+| **Works outside ADO** | Yes | No (ADO only) |
+| **Flexibility** | Higher | Standard |
+### Option 1: Python CLI
+Best for users comfortable with Python/pip, custom scripts, and non-ADO CI/CD systems.
+#### First Run (Extract Data)
+```bash
+ado-insights extract \
+  --organization MyOrg \
+  --projects "ProjectOne,ProjectTwo" \
+  --pat $ADO_PAT \
+  --database ./ado-insights.sqlite
+```
+> **Note**: End date defaults to yesterday (to avoid incomplete data).
+> Include today: `--end-date $(date +%Y-%m-%d)` (Bash) or `--end-date (Get-Date -Format yyyy-MM-dd)` (PowerShell)
+#### Generate CSVs
+```bash
+ado-insights generate-csv \
+  --database ./ado-insights.sqlite \
+  --output ./csv_output
+```
+#### Backfill Mode (Weekly Convergence)
+```bash
+ado-insights extract \
+  --organization MyOrg \
+  --projects "ProjectOne,ProjectTwo" \
+  --pat $ADO_PAT \
+  --database ./ado-insights.sqlite \
+  --backfill-days 60
+```
+### Option 2: Azure DevOps Extension
+Best for teams that prefer the ADO pipeline editor UI or want a self-contained task without managing Python dependencies.
+```yaml
+steps:
+  - task: ExtractPullRequests@1
+    inputs:
+      organization: 'MyOrg'
+      projects: 'Project1,Project2'
+      pat: '$(PAT_SECRET)'
+      database: '$(Pipeline.Workspace)/data/ado-insights.sqlite'
+      outputDir: '$(Pipeline.Workspace)/csv_output'
+```
+**Installation:**
+1. Download the `.vsix` from [GitHub Releases](https://github.com/oddessentials/ado-git-repo-insights/releases)
+2. Install in your ADO organization: Organization Settings → Extensions → Browse local extensions
+## Configuration
+Create a `config.yaml` file:
+```yaml
+organization: MyOrg
+projects:
+  - ProjectOne
+  - ProjectTwo
+  - Project%20Three  # URL-encoded names supported
+api:
+  base_url: https://dev.azure.com
+  version: 7.1-preview.1
+  rate_limit_sleep_seconds: 0.5
+  max_retries: 3
+  retry_delay_seconds: 5
+  retry_backoff_multiplier: 2.0
+backfill:
+  enabled: true
+  window_days: 60
+```
+Then run:
+```bash
+ado-insights extract --config config.yaml --pat $ADO_PAT
+```
+## Azure DevOps Pipeline Integration
+See [sample-pipeline.yml](sample-pipeline.yml) for a complete example.
+### Scheduled Daily Extraction
+```yaml
+schedules:
+  - cron: "0 6 * * *"  # Daily at 6 AM UTC
+    displayName: "Daily PR Extraction"
+    branches:
+      include: [main]
+    always: true
+```
+### Weekly Backfill
+```yaml
+schedules:
+  - cron: "0 6 * * 0"  # Weekly on Sunday
+    displayName: "Weekly Backfill"
+    branches:
+      include: [main]
+    always: true
+```
+## CSV Output Contract
+The following CSVs are generated with **exact schema and column order** for PowerBI compatibility:
+| File | Columns |
+|------|---------|
+| `organizations.csv` | `organization_name` |
+| `projects.csv` | `organization_name`, `project_name` |
+| `repositories.csv` | `repository_id`, `repository_name`, `project_name`, `organization_name` |
+| `pull_requests.csv` | `pull_request_uid`, `pull_request_id`, `organization_name`, `project_name`, `repository_id`, `user_id`, `title`, `status`, `description`, `creation_date`, `closed_date`, `cycle_time_minutes` |
+| `users.csv` | `user_id`, `display_name`, `email` |
+| `reviewers.csv` | `pull_request_uid`, `user_id`, `vote`, `repository_id` |
+## Governance
+This project is governed by authoritative documents in `agents/`:
+- [INVARIANTS.md](agents/INVARIANTS.md) - 25 non-negotiable invariants
+- [definition-of-done.md](agents/definition-of-done.md) - Completion criteria
+- [victory-gates.md](agents/victory-gates.md) - Verification gates
+## Development
+```bash
+# Setup
+python -m venv .venv
+source .venv/bin/activate  # or .venv\Scripts\activate on Windows
+pip install -e .[dev]
+# Lint + Format
+ruff check .
+ruff format .
+# Type Check
+mypy src/
+# Test
+pytest
+```
+## License
+MIT