PyPI - ado-git-repo-insights - Versions diffs - 1.2.1__py3-none-any.whl - Mend

ado-git-repo-insights 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

ado_git_repo_insights/__init__.py +3 -0
ado_git_repo_insights/cli.py +354 -0
ado_git_repo_insights/config.py +186 -0
ado_git_repo_insights/extractor/__init__.py +1 -0
ado_git_repo_insights/extractor/ado_client.py +246 -0
ado_git_repo_insights/extractor/pr_extractor.py +239 -0
ado_git_repo_insights/persistence/__init__.py +1 -0
ado_git_repo_insights/persistence/database.py +193 -0
ado_git_repo_insights/persistence/models.py +145 -0
ado_git_repo_insights/persistence/repository.py +376 -0
ado_git_repo_insights/transform/__init__.py +1 -0
ado_git_repo_insights/transform/csv_generator.py +132 -0
ado_git_repo_insights/utils/__init__.py +1 -0
ado_git_repo_insights/utils/datetime_utils.py +101 -0
ado_git_repo_insights/utils/logging_config.py +172 -0
ado_git_repo_insights/utils/run_summary.py +206 -0
ado_git_repo_insights-1.2.1.dist-info/METADATA +225 -0
ado_git_repo_insights-1.2.1.dist-info/RECORD +22 -0
ado_git_repo_insights-1.2.1.dist-info/WHEEL +5 -0
ado_git_repo_insights-1.2.1.dist-info/entry_points.txt +2 -0
ado_git_repo_insights-1.2.1.dist-info/licenses/LICENSE +21 -0
ado_git_repo_insights-1.2.1.dist-info/top_level.txt +1 -0

ado_git_repo_insights/extractor/ado_client.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""Azure DevOps REST API client.
+Implements pagination (continuation tokens), bounded retry with exponential backoff,
+and fail-fast on partial failures per Invariants 12-13 and Adjustment 4.
+"""
+from __future__ import annotations
+import base64
+import logging
+import time
+from collections.abc import Iterator
+from dataclasses import dataclass
+from datetime import date, timedelta
+from typing import Any
+import requests
+from requests.exceptions import HTTPError, RequestException
+from ..config import APIConfig
+logger = logging.getLogger(__name__)
+class ExtractionError(Exception):
+    """Extraction failed - causes run to fail (Invariant 7, Adjustment 4)."""
+@dataclass
+class ExtractionStats:
+    """Statistics for an extraction run."""
+    total_prs: int = 0
+    pages_fetched: int = 0
+    retries_used: int = 0
+class ADOClient:
+    """Azure DevOps REST API client with pagination, retry, and rate limiting.
+    Invariant 12: Pagination must be complete (continuation tokens).
+    Invariant 13: Retries must be bounded and predictable.
+    Adjustment 4: Partial failures fail the run.
+    """
+    def __init__(self, organization: str, pat: str, config: APIConfig) -> None:
+        """Initialize the ADO client.
+        Args:
+            organization: Azure DevOps organization name.
+            pat: Personal Access Token with Code (Read) scope.
+            config: API configuration settings.
+        """
+        self.organization = organization
+        self.base_url = f"{config.base_url}/{organization}"
+        self.config = config
+        self.headers = self._build_auth_headers(pat)
+        self.stats = ExtractionStats()
+    def _build_auth_headers(self, pat: str) -> dict[str, str]:
+        """Build authorization headers for ADO API.
+        Args:
+            pat: Personal Access Token.
+        Returns:
+            Headers dict with Basic auth.
+        """
+        # Invariant 19: PAT is never logged
+        encoded = base64.b64encode(f":{pat}".encode()).decode()
+        return {
+            "Authorization": f"Basic {encoded}",
+            "Content-Type": "application/json",
+        }
+    def get_pull_requests(
+        self,
+        project: str,
+        start_date: date,
+        end_date: date,
+    ) -> Iterator[dict[str, Any]]:
+        """Fetch completed PRs for a date range with automatic pagination.
+        Adjustment 4: Handles continuation tokens, bounded retries with backoff.
+        Raises on partial failures (deterministic failure over silent partial success).
+        Args:
+            project: Project name.
+            start_date: Start of date range (inclusive).
+            end_date: End of date range (inclusive).
+        Yields:
+            PR data dictionaries.
+        Raises:
+            ExtractionError: If extraction fails for any date.
+        """
+        current_date = start_date
+        while current_date <= end_date:
+            try:
+                prs = self._fetch_prs_for_date_paginated(project, current_date)
+                yield from prs
+            except ExtractionError as e:
+                # Fail the entire run on any date failure (Adjustment 4)
+                raise ExtractionError(
+                    f"Failed extracting {project} on {current_date}: {e}"
+                ) from e
+            time.sleep(self.config.rate_limit_sleep_seconds)
+            current_date += timedelta(days=1)
+    def _fetch_prs_for_date_paginated(
+        self, project: str, dt: date
+    ) -> list[dict[str, Any]]:
+        """Fetch all PRs for a single date, handling continuation tokens.
+        Invariant 12: Complete pagination via continuation tokens.
+        Args:
+            project: Project name.
+            dt: Date to fetch.
+        Returns:
+            List of all PRs for the date.
+        """
+        all_prs: list[dict[str, Any]] = []
+        continuation_token: str | None = None
+        while True:
+            prs, continuation_token = self._fetch_page(project, dt, continuation_token)
+            all_prs.extend(prs)
+            self.stats.pages_fetched += 1
+            if not continuation_token:
+                break
+            logger.debug(f"Fetching next page for {project}/{dt}")
+        self.stats.total_prs += len(all_prs)
+        if all_prs:
+            logger.debug(f"Fetched {len(all_prs)} PRs for {project}/{dt}")
+        return all_prs
+    def _fetch_page(
+        self,
+        project: str,
+        dt: date,
+        token: str | None,
+    ) -> tuple[list[dict[str, Any]], str | None]:
+        """Fetch a single page of PRs with retry logic.
+        Invariant 13: Bounded retries with exponential backoff.
+        Args:
+            project: Project name.
+            dt: Date to fetch.
+            token: Continuation token from previous page.
+        Returns:
+            Tuple of (PR list, next continuation token or None).
+        Raises:
+            ExtractionError: After max retries exhausted.
+        """
+        url = self._build_pr_url(project, dt, token)
+        last_error: Exception | None = None
+        delay = self.config.retry_delay_seconds
+        for attempt in range(1, self.config.max_retries + 1):
+            try:
+                response = requests.get(url, headers=self.headers, timeout=30)
+                response.raise_for_status()
+                next_token = response.headers.get("x-ms-continuationtoken")
+                data = response.json()
+                return data.get("value", []), next_token
+            except (RequestException, HTTPError) as e:
+                last_error = e
+                self.stats.retries_used += 1
+                logger.warning(
+                    f"Attempt {attempt}/{self.config.max_retries} failed: {e}"
+                )
+                if attempt < self.config.max_retries:
+                    logger.info(f"Retrying in {delay:.1f}s...")
+                    time.sleep(delay)
+                    delay *= self.config.retry_backoff_multiplier
+        # All retries exhausted - fail the run (Adjustment 4)
+        raise ExtractionError(
+            f"Max retries ({self.config.max_retries}) exhausted for {project}/{dt}: "
+            f"{last_error}"
+        )
+    def _build_pr_url(self, project: str, dt: date, token: str | None) -> str:
+        """Build the ADO API URL for fetching PRs.
+        Args:
+            project: Project name.
+            dt: Date to query.
+            token: Optional continuation token.
+        Returns:
+            Fully constructed URL.
+        """
+        url = (
+            f"{self.base_url}/{project}/_apis/git/pullrequests"
+            f"?searchCriteria.status=completed"
+            f"&searchCriteria.queryTimeRangeType=closed"
+            f"&searchCriteria.minTime={dt}T00:00:00Z"
+            f"&searchCriteria.maxTime={dt}T23:59:59Z"
+            f"&$top=1000"
+            f"&api-version={self.config.version}"
+        )
+        if token:
+            url += f"&continuationToken={token}"
+        return url
+    def test_connection(self, project: str) -> bool:
+        """Test connectivity to ADO API.
+        Args:
+            project: Project name to test.
+        Returns:
+            True if connection successful.
+        Raises:
+            ExtractionError: If connection fails.
+        """
+        url = f"{self.base_url}/{project}/_apis/git/repositories?api-version={self.config.version}"
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            response.raise_for_status()
+            logger.info(f"Successfully connected to {self.organization}/{project}")
+            return True
+        except (RequestException, HTTPError) as e:
+            raise ExtractionError(
+                f"Failed to connect to {self.organization}/{project}: {e}"
+            ) from e

ado_git_repo_insights/extractor/pr_extractor.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Pull Request extractor orchestration.
+Coordinates extraction across multiple projects with incremental and backfill support.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from datetime import date, timedelta
+from ..config import Config
+from ..persistence.database import DatabaseManager
+from ..persistence.repository import PRRepository
+from .ado_client import ADOClient, ExtractionError
+logger = logging.getLogger(__name__)
+@dataclass
+class ProjectExtractionResult:
+    """Result of extracting PRs for a single project."""
+    project: str
+    start_date: date
+    end_date: date
+    prs_extracted: int
+    success: bool
+    error: str | None = None
+@dataclass
+class ExtractionSummary:
+    """Summary of an extraction run."""
+    projects: list[ProjectExtractionResult] = field(default_factory=list)
+    total_prs: int = 0
+    success: bool = True
+    def add_result(self, result: ProjectExtractionResult) -> None:
+        """Add a project result to the summary."""
+        self.projects.append(result)
+        self.total_prs += result.prs_extracted
+        if not result.success:
+            self.success = False
+    def log_summary(self) -> None:
+        """Log the extraction summary."""
+        logger.info("=" * 50)
+        logger.info("Extraction Summary")
+        logger.info("=" * 50)
+        for result in self.projects:
+            status = "✓" if result.success else "✗"
+            logger.info(
+                f"  {status} {result.project}: "
+                f"{result.prs_extracted} PRs ({result.start_date} → {result.end_date})"
+            )
+            if result.error:
+                logger.error(f"    Error: {result.error}")
+        logger.info(f"Total: {self.total_prs} PRs")
+        logger.info(f"Status: {'SUCCESS' if self.success else 'FAILED'}")
+        logger.info("=" * 50)
+class PRExtractor:
+    """Orchestrates PR extraction across multiple projects.
+    Invariant 10: Daily incremental extraction is the default mode.
+    Invariant 11: Periodic backfill is required to prevent drift.
+    """
+    def __init__(
+        self,
+        client: ADOClient,
+        db: DatabaseManager,
+        config: Config,
+    ) -> None:
+        """Initialize the PR extractor.
+        Args:
+            client: ADO API client.
+            db: Database manager.
+            config: Extraction configuration.
+        """
+        self.client = client
+        self.db = db
+        self.repository = PRRepository(db)
+        self.config = config
+    def extract_all(self, backfill_days: int | None = None) -> ExtractionSummary:
+        """Extract PRs for all configured projects.
+        For each project:
+        1. Determine date range (incremental from last extraction, or configured)
+        2. Fetch PRs from ADO API
+        3. UPSERT into SQLite
+        4. Update extraction metadata
+        Args:
+            backfill_days: If provided, re-extract the last N days (Adjustment 1).
+        Returns:
+            Summary of extraction results.
+        """
+        summary = ExtractionSummary()
+        for project in self.config.projects:
+            result = self._extract_project(project, backfill_days)
+            summary.add_result(result)
+            # Adjustment 4: Fail fast on any project failure
+            if not result.success:
+                logger.error(f"Extraction failed for {project}, aborting run")
+                break
+        summary.log_summary()
+        return summary
+    def _extract_project(
+        self,
+        project: str,
+        backfill_days: int | None,
+    ) -> ProjectExtractionResult:
+        """Extract PRs for a single project.
+        Args:
+            project: Project name.
+            backfill_days: Optional backfill window.
+        Returns:
+            Extraction result for this project.
+        """
+        try:
+            start_date = self._determine_start_date(project, backfill_days)
+            end_date = self._determine_end_date()
+            if start_date > end_date:
+                logger.info(f"{project}: Already up to date (last: {start_date})")
+                return ProjectExtractionResult(
+                    project=project,
+                    start_date=start_date,
+                    end_date=end_date,
+                    prs_extracted=0,
+                    success=True,
+                )
+            logger.info(
+                f"Extracting {self.config.organization}/{project}: "
+                f"{start_date} → {end_date}"
+            )
+            count = 0
+            for pr_data in self.client.get_pull_requests(project, start_date, end_date):
+                self.repository.upsert_pr_with_related(
+                    pr_data=pr_data,
+                    organization_name=self.config.organization,
+                    project_name=project,
+                )
+                count += 1
+            # Update extraction metadata only on success
+            self.repository.update_extraction_metadata(
+                self.config.organization,
+                project,
+                end_date,
+            )
+            logger.info(f"{project}: Extracted {count} PRs")
+            return ProjectExtractionResult(
+                project=project,
+                start_date=start_date,
+                end_date=end_date,
+                prs_extracted=count,
+                success=True,
+            )
+        except ExtractionError as e:
+            logger.error(f"{project}: Extraction failed: {e}")
+            return ProjectExtractionResult(
+                project=project,
+                start_date=start_date if "start_date" in dir() else date.today(),
+                end_date=end_date if "end_date" in dir() else date.today(),
+                prs_extracted=0,
+                success=False,
+                error=str(e),
+            )
+    def _determine_start_date(
+        self,
+        project: str,
+        backfill_days: int | None,
+    ) -> date:
+        """Determine the start date for extraction.
+        Invariant 10: Incremental by default.
+        Invariant 11: Backfill for convergence.
+        Args:
+            project: Project name.
+            backfill_days: Optional backfill window.
+        Returns:
+            Start date for extraction.
+        """
+        # Priority 1: Explicit date range from config
+        if self.config.date_range.start:
+            return self.config.date_range.start
+        # Priority 2: Backfill mode
+        if backfill_days:
+            backfill_start = date.today() - timedelta(days=backfill_days)
+            logger.info(f"{project}: Backfill mode - {backfill_days} days")
+            return backfill_start
+        # Priority 3: Incremental from last extraction
+        last_date = self.repository.get_last_extraction_date(
+            self.config.organization,
+            project,
+        )
+        if last_date:
+            # Start from day after last extraction
+            return last_date + timedelta(days=1)
+        # Default: Start of current year (first run)
+        default_start = date(date.today().year, 1, 1)
+        logger.info(f"{project}: First run - starting from {default_start}")
+        return default_start
+    def _determine_end_date(self) -> date:
+        """Determine the end date for extraction.
+        Returns:
+            End date (yesterday by default, or configured).
+        """
+        if self.config.date_range.end:
+            return self.config.date_range.end
+        # Default: yesterday (avoids incomplete day data)
+        return date.today() - timedelta(days=1)

ado_git_repo_insights/persistence/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Persistence module for SQLite storage operations."""

ado_git_repo_insights/persistence/database.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""SQLite database connection and management.
+This module handles database connections, schema initialization, and
+ensures safe transaction handling per Invariant 7 (no publish-on-failure).
+"""
+from __future__ import annotations
+import logging
+import sqlite3
+from collections.abc import Iterator
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+from .models import SCHEMA_SQL
+if TYPE_CHECKING:
+    from sqlite3 import Connection, Cursor
+logger = logging.getLogger(__name__)
+class DatabaseError(Exception):
+    """Database operation failed."""
+class DatabaseManager:
+    """Manages SQLite database connections and schema.
+    Invariant 5: SQLite is the source of truth for derived outputs.
+    Invariant 9: Persistence must be recoverable.
+    """
+    def __init__(self, db_path: Path) -> None:
+        """Initialize the database manager.
+        Args:
+            db_path: Path to the SQLite database file.
+        """
+        self.db_path = db_path
+        self._connection: Connection | None = None
+    @property
+    def connection(self) -> Connection:
+        """Get the active database connection.
+        Raises:
+            DatabaseError: If not connected.
+        """
+        if self._connection is None:
+            raise DatabaseError("Database not connected. Call connect() first.")
+        return self._connection
+    def connect(self) -> None:
+        """Open a connection to the database.
+        Creates the database file and parent directories if they don't exist.
+        Initializes the schema on first connection.
+        """
+        # Ensure parent directory exists
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        is_new_db = not self.db_path.exists()
+        try:
+            self._connection = sqlite3.connect(
+                str(self.db_path),
+                isolation_level=None,  # Autocommit; we'll manage transactions explicitly
+            )
+            self._connection.row_factory = sqlite3.Row
+            # Enable foreign keys
+            self._connection.execute("PRAGMA foreign_keys = ON")
+            if is_new_db:
+                logger.info(f"Creating new database at {self.db_path}")
+                self._initialize_schema()
+            else:
+                logger.info(f"Connected to existing database at {self.db_path}")
+                self._validate_schema()
+        except sqlite3.Error as e:
+            self.close()  # Ensure connection is closed on error
+            raise DatabaseError(f"Failed to connect to database: {e}") from e
+        except DatabaseError:
+            self.close()  # Ensure connection is closed on validation error
+            raise
+    def close(self) -> None:
+        """Close the database connection."""
+        if self._connection is not None:
+            self._connection.close()
+            self._connection = None
+            logger.debug("Database connection closed")
+    def _initialize_schema(self) -> None:
+        """Create all tables and indexes."""
+        try:
+            self._connection.executescript(SCHEMA_SQL)  # type: ignore[union-attr]
+            logger.info("Database schema initialized")
+        except sqlite3.Error as e:
+            raise DatabaseError(f"Failed to initialize schema: {e}") from e
+    def _validate_schema(self) -> None:
+        """Validate that required tables exist.
+        Invariant 9: If schema is invalid, fail fast with clear error.
+        """
+        required_tables = [
+            "extraction_metadata",
+            "organizations",
+            "projects",
+            "repositories",
+            "users",
+            "pull_requests",
+            "reviewers",
+        ]
+        cursor = self.connection.execute(
+            "SELECT name FROM sqlite_master WHERE type='table'"
+        )
+        existing_tables = {row["name"] for row in cursor.fetchall()}
+        missing = set(required_tables) - existing_tables
+        if missing:
+            raise DatabaseError(
+                f"Database schema invalid. Missing tables: {missing}. "
+                "Consider creating a fresh database."
+            )
+    @contextmanager
+    def transaction(self) -> Iterator[Cursor]:
+        """Execute operations within a transaction.
+        Invariant 7: On failure, changes are rolled back.
+        Yields:
+            Database cursor for executing queries.
+        """
+        conn = self.connection
+        cursor = conn.cursor()
+        try:
+            cursor.execute("BEGIN TRANSACTION")
+            yield cursor
+            cursor.execute("COMMIT")
+        except Exception:
+            cursor.execute("ROLLBACK")
+            raise
+        finally:
+            cursor.close()
+    def execute(self, sql: str, parameters: tuple[Any, ...] = ()) -> Cursor:  # noqa: UP006
+        """Execute a single SQL statement.
+        Args:
+            sql: SQL statement to execute.
+            parameters: Parameters for the statement.
+        Returns:
+            Cursor with results.
+        """
+        return self.connection.execute(sql, parameters)
+    def executemany(
+        self,
+        sql: str,
+        parameters: list[tuple[Any, ...]],  # noqa: UP006
+    ) -> Cursor:
+        """Execute a SQL statement with multiple parameter sets.
+        Args:
+            sql: SQL statement to execute.
+            parameters: List of parameter tuples.
+        Returns:
+            Cursor with results.
+        """
+        return self.connection.executemany(sql, parameters)
+    def get_schema_version(self) -> int:
+        """Get the current schema version.
+        Returns:
+            Current schema version number.
+        """
+        try:
+            cursor = self.execute("SELECT MAX(version) as version FROM schema_version")
+            row = cursor.fetchone()
+            return int(row["version"]) if row and row["version"] is not None else 0
+        except sqlite3.Error:
+            return 0