ado-git-repo-insights 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ """Azure DevOps REST API client.
2
+
3
+ Implements pagination (continuation tokens), bounded retry with exponential backoff,
4
+ and fail-fast on partial failures per Invariants 12-13 and Adjustment 4.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import logging
11
+ import time
12
+ from collections.abc import Iterator
13
+ from dataclasses import dataclass
14
+ from datetime import date, timedelta
15
+ from typing import Any
16
+
17
+ import requests
18
+ from requests.exceptions import HTTPError, RequestException
19
+
20
+ from ..config import APIConfig
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ExtractionError(Exception):
26
+ """Extraction failed - causes run to fail (Invariant 7, Adjustment 4)."""
27
+
28
+
29
+ @dataclass
30
+ class ExtractionStats:
31
+ """Statistics for an extraction run."""
32
+
33
+ total_prs: int = 0
34
+ pages_fetched: int = 0
35
+ retries_used: int = 0
36
+
37
+
38
+ class ADOClient:
39
+ """Azure DevOps REST API client with pagination, retry, and rate limiting.
40
+
41
+ Invariant 12: Pagination must be complete (continuation tokens).
42
+ Invariant 13: Retries must be bounded and predictable.
43
+ Adjustment 4: Partial failures fail the run.
44
+ """
45
+
46
+ def __init__(self, organization: str, pat: str, config: APIConfig) -> None:
47
+ """Initialize the ADO client.
48
+
49
+ Args:
50
+ organization: Azure DevOps organization name.
51
+ pat: Personal Access Token with Code (Read) scope.
52
+ config: API configuration settings.
53
+ """
54
+ self.organization = organization
55
+ self.base_url = f"{config.base_url}/{organization}"
56
+ self.config = config
57
+ self.headers = self._build_auth_headers(pat)
58
+ self.stats = ExtractionStats()
59
+
60
+ def _build_auth_headers(self, pat: str) -> dict[str, str]:
61
+ """Build authorization headers for ADO API.
62
+
63
+ Args:
64
+ pat: Personal Access Token.
65
+
66
+ Returns:
67
+ Headers dict with Basic auth.
68
+ """
69
+ # Invariant 19: PAT is never logged
70
+ encoded = base64.b64encode(f":{pat}".encode()).decode()
71
+ return {
72
+ "Authorization": f"Basic {encoded}",
73
+ "Content-Type": "application/json",
74
+ }
75
+
76
+ def get_pull_requests(
77
+ self,
78
+ project: str,
79
+ start_date: date,
80
+ end_date: date,
81
+ ) -> Iterator[dict[str, Any]]:
82
+ """Fetch completed PRs for a date range with automatic pagination.
83
+
84
+ Adjustment 4: Handles continuation tokens, bounded retries with backoff.
85
+ Raises on partial failures (deterministic failure over silent partial success).
86
+
87
+ Args:
88
+ project: Project name.
89
+ start_date: Start of date range (inclusive).
90
+ end_date: End of date range (inclusive).
91
+
92
+ Yields:
93
+ PR data dictionaries.
94
+
95
+ Raises:
96
+ ExtractionError: If extraction fails for any date.
97
+ """
98
+ current_date = start_date
99
+ while current_date <= end_date:
100
+ try:
101
+ prs = self._fetch_prs_for_date_paginated(project, current_date)
102
+ yield from prs
103
+ except ExtractionError as e:
104
+ # Fail the entire run on any date failure (Adjustment 4)
105
+ raise ExtractionError(
106
+ f"Failed extracting {project} on {current_date}: {e}"
107
+ ) from e
108
+
109
+ time.sleep(self.config.rate_limit_sleep_seconds)
110
+ current_date += timedelta(days=1)
111
+
112
+ def _fetch_prs_for_date_paginated(
113
+ self, project: str, dt: date
114
+ ) -> list[dict[str, Any]]:
115
+ """Fetch all PRs for a single date, handling continuation tokens.
116
+
117
+ Invariant 12: Complete pagination via continuation tokens.
118
+
119
+ Args:
120
+ project: Project name.
121
+ dt: Date to fetch.
122
+
123
+ Returns:
124
+ List of all PRs for the date.
125
+ """
126
+ all_prs: list[dict[str, Any]] = []
127
+ continuation_token: str | None = None
128
+
129
+ while True:
130
+ prs, continuation_token = self._fetch_page(project, dt, continuation_token)
131
+ all_prs.extend(prs)
132
+ self.stats.pages_fetched += 1
133
+
134
+ if not continuation_token:
135
+ break
136
+
137
+ logger.debug(f"Fetching next page for {project}/{dt}")
138
+
139
+ self.stats.total_prs += len(all_prs)
140
+ if all_prs:
141
+ logger.debug(f"Fetched {len(all_prs)} PRs for {project}/{dt}")
142
+
143
+ return all_prs
144
+
145
+ def _fetch_page(
146
+ self,
147
+ project: str,
148
+ dt: date,
149
+ token: str | None,
150
+ ) -> tuple[list[dict[str, Any]], str | None]:
151
+ """Fetch a single page of PRs with retry logic.
152
+
153
+ Invariant 13: Bounded retries with exponential backoff.
154
+
155
+ Args:
156
+ project: Project name.
157
+ dt: Date to fetch.
158
+ token: Continuation token from previous page.
159
+
160
+ Returns:
161
+ Tuple of (PR list, next continuation token or None).
162
+
163
+ Raises:
164
+ ExtractionError: After max retries exhausted.
165
+ """
166
+ url = self._build_pr_url(project, dt, token)
167
+
168
+ last_error: Exception | None = None
169
+ delay = self.config.retry_delay_seconds
170
+
171
+ for attempt in range(1, self.config.max_retries + 1):
172
+ try:
173
+ response = requests.get(url, headers=self.headers, timeout=30)
174
+ response.raise_for_status()
175
+
176
+ next_token = response.headers.get("x-ms-continuationtoken")
177
+ data = response.json()
178
+ return data.get("value", []), next_token
179
+
180
+ except (RequestException, HTTPError) as e:
181
+ last_error = e
182
+ self.stats.retries_used += 1
183
+ logger.warning(
184
+ f"Attempt {attempt}/{self.config.max_retries} failed: {e}"
185
+ )
186
+
187
+ if attempt < self.config.max_retries:
188
+ logger.info(f"Retrying in {delay:.1f}s...")
189
+ time.sleep(delay)
190
+ delay *= self.config.retry_backoff_multiplier
191
+
192
+ # All retries exhausted - fail the run (Adjustment 4)
193
+ raise ExtractionError(
194
+ f"Max retries ({self.config.max_retries}) exhausted for {project}/{dt}: "
195
+ f"{last_error}"
196
+ )
197
+
198
+ def _build_pr_url(self, project: str, dt: date, token: str | None) -> str:
199
+ """Build the ADO API URL for fetching PRs.
200
+
201
+ Args:
202
+ project: Project name.
203
+ dt: Date to query.
204
+ token: Optional continuation token.
205
+
206
+ Returns:
207
+ Fully constructed URL.
208
+ """
209
+ url = (
210
+ f"{self.base_url}/{project}/_apis/git/pullrequests"
211
+ f"?searchCriteria.status=completed"
212
+ f"&searchCriteria.queryTimeRangeType=closed"
213
+ f"&searchCriteria.minTime={dt}T00:00:00Z"
214
+ f"&searchCriteria.maxTime={dt}T23:59:59Z"
215
+ f"&$top=1000"
216
+ f"&api-version={self.config.version}"
217
+ )
218
+
219
+ if token:
220
+ url += f"&continuationToken={token}"
221
+
222
+ return url
223
+
224
+ def test_connection(self, project: str) -> bool:
225
+ """Test connectivity to ADO API.
226
+
227
+ Args:
228
+ project: Project name to test.
229
+
230
+ Returns:
231
+ True if connection successful.
232
+
233
+ Raises:
234
+ ExtractionError: If connection fails.
235
+ """
236
+ url = f"{self.base_url}/{project}/_apis/git/repositories?api-version={self.config.version}"
237
+
238
+ try:
239
+ response = requests.get(url, headers=self.headers, timeout=10)
240
+ response.raise_for_status()
241
+ logger.info(f"Successfully connected to {self.organization}/{project}")
242
+ return True
243
+ except (RequestException, HTTPError) as e:
244
+ raise ExtractionError(
245
+ f"Failed to connect to {self.organization}/{project}: {e}"
246
+ ) from e
@@ -0,0 +1,239 @@
1
+ """Pull Request extractor orchestration.
2
+
3
+ Coordinates extraction across multiple projects with incremental and backfill support.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from datetime import date, timedelta
11
+
12
+ from ..config import Config
13
+ from ..persistence.database import DatabaseManager
14
+ from ..persistence.repository import PRRepository
15
+ from .ado_client import ADOClient, ExtractionError
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class ProjectExtractionResult:
22
+ """Result of extracting PRs for a single project."""
23
+
24
+ project: str
25
+ start_date: date
26
+ end_date: date
27
+ prs_extracted: int
28
+ success: bool
29
+ error: str | None = None
30
+
31
+
32
+ @dataclass
33
+ class ExtractionSummary:
34
+ """Summary of an extraction run."""
35
+
36
+ projects: list[ProjectExtractionResult] = field(default_factory=list)
37
+ total_prs: int = 0
38
+ success: bool = True
39
+
40
+ def add_result(self, result: ProjectExtractionResult) -> None:
41
+ """Add a project result to the summary."""
42
+ self.projects.append(result)
43
+ self.total_prs += result.prs_extracted
44
+ if not result.success:
45
+ self.success = False
46
+
47
+ def log_summary(self) -> None:
48
+ """Log the extraction summary."""
49
+ logger.info("=" * 50)
50
+ logger.info("Extraction Summary")
51
+ logger.info("=" * 50)
52
+ for result in self.projects:
53
+ status = "✓" if result.success else "✗"
54
+ logger.info(
55
+ f" {status} {result.project}: "
56
+ f"{result.prs_extracted} PRs ({result.start_date} → {result.end_date})"
57
+ )
58
+ if result.error:
59
+ logger.error(f" Error: {result.error}")
60
+ logger.info(f"Total: {self.total_prs} PRs")
61
+ logger.info(f"Status: {'SUCCESS' if self.success else 'FAILED'}")
62
+ logger.info("=" * 50)
63
+
64
+
65
+ class PRExtractor:
66
+ """Orchestrates PR extraction across multiple projects.
67
+
68
+ Invariant 10: Daily incremental extraction is the default mode.
69
+ Invariant 11: Periodic backfill is required to prevent drift.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ client: ADOClient,
75
+ db: DatabaseManager,
76
+ config: Config,
77
+ ) -> None:
78
+ """Initialize the PR extractor.
79
+
80
+ Args:
81
+ client: ADO API client.
82
+ db: Database manager.
83
+ config: Extraction configuration.
84
+ """
85
+ self.client = client
86
+ self.db = db
87
+ self.repository = PRRepository(db)
88
+ self.config = config
89
+
90
+ def extract_all(self, backfill_days: int | None = None) -> ExtractionSummary:
91
+ """Extract PRs for all configured projects.
92
+
93
+ For each project:
94
+ 1. Determine date range (incremental from last extraction, or configured)
95
+ 2. Fetch PRs from ADO API
96
+ 3. UPSERT into SQLite
97
+ 4. Update extraction metadata
98
+
99
+ Args:
100
+ backfill_days: If provided, re-extract the last N days (Adjustment 1).
101
+
102
+ Returns:
103
+ Summary of extraction results.
104
+ """
105
+ summary = ExtractionSummary()
106
+
107
+ for project in self.config.projects:
108
+ result = self._extract_project(project, backfill_days)
109
+ summary.add_result(result)
110
+
111
+ # Adjustment 4: Fail fast on any project failure
112
+ if not result.success:
113
+ logger.error(f"Extraction failed for {project}, aborting run")
114
+ break
115
+
116
+ summary.log_summary()
117
+ return summary
118
+
119
+ def _extract_project(
120
+ self,
121
+ project: str,
122
+ backfill_days: int | None,
123
+ ) -> ProjectExtractionResult:
124
+ """Extract PRs for a single project.
125
+
126
+ Args:
127
+ project: Project name.
128
+ backfill_days: Optional backfill window.
129
+
130
+ Returns:
131
+ Extraction result for this project.
132
+ """
133
+ try:
134
+ start_date = self._determine_start_date(project, backfill_days)
135
+ end_date = self._determine_end_date()
136
+
137
+ if start_date > end_date:
138
+ logger.info(f"{project}: Already up to date (last: {start_date})")
139
+ return ProjectExtractionResult(
140
+ project=project,
141
+ start_date=start_date,
142
+ end_date=end_date,
143
+ prs_extracted=0,
144
+ success=True,
145
+ )
146
+
147
+ logger.info(
148
+ f"Extracting {self.config.organization}/{project}: "
149
+ f"{start_date} → {end_date}"
150
+ )
151
+
152
+ count = 0
153
+ for pr_data in self.client.get_pull_requests(project, start_date, end_date):
154
+ self.repository.upsert_pr_with_related(
155
+ pr_data=pr_data,
156
+ organization_name=self.config.organization,
157
+ project_name=project,
158
+ )
159
+ count += 1
160
+
161
+ # Update extraction metadata only on success
162
+ self.repository.update_extraction_metadata(
163
+ self.config.organization,
164
+ project,
165
+ end_date,
166
+ )
167
+
168
+ logger.info(f"{project}: Extracted {count} PRs")
169
+ return ProjectExtractionResult(
170
+ project=project,
171
+ start_date=start_date,
172
+ end_date=end_date,
173
+ prs_extracted=count,
174
+ success=True,
175
+ )
176
+
177
+ except ExtractionError as e:
178
+ logger.error(f"{project}: Extraction failed: {e}")
179
+ return ProjectExtractionResult(
180
+ project=project,
181
+ start_date=start_date if "start_date" in dir() else date.today(),
182
+ end_date=end_date if "end_date" in dir() else date.today(),
183
+ prs_extracted=0,
184
+ success=False,
185
+ error=str(e),
186
+ )
187
+
188
+ def _determine_start_date(
189
+ self,
190
+ project: str,
191
+ backfill_days: int | None,
192
+ ) -> date:
193
+ """Determine the start date for extraction.
194
+
195
+ Invariant 10: Incremental by default.
196
+ Invariant 11: Backfill for convergence.
197
+
198
+ Args:
199
+ project: Project name.
200
+ backfill_days: Optional backfill window.
201
+
202
+ Returns:
203
+ Start date for extraction.
204
+ """
205
+ # Priority 1: Explicit date range from config
206
+ if self.config.date_range.start:
207
+ return self.config.date_range.start
208
+
209
+ # Priority 2: Backfill mode
210
+ if backfill_days:
211
+ backfill_start = date.today() - timedelta(days=backfill_days)
212
+ logger.info(f"{project}: Backfill mode - {backfill_days} days")
213
+ return backfill_start
214
+
215
+ # Priority 3: Incremental from last extraction
216
+ last_date = self.repository.get_last_extraction_date(
217
+ self.config.organization,
218
+ project,
219
+ )
220
+ if last_date:
221
+ # Start from day after last extraction
222
+ return last_date + timedelta(days=1)
223
+
224
+ # Default: Start of current year (first run)
225
+ default_start = date(date.today().year, 1, 1)
226
+ logger.info(f"{project}: First run - starting from {default_start}")
227
+ return default_start
228
+
229
+ def _determine_end_date(self) -> date:
230
+ """Determine the end date for extraction.
231
+
232
+ Returns:
233
+ End date (yesterday by default, or configured).
234
+ """
235
+ if self.config.date_range.end:
236
+ return self.config.date_range.end
237
+
238
+ # Default: yesterday (avoids incomplete day data)
239
+ return date.today() - timedelta(days=1)
@@ -0,0 +1 @@
1
+ """Persistence module for SQLite storage operations."""
@@ -0,0 +1,193 @@
1
+ """SQLite database connection and management.
2
+
3
+ This module handles database connections, schema initialization, and
4
+ ensures safe transaction handling per Invariant 7 (no publish-on-failure).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import sqlite3
11
+ from collections.abc import Iterator
12
+ from contextlib import contextmanager
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ from .models import SCHEMA_SQL
17
+
18
+ if TYPE_CHECKING:
19
+ from sqlite3 import Connection, Cursor
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class DatabaseError(Exception):
25
+ """Database operation failed."""
26
+
27
+
28
+ class DatabaseManager:
29
+ """Manages SQLite database connections and schema.
30
+
31
+ Invariant 5: SQLite is the source of truth for derived outputs.
32
+ Invariant 9: Persistence must be recoverable.
33
+ """
34
+
35
+ def __init__(self, db_path: Path) -> None:
36
+ """Initialize the database manager.
37
+
38
+ Args:
39
+ db_path: Path to the SQLite database file.
40
+ """
41
+ self.db_path = db_path
42
+ self._connection: Connection | None = None
43
+
44
+ @property
45
+ def connection(self) -> Connection:
46
+ """Get the active database connection.
47
+
48
+ Raises:
49
+ DatabaseError: If not connected.
50
+ """
51
+ if self._connection is None:
52
+ raise DatabaseError("Database not connected. Call connect() first.")
53
+ return self._connection
54
+
55
+ def connect(self) -> None:
56
+ """Open a connection to the database.
57
+
58
+ Creates the database file and parent directories if they don't exist.
59
+ Initializes the schema on first connection.
60
+ """
61
+ # Ensure parent directory exists
62
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
63
+
64
+ is_new_db = not self.db_path.exists()
65
+
66
+ try:
67
+ self._connection = sqlite3.connect(
68
+ str(self.db_path),
69
+ isolation_level=None, # Autocommit; we'll manage transactions explicitly
70
+ )
71
+ self._connection.row_factory = sqlite3.Row
72
+
73
+ # Enable foreign keys
74
+ self._connection.execute("PRAGMA foreign_keys = ON")
75
+
76
+ if is_new_db:
77
+ logger.info(f"Creating new database at {self.db_path}")
78
+ self._initialize_schema()
79
+ else:
80
+ logger.info(f"Connected to existing database at {self.db_path}")
81
+ self._validate_schema()
82
+
83
+ except sqlite3.Error as e:
84
+ self.close() # Ensure connection is closed on error
85
+ raise DatabaseError(f"Failed to connect to database: {e}") from e
86
+ except DatabaseError:
87
+ self.close() # Ensure connection is closed on validation error
88
+ raise
89
+
90
+ def close(self) -> None:
91
+ """Close the database connection."""
92
+ if self._connection is not None:
93
+ self._connection.close()
94
+ self._connection = None
95
+ logger.debug("Database connection closed")
96
+
97
+ def _initialize_schema(self) -> None:
98
+ """Create all tables and indexes."""
99
+ try:
100
+ self._connection.executescript(SCHEMA_SQL) # type: ignore[union-attr]
101
+ logger.info("Database schema initialized")
102
+ except sqlite3.Error as e:
103
+ raise DatabaseError(f"Failed to initialize schema: {e}") from e
104
+
105
+ def _validate_schema(self) -> None:
106
+ """Validate that required tables exist.
107
+
108
+ Invariant 9: If schema is invalid, fail fast with clear error.
109
+ """
110
+ required_tables = [
111
+ "extraction_metadata",
112
+ "organizations",
113
+ "projects",
114
+ "repositories",
115
+ "users",
116
+ "pull_requests",
117
+ "reviewers",
118
+ ]
119
+
120
+ cursor = self.connection.execute(
121
+ "SELECT name FROM sqlite_master WHERE type='table'"
122
+ )
123
+ existing_tables = {row["name"] for row in cursor.fetchall()}
124
+
125
+ missing = set(required_tables) - existing_tables
126
+ if missing:
127
+ raise DatabaseError(
128
+ f"Database schema invalid. Missing tables: {missing}. "
129
+ "Consider creating a fresh database."
130
+ )
131
+
132
+ @contextmanager
133
+ def transaction(self) -> Iterator[Cursor]:
134
+ """Execute operations within a transaction.
135
+
136
+ Invariant 7: On failure, changes are rolled back.
137
+
138
+ Yields:
139
+ Database cursor for executing queries.
140
+ """
141
+ conn = self.connection
142
+ cursor = conn.cursor()
143
+
144
+ try:
145
+ cursor.execute("BEGIN TRANSACTION")
146
+ yield cursor
147
+ cursor.execute("COMMIT")
148
+ except Exception:
149
+ cursor.execute("ROLLBACK")
150
+ raise
151
+ finally:
152
+ cursor.close()
153
+
154
+ def execute(self, sql: str, parameters: tuple[Any, ...] = ()) -> Cursor: # noqa: UP006
155
+ """Execute a single SQL statement.
156
+
157
+ Args:
158
+ sql: SQL statement to execute.
159
+ parameters: Parameters for the statement.
160
+
161
+ Returns:
162
+ Cursor with results.
163
+ """
164
+ return self.connection.execute(sql, parameters)
165
+
166
+ def executemany(
167
+ self,
168
+ sql: str,
169
+ parameters: list[tuple[Any, ...]], # noqa: UP006
170
+ ) -> Cursor:
171
+ """Execute a SQL statement with multiple parameter sets.
172
+
173
+ Args:
174
+ sql: SQL statement to execute.
175
+ parameters: List of parameter tuples.
176
+
177
+ Returns:
178
+ Cursor with results.
179
+ """
180
+ return self.connection.executemany(sql, parameters)
181
+
182
+ def get_schema_version(self) -> int:
183
+ """Get the current schema version.
184
+
185
+ Returns:
186
+ Current schema version number.
187
+ """
188
+ try:
189
+ cursor = self.execute("SELECT MAX(version) as version FROM schema_version")
190
+ row = cursor.fetchone()
191
+ return int(row["version"]) if row and row["version"] is not None else 0
192
+ except sqlite3.Error:
193
+ return 0