ado-git-repo-insights 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ """SQLite database schema and models for ado-git-repo-insights.
2
+
3
+ This module defines the SQLite schema that maps directly to the CSV output contract.
4
+ Schema changes must preserve invariants 1-4, 14-16 from INVARIANTS.md.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ # SQL schema that will be executed to create tables
10
+ # Mirrors the CSV output contract exactly
11
+
12
+ SCHEMA_SQL = """
13
+ -- Metadata table for incremental extraction state (Invariant 6)
14
+ CREATE TABLE IF NOT EXISTS extraction_metadata (
15
+ id INTEGER PRIMARY KEY,
16
+ organization_name TEXT NOT NULL,
17
+ project_name TEXT NOT NULL,
18
+ last_extraction_date TEXT NOT NULL, -- ISO 8601 (YYYY-MM-DD)
19
+ last_extraction_timestamp TEXT NOT NULL, -- ISO 8601 with time
20
+ UNIQUE(organization_name, project_name)
21
+ );
22
+
23
+ -- Core entity tables (matching CSV output contract - Invariants 1-4)
24
+
25
+ -- organizations.csv: organization_name
26
+ CREATE TABLE IF NOT EXISTS organizations (
27
+ organization_name TEXT PRIMARY KEY
28
+ );
29
+
30
+ -- projects.csv: organization_name, project_name
31
+ CREATE TABLE IF NOT EXISTS projects (
32
+ organization_name TEXT NOT NULL,
33
+ project_name TEXT NOT NULL,
34
+ PRIMARY KEY (organization_name, project_name),
35
+ FOREIGN KEY (organization_name) REFERENCES organizations(organization_name)
36
+ );
37
+
38
+ -- repositories.csv: repository_id, repository_name, project_name, organization_name
39
+ -- Invariant 14: repository_id is the stable ADO ID
40
+ CREATE TABLE IF NOT EXISTS repositories (
41
+ repository_id TEXT PRIMARY KEY,
42
+ repository_name TEXT NOT NULL,
43
+ project_name TEXT NOT NULL,
44
+ organization_name TEXT NOT NULL,
45
+ FOREIGN KEY (organization_name, project_name)
46
+ REFERENCES projects(organization_name, project_name)
47
+ );
48
+ CREATE INDEX IF NOT EXISTS idx_repositories_project
49
+ ON repositories(organization_name, project_name);
50
+
51
+ -- users.csv: user_id, display_name, email
52
+ -- Invariant 16: user_id is stable ADO ID, display_name/email are mutable labels
53
+ CREATE TABLE IF NOT EXISTS users (
54
+ user_id TEXT PRIMARY KEY,
55
+ display_name TEXT NOT NULL,
56
+ email TEXT
57
+ );
58
+
59
+ -- pull_requests.csv: pull_request_uid, pull_request_id, organization_name, project_name,
60
+ -- repository_id, user_id, title, status, description,
61
+ -- creation_date, closed_date, cycle_time_minutes
62
+ -- Invariant 14: pull_request_uid = {repository_id}-{pull_request_id}
63
+ CREATE TABLE IF NOT EXISTS pull_requests (
64
+ pull_request_uid TEXT PRIMARY KEY,
65
+ pull_request_id INTEGER NOT NULL,
66
+ organization_name TEXT NOT NULL,
67
+ project_name TEXT NOT NULL,
68
+ repository_id TEXT NOT NULL,
69
+ user_id TEXT NOT NULL,
70
+ title TEXT NOT NULL,
71
+ status TEXT NOT NULL,
72
+ description TEXT,
73
+ creation_date TEXT NOT NULL, -- ISO 8601
74
+ closed_date TEXT, -- ISO 8601
75
+ cycle_time_minutes REAL,
76
+ raw_json TEXT, -- Original ADO response for auditing
77
+ FOREIGN KEY (repository_id) REFERENCES repositories(repository_id),
78
+ FOREIGN KEY (user_id) REFERENCES users(user_id)
79
+ );
80
+ CREATE INDEX IF NOT EXISTS idx_pull_requests_closed_date
81
+ ON pull_requests(closed_date);
82
+ CREATE INDEX IF NOT EXISTS idx_pull_requests_org_project
83
+ ON pull_requests(organization_name, project_name);
84
+
85
+ -- reviewers.csv: pull_request_uid, user_id, vote, repository_id
86
+ CREATE TABLE IF NOT EXISTS reviewers (
87
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
88
+ pull_request_uid TEXT NOT NULL,
89
+ user_id TEXT NOT NULL,
90
+ vote INTEGER NOT NULL,
91
+ repository_id TEXT NOT NULL,
92
+ FOREIGN KEY (pull_request_uid) REFERENCES pull_requests(pull_request_uid),
93
+ FOREIGN KEY (user_id) REFERENCES users(user_id),
94
+ UNIQUE(pull_request_uid, user_id) -- One vote per reviewer per PR
95
+ );
96
+ CREATE INDEX IF NOT EXISTS idx_reviewers_pr ON reviewers(pull_request_uid);
97
+
98
+ -- Schema version for future migrations
99
+ CREATE TABLE IF NOT EXISTS schema_version (
100
+ version INTEGER PRIMARY KEY,
101
+ applied_at TEXT NOT NULL
102
+ );
103
+
104
+ -- Insert initial schema version
105
+ INSERT OR IGNORE INTO schema_version (version, applied_at)
106
+ VALUES (1, datetime('now'));
107
+ """
108
+
109
+ # CSV column order contract (NON-NEGOTIABLE per Invariants 1-4)
110
+ CSV_SCHEMAS: dict[str, list[str]] = {
111
+ "organizations": ["organization_name"],
112
+ "projects": ["organization_name", "project_name"],
113
+ "repositories": [
114
+ "repository_id",
115
+ "repository_name",
116
+ "project_name",
117
+ "organization_name",
118
+ ],
119
+ "pull_requests": [
120
+ "pull_request_uid",
121
+ "pull_request_id",
122
+ "organization_name",
123
+ "project_name",
124
+ "repository_id",
125
+ "user_id",
126
+ "title",
127
+ "status",
128
+ "description",
129
+ "creation_date",
130
+ "closed_date",
131
+ "cycle_time_minutes",
132
+ ],
133
+ "users": ["user_id", "display_name", "email"],
134
+ "reviewers": ["pull_request_uid", "user_id", "vote", "repository_id"],
135
+ }
136
+
137
+ # Deterministic row ordering: primary key + tie-breaker (Adjustment 3)
138
+ SORT_KEYS: dict[str, list[str]] = {
139
+ "organizations": ["organization_name"],
140
+ "projects": ["organization_name", "project_name"],
141
+ "repositories": ["repository_id"],
142
+ "pull_requests": ["pull_request_uid", "creation_date"],
143
+ "users": ["user_id"],
144
+ "reviewers": ["pull_request_uid", "user_id"],
145
+ }
@@ -0,0 +1,376 @@
1
+ """Data access layer for ado-git-repo-insights.
2
+
3
+ This module implements UPSERT operations and state tracking per Invariant 8
4
+ (idempotent and convergent state updates).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ from dataclasses import dataclass
12
+ from datetime import date, datetime, timezone
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ if TYPE_CHECKING:
16
+ from .database import DatabaseManager
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class ExtractionMetadata:
23
+ """Metadata about the last extraction for a project."""
24
+
25
+ organization_name: str
26
+ project_name: str
27
+ last_extraction_date: date
28
+ last_extraction_timestamp: datetime
29
+
30
+
31
+ class PRRepository:
32
+ """Data access layer for Pull Request data.
33
+
34
+ Invariant 8: State updates must be idempotent and converge.
35
+ Invariant 14: Stable identifiers are required for UPSERT keys.
36
+ Invariant 15: All entities must be scoped to organization + project.
37
+ """
38
+
39
+ def __init__(self, db: DatabaseManager) -> None:
40
+ """Initialize the repository.
41
+
42
+ Args:
43
+ db: Database manager instance.
44
+ """
45
+ self.db = db
46
+
47
+ # --- Extraction Metadata ---
48
+
49
+ def get_last_extraction_date(self, organization: str, project: str) -> date | None:
50
+ """Get the last successful extraction date for a project.
51
+
52
+ Args:
53
+ organization: Organization name.
54
+ project: Project name.
55
+
56
+ Returns:
57
+ Last extraction date, or None if never extracted or metadata is corrupt.
58
+ """
59
+ cursor = self.db.execute(
60
+ """
61
+ SELECT last_extraction_date FROM extraction_metadata
62
+ WHERE organization_name = ? AND project_name = ?
63
+ """,
64
+ (organization, project),
65
+ )
66
+ row = cursor.fetchone()
67
+ if row:
68
+ date_value = row["last_extraction_date"]
69
+ # Handle NULL or empty string
70
+ if not date_value:
71
+ return None
72
+ # Handle corrupt date format gracefully (warn + fallback)
73
+ try:
74
+ return date.fromisoformat(date_value)
75
+ except (ValueError, TypeError) as e:
76
+ logger.warning(
77
+ f"Invalid/corrupt extraction metadata date for "
78
+ f"{organization}/{project}: '{date_value}' - {e}"
79
+ )
80
+ return None
81
+ return None
82
+
83
+ def update_extraction_metadata(
84
+ self, organization: str, project: str, extraction_date: date
85
+ ) -> None:
86
+ """Record successful extraction for the given date.
87
+
88
+ Args:
89
+ organization: Organization name.
90
+ project: Project name.
91
+ extraction_date: Date that was extracted.
92
+ """
93
+ self.db.execute(
94
+ """
95
+ INSERT OR REPLACE INTO extraction_metadata
96
+ (organization_name, project_name, last_extraction_date, last_extraction_timestamp)
97
+ VALUES (?, ?, ?, ?)
98
+ """,
99
+ (
100
+ organization,
101
+ project,
102
+ extraction_date.isoformat(),
103
+ datetime.now(timezone.utc).isoformat(),
104
+ ),
105
+ )
106
+ logger.debug(
107
+ f"Updated extraction metadata: {organization}/{project} = {extraction_date}"
108
+ )
109
+
110
+ # --- Organizations ---
111
+
112
+ def upsert_organization(self, organization_name: str) -> None:
113
+ """Insert or update an organization.
114
+
115
+ Args:
116
+ organization_name: Organization name.
117
+ """
118
+ self.db.execute(
119
+ "INSERT OR IGNORE INTO organizations (organization_name) VALUES (?)",
120
+ (organization_name,),
121
+ )
122
+
123
+ # --- Projects ---
124
+
125
+ def upsert_project(self, organization_name: str, project_name: str) -> None:
126
+ """Insert or update a project.
127
+
128
+ Args:
129
+ organization_name: Organization name.
130
+ project_name: Project name.
131
+ """
132
+ # Ensure organization exists first
133
+ self.upsert_organization(organization_name)
134
+
135
+ self.db.execute(
136
+ """
137
+ INSERT OR IGNORE INTO projects (organization_name, project_name)
138
+ VALUES (?, ?)
139
+ """,
140
+ (organization_name, project_name),
141
+ )
142
+
143
+ # --- Repositories ---
144
+
145
+ def upsert_repository(
146
+ self,
147
+ repository_id: str,
148
+ repository_name: str,
149
+ project_name: str,
150
+ organization_name: str,
151
+ ) -> None:
152
+ """Insert or update a repository.
153
+
154
+ Invariant 14: repository_id is the stable ADO ID.
155
+ Invariant 16: repository_name is a mutable label.
156
+
157
+ Args:
158
+ repository_id: Stable ADO repository ID.
159
+ repository_name: Current repository name.
160
+ project_name: Project name.
161
+ organization_name: Organization name.
162
+ """
163
+ # Ensure project exists first
164
+ self.upsert_project(organization_name, project_name)
165
+
166
+ self.db.execute(
167
+ """
168
+ INSERT OR REPLACE INTO repositories
169
+ (repository_id, repository_name, project_name, organization_name)
170
+ VALUES (?, ?, ?, ?)
171
+ """,
172
+ (repository_id, repository_name, project_name, organization_name),
173
+ )
174
+
175
+ # --- Users ---
176
+
177
+ def upsert_user(
178
+ self, user_id: str, display_name: str, email: str | None = None
179
+ ) -> None:
180
+ """Insert or update a user.
181
+
182
+ Invariant 16: user_id is stable, display_name/email are mutable.
183
+
184
+ Args:
185
+ user_id: Stable ADO user ID.
186
+ display_name: Current display name.
187
+ email: Current email (optional).
188
+ """
189
+ self.db.execute(
190
+ """
191
+ INSERT OR REPLACE INTO users (user_id, display_name, email)
192
+ VALUES (?, ?, ?)
193
+ """,
194
+ (user_id, display_name, email),
195
+ )
196
+
197
+ # --- Pull Requests ---
198
+
199
+ def upsert_pull_request(
200
+ self,
201
+ pull_request_uid: str,
202
+ pull_request_id: int,
203
+ organization_name: str,
204
+ project_name: str,
205
+ repository_id: str,
206
+ user_id: str,
207
+ title: str,
208
+ status: str,
209
+ description: str | None,
210
+ creation_date: str,
211
+ closed_date: str | None,
212
+ cycle_time_minutes: float | None,
213
+ raw_json: dict[str, Any] | None = None,
214
+ ) -> None:
215
+ """Insert or update a pull request.
216
+
217
+ Invariant 8: UPSERT semantics ensure idempotent updates.
218
+ Invariant 14: pull_request_uid = {repository_id}-{pull_request_id}.
219
+
220
+ Args:
221
+ pull_request_uid: Unique identifier (repo_id-pr_id).
222
+ pull_request_id: ADO PR ID.
223
+ organization_name: Organization name.
224
+ project_name: Project name.
225
+ repository_id: Repository ID.
226
+ user_id: Author user ID.
227
+ title: PR title.
228
+ status: PR status.
229
+ description: PR description.
230
+ creation_date: ISO 8601 creation date.
231
+ closed_date: ISO 8601 closed date.
232
+ cycle_time_minutes: Calculated cycle time.
233
+ raw_json: Original ADO API response for auditing.
234
+ """
235
+ self.db.execute(
236
+ """
237
+ INSERT OR REPLACE INTO pull_requests (
238
+ pull_request_uid, pull_request_id, organization_name, project_name,
239
+ repository_id, user_id, title, status, description,
240
+ creation_date, closed_date, cycle_time_minutes, raw_json
241
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
242
+ """,
243
+ (
244
+ pull_request_uid,
245
+ pull_request_id,
246
+ organization_name,
247
+ project_name,
248
+ repository_id,
249
+ user_id,
250
+ title,
251
+ status,
252
+ description,
253
+ creation_date,
254
+ closed_date,
255
+ cycle_time_minutes,
256
+ json.dumps(raw_json) if raw_json else None,
257
+ ),
258
+ )
259
+
260
+ # --- Reviewers ---
261
+
262
+ def upsert_reviewer(
263
+ self,
264
+ pull_request_uid: str,
265
+ user_id: str,
266
+ vote: int,
267
+ repository_id: str,
268
+ ) -> None:
269
+ """Insert or update a reviewer.
270
+
271
+ Args:
272
+ pull_request_uid: PR unique identifier.
273
+ user_id: Reviewer user ID.
274
+ vote: Vote value.
275
+ repository_id: Repository ID.
276
+ """
277
+ self.db.execute(
278
+ """
279
+ INSERT OR REPLACE INTO reviewers
280
+ (pull_request_uid, user_id, vote, repository_id)
281
+ VALUES (?, ?, ?, ?)
282
+ """,
283
+ (pull_request_uid, user_id, vote, repository_id),
284
+ )
285
+
286
+ # --- Bulk Operations ---
287
+
288
+ def upsert_pr_with_related(
289
+ self,
290
+ pr_data: dict[str, Any],
291
+ organization_name: str,
292
+ project_name: str,
293
+ ) -> None:
294
+ """Insert or update a PR and all related entities.
295
+
296
+ This is the main entry point for processing a PR from the ADO API.
297
+ Handles repository, user, reviewers, and the PR itself.
298
+
299
+ Args:
300
+ pr_data: Raw PR data from ADO API.
301
+ organization_name: Organization name.
302
+ project_name: Project name.
303
+ """
304
+ from ..utils.datetime_utils import calculate_cycle_time_minutes
305
+
306
+ # Extract repository
307
+ repo = pr_data.get("repository", {})
308
+ repository_id = repo.get("id", "")
309
+ repository_name = repo.get("name", "")
310
+
311
+ self.upsert_repository(
312
+ repository_id=repository_id,
313
+ repository_name=repository_name,
314
+ project_name=project_name,
315
+ organization_name=organization_name,
316
+ )
317
+
318
+ # Extract author
319
+ created_by = pr_data.get("createdBy", {})
320
+ user_id = created_by.get("id", "")
321
+ display_name = created_by.get("displayName", "")
322
+ email = created_by.get("uniqueName")
323
+
324
+ self.upsert_user(
325
+ user_id=user_id,
326
+ display_name=display_name,
327
+ email=email,
328
+ )
329
+
330
+ # Build PR UID (Invariant 14)
331
+ pr_id = pr_data.get("pullRequestId", 0)
332
+ pull_request_uid = f"{repository_id}-{pr_id}"
333
+
334
+ # Calculate cycle time
335
+ creation_date = pr_data.get("creationDate", "")
336
+ closed_date = pr_data.get("closedDate")
337
+ cycle_time = calculate_cycle_time_minutes(creation_date, closed_date)
338
+
339
+ # Upsert PR
340
+ self.upsert_pull_request(
341
+ pull_request_uid=pull_request_uid,
342
+ pull_request_id=pr_id,
343
+ organization_name=organization_name,
344
+ project_name=project_name,
345
+ repository_id=repository_id,
346
+ user_id=user_id,
347
+ title=pr_data.get("title", ""),
348
+ status=pr_data.get("status", ""),
349
+ description=pr_data.get("description"),
350
+ creation_date=creation_date,
351
+ closed_date=closed_date,
352
+ cycle_time_minutes=cycle_time,
353
+ raw_json=pr_data,
354
+ )
355
+
356
+ # Upsert reviewers
357
+ for reviewer in pr_data.get("reviewers", []):
358
+ reviewer_id = reviewer.get("id", "")
359
+ reviewer_name = reviewer.get("displayName", "")
360
+ reviewer_email = reviewer.get("uniqueName")
361
+ vote = reviewer.get("vote", 0)
362
+
363
+ self.upsert_user(
364
+ user_id=reviewer_id,
365
+ display_name=reviewer_name,
366
+ email=reviewer_email,
367
+ )
368
+
369
+ self.upsert_reviewer(
370
+ pull_request_uid=pull_request_uid,
371
+ user_id=reviewer_id,
372
+ vote=vote,
373
+ repository_id=repository_id,
374
+ )
375
+
376
+ logger.debug(f"Upserted PR: {pull_request_uid}")
@@ -0,0 +1 @@
1
+ """Transform module for CSV generation."""
@@ -0,0 +1,132 @@
1
+ """CSV generator for PowerBI-compatible output.
2
+
3
+ Generates CSVs that are:
4
+ - Schema-compliant (exact columns, exact order - Invariants 1-4)
5
+ - Deterministic (same DB → same bytes - Adjustment 3)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING
13
+
14
+ import pandas as pd
15
+
16
+ from ..persistence.models import CSV_SCHEMAS, SORT_KEYS
17
+
18
+ if TYPE_CHECKING:
19
+ from ..persistence.database import DatabaseManager
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class CSVGenerationError(Exception):
25
+ """CSV generation failed."""
26
+
27
+
28
+ class CSVGenerator:
29
+ """Generates PowerBI-compatible CSV files from SQLite.
30
+
31
+ Invariant 1: CSV schema is a hard contract.
32
+ Invariant 3: CSV output must be deterministic.
33
+ """
34
+
35
+ def __init__(self, db: DatabaseManager, output_dir: Path) -> None:
36
+ """Initialize the CSV generator.
37
+
38
+ Args:
39
+ db: Database manager instance.
40
+ output_dir: Directory for CSV output files.
41
+ """
42
+ self.db = db
43
+ self.output_dir = output_dir
44
+
45
+ def generate_all(self) -> dict[str, int]:
46
+ """Generate all CSV files.
47
+
48
+ Returns:
49
+ Dict mapping table names to row counts.
50
+
51
+ Raises:
52
+ CSVGenerationError: If generation fails.
53
+ """
54
+ self.output_dir.mkdir(parents=True, exist_ok=True)
55
+
56
+ results: dict[str, int] = {}
57
+
58
+ for table_name, columns in CSV_SCHEMAS.items():
59
+ try:
60
+ count = self._generate_table(table_name, columns)
61
+ results[table_name] = count
62
+ logger.info(f"Generated {table_name}.csv: {count} rows")
63
+ except Exception as e:
64
+ raise CSVGenerationError(
65
+ f"Failed to generate {table_name}.csv: {e}"
66
+ ) from e
67
+
68
+ return results
69
+
70
+ def _generate_table(self, table_name: str, columns: list[str]) -> int:
71
+ """Generate a single CSV file.
72
+
73
+ Args:
74
+ table_name: Name of the table/CSV.
75
+ columns: Expected column order (contract).
76
+
77
+ Returns:
78
+ Number of rows written.
79
+ """
80
+ # Query the table
81
+ column_list = ", ".join(columns)
82
+ df = pd.read_sql_query(
83
+ f"SELECT {column_list} FROM {table_name}", # noqa: S608
84
+ self.db.connection,
85
+ )
86
+
87
+ # Ensure column order matches contract exactly (Invariant 1)
88
+ df = df[columns]
89
+
90
+ # Deterministic row ordering (Adjustment 3)
91
+ sort_keys = SORT_KEYS.get(table_name, columns[:1])
92
+ df = df.sort_values(by=sort_keys, ascending=True)
93
+
94
+ # Write CSV with deterministic settings
95
+ output_path = self.output_dir / f"{table_name}.csv"
96
+ df.to_csv(
97
+ output_path,
98
+ index=False,
99
+ encoding="utf-8",
100
+ lineterminator="\n", # Unix line endings for consistency
101
+ date_format="%Y-%m-%dT%H:%M:%S", # Consistent datetime format
102
+ )
103
+
104
+ return len(df)
105
+
106
+ def validate_schemas(self) -> bool:
107
+ """Validate that generated CSVs match expected schemas.
108
+
109
+ Returns:
110
+ True if all schemas valid.
111
+
112
+ Raises:
113
+ CSVGenerationError: If any schema mismatch.
114
+ """
115
+ for table_name, expected_columns in CSV_SCHEMAS.items():
116
+ csv_path = self.output_dir / f"{table_name}.csv"
117
+
118
+ if not csv_path.exists():
119
+ raise CSVGenerationError(f"Missing CSV: {csv_path}")
120
+
121
+ df = pd.read_csv(csv_path, nrows=0) # Just read headers
122
+ actual_columns = list(df.columns)
123
+
124
+ if actual_columns != expected_columns:
125
+ raise CSVGenerationError(
126
+ f"Schema mismatch in {table_name}.csv:\n"
127
+ f" Expected: {expected_columns}\n"
128
+ f" Actual: {actual_columns}"
129
+ )
130
+
131
+ logger.info("All CSV schemas validated successfully")
132
+ return True
@@ -0,0 +1 @@
1
+ """Utilities module for shared helper functions."""