ado-git-repo-insights 1.2.1__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. ado_git_repo_insights/__init__.py +3 -3
  2. ado_git_repo_insights/cli.py +703 -354
  3. ado_git_repo_insights/config.py +186 -186
  4. ado_git_repo_insights/extractor/__init__.py +1 -1
  5. ado_git_repo_insights/extractor/ado_client.py +452 -246
  6. ado_git_repo_insights/extractor/pr_extractor.py +239 -239
  7. ado_git_repo_insights/ml/__init__.py +13 -0
  8. ado_git_repo_insights/ml/date_utils.py +70 -0
  9. ado_git_repo_insights/ml/forecaster.py +288 -0
  10. ado_git_repo_insights/ml/insights.py +497 -0
  11. ado_git_repo_insights/persistence/__init__.py +1 -1
  12. ado_git_repo_insights/persistence/database.py +193 -193
  13. ado_git_repo_insights/persistence/models.py +207 -145
  14. ado_git_repo_insights/persistence/repository.py +662 -376
  15. ado_git_repo_insights/transform/__init__.py +1 -1
  16. ado_git_repo_insights/transform/aggregators.py +950 -0
  17. ado_git_repo_insights/transform/csv_generator.py +132 -132
  18. ado_git_repo_insights/utils/__init__.py +1 -1
  19. ado_git_repo_insights/utils/datetime_utils.py +101 -101
  20. ado_git_repo_insights/utils/logging_config.py +172 -172
  21. ado_git_repo_insights/utils/run_summary.py +207 -206
  22. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/METADATA +56 -15
  23. ado_git_repo_insights-2.7.4.dist-info/RECORD +27 -0
  24. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/licenses/LICENSE +21 -21
  25. ado_git_repo_insights-1.2.1.dist-info/RECORD +0 -22
  26. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/WHEEL +0 -0
  27. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/entry_points.txt +0 -0
  28. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,497 @@
1
+ """OpenAI-based insights generator for Phase 5.
2
+
3
+ Produces insights/summary.json with contract-compliant insights:
4
+ - schema_version: 1
5
+ - is_stub: false
6
+ - generated_by: "openai-v1.0"
7
+ - Categories: bottleneck, trend, anomaly
8
+ - Severities: info, warning, critical
9
+ - Single API call for up to 3 insights
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ import os
18
+ import time
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+ from typing import TYPE_CHECKING, Any
22
+
23
+ if TYPE_CHECKING:
24
+ from ..persistence.database import DatabaseManager
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Schema version (locked)
29
+ INSIGHTS_SCHEMA_VERSION = 1
30
+ GENERATOR_ID = "openai-v1.0"
31
+
32
+ # Cache invalidation control:
33
+ # Bumping PROMPT_VERSION intentionally invalidates all cached insights.
34
+ # This ensures users get fresh insights after prompt improvements or bug fixes.
35
+ # Current: "phase5-v2" (bumped from v1 for deterministic cache key fix)
36
+ PROMPT_VERSION = "phase5-v2"
37
+
38
+ # Default model (can be overridden with OPENAI_MODEL env var)
39
+ # PHASE5.md locked decision: gpt-5-nano
40
+ DEFAULT_MODEL = "gpt-5-nano"
41
+
42
+
43
+ class LLMInsightsGenerator:
44
+ """Generate OpenAI-based insights from PR metrics.
45
+
46
+ Single API call requesting JSON with up to 3 insights (one per category).
47
+ Supports dry-run mode and 24h caching.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ db: DatabaseManager,
53
+ output_dir: Path,
54
+ max_tokens: int = 1000,
55
+ cache_ttl_hours: int = 24,
56
+ dry_run: bool = False,
57
+ ) -> None:
58
+ """Initialize the insights generator.
59
+
60
+ Args:
61
+ db: Database manager with PR data.
62
+ output_dir: Directory for output files.
63
+ max_tokens: Maximum tokens for OpenAI response.
64
+ cache_ttl_hours: Cache TTL in hours.
65
+ dry_run: If True, write prompt artifact without calling API.
66
+ """
67
+ self.db = db
68
+ self.output_dir = output_dir
69
+ self.max_tokens = max_tokens
70
+ self.cache_ttl_hours = cache_ttl_hours
71
+ self.dry_run = dry_run
72
+ self.model = os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)
73
+
74
+ def generate(self) -> bool:
75
+ """Generate insights and write to summary.json.
76
+
77
+ Returns:
78
+ True if file was written successfully, False otherwise.
79
+
80
+ Behavior:
81
+ - Dry-run: writes prompt.json, does NOT write summary.json, returns False
82
+ - Cache hit: writes summary.json from cache, skips API call
83
+ - API failure: warns, does NOT write file, returns False
84
+ """
85
+ start_time = time.perf_counter()
86
+
87
+ insights_dir = self.output_dir / "insights"
88
+ insights_dir.mkdir(parents=True, exist_ok=True)
89
+
90
+ # Build prompt (returns prompt string and canonical data for cache key)
91
+ prompt, prompt_data = self._build_prompt()
92
+
93
+ if self.dry_run:
94
+ # Dry-run: write prompt artifact and exit
95
+ # NO API call, NO client creation
96
+ prompt_artifact = {
97
+ "model": self.model,
98
+ "max_tokens": self.max_tokens,
99
+ "prompt": prompt,
100
+ "generated_at": datetime.now(timezone.utc).isoformat(),
101
+ }
102
+ prompt_path = insights_dir / "prompt.json"
103
+ with prompt_path.open("w", encoding="utf-8") as f:
104
+ json.dump(prompt_artifact, f, indent=2)
105
+ logger.info(
106
+ f"DRY RUN: Wrote prompt artifact to {prompt_path}. "
107
+ "No API call made, no costs incurred."
108
+ )
109
+ return False # Don't write summary.json in dry-run
110
+
111
+ # Check cache
112
+ cache_path = insights_dir / "cache.json"
113
+ cache_key = self._get_cache_key(prompt_data)
114
+
115
+ cached_insights = self._check_cache(cache_path, cache_key)
116
+ if cached_insights:
117
+ # Cache hit - write summary.json from cache
118
+ summary_path = insights_dir / "summary.json"
119
+ with summary_path.open("w", encoding="utf-8") as f:
120
+ json.dump(cached_insights, f, indent=2, sort_keys=True)
121
+ logger.info("Cache hit - wrote insights from cache")
122
+ return True
123
+
124
+ # Call OpenAI API
125
+ try:
126
+ insights_data = self._call_openai(prompt)
127
+ except Exception as e:
128
+ logger.warning(f"OpenAI API call failed: {type(e).__name__}: {e}")
129
+ return False
130
+
131
+ if not insights_data:
132
+ logger.warning("OpenAI returned no insights")
133
+ return False
134
+
135
+ # Write summary.json
136
+ summary_path = insights_dir / "summary.json"
137
+ with summary_path.open("w", encoding="utf-8") as f:
138
+ json.dump(insights_data, f, indent=2, sort_keys=True)
139
+
140
+ # Update cache
141
+ self._write_cache(cache_path, cache_key, insights_data)
142
+
143
+ elapsed = time.perf_counter() - start_time
144
+ logger.info(
145
+ f"OpenAI insights generation completed in {elapsed:.2f}s "
146
+ f"({len(insights_data.get('insights', []))} insights)"
147
+ )
148
+ return True
149
+
150
+ def _build_prompt(self) -> tuple[str, dict[str, Any]]:
151
+ """Build the prompt for OpenAI.
152
+
153
+ Returns:
154
+ Tuple of (prompt_string, canonical_data_dict)
155
+ The canonical_data_dict is used for deterministic cache key generation.
156
+ """
157
+ # Get aggregate stats from database
158
+ stats = self._get_pr_stats()
159
+
160
+ # Canonical data for cache key (sorted, normalized)
161
+ canonical_data = {
162
+ "prompt_version": PROMPT_VERSION,
163
+ "stats": stats,
164
+ }
165
+
166
+ prompt = f"""You are a DevOps metrics analyst. Analyze the following pull request metrics and provide up to 3 actionable insights.
167
+
168
+ **Metrics Summary:**
169
+ - Total PRs: {stats["total_prs"]}
170
+ - Date range: {stats["date_range_start"]} to {stats["date_range_end"]}
171
+ - Average cycle time: {stats["avg_cycle_time_minutes"]} minutes
172
+ - P90 cycle time: {stats["p90_cycle_time_minutes"]} minutes
173
+ - Authors: {stats["authors_count"]}
174
+ - Repositories: {stats["repositories_count"]}
175
+
176
+ **Instructions:**
177
+ - Provide up to 3 insights, one per category: "bottleneck", "trend", "anomaly"
178
+ - For each insight, identify severity: "info", "warning", or "critical"
179
+ - Focus on actual patterns, NOT recommendations
180
+ - Use descriptive language only - no action items
181
+
182
+ **Required JSON format:**
183
+ {{
184
+ "insights": [
185
+ {{
186
+ "id": "unique-id",
187
+ "category": "bottleneck | trend | anomaly",
188
+ "severity": "info | warning | critical",
189
+ "title": "Short summary",
190
+ "description": "Detailed description of the pattern observed",
191
+ "affected_entities": ["entity:name", ...]
192
+ }}
193
+ ]
194
+ }}
195
+
196
+ Respond ONLY with valid JSON matching this format."""
197
+
198
+ return prompt, canonical_data
199
+
200
+ def _get_pr_stats(self) -> dict[str, Any]:
201
+ """Get PR statistics from database for prompt.
202
+
203
+ Returns:
204
+ Dict with aggregate statistics.
205
+ """
206
+ # Total PRs
207
+ cursor = self.db.execute(
208
+ "SELECT COUNT(*) as cnt FROM pull_requests WHERE status = 'completed'"
209
+ )
210
+ total_prs = cursor.fetchone()["cnt"]
211
+
212
+ # Date range
213
+ cursor = self.db.execute(
214
+ """
215
+ SELECT MIN(closed_date) as min_date, MAX(closed_date) as max_date
216
+ FROM pull_requests
217
+ WHERE closed_date IS NOT NULL
218
+ """
219
+ )
220
+ row = cursor.fetchone()
221
+ date_range_start = row["min_date"][:10] if row["min_date"] else "N/A"
222
+ date_range_end = row["max_date"][:10] if row["max_date"] else "N/A"
223
+
224
+ # Cycle time stats
225
+ cursor = self.db.execute(
226
+ """
227
+ SELECT
228
+ AVG(cycle_time_minutes) as avg_cycle,
229
+ MAX(cycle_time_minutes) as max_cycle
230
+ FROM pull_requests
231
+ WHERE cycle_time_minutes IS NOT NULL
232
+ """
233
+ )
234
+ row = cursor.fetchone()
235
+ avg_cycle_time = round(row["avg_cycle"], 1) if row["avg_cycle"] else 0
236
+
237
+ # P90 approximation (use 90% of max as rough estimate)
238
+ p90_cycle_time = round(row["max_cycle"] * 0.9, 1) if row["max_cycle"] else 0
239
+
240
+ # Authors
241
+ cursor = self.db.execute(
242
+ "SELECT COUNT(DISTINCT user_id) as cnt FROM pull_requests"
243
+ )
244
+ authors_count = cursor.fetchone()["cnt"]
245
+
246
+ # Repositories
247
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM repositories")
248
+ repositories_count = cursor.fetchone()["cnt"]
249
+
250
+ return {
251
+ "total_prs": total_prs,
252
+ "date_range_start": date_range_start,
253
+ "date_range_end": date_range_end,
254
+ "avg_cycle_time_minutes": avg_cycle_time,
255
+ "p90_cycle_time_minutes": p90_cycle_time,
256
+ "authors_count": authors_count,
257
+ "repositories_count": repositories_count,
258
+ }
259
+
260
+ def _get_cache_key(self, prompt_data: dict[str, Any]) -> str:
261
+ """Generate deterministic cache key using canonical JSON.
262
+
263
+ Args:
264
+ prompt_data: Canonical data dict (not prompt string)
265
+
266
+ Returns:
267
+ SHA256 hash of cache key inputs.
268
+ """
269
+ # Deterministic DB freshness markers:
270
+ # 1. Max closed_date from PRs
271
+ # 2. Max updated_at (if available) to catch backfill/metadata changes
272
+ # Note: Use deterministic fallback for empty datasets
273
+ cursor = self.db.execute(
274
+ """
275
+ SELECT
276
+ MAX(closed_date) as max_closed,
277
+ MAX(COALESCE(updated_at, closed_date)) as max_updated
278
+ FROM pull_requests
279
+ """
280
+ )
281
+ row = cursor.fetchone()
282
+ max_closed = row["max_closed"] if row and row["max_closed"] else "empty-dataset"
283
+ max_updated = (
284
+ row["max_updated"] if row and row["max_updated"] else "empty-dataset"
285
+ )
286
+
287
+ # Use canonical JSON with sorted keys for deterministic hashing
288
+ # This prevents cache misses from non-deterministic ordering or whitespace
289
+ canonical_json = json.dumps(prompt_data, sort_keys=True, ensure_ascii=True)
290
+ prompt_hash = hashlib.sha256(canonical_json.encode()).hexdigest()[:16]
291
+
292
+ # Cache key components
293
+ key_parts = [
294
+ PROMPT_VERSION,
295
+ self.model,
296
+ max_closed,
297
+ max_updated,
298
+ prompt_hash,
299
+ ]
300
+ key_string = "|".join(str(p) for p in key_parts)
301
+ return hashlib.sha256(key_string.encode()).hexdigest()
302
+
303
+ def _check_cache(self, cache_path: Path, cache_key: str) -> dict[str, Any] | None:
304
+ """Check if valid cache exists.
305
+
306
+ Args:
307
+ cache_path: Path to cache file.
308
+ cache_key: Expected cache key.
309
+
310
+ Returns:
311
+ Cached insights data if valid, None otherwise.
312
+ """
313
+ if not cache_path.exists():
314
+ return None
315
+
316
+ try:
317
+ with cache_path.open("r", encoding="utf-8") as f:
318
+ cache_data = json.load(f)
319
+
320
+ # Validate cache key
321
+ if cache_data.get("cache_key") != cache_key:
322
+ logger.debug("Cache miss: key mismatch")
323
+ return None
324
+
325
+ # Validate TTL
326
+ cached_at = datetime.fromisoformat(cache_data["cached_at"])
327
+ age_hours = (datetime.now(timezone.utc) - cached_at).total_seconds() / 3600
328
+ if age_hours > self.cache_ttl_hours:
329
+ logger.debug(
330
+ f"Cache expired: {age_hours:.1f}h > {self.cache_ttl_hours}h"
331
+ )
332
+ return None
333
+
334
+ logger.info(f"Cache hit: age {age_hours:.1f}h")
335
+ return cache_data.get("insights_data")
336
+
337
+ except Exception as e:
338
+ logger.debug(f"Cache read failed: {e}")
339
+ return None
340
+
341
+ def _write_cache(
342
+ self, cache_path: Path, cache_key: str, insights_data: dict[str, Any]
343
+ ) -> None:
344
+ """Write insights to cache.
345
+
346
+ Args:
347
+ cache_path: Path to cache file.
348
+ cache_key: Cache key.
349
+ insights_data: Insights data to cache.
350
+ """
351
+ cache_data = {
352
+ "cache_key": cache_key,
353
+ "cached_at": datetime.now(timezone.utc).isoformat(),
354
+ "insights_data": insights_data,
355
+ }
356
+ with cache_path.open("w", encoding="utf-8") as f:
357
+ json.dump(cache_data, f, indent=2)
358
+
359
+ def _call_openai(self, prompt: str) -> dict[str, Any] | None:
360
+ """Call OpenAI API and parse response.
361
+
362
+ Args:
363
+ prompt: The prompt string.
364
+
365
+ Returns:
366
+ Insights data dict or None if failed.
367
+ """
368
+ import openai
369
+
370
+ api_key = os.environ.get("OPENAI_API_KEY")
371
+ if not api_key:
372
+ raise ValueError("OPENAI_API_KEY not set")
373
+
374
+ # OpenAI SDK v1.0+ client-based API
375
+ client = openai.OpenAI(api_key=api_key)
376
+
377
+ try:
378
+ response = client.chat.completions.create(
379
+ model=self.model,
380
+ messages=[
381
+ {
382
+ "role": "system",
383
+ "content": "You are a DevOps metrics analyst. Respond only with valid JSON.",
384
+ },
385
+ {"role": "user", "content": prompt},
386
+ ],
387
+ max_tokens=self.max_tokens,
388
+ temperature=0.7,
389
+ )
390
+
391
+ # Extract response text
392
+ if not response.choices:
393
+ logger.warning("OpenAI returned no choices")
394
+ return None
395
+
396
+ content = response.choices[0].message.content
397
+ if not content:
398
+ logger.warning("OpenAI returned empty content")
399
+ return None
400
+
401
+ # Parse JSON
402
+ try:
403
+ insights_json = json.loads(content)
404
+ except json.JSONDecodeError as e:
405
+ logger.warning(f"Failed to parse OpenAI response as JSON: {e}")
406
+ return None
407
+
408
+ # Get DB freshness markers for deterministic ID generation
409
+ # Handle empty datasets (None values) with deterministic fallback
410
+ cursor = self.db.execute(
411
+ """
412
+ SELECT
413
+ MAX(closed_date) as max_closed,
414
+ MAX(COALESCE(updated_at, closed_date)) as max_updated
415
+ FROM pull_requests
416
+ """
417
+ )
418
+ row = cursor.fetchone()
419
+ # Deterministic fallback for empty datasets
420
+ max_closed = (
421
+ row["max_closed"] if row and row["max_closed"] else "empty-dataset"
422
+ )
423
+ max_updated = (
424
+ row["max_updated"] if row and row["max_updated"] else "empty-dataset"
425
+ )
426
+
427
+ # Validate and enforce contract with deterministic IDs
428
+ return self._validate_and_fix_insights(
429
+ insights_json, max_closed, max_updated
430
+ )
431
+
432
+ except Exception as e:
433
+ logger.warning(f"OpenAI API error: {type(e).__name__}: {e}")
434
+ return None
435
+
436
+ def _validate_and_fix_insights(
437
+ self, insights_json: dict[str, Any], max_closed: str, max_updated: str
438
+ ) -> dict[str, Any] | None:
439
+ """Validate and fix insights to match contract.
440
+
441
+ Generates deterministic IDs to ensure cache stability and prevent UI flicker.
442
+
443
+ Args:
444
+ insights_json: Raw JSON from OpenAI.
445
+ max_closed: Max closed_date from database (for ID generation).
446
+ max_updated: Max updated_at from database (for ID generation).
447
+
448
+ Returns:
449
+ Contract-compliant insights or None if invalid.
450
+ """
451
+ if "insights" not in insights_json:
452
+ logger.warning("Missing 'insights' array in response")
453
+ return None
454
+
455
+ insights_list = insights_json["insights"]
456
+ if not isinstance(insights_list, list):
457
+ logger.warning("'insights' is not an array")
458
+ return None
459
+
460
+ # Fix each insight
461
+ fixed_insights = []
462
+ for idx, insight in enumerate(insights_list):
463
+ if not isinstance(insight, dict):
464
+ continue
465
+
466
+ # Enforce required fields
467
+ if "affected_entities" not in insight:
468
+ insight["affected_entities"] = [] # Enforce empty array if missing
469
+
470
+ # Validate category (needed for deterministic ID)
471
+ category = insight.get("category", "unknown")
472
+ if not isinstance(category, str):
473
+ logger.warning(f"Insight missing valid category: {insight}")
474
+ continue
475
+
476
+ # Generate deterministic ID based on category + dataset + prompt version
477
+ # This ensures the same data produces the same IDs across cache hits
478
+ id_input = f"{category}|{max_closed}|{max_updated}|{PROMPT_VERSION}|{idx}"
479
+ deterministic_id = hashlib.sha256(id_input.encode()).hexdigest()[:12]
480
+ insight["id"] = f"{category}-{deterministic_id}"
481
+
482
+ # Validate other required fields exist
483
+ required = ["severity", "title", "description"]
484
+ if not all(field in insight for field in required):
485
+ logger.warning(f"Insight missing required fields: {insight}")
486
+ continue
487
+
488
+ fixed_insights.append(insight)
489
+
490
+ # Build contract-compliant output
491
+ return {
492
+ "schema_version": INSIGHTS_SCHEMA_VERSION,
493
+ "generated_at": datetime.now(timezone.utc).isoformat(),
494
+ "is_stub": False,
495
+ "generated_by": GENERATOR_ID,
496
+ "insights": fixed_insights,
497
+ }
@@ -1 +1 @@
1
- """Persistence module for SQLite storage operations."""
1
+ """Persistence module for SQLite storage operations."""