ado-git-repo-insights 1.2.1__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. ado_git_repo_insights/__init__.py +3 -3
  2. ado_git_repo_insights/cli.py +703 -354
  3. ado_git_repo_insights/config.py +186 -186
  4. ado_git_repo_insights/extractor/__init__.py +1 -1
  5. ado_git_repo_insights/extractor/ado_client.py +452 -246
  6. ado_git_repo_insights/extractor/pr_extractor.py +239 -239
  7. ado_git_repo_insights/ml/__init__.py +13 -0
  8. ado_git_repo_insights/ml/date_utils.py +70 -0
  9. ado_git_repo_insights/ml/forecaster.py +288 -0
  10. ado_git_repo_insights/ml/insights.py +497 -0
  11. ado_git_repo_insights/persistence/__init__.py +1 -1
  12. ado_git_repo_insights/persistence/database.py +193 -193
  13. ado_git_repo_insights/persistence/models.py +207 -145
  14. ado_git_repo_insights/persistence/repository.py +662 -376
  15. ado_git_repo_insights/transform/__init__.py +1 -1
  16. ado_git_repo_insights/transform/aggregators.py +950 -0
  17. ado_git_repo_insights/transform/csv_generator.py +132 -132
  18. ado_git_repo_insights/utils/__init__.py +1 -1
  19. ado_git_repo_insights/utils/datetime_utils.py +101 -101
  20. ado_git_repo_insights/utils/logging_config.py +172 -172
  21. ado_git_repo_insights/utils/run_summary.py +207 -206
  22. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/METADATA +56 -15
  23. ado_git_repo_insights-2.7.4.dist-info/RECORD +27 -0
  24. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/licenses/LICENSE +21 -21
  25. ado_git_repo_insights-1.2.1.dist-info/RECORD +0 -22
  26. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/WHEEL +0 -0
  27. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/entry_points.txt +0 -0
  28. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,950 @@
1
+ """Chunked aggregate generator for Phase 3 UI.
2
+
3
+ Generates JSON aggregates from SQLite for scale-safe UI rendering:
4
+ - weekly_rollups/YYYY-Www.json - Weekly PR metrics
5
+ - distributions/YYYY.json - Yearly distribution data
6
+ - dimensions.json - Filter dimensions (repos, users, teams)
7
+ - dataset-manifest.json - Discovery metadata with schema versions
8
+ - predictions/trends.json - Trend forecasts (Phase 3.5)
9
+ - insights/summary.json - AI insights (Phase 3.5)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ import os
18
+ import random
19
+ from dataclasses import asdict, dataclass, field
20
+ from datetime import date, datetime, timedelta, timezone
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Any
23
+
24
+ import pandas as pd
25
+
26
+ if TYPE_CHECKING:
27
+ from ..persistence.database import DatabaseManager
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Schema versions (Phase 3 locked)
32
+ MANIFEST_SCHEMA_VERSION = 1
33
+ DATASET_SCHEMA_VERSION = 1
34
+ AGGREGATES_SCHEMA_VERSION = 1
35
+
36
+ # Phase 3.5 schema versions
37
+ PREDICTIONS_SCHEMA_VERSION = 1
38
+ INSIGHTS_SCHEMA_VERSION = 1
39
+
40
+ # Stub generator identifier
41
+ STUB_GENERATOR_ID = "phase3.5-stub-v1"
42
+
43
+
44
+ class AggregationError(Exception):
45
+ """Aggregation failed."""
46
+
47
+
48
+ @dataclass
49
+ class WeeklyRollup:
50
+ """Weekly PR metrics rollup."""
51
+
52
+ week: str # ISO week: YYYY-Www
53
+ start_date: str # ISO date
54
+ end_date: str # ISO date
55
+ pr_count: int = 0
56
+ cycle_time_p50: float | None = None
57
+ cycle_time_p90: float | None = None
58
+ authors_count: int = 0
59
+ reviewers_count: int = 0
60
+
61
+
62
+ @dataclass
63
+ class YearlyDistribution:
64
+ """Yearly distribution metrics."""
65
+
66
+ year: str # YYYY
67
+ start_date: str
68
+ end_date: str
69
+ total_prs: int = 0
70
+ cycle_time_buckets: dict[str, int] = field(default_factory=dict)
71
+ prs_by_month: dict[str, int] = field(default_factory=dict)
72
+
73
+
74
+ @dataclass
75
+ class Dimensions:
76
+ """Filter dimensions for UI."""
77
+
78
+ repositories: list[dict[str, Any]] = field(default_factory=list)
79
+ users: list[dict[str, Any]] = field(default_factory=list)
80
+ projects: list[dict[str, Any]] = field(default_factory=list)
81
+ teams: list[dict[str, Any]] = field(default_factory=list) # Phase 3.3
82
+ date_range: dict[str, str] = field(default_factory=dict)
83
+
84
+
85
+ @dataclass
86
+ class AggregateIndex:
87
+ """Index of available aggregate files."""
88
+
89
+ weekly_rollups: list[dict[str, Any]] = field(default_factory=list)
90
+ distributions: list[dict[str, Any]] = field(default_factory=list)
91
+
92
+
93
+ @dataclass
94
+ class DatasetManifest:
95
+ """Dataset discovery manifest."""
96
+
97
+ manifest_schema_version: int = MANIFEST_SCHEMA_VERSION
98
+ dataset_schema_version: int = DATASET_SCHEMA_VERSION
99
+ aggregates_schema_version: int = AGGREGATES_SCHEMA_VERSION
100
+ predictions_schema_version: int = PREDICTIONS_SCHEMA_VERSION # Phase 3.5
101
+ insights_schema_version: int = INSIGHTS_SCHEMA_VERSION # Phase 3.5
102
+ generated_at: str = ""
103
+ run_id: str = ""
104
+ warnings: list[str] = field(default_factory=list) # Phase 3.5: stub warnings
105
+ aggregate_index: AggregateIndex = field(default_factory=AggregateIndex)
106
+ defaults: dict[str, Any] = field(default_factory=dict)
107
+ limits: dict[str, Any] = field(default_factory=dict)
108
+ features: dict[str, bool] = field(default_factory=dict)
109
+ coverage: dict[str, Any] = field(default_factory=dict)
110
+
111
+
112
+ class AggregateGenerator:
113
+ """Generate chunked JSON aggregates from SQLite.
114
+
115
+ Phase 3: Produces weekly rollups and distributions for lazy UI loading.
116
+ Phase 3.5: Optionally generates predictions/insights stubs.
117
+ Phase 5: Integrates Prophet forecaster and OpenAI insights.
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ db: DatabaseManager,
123
+ output_dir: Path,
124
+ run_id: str = "",
125
+ enable_ml_stubs: bool = False,
126
+ seed_base: str = "",
127
+ # Phase 5: ML parameters
128
+ enable_predictions: bool = False,
129
+ enable_insights: bool = False,
130
+ insights_max_tokens: int = 1000,
131
+ insights_cache_ttl_hours: int = 24,
132
+ insights_dry_run: bool = False,
133
+ stub_mode: bool = False,
134
+ ) -> None:
135
+ """Initialize the aggregate generator.
136
+
137
+ Args:
138
+ db: Database manager instance.
139
+ output_dir: Directory for aggregate output.
140
+ run_id: Pipeline run ID for manifest.
141
+ enable_ml_stubs: Whether to generate stub predictions/insights (Phase 3.5).
142
+ seed_base: Base string for deterministic stub seeding.
143
+ enable_predictions: Enable Prophet-based forecasting (Phase 5).
144
+ enable_insights: Enable OpenAI-based insights (Phase 5).
145
+ insights_max_tokens: Max tokens for OpenAI response.
146
+ insights_cache_ttl_hours: Cache TTL for insights.
147
+ insights_dry_run: Write prompt artifact without calling API.
148
+ stub_mode: Use deprecated stubs instead of real ML.
149
+ """
150
+ self.db = db
151
+ self.output_dir = output_dir
152
+ self.run_id = run_id or datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
153
+ self.enable_ml_stubs = enable_ml_stubs
154
+ self.seed_base = seed_base or self.run_id
155
+ # Phase 5
156
+ self.enable_predictions = enable_predictions
157
+ self.enable_insights = enable_insights
158
+ self.insights_max_tokens = insights_max_tokens
159
+ self.insights_cache_ttl_hours = insights_cache_ttl_hours
160
+ self.insights_dry_run = insights_dry_run
161
+ self.stub_mode = stub_mode
162
+
163
+ def generate_all(self) -> DatasetManifest:
164
+ """Generate all aggregate files and manifest.
165
+
166
+ Returns:
167
+ DatasetManifest with generated file index.
168
+
169
+ Raises:
170
+ AggregationError: If generation fails.
171
+ StubGenerationError: If stubs requested without ALLOW_ML_STUBS env var.
172
+ """
173
+ import warnings as py_warnings
174
+
175
+ # Create output directories
176
+ self.output_dir.mkdir(parents=True, exist_ok=True)
177
+ (self.output_dir / "aggregates").mkdir(exist_ok=True)
178
+ (self.output_dir / "aggregates" / "weekly_rollups").mkdir(exist_ok=True)
179
+ (self.output_dir / "aggregates" / "distributions").mkdir(exist_ok=True)
180
+
181
+ try:
182
+ # Generate dimensions
183
+ dimensions = self._generate_dimensions()
184
+ self._write_json(
185
+ self.output_dir / "aggregates" / "dimensions.json",
186
+ asdict(dimensions),
187
+ )
188
+ logger.info("Generated dimensions.json")
189
+
190
+ # Generate weekly rollups
191
+ weekly_index = self._generate_weekly_rollups()
192
+ logger.info(f"Generated {len(weekly_index)} weekly rollup files")
193
+
194
+ # Generate yearly distributions
195
+ dist_index = self._generate_distributions()
196
+ logger.info(f"Generated {len(dist_index)} distribution files")
197
+
198
+ # Phase 5: ML features generation
199
+ predictions_generated = False
200
+ insights_generated = False
201
+ warnings: list[str] = []
202
+
203
+ # Stub mode (deprecated, for testing only)
204
+ if self.stub_mode:
205
+ py_warnings.warn(
206
+ "Stub mode is deprecated. Use --enable-predictions and "
207
+ "--enable-insights for real ML features.",
208
+ DeprecationWarning,
209
+ stacklevel=2,
210
+ )
211
+ # Use legacy stubs
212
+ pred_gen = PredictionGenerator(self.output_dir, self.seed_base)
213
+ pred_gen.generate()
214
+ predictions_generated = True
215
+
216
+ insights_gen = InsightsGenerator(self.output_dir, self.seed_base)
217
+ insights_gen.generate()
218
+ insights_generated = True
219
+
220
+ warnings.append("STUB DATA - NOT PRODUCTION")
221
+ logger.warning(
222
+ "Generated stub predictions/insights - NOT FOR PRODUCTION"
223
+ )
224
+
225
+ # Legacy enable_ml_stubs (LOUD WARNING - maps to stub mode)
226
+ elif self.enable_ml_stubs:
227
+ # Hard warning to prevent accidental stub usage in production
228
+ logger.warning("=" * 80)
229
+ logger.warning(
230
+ "WARNING: --enable-ml-stubs is DEPRECATED and generates "
231
+ "STUB DATA with is_stub:true"
232
+ )
233
+ logger.warning(
234
+ "Use --enable-predictions and --enable-insights for real ML features."
235
+ )
236
+ logger.warning(
237
+ "To explicitly use stubs for testing, use --stub-mode instead."
238
+ )
239
+ logger.warning("=" * 80)
240
+
241
+ pred_gen = PredictionGenerator(self.output_dir, self.seed_base)
242
+ pred_gen.generate()
243
+ predictions_generated = True
244
+
245
+ insights_gen = InsightsGenerator(self.output_dir, self.seed_base)
246
+ insights_gen.generate()
247
+ insights_generated = True
248
+
249
+ warnings.append("STUB DATA - NOT PRODUCTION - DEPRECATED FLAG USED")
250
+ logger.warning(
251
+ "Generated stub predictions/insights - NOT FOR PRODUCTION"
252
+ )
253
+
254
+ else:
255
+ # Phase 5: Real ML features
256
+ if self.enable_predictions:
257
+ predictions_generated = self._generate_predictions()
258
+
259
+ if self.enable_insights:
260
+ insights_generated = self._generate_insights()
261
+
262
+ # Check if files exist from previous runs
263
+ if not predictions_generated:
264
+ predictions_generated = (
265
+ self.output_dir / "predictions" / "trends.json"
266
+ ).exists()
267
+ if not insights_generated:
268
+ insights_generated = (
269
+ self.output_dir / "insights" / "summary.json"
270
+ ).exists()
271
+
272
+ # Build manifest
273
+ manifest = DatasetManifest(
274
+ generated_at=datetime.now(timezone.utc).isoformat(),
275
+ run_id=self.run_id,
276
+ warnings=warnings,
277
+ aggregate_index=AggregateIndex(
278
+ weekly_rollups=weekly_index,
279
+ distributions=dist_index,
280
+ ),
281
+ defaults={"default_date_range_days": 90},
282
+ limits={"max_date_range_days_soft": 730},
283
+ features={
284
+ "teams": len(dimensions.teams) > 0, # Phase 3.3: dynamic
285
+ "comments": self._has_comments(), # Phase 3.4: dynamic
286
+ "predictions": predictions_generated, # Phase 3.5/5: file-gated
287
+ "ai_insights": insights_generated, # Phase 3.5/5: file-gated
288
+ },
289
+ coverage={
290
+ "total_prs": self._get_pr_count(),
291
+ "date_range": dimensions.date_range,
292
+ "teams_count": len(dimensions.teams), # Phase 3.3
293
+ "comments": self._get_comments_coverage(), # Phase 3.4
294
+ # Phase 4 §5: Operational visibility
295
+ "row_counts": self._get_row_counts(),
296
+ },
297
+ )
298
+
299
+ # Phase 4 §5: Calculate total artifact size after manifest written
300
+ # We'll add this after initial manifest write
301
+ manifest_dict = asdict(manifest)
302
+ manifest_dict["operational"] = self._get_operational_summary(
303
+ weekly_index, dist_index
304
+ )
305
+
306
+ # Write manifest
307
+ self._write_json(
308
+ self.output_dir / "dataset-manifest.json",
309
+ manifest_dict,
310
+ )
311
+ logger.info("Generated dataset-manifest.json")
312
+
313
+ return manifest
314
+
315
+ except Exception as e:
316
+ raise AggregationError(f"Failed to generate aggregates: {e}") from e
317
+
318
+ def _generate_predictions(self) -> bool:
319
+ """Generate Prophet-based predictions (Phase 5).
320
+
321
+ Returns:
322
+ True if predictions file was successfully written, False otherwise.
323
+ """
324
+ try:
325
+ from ..ml.forecaster import ProphetForecaster
326
+ except ImportError:
327
+ logger.warning(
328
+ "Prophet not installed. Install ML extras: pip install -e '.[ml]'"
329
+ )
330
+ return False
331
+
332
+ try:
333
+ forecaster = ProphetForecaster(
334
+ db=self.db,
335
+ output_dir=self.output_dir,
336
+ )
337
+ return forecaster.generate()
338
+ except Exception as e:
339
+ logger.warning(f"Prediction generation failed: {type(e).__name__}: {e}")
340
+ return False
341
+
342
+ def _generate_insights(self) -> bool:
343
+ """Generate OpenAI-based insights (Phase 5).
344
+
345
+ Returns:
346
+ True if insights file was written, False otherwise.
347
+ """
348
+ try:
349
+ from ..ml.insights import LLMInsightsGenerator
350
+ except ImportError:
351
+ # This should not happen as CLI validates openai is installed
352
+ logger.error(
353
+ "OpenAI SDK not installed. Install ML extras: pip install -e '.[ml]'"
354
+ )
355
+ raise AggregationError(
356
+ "OpenAI SDK required for --enable-insights"
357
+ ) from None
358
+
359
+ try:
360
+ insights_gen = LLMInsightsGenerator(
361
+ db=self.db,
362
+ output_dir=self.output_dir,
363
+ max_tokens=self.insights_max_tokens,
364
+ cache_ttl_hours=self.insights_cache_ttl_hours,
365
+ dry_run=self.insights_dry_run,
366
+ )
367
+ return insights_gen.generate()
368
+ except Exception as e:
369
+ logger.warning(f"Insights generation failed: {type(e).__name__}: {e}")
370
+ return False
371
+
372
+ def _generate_dimensions(self) -> Dimensions:
373
+ """Generate filter dimensions from SQLite."""
374
+ # Repositories
375
+ repos_df = pd.read_sql_query(
376
+ """
377
+ SELECT repository_id, repository_name, project_name, organization_name
378
+ FROM repositories
379
+ ORDER BY organization_name, project_name, repository_name
380
+ """,
381
+ self.db.connection,
382
+ )
383
+
384
+ # Users (authors only, not all users)
385
+ users_df = pd.read_sql_query(
386
+ """
387
+ SELECT DISTINCT u.user_id, u.display_name
388
+ FROM users u
389
+ INNER JOIN pull_requests pr ON pr.user_id = u.user_id
390
+ ORDER BY u.display_name
391
+ """,
392
+ self.db.connection,
393
+ )
394
+
395
+ # Projects
396
+ projects_df = pd.read_sql_query(
397
+ """
398
+ SELECT organization_name, project_name
399
+ FROM projects
400
+ ORDER BY organization_name, project_name
401
+ """,
402
+ self.db.connection,
403
+ )
404
+
405
+ # Date range
406
+ date_range_df = pd.read_sql_query(
407
+ """
408
+ SELECT MIN(closed_date) as min_date, MAX(closed_date) as max_date
409
+ FROM pull_requests
410
+ WHERE closed_date IS NOT NULL
411
+ """,
412
+ self.db.connection,
413
+ )
414
+
415
+ date_range = {}
416
+ if not date_range_df.empty and date_range_df.iloc[0]["min_date"]:
417
+ date_range = {
418
+ "min": date_range_df.iloc[0]["min_date"][:10], # YYYY-MM-DD
419
+ "max": date_range_df.iloc[0]["max_date"][:10],
420
+ }
421
+
422
+ # Phase 3.3: Teams (defensive for legacy DBs without teams table)
423
+ try:
424
+ teams_df = pd.read_sql_query(
425
+ """
426
+ SELECT t.team_id, t.team_name, t.project_name, t.organization_name,
427
+ COUNT(tm.user_id) as member_count
428
+ FROM teams t
429
+ LEFT JOIN team_members tm ON t.team_id = tm.team_id
430
+ GROUP BY t.team_id, t.team_name, t.project_name, t.organization_name
431
+ ORDER BY t.organization_name, t.project_name, t.team_name
432
+ """,
433
+ self.db.connection,
434
+ )
435
+ except Exception as e:
436
+ # P1 fix: Legacy databases may not have teams table
437
+ logger.debug(f"Teams table not available (legacy DB?): {e}")
438
+ teams_df = pd.DataFrame()
439
+
440
+ return Dimensions(
441
+ repositories=list(repos_df.to_dict(orient="records")), # type: ignore[arg-type]
442
+ users=list(users_df.to_dict(orient="records")), # type: ignore[arg-type]
443
+ projects=list(projects_df.to_dict(orient="records")), # type: ignore[arg-type]
444
+ teams=(
445
+ list(teams_df.to_dict(orient="records")) # type: ignore[arg-type]
446
+ if not teams_df.empty
447
+ else []
448
+ ),
449
+ date_range=date_range,
450
+ )
451
+
452
+ def _generate_weekly_rollups(self) -> list[dict[str, Any]]:
453
+ """Generate weekly rollup files, one per ISO week."""
454
+ # Query PRs with closed dates
455
+ df = pd.read_sql_query(
456
+ """
457
+ SELECT
458
+ closed_date,
459
+ cycle_time_minutes,
460
+ user_id,
461
+ pull_request_uid
462
+ FROM pull_requests
463
+ WHERE closed_date IS NOT NULL AND status = 'completed'
464
+ ORDER BY closed_date
465
+ """,
466
+ self.db.connection,
467
+ )
468
+
469
+ if df.empty:
470
+ return []
471
+
472
+ # Convert to datetime and extract ISO week
473
+ df["closed_dt"] = pd.to_datetime(df["closed_date"])
474
+ df["iso_year"] = df["closed_dt"].dt.isocalendar().year
475
+ df["iso_week"] = df["closed_dt"].dt.isocalendar().week
476
+
477
+ index: list[dict[str, Any]] = []
478
+
479
+ # Group by ISO year-week
480
+ for (iso_year, iso_week), group in df.groupby(["iso_year", "iso_week"]):
481
+ week_str = f"{iso_year}-W{iso_week:02d}"
482
+
483
+ # Calculate week boundaries (iso_year/iso_week are UInt32 from pandas)
484
+ year_int = int(iso_year) # type: ignore[call-overload]
485
+ week_int = int(iso_week) # type: ignore[call-overload]
486
+ start_date = date.fromisocalendar(year_int, week_int, 1)
487
+ end_date = date.fromisocalendar(year_int, week_int, 7)
488
+
489
+ rollup = WeeklyRollup(
490
+ week=week_str,
491
+ start_date=start_date.isoformat(),
492
+ end_date=end_date.isoformat(),
493
+ pr_count=len(group),
494
+ cycle_time_p50=group["cycle_time_minutes"].quantile(0.5)
495
+ if not group["cycle_time_minutes"].isna().all()
496
+ else None,
497
+ cycle_time_p90=group["cycle_time_minutes"].quantile(0.9)
498
+ if not group["cycle_time_minutes"].isna().all()
499
+ else None,
500
+ authors_count=group["user_id"].nunique(),
501
+ reviewers_count=0, # TODO: Add reviewer counting
502
+ )
503
+
504
+ # Write file
505
+ file_path = (
506
+ self.output_dir / "aggregates" / "weekly_rollups" / f"{week_str}.json"
507
+ )
508
+ self._write_json(file_path, asdict(rollup))
509
+
510
+ # Add to index
511
+ index.append(
512
+ {
513
+ "week": week_str,
514
+ "path": f"aggregates/weekly_rollups/{week_str}.json",
515
+ "start_date": rollup.start_date,
516
+ "end_date": rollup.end_date,
517
+ "size_bytes": file_path.stat().st_size,
518
+ }
519
+ )
520
+
521
+ return index
522
+
523
+ def _generate_distributions(self) -> list[dict[str, Any]]:
524
+ """Generate yearly distribution files."""
525
+ df = pd.read_sql_query(
526
+ """
527
+ SELECT
528
+ closed_date,
529
+ cycle_time_minutes
530
+ FROM pull_requests
531
+ WHERE closed_date IS NOT NULL AND status = 'completed'
532
+ ORDER BY closed_date
533
+ """,
534
+ self.db.connection,
535
+ )
536
+
537
+ if df.empty:
538
+ return []
539
+
540
+ df["closed_dt"] = pd.to_datetime(df["closed_date"])
541
+ df["year"] = df["closed_dt"].dt.year
542
+ df["month"] = df["closed_dt"].dt.strftime("%Y-%m")
543
+
544
+ index: list[dict[str, Any]] = []
545
+
546
+ for year, group in df.groupby("year"):
547
+ year_str = str(year)
548
+
549
+ # Cycle time buckets (in hours)
550
+ cycle_times = group["cycle_time_minutes"].dropna() / 60 # Convert to hours
551
+ buckets = {
552
+ "0-1h": int((cycle_times < 1).sum()),
553
+ "1-4h": int(((cycle_times >= 1) & (cycle_times < 4)).sum()),
554
+ "4-24h": int(((cycle_times >= 4) & (cycle_times < 24)).sum()),
555
+ "1-3d": int(((cycle_times >= 24) & (cycle_times < 72)).sum()),
556
+ "3-7d": int(((cycle_times >= 72) & (cycle_times < 168)).sum()),
557
+ "7d+": int((cycle_times >= 168).sum()),
558
+ }
559
+
560
+ # PRs by month
561
+ prs_by_month = group.groupby("month").size().to_dict()
562
+
563
+ dist = YearlyDistribution(
564
+ year=year_str,
565
+ start_date=f"{year_str}-01-01",
566
+ end_date=f"{year_str}-12-31",
567
+ total_prs=len(group),
568
+ cycle_time_buckets=buckets,
569
+ prs_by_month={str(k): int(v) for k, v in prs_by_month.items()},
570
+ )
571
+
572
+ # Write file
573
+ file_path = (
574
+ self.output_dir / "aggregates" / "distributions" / f"{year_str}.json"
575
+ )
576
+ self._write_json(file_path, asdict(dist))
577
+
578
+ index.append(
579
+ {
580
+ "year": year_str,
581
+ "path": f"aggregates/distributions/{year_str}.json",
582
+ "start_date": dist.start_date,
583
+ "end_date": dist.end_date,
584
+ "size_bytes": file_path.stat().st_size,
585
+ }
586
+ )
587
+
588
+ return index
589
+
590
+ def _get_pr_count(self) -> int:
591
+ """Get total PR count."""
592
+ cursor = self.db.execute(
593
+ "SELECT COUNT(*) as cnt FROM pull_requests WHERE status = 'completed'"
594
+ )
595
+ row = cursor.fetchone()
596
+ return int(row["cnt"]) if row else 0
597
+
598
+ def _has_comments(self) -> bool:
599
+ """Check if comments data exists."""
600
+ try:
601
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_threads")
602
+ row = cursor.fetchone()
603
+ return int(row["cnt"]) > 0 if row else False
604
+ except Exception:
605
+ # Legacy DB may not have pr_threads table
606
+ return False
607
+
608
+ def _get_comments_coverage(self) -> dict[str, Any]:
609
+ """Get comments coverage statistics.
610
+
611
+ §6: coverage.comments: "full" | "partial" | "disabled"
612
+ """
613
+ try:
614
+ # Count threads and comments
615
+ thread_cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_threads")
616
+ thread_row = thread_cursor.fetchone()
617
+ thread_count = int(thread_row["cnt"]) if thread_row else 0
618
+
619
+ comment_cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_comments")
620
+ comment_row = comment_cursor.fetchone()
621
+ comment_count = int(comment_row["cnt"]) if comment_row else 0
622
+
623
+ # Count PRs with threads
624
+ prs_with_threads_cursor = self.db.execute(
625
+ "SELECT COUNT(DISTINCT pull_request_uid) as cnt FROM pr_threads"
626
+ )
627
+ prs_with_threads_row = prs_with_threads_cursor.fetchone()
628
+ prs_with_threads = (
629
+ int(prs_with_threads_row["cnt"]) if prs_with_threads_row else 0
630
+ )
631
+ except Exception:
632
+ # Legacy DB may not have comments tables
633
+ thread_count = 0
634
+ comment_count = 0
635
+ prs_with_threads = 0
636
+
637
+ if thread_count == 0:
638
+ status = "disabled"
639
+ else:
640
+ # For now, assume full coverage if any comments exist
641
+ # A more complex implementation would track capped state
642
+ status = "full"
643
+
644
+ return {
645
+ "status": status,
646
+ "threads_fetched": thread_count,
647
+ "comments_fetched": comment_count,
648
+ "prs_with_threads": prs_with_threads,
649
+ "capped": False, # Set by extraction when limits hit
650
+ }
651
+
652
+ def _get_row_counts(self) -> dict[str, int]:
653
+ """Get row counts for key tables (Phase 4 §5: Operational visibility)."""
654
+ counts: dict[str, int] = {}
655
+
656
+ # PRs
657
+ try:
658
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pull_requests")
659
+ row = cursor.fetchone()
660
+ counts["pull_requests"] = int(row["cnt"]) if row else 0
661
+ except Exception:
662
+ counts["pull_requests"] = 0
663
+
664
+ # Reviewers
665
+ try:
666
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM reviewers")
667
+ row = cursor.fetchone()
668
+ counts["reviewers"] = int(row["cnt"]) if row else 0
669
+ except Exception:
670
+ counts["reviewers"] = 0
671
+
672
+ # Users
673
+ try:
674
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM users")
675
+ row = cursor.fetchone()
676
+ counts["users"] = int(row["cnt"]) if row else 0
677
+ except Exception:
678
+ counts["users"] = 0
679
+
680
+ # Repositories
681
+ try:
682
+ cursor = self.db.execute("SELECT COUNT(*) as cnt FROM repositories")
683
+ row = cursor.fetchone()
684
+ counts["repositories"] = int(row["cnt"]) if row else 0
685
+ except Exception:
686
+ counts["repositories"] = 0
687
+
688
+ return counts
689
+
690
+ def _get_operational_summary(
691
+ self,
692
+ weekly_index: list[dict[str, Any]],
693
+ dist_index: list[dict[str, Any]],
694
+ ) -> dict[str, Any]:
695
+ """Generate operational summary for operators (Phase 4 §5).
696
+
697
+ Provides immediate insight into dataset health and scale.
698
+ """
699
+ # Calculate total artifact size from indexes
700
+ total_size = sum(item.get("size_bytes", 0) for item in weekly_index)
701
+ total_size += sum(item.get("size_bytes", 0) for item in dist_index)
702
+
703
+ # Add dimensions file size if it exists
704
+ dimensions_path = self.output_dir / "aggregates" / "dimensions.json"
705
+ if dimensions_path.exists():
706
+ total_size += dimensions_path.stat().st_size
707
+
708
+ # Add predictions/insights sizes if they exist
709
+ for extra_file in [
710
+ self.output_dir / "predictions" / "trends.json",
711
+ self.output_dir / "insights" / "summary.json",
712
+ ]:
713
+ if extra_file.exists():
714
+ total_size += extra_file.stat().st_size
715
+
716
+ return {
717
+ "artifact_size_bytes": total_size,
718
+ "weekly_rollup_count": len(weekly_index),
719
+ "distribution_count": len(dist_index),
720
+ "retention_notice": (
721
+ "Data older than 2 years may have reduced detail. "
722
+ "Consider archiving old data periodically."
723
+ if len(dist_index) > 2
724
+ else None
725
+ ),
726
+ }
727
+
728
+ def _write_json(self, path: Path, data: dict[str, Any]) -> None:
729
+ """Write JSON file with deterministic formatting."""
730
+ with path.open("w", encoding="utf-8") as f:
731
+ json.dump(data, f, indent=2, sort_keys=True)
732
+
733
+
734
+ class StubGenerationError(Exception):
735
+ """Stub generation failed due to missing ALLOW_ML_STUBS env var."""
736
+
737
+
738
+ class PredictionGenerator:
739
+ """Generate predictions stub data for Phase 3.5.
740
+
741
+ Produces deterministic synthetic forecasts using a stable seed.
742
+ Only enabled with --enable-ml-stubs AND ALLOW_ML_STUBS=1 env var.
743
+ """
744
+
745
+ METRICS = [
746
+ ("pr_throughput", "count"),
747
+ ("cycle_time_minutes", "minutes"),
748
+ ("review_time_minutes", "minutes"),
749
+ ]
750
+ HORIZON_WEEKS = 4
751
+
752
+ def __init__(
753
+ self,
754
+ output_dir: Path,
755
+ seed_base: str = "",
756
+ ) -> None:
757
+ """Initialize the prediction generator.
758
+
759
+ Args:
760
+ output_dir: Directory for output files.
761
+ seed_base: Base string for deterministic seeding (e.g., org+project).
762
+ """
763
+ self.output_dir = output_dir
764
+ self.seed_base = seed_base
765
+
766
+ def generate(self) -> dict[str, Any] | None:
767
+ """Generate predictions stub file.
768
+
769
+ Returns:
770
+ Dict with predictions data if generated, None otherwise.
771
+
772
+ Raises:
773
+ StubGenerationError: If ALLOW_ML_STUBS env var not set.
774
+ """
775
+ if not os.environ.get("ALLOW_ML_STUBS") == "1":
776
+ raise StubGenerationError(
777
+ "Stub generation requires ALLOW_ML_STUBS=1 environment variable. "
778
+ "This is a safety gate to prevent accidental use of synthetic data."
779
+ )
780
+
781
+ predictions_dir = self.output_dir / "predictions"
782
+ predictions_dir.mkdir(parents=True, exist_ok=True)
783
+
784
+ forecasts = []
785
+ today = date.today()
786
+ # Monday-align to start of current week
787
+ start_monday = today - timedelta(days=today.weekday())
788
+
789
+ for metric, unit in self.METRICS:
790
+ values = []
791
+ for week_offset in range(self.HORIZON_WEEKS):
792
+ period_start = start_monday + timedelta(weeks=week_offset)
793
+
794
+ # Deterministic seed per metric+period
795
+ seed_str = f"{self.seed_base}:{metric}:{period_start.isoformat()}"
796
+ seed = int(hashlib.sha256(seed_str.encode()).hexdigest()[:8], 16)
797
+ rng = random.Random(seed) # noqa: S311 - intentional for deterministic stubs
798
+
799
+ # Generate synthetic values based on metric type
800
+ if metric == "pr_throughput":
801
+ base_value = rng.randint(15, 45)
802
+ variance = rng.randint(3, 10)
803
+ else: # time metrics in minutes
804
+ base_value = rng.randint(120, 480)
805
+ variance = rng.randint(30, 120)
806
+
807
+ values.append(
808
+ {
809
+ "period_start": period_start.isoformat(),
810
+ "predicted": base_value,
811
+ "lower_bound": max(0, base_value - variance),
812
+ "upper_bound": base_value + variance,
813
+ }
814
+ )
815
+
816
+ forecasts.append(
817
+ {
818
+ "metric": metric,
819
+ "unit": unit,
820
+ "horizon_weeks": self.HORIZON_WEEKS,
821
+ "values": values,
822
+ }
823
+ )
824
+
825
+ predictions = {
826
+ "schema_version": PREDICTIONS_SCHEMA_VERSION,
827
+ "generated_at": datetime.now(timezone.utc).isoformat(),
828
+ "is_stub": True,
829
+ "generated_by": STUB_GENERATOR_ID,
830
+ "forecasts": forecasts,
831
+ }
832
+
833
+ # Write file
834
+ file_path = predictions_dir / "trends.json"
835
+ with file_path.open("w", encoding="utf-8") as f:
836
+ json.dump(predictions, f, indent=2, sort_keys=True)
837
+
838
+ logger.info("Generated predictions/trends.json (stub data)")
839
+ return predictions
840
+
841
+
842
+ class InsightsGenerator:
843
+ """Generate AI insights stub data for Phase 3.5.
844
+
845
+ Produces deterministic synthetic insights using a stable seed.
846
+ Only enabled with --enable-ml-stubs AND ALLOW_ML_STUBS=1 env var.
847
+ """
848
+
849
+ # Sample insight templates for stub generation
850
+ INSIGHT_TEMPLATES = [
851
+ {
852
+ "category": "bottleneck",
853
+ "severity": "warning",
854
+ "title": "Code review latency increasing",
855
+ "description": "Average time from PR creation to first review has increased "
856
+ "by 15% over the past 4 weeks. This may indicate reviewer capacity constraints.",
857
+ },
858
+ {
859
+ "category": "trend",
860
+ "severity": "info",
861
+ "title": "PR throughput stable",
862
+ "description": "Weekly PR merge rate has remained consistent at approximately "
863
+ "25-30 PRs per week over the analyzed period.",
864
+ },
865
+ {
866
+ "category": "anomaly",
867
+ "severity": "critical",
868
+ "title": "Unusual cycle time spike detected",
869
+ "description": "P90 cycle time increased significantly in the most recent week, "
870
+ "exceeding the historical 95th percentile threshold.",
871
+ },
872
+ ]
873
+
874
+ def __init__(
875
+ self,
876
+ output_dir: Path,
877
+ seed_base: str = "",
878
+ ) -> None:
879
+ """Initialize the insights generator.
880
+
881
+ Args:
882
+ output_dir: Directory for output files.
883
+ seed_base: Base string for deterministic seeding.
884
+ """
885
+ self.output_dir = output_dir
886
+ self.seed_base = seed_base
887
+
888
+ def generate(self) -> dict[str, Any] | None:
889
+ """Generate insights stub file.
890
+
891
+ Returns:
892
+ Dict with insights data if generated, None otherwise.
893
+
894
+ Raises:
895
+ StubGenerationError: If ALLOW_ML_STUBS env var not set.
896
+ """
897
+ if not os.environ.get("ALLOW_ML_STUBS") == "1":
898
+ raise StubGenerationError(
899
+ "Stub generation requires ALLOW_ML_STUBS=1 environment variable."
900
+ )
901
+
902
+ insights_dir = self.output_dir / "insights"
903
+ insights_dir.mkdir(parents=True, exist_ok=True)
904
+
905
+ # Deterministic selection of insights based on seed
906
+ seed_str = f"{self.seed_base}:insights"
907
+ seed = int(hashlib.sha256(seed_str.encode()).hexdigest()[:8], 16)
908
+ rng = random.Random(seed) # noqa: S311 - intentional for deterministic stubs
909
+
910
+ # Generate 2-3 insights from templates
911
+ num_insights = rng.randint(2, 3)
912
+ selected_templates = rng.sample(
913
+ self.INSIGHT_TEMPLATES, min(num_insights, len(self.INSIGHT_TEMPLATES))
914
+ )
915
+
916
+ insights_list = []
917
+ for i, template in enumerate(selected_templates):
918
+ insight_id = hashlib.sha256(
919
+ f"{self.seed_base}:insight:{i}".encode()
920
+ ).hexdigest()[:12]
921
+
922
+ insights_list.append(
923
+ {
924
+ "id": f"stub-{insight_id}",
925
+ "category": template["category"],
926
+ "severity": template["severity"],
927
+ "title": template["title"],
928
+ "description": template["description"],
929
+ "affected_entities": [
930
+ f"project:{self.seed_base.split(':')[0] if ':' in self.seed_base else 'default'}"
931
+ ],
932
+ "evidence_refs": [],
933
+ }
934
+ )
935
+
936
+ insights = {
937
+ "schema_version": INSIGHTS_SCHEMA_VERSION,
938
+ "generated_at": datetime.now(timezone.utc).isoformat(),
939
+ "is_stub": True,
940
+ "generated_by": STUB_GENERATOR_ID,
941
+ "insights": insights_list,
942
+ }
943
+
944
+ # Write file
945
+ file_path = insights_dir / "summary.json"
946
+ with file_path.open("w", encoding="utf-8") as f:
947
+ json.dump(insights, f, indent=2, sort_keys=True)
948
+
949
+ logger.info("Generated insights/summary.json (stub data)")
950
+ return insights