ado-git-repo-insights 1.2.1__py3-none-any.whl → 2.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ado_git_repo_insights/__init__.py +3 -3
- ado_git_repo_insights/cli.py +703 -354
- ado_git_repo_insights/config.py +186 -186
- ado_git_repo_insights/extractor/__init__.py +1 -1
- ado_git_repo_insights/extractor/ado_client.py +452 -246
- ado_git_repo_insights/extractor/pr_extractor.py +239 -239
- ado_git_repo_insights/ml/__init__.py +13 -0
- ado_git_repo_insights/ml/date_utils.py +70 -0
- ado_git_repo_insights/ml/forecaster.py +288 -0
- ado_git_repo_insights/ml/insights.py +497 -0
- ado_git_repo_insights/persistence/__init__.py +1 -1
- ado_git_repo_insights/persistence/database.py +193 -193
- ado_git_repo_insights/persistence/models.py +207 -145
- ado_git_repo_insights/persistence/repository.py +662 -376
- ado_git_repo_insights/transform/__init__.py +1 -1
- ado_git_repo_insights/transform/aggregators.py +950 -0
- ado_git_repo_insights/transform/csv_generator.py +132 -132
- ado_git_repo_insights/utils/__init__.py +1 -1
- ado_git_repo_insights/utils/datetime_utils.py +101 -101
- ado_git_repo_insights/utils/logging_config.py +172 -172
- ado_git_repo_insights/utils/run_summary.py +207 -206
- {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/METADATA +56 -15
- ado_git_repo_insights-2.7.4.dist-info/RECORD +27 -0
- {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/licenses/LICENSE +21 -21
- ado_git_repo_insights-1.2.1.dist-info/RECORD +0 -22
- {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/WHEEL +0 -0
- {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/entry_points.txt +0 -0
- {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,950 @@
|
|
|
1
|
+
"""Chunked aggregate generator for Phase 3 UI.
|
|
2
|
+
|
|
3
|
+
Generates JSON aggregates from SQLite for scale-safe UI rendering:
|
|
4
|
+
- weekly_rollups/YYYY-Www.json - Weekly PR metrics
|
|
5
|
+
- distributions/YYYY.json - Yearly distribution data
|
|
6
|
+
- dimensions.json - Filter dimensions (repos, users, teams)
|
|
7
|
+
- dataset-manifest.json - Discovery metadata with schema versions
|
|
8
|
+
- predictions/trends.json - Trend forecasts (Phase 3.5)
|
|
9
|
+
- insights/summary.json - AI insights (Phase 3.5)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import random
|
|
19
|
+
from dataclasses import asdict, dataclass, field
|
|
20
|
+
from datetime import date, datetime, timedelta, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING, Any
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from ..persistence.database import DatabaseManager
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# Schema versions (Phase 3 locked)
|
|
32
|
+
MANIFEST_SCHEMA_VERSION = 1
|
|
33
|
+
DATASET_SCHEMA_VERSION = 1
|
|
34
|
+
AGGREGATES_SCHEMA_VERSION = 1
|
|
35
|
+
|
|
36
|
+
# Phase 3.5 schema versions
|
|
37
|
+
PREDICTIONS_SCHEMA_VERSION = 1
|
|
38
|
+
INSIGHTS_SCHEMA_VERSION = 1
|
|
39
|
+
|
|
40
|
+
# Stub generator identifier
|
|
41
|
+
STUB_GENERATOR_ID = "phase3.5-stub-v1"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AggregationError(Exception):
|
|
45
|
+
"""Aggregation failed."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class WeeklyRollup:
|
|
50
|
+
"""Weekly PR metrics rollup."""
|
|
51
|
+
|
|
52
|
+
week: str # ISO week: YYYY-Www
|
|
53
|
+
start_date: str # ISO date
|
|
54
|
+
end_date: str # ISO date
|
|
55
|
+
pr_count: int = 0
|
|
56
|
+
cycle_time_p50: float | None = None
|
|
57
|
+
cycle_time_p90: float | None = None
|
|
58
|
+
authors_count: int = 0
|
|
59
|
+
reviewers_count: int = 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class YearlyDistribution:
|
|
64
|
+
"""Yearly distribution metrics."""
|
|
65
|
+
|
|
66
|
+
year: str # YYYY
|
|
67
|
+
start_date: str
|
|
68
|
+
end_date: str
|
|
69
|
+
total_prs: int = 0
|
|
70
|
+
cycle_time_buckets: dict[str, int] = field(default_factory=dict)
|
|
71
|
+
prs_by_month: dict[str, int] = field(default_factory=dict)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class Dimensions:
|
|
76
|
+
"""Filter dimensions for UI."""
|
|
77
|
+
|
|
78
|
+
repositories: list[dict[str, Any]] = field(default_factory=list)
|
|
79
|
+
users: list[dict[str, Any]] = field(default_factory=list)
|
|
80
|
+
projects: list[dict[str, Any]] = field(default_factory=list)
|
|
81
|
+
teams: list[dict[str, Any]] = field(default_factory=list) # Phase 3.3
|
|
82
|
+
date_range: dict[str, str] = field(default_factory=dict)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class AggregateIndex:
|
|
87
|
+
"""Index of available aggregate files."""
|
|
88
|
+
|
|
89
|
+
weekly_rollups: list[dict[str, Any]] = field(default_factory=list)
|
|
90
|
+
distributions: list[dict[str, Any]] = field(default_factory=list)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class DatasetManifest:
|
|
95
|
+
"""Dataset discovery manifest."""
|
|
96
|
+
|
|
97
|
+
manifest_schema_version: int = MANIFEST_SCHEMA_VERSION
|
|
98
|
+
dataset_schema_version: int = DATASET_SCHEMA_VERSION
|
|
99
|
+
aggregates_schema_version: int = AGGREGATES_SCHEMA_VERSION
|
|
100
|
+
predictions_schema_version: int = PREDICTIONS_SCHEMA_VERSION # Phase 3.5
|
|
101
|
+
insights_schema_version: int = INSIGHTS_SCHEMA_VERSION # Phase 3.5
|
|
102
|
+
generated_at: str = ""
|
|
103
|
+
run_id: str = ""
|
|
104
|
+
warnings: list[str] = field(default_factory=list) # Phase 3.5: stub warnings
|
|
105
|
+
aggregate_index: AggregateIndex = field(default_factory=AggregateIndex)
|
|
106
|
+
defaults: dict[str, Any] = field(default_factory=dict)
|
|
107
|
+
limits: dict[str, Any] = field(default_factory=dict)
|
|
108
|
+
features: dict[str, bool] = field(default_factory=dict)
|
|
109
|
+
coverage: dict[str, Any] = field(default_factory=dict)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class AggregateGenerator:
|
|
113
|
+
"""Generate chunked JSON aggregates from SQLite.
|
|
114
|
+
|
|
115
|
+
Phase 3: Produces weekly rollups and distributions for lazy UI loading.
|
|
116
|
+
Phase 3.5: Optionally generates predictions/insights stubs.
|
|
117
|
+
Phase 5: Integrates Prophet forecaster and OpenAI insights.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
db: DatabaseManager,
|
|
123
|
+
output_dir: Path,
|
|
124
|
+
run_id: str = "",
|
|
125
|
+
enable_ml_stubs: bool = False,
|
|
126
|
+
seed_base: str = "",
|
|
127
|
+
# Phase 5: ML parameters
|
|
128
|
+
enable_predictions: bool = False,
|
|
129
|
+
enable_insights: bool = False,
|
|
130
|
+
insights_max_tokens: int = 1000,
|
|
131
|
+
insights_cache_ttl_hours: int = 24,
|
|
132
|
+
insights_dry_run: bool = False,
|
|
133
|
+
stub_mode: bool = False,
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Initialize the aggregate generator.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
db: Database manager instance.
|
|
139
|
+
output_dir: Directory for aggregate output.
|
|
140
|
+
run_id: Pipeline run ID for manifest.
|
|
141
|
+
enable_ml_stubs: Whether to generate stub predictions/insights (Phase 3.5).
|
|
142
|
+
seed_base: Base string for deterministic stub seeding.
|
|
143
|
+
enable_predictions: Enable Prophet-based forecasting (Phase 5).
|
|
144
|
+
enable_insights: Enable OpenAI-based insights (Phase 5).
|
|
145
|
+
insights_max_tokens: Max tokens for OpenAI response.
|
|
146
|
+
insights_cache_ttl_hours: Cache TTL for insights.
|
|
147
|
+
insights_dry_run: Write prompt artifact without calling API.
|
|
148
|
+
stub_mode: Use deprecated stubs instead of real ML.
|
|
149
|
+
"""
|
|
150
|
+
self.db = db
|
|
151
|
+
self.output_dir = output_dir
|
|
152
|
+
self.run_id = run_id or datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
|
|
153
|
+
self.enable_ml_stubs = enable_ml_stubs
|
|
154
|
+
self.seed_base = seed_base or self.run_id
|
|
155
|
+
# Phase 5
|
|
156
|
+
self.enable_predictions = enable_predictions
|
|
157
|
+
self.enable_insights = enable_insights
|
|
158
|
+
self.insights_max_tokens = insights_max_tokens
|
|
159
|
+
self.insights_cache_ttl_hours = insights_cache_ttl_hours
|
|
160
|
+
self.insights_dry_run = insights_dry_run
|
|
161
|
+
self.stub_mode = stub_mode
|
|
162
|
+
|
|
163
|
+
def generate_all(self) -> DatasetManifest:
|
|
164
|
+
"""Generate all aggregate files and manifest.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
DatasetManifest with generated file index.
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
AggregationError: If generation fails.
|
|
171
|
+
StubGenerationError: If stubs requested without ALLOW_ML_STUBS env var.
|
|
172
|
+
"""
|
|
173
|
+
import warnings as py_warnings
|
|
174
|
+
|
|
175
|
+
# Create output directories
|
|
176
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
(self.output_dir / "aggregates").mkdir(exist_ok=True)
|
|
178
|
+
(self.output_dir / "aggregates" / "weekly_rollups").mkdir(exist_ok=True)
|
|
179
|
+
(self.output_dir / "aggregates" / "distributions").mkdir(exist_ok=True)
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
# Generate dimensions
|
|
183
|
+
dimensions = self._generate_dimensions()
|
|
184
|
+
self._write_json(
|
|
185
|
+
self.output_dir / "aggregates" / "dimensions.json",
|
|
186
|
+
asdict(dimensions),
|
|
187
|
+
)
|
|
188
|
+
logger.info("Generated dimensions.json")
|
|
189
|
+
|
|
190
|
+
# Generate weekly rollups
|
|
191
|
+
weekly_index = self._generate_weekly_rollups()
|
|
192
|
+
logger.info(f"Generated {len(weekly_index)} weekly rollup files")
|
|
193
|
+
|
|
194
|
+
# Generate yearly distributions
|
|
195
|
+
dist_index = self._generate_distributions()
|
|
196
|
+
logger.info(f"Generated {len(dist_index)} distribution files")
|
|
197
|
+
|
|
198
|
+
# Phase 5: ML features generation
|
|
199
|
+
predictions_generated = False
|
|
200
|
+
insights_generated = False
|
|
201
|
+
warnings: list[str] = []
|
|
202
|
+
|
|
203
|
+
# Stub mode (deprecated, for testing only)
|
|
204
|
+
if self.stub_mode:
|
|
205
|
+
py_warnings.warn(
|
|
206
|
+
"Stub mode is deprecated. Use --enable-predictions and "
|
|
207
|
+
"--enable-insights for real ML features.",
|
|
208
|
+
DeprecationWarning,
|
|
209
|
+
stacklevel=2,
|
|
210
|
+
)
|
|
211
|
+
# Use legacy stubs
|
|
212
|
+
pred_gen = PredictionGenerator(self.output_dir, self.seed_base)
|
|
213
|
+
pred_gen.generate()
|
|
214
|
+
predictions_generated = True
|
|
215
|
+
|
|
216
|
+
insights_gen = InsightsGenerator(self.output_dir, self.seed_base)
|
|
217
|
+
insights_gen.generate()
|
|
218
|
+
insights_generated = True
|
|
219
|
+
|
|
220
|
+
warnings.append("STUB DATA - NOT PRODUCTION")
|
|
221
|
+
logger.warning(
|
|
222
|
+
"Generated stub predictions/insights - NOT FOR PRODUCTION"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Legacy enable_ml_stubs (LOUD WARNING - maps to stub mode)
|
|
226
|
+
elif self.enable_ml_stubs:
|
|
227
|
+
# Hard warning to prevent accidental stub usage in production
|
|
228
|
+
logger.warning("=" * 80)
|
|
229
|
+
logger.warning(
|
|
230
|
+
"WARNING: --enable-ml-stubs is DEPRECATED and generates "
|
|
231
|
+
"STUB DATA with is_stub:true"
|
|
232
|
+
)
|
|
233
|
+
logger.warning(
|
|
234
|
+
"Use --enable-predictions and --enable-insights for real ML features."
|
|
235
|
+
)
|
|
236
|
+
logger.warning(
|
|
237
|
+
"To explicitly use stubs for testing, use --stub-mode instead."
|
|
238
|
+
)
|
|
239
|
+
logger.warning("=" * 80)
|
|
240
|
+
|
|
241
|
+
pred_gen = PredictionGenerator(self.output_dir, self.seed_base)
|
|
242
|
+
pred_gen.generate()
|
|
243
|
+
predictions_generated = True
|
|
244
|
+
|
|
245
|
+
insights_gen = InsightsGenerator(self.output_dir, self.seed_base)
|
|
246
|
+
insights_gen.generate()
|
|
247
|
+
insights_generated = True
|
|
248
|
+
|
|
249
|
+
warnings.append("STUB DATA - NOT PRODUCTION - DEPRECATED FLAG USED")
|
|
250
|
+
logger.warning(
|
|
251
|
+
"Generated stub predictions/insights - NOT FOR PRODUCTION"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
else:
|
|
255
|
+
# Phase 5: Real ML features
|
|
256
|
+
if self.enable_predictions:
|
|
257
|
+
predictions_generated = self._generate_predictions()
|
|
258
|
+
|
|
259
|
+
if self.enable_insights:
|
|
260
|
+
insights_generated = self._generate_insights()
|
|
261
|
+
|
|
262
|
+
# Check if files exist from previous runs
|
|
263
|
+
if not predictions_generated:
|
|
264
|
+
predictions_generated = (
|
|
265
|
+
self.output_dir / "predictions" / "trends.json"
|
|
266
|
+
).exists()
|
|
267
|
+
if not insights_generated:
|
|
268
|
+
insights_generated = (
|
|
269
|
+
self.output_dir / "insights" / "summary.json"
|
|
270
|
+
).exists()
|
|
271
|
+
|
|
272
|
+
# Build manifest
|
|
273
|
+
manifest = DatasetManifest(
|
|
274
|
+
generated_at=datetime.now(timezone.utc).isoformat(),
|
|
275
|
+
run_id=self.run_id,
|
|
276
|
+
warnings=warnings,
|
|
277
|
+
aggregate_index=AggregateIndex(
|
|
278
|
+
weekly_rollups=weekly_index,
|
|
279
|
+
distributions=dist_index,
|
|
280
|
+
),
|
|
281
|
+
defaults={"default_date_range_days": 90},
|
|
282
|
+
limits={"max_date_range_days_soft": 730},
|
|
283
|
+
features={
|
|
284
|
+
"teams": len(dimensions.teams) > 0, # Phase 3.3: dynamic
|
|
285
|
+
"comments": self._has_comments(), # Phase 3.4: dynamic
|
|
286
|
+
"predictions": predictions_generated, # Phase 3.5/5: file-gated
|
|
287
|
+
"ai_insights": insights_generated, # Phase 3.5/5: file-gated
|
|
288
|
+
},
|
|
289
|
+
coverage={
|
|
290
|
+
"total_prs": self._get_pr_count(),
|
|
291
|
+
"date_range": dimensions.date_range,
|
|
292
|
+
"teams_count": len(dimensions.teams), # Phase 3.3
|
|
293
|
+
"comments": self._get_comments_coverage(), # Phase 3.4
|
|
294
|
+
# Phase 4 §5: Operational visibility
|
|
295
|
+
"row_counts": self._get_row_counts(),
|
|
296
|
+
},
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Phase 4 §5: Calculate total artifact size after manifest written
|
|
300
|
+
# We'll add this after initial manifest write
|
|
301
|
+
manifest_dict = asdict(manifest)
|
|
302
|
+
manifest_dict["operational"] = self._get_operational_summary(
|
|
303
|
+
weekly_index, dist_index
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Write manifest
|
|
307
|
+
self._write_json(
|
|
308
|
+
self.output_dir / "dataset-manifest.json",
|
|
309
|
+
manifest_dict,
|
|
310
|
+
)
|
|
311
|
+
logger.info("Generated dataset-manifest.json")
|
|
312
|
+
|
|
313
|
+
return manifest
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
raise AggregationError(f"Failed to generate aggregates: {e}") from e
|
|
317
|
+
|
|
318
|
+
def _generate_predictions(self) -> bool:
|
|
319
|
+
"""Generate Prophet-based predictions (Phase 5).
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
True if predictions file was successfully written, False otherwise.
|
|
323
|
+
"""
|
|
324
|
+
try:
|
|
325
|
+
from ..ml.forecaster import ProphetForecaster
|
|
326
|
+
except ImportError:
|
|
327
|
+
logger.warning(
|
|
328
|
+
"Prophet not installed. Install ML extras: pip install -e '.[ml]'"
|
|
329
|
+
)
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
forecaster = ProphetForecaster(
|
|
334
|
+
db=self.db,
|
|
335
|
+
output_dir=self.output_dir,
|
|
336
|
+
)
|
|
337
|
+
return forecaster.generate()
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.warning(f"Prediction generation failed: {type(e).__name__}: {e}")
|
|
340
|
+
return False
|
|
341
|
+
|
|
342
|
+
def _generate_insights(self) -> bool:
|
|
343
|
+
"""Generate OpenAI-based insights (Phase 5).
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
True if insights file was written, False otherwise.
|
|
347
|
+
"""
|
|
348
|
+
try:
|
|
349
|
+
from ..ml.insights import LLMInsightsGenerator
|
|
350
|
+
except ImportError:
|
|
351
|
+
# This should not happen as CLI validates openai is installed
|
|
352
|
+
logger.error(
|
|
353
|
+
"OpenAI SDK not installed. Install ML extras: pip install -e '.[ml]'"
|
|
354
|
+
)
|
|
355
|
+
raise AggregationError(
|
|
356
|
+
"OpenAI SDK required for --enable-insights"
|
|
357
|
+
) from None
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
insights_gen = LLMInsightsGenerator(
|
|
361
|
+
db=self.db,
|
|
362
|
+
output_dir=self.output_dir,
|
|
363
|
+
max_tokens=self.insights_max_tokens,
|
|
364
|
+
cache_ttl_hours=self.insights_cache_ttl_hours,
|
|
365
|
+
dry_run=self.insights_dry_run,
|
|
366
|
+
)
|
|
367
|
+
return insights_gen.generate()
|
|
368
|
+
except Exception as e:
|
|
369
|
+
logger.warning(f"Insights generation failed: {type(e).__name__}: {e}")
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
def _generate_dimensions(self) -> Dimensions:
|
|
373
|
+
"""Generate filter dimensions from SQLite."""
|
|
374
|
+
# Repositories
|
|
375
|
+
repos_df = pd.read_sql_query(
|
|
376
|
+
"""
|
|
377
|
+
SELECT repository_id, repository_name, project_name, organization_name
|
|
378
|
+
FROM repositories
|
|
379
|
+
ORDER BY organization_name, project_name, repository_name
|
|
380
|
+
""",
|
|
381
|
+
self.db.connection,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Users (authors only, not all users)
|
|
385
|
+
users_df = pd.read_sql_query(
|
|
386
|
+
"""
|
|
387
|
+
SELECT DISTINCT u.user_id, u.display_name
|
|
388
|
+
FROM users u
|
|
389
|
+
INNER JOIN pull_requests pr ON pr.user_id = u.user_id
|
|
390
|
+
ORDER BY u.display_name
|
|
391
|
+
""",
|
|
392
|
+
self.db.connection,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Projects
|
|
396
|
+
projects_df = pd.read_sql_query(
|
|
397
|
+
"""
|
|
398
|
+
SELECT organization_name, project_name
|
|
399
|
+
FROM projects
|
|
400
|
+
ORDER BY organization_name, project_name
|
|
401
|
+
""",
|
|
402
|
+
self.db.connection,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Date range
|
|
406
|
+
date_range_df = pd.read_sql_query(
|
|
407
|
+
"""
|
|
408
|
+
SELECT MIN(closed_date) as min_date, MAX(closed_date) as max_date
|
|
409
|
+
FROM pull_requests
|
|
410
|
+
WHERE closed_date IS NOT NULL
|
|
411
|
+
""",
|
|
412
|
+
self.db.connection,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
date_range = {}
|
|
416
|
+
if not date_range_df.empty and date_range_df.iloc[0]["min_date"]:
|
|
417
|
+
date_range = {
|
|
418
|
+
"min": date_range_df.iloc[0]["min_date"][:10], # YYYY-MM-DD
|
|
419
|
+
"max": date_range_df.iloc[0]["max_date"][:10],
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
# Phase 3.3: Teams (defensive for legacy DBs without teams table)
|
|
423
|
+
try:
|
|
424
|
+
teams_df = pd.read_sql_query(
|
|
425
|
+
"""
|
|
426
|
+
SELECT t.team_id, t.team_name, t.project_name, t.organization_name,
|
|
427
|
+
COUNT(tm.user_id) as member_count
|
|
428
|
+
FROM teams t
|
|
429
|
+
LEFT JOIN team_members tm ON t.team_id = tm.team_id
|
|
430
|
+
GROUP BY t.team_id, t.team_name, t.project_name, t.organization_name
|
|
431
|
+
ORDER BY t.organization_name, t.project_name, t.team_name
|
|
432
|
+
""",
|
|
433
|
+
self.db.connection,
|
|
434
|
+
)
|
|
435
|
+
except Exception as e:
|
|
436
|
+
# P1 fix: Legacy databases may not have teams table
|
|
437
|
+
logger.debug(f"Teams table not available (legacy DB?): {e}")
|
|
438
|
+
teams_df = pd.DataFrame()
|
|
439
|
+
|
|
440
|
+
return Dimensions(
|
|
441
|
+
repositories=list(repos_df.to_dict(orient="records")), # type: ignore[arg-type]
|
|
442
|
+
users=list(users_df.to_dict(orient="records")), # type: ignore[arg-type]
|
|
443
|
+
projects=list(projects_df.to_dict(orient="records")), # type: ignore[arg-type]
|
|
444
|
+
teams=(
|
|
445
|
+
list(teams_df.to_dict(orient="records")) # type: ignore[arg-type]
|
|
446
|
+
if not teams_df.empty
|
|
447
|
+
else []
|
|
448
|
+
),
|
|
449
|
+
date_range=date_range,
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
def _generate_weekly_rollups(self) -> list[dict[str, Any]]:
|
|
453
|
+
"""Generate weekly rollup files, one per ISO week."""
|
|
454
|
+
# Query PRs with closed dates
|
|
455
|
+
df = pd.read_sql_query(
|
|
456
|
+
"""
|
|
457
|
+
SELECT
|
|
458
|
+
closed_date,
|
|
459
|
+
cycle_time_minutes,
|
|
460
|
+
user_id,
|
|
461
|
+
pull_request_uid
|
|
462
|
+
FROM pull_requests
|
|
463
|
+
WHERE closed_date IS NOT NULL AND status = 'completed'
|
|
464
|
+
ORDER BY closed_date
|
|
465
|
+
""",
|
|
466
|
+
self.db.connection,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
if df.empty:
|
|
470
|
+
return []
|
|
471
|
+
|
|
472
|
+
# Convert to datetime and extract ISO week
|
|
473
|
+
df["closed_dt"] = pd.to_datetime(df["closed_date"])
|
|
474
|
+
df["iso_year"] = df["closed_dt"].dt.isocalendar().year
|
|
475
|
+
df["iso_week"] = df["closed_dt"].dt.isocalendar().week
|
|
476
|
+
|
|
477
|
+
index: list[dict[str, Any]] = []
|
|
478
|
+
|
|
479
|
+
# Group by ISO year-week
|
|
480
|
+
for (iso_year, iso_week), group in df.groupby(["iso_year", "iso_week"]):
|
|
481
|
+
week_str = f"{iso_year}-W{iso_week:02d}"
|
|
482
|
+
|
|
483
|
+
# Calculate week boundaries (iso_year/iso_week are UInt32 from pandas)
|
|
484
|
+
year_int = int(iso_year) # type: ignore[call-overload]
|
|
485
|
+
week_int = int(iso_week) # type: ignore[call-overload]
|
|
486
|
+
start_date = date.fromisocalendar(year_int, week_int, 1)
|
|
487
|
+
end_date = date.fromisocalendar(year_int, week_int, 7)
|
|
488
|
+
|
|
489
|
+
rollup = WeeklyRollup(
|
|
490
|
+
week=week_str,
|
|
491
|
+
start_date=start_date.isoformat(),
|
|
492
|
+
end_date=end_date.isoformat(),
|
|
493
|
+
pr_count=len(group),
|
|
494
|
+
cycle_time_p50=group["cycle_time_minutes"].quantile(0.5)
|
|
495
|
+
if not group["cycle_time_minutes"].isna().all()
|
|
496
|
+
else None,
|
|
497
|
+
cycle_time_p90=group["cycle_time_minutes"].quantile(0.9)
|
|
498
|
+
if not group["cycle_time_minutes"].isna().all()
|
|
499
|
+
else None,
|
|
500
|
+
authors_count=group["user_id"].nunique(),
|
|
501
|
+
reviewers_count=0, # TODO: Add reviewer counting
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Write file
|
|
505
|
+
file_path = (
|
|
506
|
+
self.output_dir / "aggregates" / "weekly_rollups" / f"{week_str}.json"
|
|
507
|
+
)
|
|
508
|
+
self._write_json(file_path, asdict(rollup))
|
|
509
|
+
|
|
510
|
+
# Add to index
|
|
511
|
+
index.append(
|
|
512
|
+
{
|
|
513
|
+
"week": week_str,
|
|
514
|
+
"path": f"aggregates/weekly_rollups/{week_str}.json",
|
|
515
|
+
"start_date": rollup.start_date,
|
|
516
|
+
"end_date": rollup.end_date,
|
|
517
|
+
"size_bytes": file_path.stat().st_size,
|
|
518
|
+
}
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
return index
|
|
522
|
+
|
|
523
|
+
def _generate_distributions(self) -> list[dict[str, Any]]:
|
|
524
|
+
"""Generate yearly distribution files."""
|
|
525
|
+
df = pd.read_sql_query(
|
|
526
|
+
"""
|
|
527
|
+
SELECT
|
|
528
|
+
closed_date,
|
|
529
|
+
cycle_time_minutes
|
|
530
|
+
FROM pull_requests
|
|
531
|
+
WHERE closed_date IS NOT NULL AND status = 'completed'
|
|
532
|
+
ORDER BY closed_date
|
|
533
|
+
""",
|
|
534
|
+
self.db.connection,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
if df.empty:
|
|
538
|
+
return []
|
|
539
|
+
|
|
540
|
+
df["closed_dt"] = pd.to_datetime(df["closed_date"])
|
|
541
|
+
df["year"] = df["closed_dt"].dt.year
|
|
542
|
+
df["month"] = df["closed_dt"].dt.strftime("%Y-%m")
|
|
543
|
+
|
|
544
|
+
index: list[dict[str, Any]] = []
|
|
545
|
+
|
|
546
|
+
for year, group in df.groupby("year"):
|
|
547
|
+
year_str = str(year)
|
|
548
|
+
|
|
549
|
+
# Cycle time buckets (in hours)
|
|
550
|
+
cycle_times = group["cycle_time_minutes"].dropna() / 60 # Convert to hours
|
|
551
|
+
buckets = {
|
|
552
|
+
"0-1h": int((cycle_times < 1).sum()),
|
|
553
|
+
"1-4h": int(((cycle_times >= 1) & (cycle_times < 4)).sum()),
|
|
554
|
+
"4-24h": int(((cycle_times >= 4) & (cycle_times < 24)).sum()),
|
|
555
|
+
"1-3d": int(((cycle_times >= 24) & (cycle_times < 72)).sum()),
|
|
556
|
+
"3-7d": int(((cycle_times >= 72) & (cycle_times < 168)).sum()),
|
|
557
|
+
"7d+": int((cycle_times >= 168).sum()),
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
# PRs by month
|
|
561
|
+
prs_by_month = group.groupby("month").size().to_dict()
|
|
562
|
+
|
|
563
|
+
dist = YearlyDistribution(
|
|
564
|
+
year=year_str,
|
|
565
|
+
start_date=f"{year_str}-01-01",
|
|
566
|
+
end_date=f"{year_str}-12-31",
|
|
567
|
+
total_prs=len(group),
|
|
568
|
+
cycle_time_buckets=buckets,
|
|
569
|
+
prs_by_month={str(k): int(v) for k, v in prs_by_month.items()},
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Write file
|
|
573
|
+
file_path = (
|
|
574
|
+
self.output_dir / "aggregates" / "distributions" / f"{year_str}.json"
|
|
575
|
+
)
|
|
576
|
+
self._write_json(file_path, asdict(dist))
|
|
577
|
+
|
|
578
|
+
index.append(
|
|
579
|
+
{
|
|
580
|
+
"year": year_str,
|
|
581
|
+
"path": f"aggregates/distributions/{year_str}.json",
|
|
582
|
+
"start_date": dist.start_date,
|
|
583
|
+
"end_date": dist.end_date,
|
|
584
|
+
"size_bytes": file_path.stat().st_size,
|
|
585
|
+
}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
return index
|
|
589
|
+
|
|
590
|
+
def _get_pr_count(self) -> int:
|
|
591
|
+
"""Get total PR count."""
|
|
592
|
+
cursor = self.db.execute(
|
|
593
|
+
"SELECT COUNT(*) as cnt FROM pull_requests WHERE status = 'completed'"
|
|
594
|
+
)
|
|
595
|
+
row = cursor.fetchone()
|
|
596
|
+
return int(row["cnt"]) if row else 0
|
|
597
|
+
|
|
598
|
+
def _has_comments(self) -> bool:
|
|
599
|
+
"""Check if comments data exists."""
|
|
600
|
+
try:
|
|
601
|
+
cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_threads")
|
|
602
|
+
row = cursor.fetchone()
|
|
603
|
+
return int(row["cnt"]) > 0 if row else False
|
|
604
|
+
except Exception:
|
|
605
|
+
# Legacy DB may not have pr_threads table
|
|
606
|
+
return False
|
|
607
|
+
|
|
608
|
+
def _get_comments_coverage(self) -> dict[str, Any]:
|
|
609
|
+
"""Get comments coverage statistics.
|
|
610
|
+
|
|
611
|
+
§6: coverage.comments: "full" | "partial" | "disabled"
|
|
612
|
+
"""
|
|
613
|
+
try:
|
|
614
|
+
# Count threads and comments
|
|
615
|
+
thread_cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_threads")
|
|
616
|
+
thread_row = thread_cursor.fetchone()
|
|
617
|
+
thread_count = int(thread_row["cnt"]) if thread_row else 0
|
|
618
|
+
|
|
619
|
+
comment_cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pr_comments")
|
|
620
|
+
comment_row = comment_cursor.fetchone()
|
|
621
|
+
comment_count = int(comment_row["cnt"]) if comment_row else 0
|
|
622
|
+
|
|
623
|
+
# Count PRs with threads
|
|
624
|
+
prs_with_threads_cursor = self.db.execute(
|
|
625
|
+
"SELECT COUNT(DISTINCT pull_request_uid) as cnt FROM pr_threads"
|
|
626
|
+
)
|
|
627
|
+
prs_with_threads_row = prs_with_threads_cursor.fetchone()
|
|
628
|
+
prs_with_threads = (
|
|
629
|
+
int(prs_with_threads_row["cnt"]) if prs_with_threads_row else 0
|
|
630
|
+
)
|
|
631
|
+
except Exception:
|
|
632
|
+
# Legacy DB may not have comments tables
|
|
633
|
+
thread_count = 0
|
|
634
|
+
comment_count = 0
|
|
635
|
+
prs_with_threads = 0
|
|
636
|
+
|
|
637
|
+
if thread_count == 0:
|
|
638
|
+
status = "disabled"
|
|
639
|
+
else:
|
|
640
|
+
# For now, assume full coverage if any comments exist
|
|
641
|
+
# A more complex implementation would track capped state
|
|
642
|
+
status = "full"
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
"status": status,
|
|
646
|
+
"threads_fetched": thread_count,
|
|
647
|
+
"comments_fetched": comment_count,
|
|
648
|
+
"prs_with_threads": prs_with_threads,
|
|
649
|
+
"capped": False, # Set by extraction when limits hit
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
def _get_row_counts(self) -> dict[str, int]:
|
|
653
|
+
"""Get row counts for key tables (Phase 4 §5: Operational visibility)."""
|
|
654
|
+
counts: dict[str, int] = {}
|
|
655
|
+
|
|
656
|
+
# PRs
|
|
657
|
+
try:
|
|
658
|
+
cursor = self.db.execute("SELECT COUNT(*) as cnt FROM pull_requests")
|
|
659
|
+
row = cursor.fetchone()
|
|
660
|
+
counts["pull_requests"] = int(row["cnt"]) if row else 0
|
|
661
|
+
except Exception:
|
|
662
|
+
counts["pull_requests"] = 0
|
|
663
|
+
|
|
664
|
+
# Reviewers
|
|
665
|
+
try:
|
|
666
|
+
cursor = self.db.execute("SELECT COUNT(*) as cnt FROM reviewers")
|
|
667
|
+
row = cursor.fetchone()
|
|
668
|
+
counts["reviewers"] = int(row["cnt"]) if row else 0
|
|
669
|
+
except Exception:
|
|
670
|
+
counts["reviewers"] = 0
|
|
671
|
+
|
|
672
|
+
# Users
|
|
673
|
+
try:
|
|
674
|
+
cursor = self.db.execute("SELECT COUNT(*) as cnt FROM users")
|
|
675
|
+
row = cursor.fetchone()
|
|
676
|
+
counts["users"] = int(row["cnt"]) if row else 0
|
|
677
|
+
except Exception:
|
|
678
|
+
counts["users"] = 0
|
|
679
|
+
|
|
680
|
+
# Repositories
|
|
681
|
+
try:
|
|
682
|
+
cursor = self.db.execute("SELECT COUNT(*) as cnt FROM repositories")
|
|
683
|
+
row = cursor.fetchone()
|
|
684
|
+
counts["repositories"] = int(row["cnt"]) if row else 0
|
|
685
|
+
except Exception:
|
|
686
|
+
counts["repositories"] = 0
|
|
687
|
+
|
|
688
|
+
return counts
|
|
689
|
+
|
|
690
|
+
def _get_operational_summary(
|
|
691
|
+
self,
|
|
692
|
+
weekly_index: list[dict[str, Any]],
|
|
693
|
+
dist_index: list[dict[str, Any]],
|
|
694
|
+
) -> dict[str, Any]:
|
|
695
|
+
"""Generate operational summary for operators (Phase 4 §5).
|
|
696
|
+
|
|
697
|
+
Provides immediate insight into dataset health and scale.
|
|
698
|
+
"""
|
|
699
|
+
# Calculate total artifact size from indexes
|
|
700
|
+
total_size = sum(item.get("size_bytes", 0) for item in weekly_index)
|
|
701
|
+
total_size += sum(item.get("size_bytes", 0) for item in dist_index)
|
|
702
|
+
|
|
703
|
+
# Add dimensions file size if it exists
|
|
704
|
+
dimensions_path = self.output_dir / "aggregates" / "dimensions.json"
|
|
705
|
+
if dimensions_path.exists():
|
|
706
|
+
total_size += dimensions_path.stat().st_size
|
|
707
|
+
|
|
708
|
+
# Add predictions/insights sizes if they exist
|
|
709
|
+
for extra_file in [
|
|
710
|
+
self.output_dir / "predictions" / "trends.json",
|
|
711
|
+
self.output_dir / "insights" / "summary.json",
|
|
712
|
+
]:
|
|
713
|
+
if extra_file.exists():
|
|
714
|
+
total_size += extra_file.stat().st_size
|
|
715
|
+
|
|
716
|
+
return {
|
|
717
|
+
"artifact_size_bytes": total_size,
|
|
718
|
+
"weekly_rollup_count": len(weekly_index),
|
|
719
|
+
"distribution_count": len(dist_index),
|
|
720
|
+
"retention_notice": (
|
|
721
|
+
"Data older than 2 years may have reduced detail. "
|
|
722
|
+
"Consider archiving old data periodically."
|
|
723
|
+
if len(dist_index) > 2
|
|
724
|
+
else None
|
|
725
|
+
),
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
def _write_json(self, path: Path, data: dict[str, Any]) -> None:
|
|
729
|
+
"""Write JSON file with deterministic formatting."""
|
|
730
|
+
with path.open("w", encoding="utf-8") as f:
|
|
731
|
+
json.dump(data, f, indent=2, sort_keys=True)
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
class StubGenerationError(Exception):
|
|
735
|
+
"""Stub generation failed due to missing ALLOW_ML_STUBS env var."""
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class PredictionGenerator:
|
|
739
|
+
"""Generate predictions stub data for Phase 3.5.
|
|
740
|
+
|
|
741
|
+
Produces deterministic synthetic forecasts using a stable seed.
|
|
742
|
+
Only enabled with --enable-ml-stubs AND ALLOW_ML_STUBS=1 env var.
|
|
743
|
+
"""
|
|
744
|
+
|
|
745
|
+
METRICS = [
|
|
746
|
+
("pr_throughput", "count"),
|
|
747
|
+
("cycle_time_minutes", "minutes"),
|
|
748
|
+
("review_time_minutes", "minutes"),
|
|
749
|
+
]
|
|
750
|
+
HORIZON_WEEKS = 4
|
|
751
|
+
|
|
752
|
+
def __init__(
|
|
753
|
+
self,
|
|
754
|
+
output_dir: Path,
|
|
755
|
+
seed_base: str = "",
|
|
756
|
+
) -> None:
|
|
757
|
+
"""Initialize the prediction generator.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
output_dir: Directory for output files.
|
|
761
|
+
seed_base: Base string for deterministic seeding (e.g., org+project).
|
|
762
|
+
"""
|
|
763
|
+
self.output_dir = output_dir
|
|
764
|
+
self.seed_base = seed_base
|
|
765
|
+
|
|
766
|
+
def generate(self) -> dict[str, Any] | None:
|
|
767
|
+
"""Generate predictions stub file.
|
|
768
|
+
|
|
769
|
+
Returns:
|
|
770
|
+
Dict with predictions data if generated, None otherwise.
|
|
771
|
+
|
|
772
|
+
Raises:
|
|
773
|
+
StubGenerationError: If ALLOW_ML_STUBS env var not set.
|
|
774
|
+
"""
|
|
775
|
+
if not os.environ.get("ALLOW_ML_STUBS") == "1":
|
|
776
|
+
raise StubGenerationError(
|
|
777
|
+
"Stub generation requires ALLOW_ML_STUBS=1 environment variable. "
|
|
778
|
+
"This is a safety gate to prevent accidental use of synthetic data."
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
predictions_dir = self.output_dir / "predictions"
|
|
782
|
+
predictions_dir.mkdir(parents=True, exist_ok=True)
|
|
783
|
+
|
|
784
|
+
forecasts = []
|
|
785
|
+
today = date.today()
|
|
786
|
+
# Monday-align to start of current week
|
|
787
|
+
start_monday = today - timedelta(days=today.weekday())
|
|
788
|
+
|
|
789
|
+
for metric, unit in self.METRICS:
|
|
790
|
+
values = []
|
|
791
|
+
for week_offset in range(self.HORIZON_WEEKS):
|
|
792
|
+
period_start = start_monday + timedelta(weeks=week_offset)
|
|
793
|
+
|
|
794
|
+
# Deterministic seed per metric+period
|
|
795
|
+
seed_str = f"{self.seed_base}:{metric}:{period_start.isoformat()}"
|
|
796
|
+
seed = int(hashlib.sha256(seed_str.encode()).hexdigest()[:8], 16)
|
|
797
|
+
rng = random.Random(seed) # noqa: S311 - intentional for deterministic stubs
|
|
798
|
+
|
|
799
|
+
# Generate synthetic values based on metric type
|
|
800
|
+
if metric == "pr_throughput":
|
|
801
|
+
base_value = rng.randint(15, 45)
|
|
802
|
+
variance = rng.randint(3, 10)
|
|
803
|
+
else: # time metrics in minutes
|
|
804
|
+
base_value = rng.randint(120, 480)
|
|
805
|
+
variance = rng.randint(30, 120)
|
|
806
|
+
|
|
807
|
+
values.append(
|
|
808
|
+
{
|
|
809
|
+
"period_start": period_start.isoformat(),
|
|
810
|
+
"predicted": base_value,
|
|
811
|
+
"lower_bound": max(0, base_value - variance),
|
|
812
|
+
"upper_bound": base_value + variance,
|
|
813
|
+
}
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
forecasts.append(
|
|
817
|
+
{
|
|
818
|
+
"metric": metric,
|
|
819
|
+
"unit": unit,
|
|
820
|
+
"horizon_weeks": self.HORIZON_WEEKS,
|
|
821
|
+
"values": values,
|
|
822
|
+
}
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
predictions = {
|
|
826
|
+
"schema_version": PREDICTIONS_SCHEMA_VERSION,
|
|
827
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
828
|
+
"is_stub": True,
|
|
829
|
+
"generated_by": STUB_GENERATOR_ID,
|
|
830
|
+
"forecasts": forecasts,
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
# Write file
|
|
834
|
+
file_path = predictions_dir / "trends.json"
|
|
835
|
+
with file_path.open("w", encoding="utf-8") as f:
|
|
836
|
+
json.dump(predictions, f, indent=2, sort_keys=True)
|
|
837
|
+
|
|
838
|
+
logger.info("Generated predictions/trends.json (stub data)")
|
|
839
|
+
return predictions
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
class InsightsGenerator:
|
|
843
|
+
"""Generate AI insights stub data for Phase 3.5.
|
|
844
|
+
|
|
845
|
+
Produces deterministic synthetic insights using a stable seed.
|
|
846
|
+
Only enabled with --enable-ml-stubs AND ALLOW_ML_STUBS=1 env var.
|
|
847
|
+
"""
|
|
848
|
+
|
|
849
|
+
# Sample insight templates for stub generation
|
|
850
|
+
INSIGHT_TEMPLATES = [
|
|
851
|
+
{
|
|
852
|
+
"category": "bottleneck",
|
|
853
|
+
"severity": "warning",
|
|
854
|
+
"title": "Code review latency increasing",
|
|
855
|
+
"description": "Average time from PR creation to first review has increased "
|
|
856
|
+
"by 15% over the past 4 weeks. This may indicate reviewer capacity constraints.",
|
|
857
|
+
},
|
|
858
|
+
{
|
|
859
|
+
"category": "trend",
|
|
860
|
+
"severity": "info",
|
|
861
|
+
"title": "PR throughput stable",
|
|
862
|
+
"description": "Weekly PR merge rate has remained consistent at approximately "
|
|
863
|
+
"25-30 PRs per week over the analyzed period.",
|
|
864
|
+
},
|
|
865
|
+
{
|
|
866
|
+
"category": "anomaly",
|
|
867
|
+
"severity": "critical",
|
|
868
|
+
"title": "Unusual cycle time spike detected",
|
|
869
|
+
"description": "P90 cycle time increased significantly in the most recent week, "
|
|
870
|
+
"exceeding the historical 95th percentile threshold.",
|
|
871
|
+
},
|
|
872
|
+
]
|
|
873
|
+
|
|
874
|
+
def __init__(
|
|
875
|
+
self,
|
|
876
|
+
output_dir: Path,
|
|
877
|
+
seed_base: str = "",
|
|
878
|
+
) -> None:
|
|
879
|
+
"""Initialize the insights generator.
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
output_dir: Directory for output files.
|
|
883
|
+
seed_base: Base string for deterministic seeding.
|
|
884
|
+
"""
|
|
885
|
+
self.output_dir = output_dir
|
|
886
|
+
self.seed_base = seed_base
|
|
887
|
+
|
|
888
|
+
def generate(self) -> dict[str, Any] | None:
|
|
889
|
+
"""Generate insights stub file.
|
|
890
|
+
|
|
891
|
+
Returns:
|
|
892
|
+
Dict with insights data if generated, None otherwise.
|
|
893
|
+
|
|
894
|
+
Raises:
|
|
895
|
+
StubGenerationError: If ALLOW_ML_STUBS env var not set.
|
|
896
|
+
"""
|
|
897
|
+
if not os.environ.get("ALLOW_ML_STUBS") == "1":
|
|
898
|
+
raise StubGenerationError(
|
|
899
|
+
"Stub generation requires ALLOW_ML_STUBS=1 environment variable."
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
insights_dir = self.output_dir / "insights"
|
|
903
|
+
insights_dir.mkdir(parents=True, exist_ok=True)
|
|
904
|
+
|
|
905
|
+
# Deterministic selection of insights based on seed
|
|
906
|
+
seed_str = f"{self.seed_base}:insights"
|
|
907
|
+
seed = int(hashlib.sha256(seed_str.encode()).hexdigest()[:8], 16)
|
|
908
|
+
rng = random.Random(seed) # noqa: S311 - intentional for deterministic stubs
|
|
909
|
+
|
|
910
|
+
# Generate 2-3 insights from templates
|
|
911
|
+
num_insights = rng.randint(2, 3)
|
|
912
|
+
selected_templates = rng.sample(
|
|
913
|
+
self.INSIGHT_TEMPLATES, min(num_insights, len(self.INSIGHT_TEMPLATES))
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
insights_list = []
|
|
917
|
+
for i, template in enumerate(selected_templates):
|
|
918
|
+
insight_id = hashlib.sha256(
|
|
919
|
+
f"{self.seed_base}:insight:{i}".encode()
|
|
920
|
+
).hexdigest()[:12]
|
|
921
|
+
|
|
922
|
+
insights_list.append(
|
|
923
|
+
{
|
|
924
|
+
"id": f"stub-{insight_id}",
|
|
925
|
+
"category": template["category"],
|
|
926
|
+
"severity": template["severity"],
|
|
927
|
+
"title": template["title"],
|
|
928
|
+
"description": template["description"],
|
|
929
|
+
"affected_entities": [
|
|
930
|
+
f"project:{self.seed_base.split(':')[0] if ':' in self.seed_base else 'default'}"
|
|
931
|
+
],
|
|
932
|
+
"evidence_refs": [],
|
|
933
|
+
}
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
insights = {
|
|
937
|
+
"schema_version": INSIGHTS_SCHEMA_VERSION,
|
|
938
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
939
|
+
"is_stub": True,
|
|
940
|
+
"generated_by": STUB_GENERATOR_ID,
|
|
941
|
+
"insights": insights_list,
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
# Write file
|
|
945
|
+
file_path = insights_dir / "summary.json"
|
|
946
|
+
with file_path.open("w", encoding="utf-8") as f:
|
|
947
|
+
json.dump(insights, f, indent=2, sort_keys=True)
|
|
948
|
+
|
|
949
|
+
logger.info("Generated insights/summary.json (stub data)")
|
|
950
|
+
return insights
|