ado-git-repo-insights 1.2.1__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. ado_git_repo_insights/__init__.py +3 -3
  2. ado_git_repo_insights/cli.py +703 -354
  3. ado_git_repo_insights/config.py +186 -186
  4. ado_git_repo_insights/extractor/__init__.py +1 -1
  5. ado_git_repo_insights/extractor/ado_client.py +452 -246
  6. ado_git_repo_insights/extractor/pr_extractor.py +239 -239
  7. ado_git_repo_insights/ml/__init__.py +13 -0
  8. ado_git_repo_insights/ml/date_utils.py +70 -0
  9. ado_git_repo_insights/ml/forecaster.py +288 -0
  10. ado_git_repo_insights/ml/insights.py +497 -0
  11. ado_git_repo_insights/persistence/__init__.py +1 -1
  12. ado_git_repo_insights/persistence/database.py +193 -193
  13. ado_git_repo_insights/persistence/models.py +207 -145
  14. ado_git_repo_insights/persistence/repository.py +662 -376
  15. ado_git_repo_insights/transform/__init__.py +1 -1
  16. ado_git_repo_insights/transform/aggregators.py +950 -0
  17. ado_git_repo_insights/transform/csv_generator.py +132 -132
  18. ado_git_repo_insights/utils/__init__.py +1 -1
  19. ado_git_repo_insights/utils/datetime_utils.py +101 -101
  20. ado_git_repo_insights/utils/logging_config.py +172 -172
  21. ado_git_repo_insights/utils/run_summary.py +207 -206
  22. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/METADATA +56 -15
  23. ado_git_repo_insights-2.7.4.dist-info/RECORD +27 -0
  24. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/licenses/LICENSE +21 -21
  25. ado_git_repo_insights-1.2.1.dist-info/RECORD +0 -22
  26. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/WHEEL +0 -0
  27. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/entry_points.txt +0 -0
  28. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/top_level.txt +0 -0
@@ -1,354 +1,703 @@
1
- """CLI entry point for ado-git-repo-insights."""
2
-
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import logging
7
- import sys
8
- import time
9
- from datetime import date
10
- from pathlib import Path
11
- from typing import TYPE_CHECKING
12
-
13
- from .config import ConfigurationError, load_config
14
- from .extractor.ado_client import ADOClient, ExtractionError
15
- from .extractor.pr_extractor import PRExtractor
16
- from .persistence.database import DatabaseError, DatabaseManager
17
- from .transform.csv_generator import CSVGenerationError, CSVGenerator
18
- from .utils.logging_config import LoggingConfig, setup_logging
19
- from .utils.run_summary import (
20
- RunCounts,
21
- RunSummary,
22
- RunTimings,
23
- create_minimal_summary,
24
- get_git_sha,
25
- get_tool_version,
26
- )
27
-
28
- if TYPE_CHECKING:
29
- from argparse import Namespace
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- def create_parser() -> argparse.ArgumentParser: # pragma: no cover
35
- """Create the argument parser for the CLI."""
36
- parser = argparse.ArgumentParser(
37
- prog="ado-insights",
38
- description="Extract Azure DevOps PR metrics and generate PowerBI-compatible CSVs.",
39
- )
40
-
41
- # Global options
42
- parser.add_argument(
43
- "--log-format",
44
- type=str,
45
- choices=["console", "jsonl"],
46
- default="console",
47
- help="Log format: console (human-readable) or jsonl (structured)",
48
- )
49
- parser.add_argument(
50
- "--artifacts-dir",
51
- type=Path,
52
- default=Path("run_artifacts"),
53
- help="Directory for run artifacts (summary, logs)",
54
- )
55
-
56
- subparsers = parser.add_subparsers(dest="command", required=True)
57
-
58
- # Extract command
59
- extract_parser = subparsers.add_parser(
60
- "extract",
61
- help="Extract PR data from Azure DevOps",
62
- )
63
- extract_parser.add_argument(
64
- "--organization",
65
- type=str,
66
- help="Azure DevOps organization name",
67
- )
68
- extract_parser.add_argument(
69
- "--projects",
70
- type=str,
71
- help="Comma-separated list of project names",
72
- )
73
- extract_parser.add_argument(
74
- "--pat",
75
- type=str,
76
- required=True,
77
- help="Personal Access Token with Code (Read) scope",
78
- )
79
- extract_parser.add_argument(
80
- "--config",
81
- type=Path,
82
- help="Path to config.yaml file",
83
- )
84
- extract_parser.add_argument(
85
- "--database",
86
- type=Path,
87
- default=Path("ado-insights.sqlite"),
88
- help="Path to SQLite database file",
89
- )
90
- extract_parser.add_argument(
91
- "--start-date",
92
- type=str,
93
- help="Override start date (YYYY-MM-DD)",
94
- )
95
- extract_parser.add_argument(
96
- "--end-date",
97
- type=str,
98
- help="Override end date (YYYY-MM-DD)",
99
- )
100
- extract_parser.add_argument(
101
- "--backfill-days",
102
- type=int,
103
- help="Number of days to backfill for convergence",
104
- )
105
-
106
- # Generate CSV command
107
- csv_parser = subparsers.add_parser(
108
- "generate-csv",
109
- help="Generate CSV files from SQLite database",
110
- )
111
- csv_parser.add_argument(
112
- "--database",
113
- type=Path,
114
- required=True,
115
- help="Path to SQLite database file",
116
- )
117
- csv_parser.add_argument(
118
- "--output",
119
- type=Path,
120
- default=Path("csv_output"),
121
- help="Output directory for CSV files",
122
- )
123
-
124
- return parser
125
-
126
-
127
- def cmd_extract(args: Namespace) -> int:
128
- """Execute the extract command."""
129
- start_time = time.perf_counter()
130
- timing = RunTimings()
131
- counts = RunCounts()
132
- warnings_list: list[str] = []
133
- per_project_status: dict[str, str] = {}
134
- first_fatal_error: str | None = None
135
-
136
- try:
137
- # Load and validate configuration
138
- config = load_config(
139
- config_path=args.config,
140
- organization=args.organization,
141
- projects=args.projects,
142
- pat=args.pat,
143
- database=args.database,
144
- start_date=args.start_date,
145
- end_date=args.end_date,
146
- backfill_days=args.backfill_days,
147
- )
148
- config.log_summary()
149
-
150
- # Connect to database
151
- extract_start = time.perf_counter()
152
- db = DatabaseManager(config.database)
153
- db.connect()
154
-
155
- try:
156
- # Create ADO client
157
- client = ADOClient(
158
- organization=config.organization,
159
- pat=config.pat, # Invariant 19: PAT handled securely
160
- config=config.api,
161
- )
162
-
163
- # Test connection
164
- client.test_connection(config.projects[0])
165
-
166
- # Run extraction
167
- extractor = PRExtractor(client, db, config)
168
- summary = extractor.extract_all(backfill_days=args.backfill_days)
169
-
170
- # Collect timing
171
- timing.extract_seconds = time.perf_counter() - extract_start
172
-
173
- # Collect counts and warnings
174
- counts.prs_fetched = summary.total_prs
175
- if hasattr(summary, "warnings"):
176
- warnings_list.extend(summary.warnings)
177
-
178
- # Collect per-project status
179
- for project_result in summary.projects:
180
- status = "success" if project_result.success else "failed"
181
- per_project_status[project_result.project] = status
182
-
183
- # Capture first fatal error
184
- if not project_result.success and first_fatal_error is None:
185
- first_fatal_error = (
186
- project_result.error
187
- or f"Extraction failed for project: {project_result.project}"
188
- )
189
-
190
- # Fail-fast: any project failure = exit 1
191
- if not summary.success:
192
- logger.error("Extraction failed")
193
- timing.total_seconds = time.perf_counter() - start_time
194
-
195
- # Write failure summary
196
- run_summary = RunSummary(
197
- tool_version=get_tool_version(),
198
- git_sha=get_git_sha(),
199
- organization=config.organization,
200
- projects=config.projects,
201
- date_range_start=str(config.date_range.start or date.today()),
202
- date_range_end=str(config.date_range.end or date.today()),
203
- counts=counts,
204
- timings=timing,
205
- warnings=warnings_list,
206
- final_status="failed",
207
- per_project_status=per_project_status,
208
- first_fatal_error=first_fatal_error,
209
- )
210
- run_summary.write(args.artifacts_dir / "run_summary.json")
211
- run_summary.print_final_line()
212
- run_summary.emit_ado_commands()
213
- return 1
214
-
215
- logger.info(f"Extraction complete: {summary.total_prs} PRs")
216
- timing.total_seconds = time.perf_counter() - start_time
217
-
218
- # Write success summary
219
- run_summary = RunSummary(
220
- tool_version=get_tool_version(),
221
- git_sha=get_git_sha(),
222
- organization=config.organization,
223
- projects=config.projects,
224
- date_range_start=str(config.date_range.start or date.today()),
225
- date_range_end=str(config.date_range.end or date.today()),
226
- counts=counts,
227
- timings=timing,
228
- warnings=warnings_list,
229
- final_status="success",
230
- per_project_status=per_project_status,
231
- first_fatal_error=None,
232
- )
233
- run_summary.write(args.artifacts_dir / "run_summary.json")
234
- run_summary.print_final_line()
235
- run_summary.emit_ado_commands()
236
- return 0
237
-
238
- finally:
239
- db.close()
240
-
241
- except ConfigurationError as e:
242
- logger.error(f"Configuration error: {e}")
243
- # P2 Fix: Write minimal summary for caught errors
244
- minimal_summary = create_minimal_summary(
245
- f"Configuration error: {e}", args.artifacts_dir
246
- )
247
- minimal_summary.write(args.artifacts_dir / "run_summary.json")
248
- return 1
249
- except DatabaseError as e:
250
- logger.error(f"Database error: {e}")
251
- # P2 Fix: Write minimal summary for caught errors
252
- minimal_summary = create_minimal_summary(
253
- f"Database error: {e}", args.artifacts_dir
254
- )
255
- minimal_summary.write(args.artifacts_dir / "run_summary.json")
256
- return 1
257
- except ExtractionError as e:
258
- logger.error(f"Extraction error: {e}")
259
- # P2 Fix: Write minimal summary for caught errors
260
- minimal_summary = create_minimal_summary(
261
- f"Extraction error: {e}", args.artifacts_dir
262
- )
263
- minimal_summary.write(args.artifacts_dir / "run_summary.json")
264
- return 1
265
-
266
-
267
- def cmd_generate_csv(args: Namespace) -> int:
268
- """Execute the generate-csv command."""
269
- logger.info("Generating CSV files...")
270
- logger.info(f"Database: {args.database}")
271
- logger.info(f"Output: {args.output}")
272
-
273
- if not args.database.exists():
274
- logger.error(f"Database not found: {args.database}")
275
- return 1
276
-
277
- try:
278
- db = DatabaseManager(args.database)
279
- db.connect()
280
-
281
- try:
282
- generator = CSVGenerator(db, args.output)
283
- results = generator.generate_all()
284
-
285
- # Validate schemas (Invariant 1)
286
- generator.validate_schemas()
287
-
288
- logger.info("CSV generation complete:")
289
- for table, count in results.items():
290
- logger.info(f" {table}: {count} rows")
291
-
292
- return 0
293
-
294
- finally:
295
- db.close()
296
-
297
- except DatabaseError as e:
298
- logger.error(f"Database error: {e}")
299
- return 1
300
- except CSVGenerationError as e:
301
- logger.error(f"CSV generation error: {e}")
302
- return 1
303
-
304
-
305
- def main() -> int:
306
- """Main entry point for the CLI."""
307
- parser = create_parser()
308
- args = parser.parse_args()
309
-
310
- # Setup logging early
311
- log_config = LoggingConfig(
312
- format=getattr(args, "log_format", "console"),
313
- artifacts_dir=getattr(args, "artifacts_dir", Path("run_artifacts")),
314
- )
315
- setup_logging(log_config)
316
-
317
- # Ensure artifacts directory exists
318
- artifacts_dir = getattr(args, "artifacts_dir", Path("run_artifacts"))
319
- artifacts_dir.mkdir(parents=True, exist_ok=True)
320
-
321
- summary_path = artifacts_dir / "run_summary.json"
322
-
323
- try:
324
- if args.command == "extract":
325
- return cmd_extract(args)
326
- elif args.command == "generate-csv":
327
- return cmd_generate_csv(args)
328
- else:
329
- parser.print_help()
330
- return 1
331
- except KeyboardInterrupt:
332
- logger.info("Operation cancelled by user")
333
-
334
- # Write minimal failure summary if success summary doesn't exist
335
- if not summary_path.exists():
336
- minimal_summary = create_minimal_summary(
337
- "Operation cancelled by user", artifacts_dir
338
- )
339
- minimal_summary.write(summary_path)
340
-
341
- return 130
342
- except Exception as e:
343
- logger.exception(f"Unexpected error: {e}")
344
-
345
- # Write minimal failure summary if success summary doesn't exist
346
- if not summary_path.exists():
347
- minimal_summary = create_minimal_summary(str(e), artifacts_dir)
348
- minimal_summary.write(summary_path)
349
-
350
- return 1
351
-
352
-
353
- if __name__ == "__main__":
354
- sys.exit(main())
1
+ """CLI entry point for ado-git-repo-insights."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import logging
7
+ import sys
8
+ import time
9
+ from datetime import date
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING
12
+
13
+ from .config import ConfigurationError, load_config
14
+ from .extractor.ado_client import ADOClient, ExtractionError
15
+ from .extractor.pr_extractor import PRExtractor
16
+ from .persistence.database import DatabaseError, DatabaseManager
17
+ from .transform.aggregators import (
18
+ AggregateGenerator,
19
+ AggregationError,
20
+ StubGenerationError,
21
+ )
22
+ from .transform.csv_generator import CSVGenerationError, CSVGenerator
23
+ from .utils.logging_config import LoggingConfig, setup_logging
24
+ from .utils.run_summary import (
25
+ RunCounts,
26
+ RunSummary,
27
+ RunTimings,
28
+ create_minimal_summary,
29
+ get_git_sha,
30
+ get_tool_version,
31
+ )
32
+
33
+ if TYPE_CHECKING:
34
+ from argparse import Namespace
35
+
36
+ from .config import Config
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def create_parser() -> argparse.ArgumentParser: # pragma: no cover
42
+ """Create the argument parser for the CLI."""
43
+ parser = argparse.ArgumentParser(
44
+ prog="ado-insights",
45
+ description="Extract Azure DevOps PR metrics and generate PowerBI-compatible CSVs.",
46
+ )
47
+
48
+ # Global options
49
+ parser.add_argument(
50
+ "--log-format",
51
+ type=str,
52
+ choices=["console", "jsonl"],
53
+ default="console",
54
+ help="Log format: console (human-readable) or jsonl (structured)",
55
+ )
56
+ parser.add_argument(
57
+ "--artifacts-dir",
58
+ type=Path,
59
+ default=Path("run_artifacts"),
60
+ help="Directory for run artifacts (summary, logs)",
61
+ )
62
+
63
+ subparsers = parser.add_subparsers(dest="command", required=True)
64
+
65
+ # Extract command
66
+ extract_parser = subparsers.add_parser(
67
+ "extract",
68
+ help="Extract PR data from Azure DevOps",
69
+ )
70
+ extract_parser.add_argument(
71
+ "--organization",
72
+ type=str,
73
+ help="Azure DevOps organization name",
74
+ )
75
+ extract_parser.add_argument(
76
+ "--projects",
77
+ type=str,
78
+ help="Comma-separated list of project names",
79
+ )
80
+ extract_parser.add_argument(
81
+ "--pat",
82
+ type=str,
83
+ required=True,
84
+ help="Personal Access Token with Code (Read) scope",
85
+ )
86
+ extract_parser.add_argument(
87
+ "--config",
88
+ type=Path,
89
+ help="Path to config.yaml file",
90
+ )
91
+ extract_parser.add_argument(
92
+ "--database",
93
+ type=Path,
94
+ default=Path("ado-insights.sqlite"),
95
+ help="Path to SQLite database file",
96
+ )
97
+ extract_parser.add_argument(
98
+ "--start-date",
99
+ type=str,
100
+ help="Override start date (YYYY-MM-DD)",
101
+ )
102
+ extract_parser.add_argument(
103
+ "--end-date",
104
+ type=str,
105
+ help="Override end date (YYYY-MM-DD)",
106
+ )
107
+ extract_parser.add_argument(
108
+ "--backfill-days",
109
+ type=int,
110
+ help="Number of days to backfill for convergence",
111
+ )
112
+ # Phase 3.4: Comments extraction (§6)
113
+ extract_parser.add_argument(
114
+ "--include-comments",
115
+ action="store_true",
116
+ default=False,
117
+ help="Extract PR threads and comments (feature-flagged)",
118
+ )
119
+ extract_parser.add_argument(
120
+ "--comments-max-prs-per-run",
121
+ type=int,
122
+ default=100,
123
+ help="Max PRs to fetch comments for per run (rate limit protection)",
124
+ )
125
+ extract_parser.add_argument(
126
+ "--comments-max-threads-per-pr",
127
+ type=int,
128
+ default=50,
129
+ help="Max threads to fetch per PR (optional limit)",
130
+ )
131
+
132
+ # Generate CSV command
133
+ csv_parser = subparsers.add_parser(
134
+ "generate-csv",
135
+ help="Generate CSV files from SQLite database",
136
+ )
137
+ csv_parser.add_argument(
138
+ "--database",
139
+ type=Path,
140
+ required=True,
141
+ help="Path to SQLite database file",
142
+ )
143
+ csv_parser.add_argument(
144
+ "--output",
145
+ type=Path,
146
+ default=Path("csv_output"),
147
+ help="Output directory for CSV files",
148
+ )
149
+
150
+ # Generate Aggregates command (Phase 3)
151
+ agg_parser = subparsers.add_parser(
152
+ "generate-aggregates",
153
+ help="Generate chunked JSON aggregates for UI (Phase 3)",
154
+ )
155
+ agg_parser.add_argument(
156
+ "--database",
157
+ type=Path,
158
+ required=True,
159
+ help="Path to SQLite database file",
160
+ )
161
+ agg_parser.add_argument(
162
+ "--output",
163
+ type=Path,
164
+ default=Path("aggregates_output"),
165
+ help="Output directory for aggregate files",
166
+ )
167
+ agg_parser.add_argument(
168
+ "--run-id",
169
+ type=str,
170
+ default="",
171
+ help="Pipeline run ID for manifest metadata",
172
+ )
173
+ # Phase 3.5: Stub generation (requires ALLOW_ML_STUBS=1 env var)
174
+ agg_parser.add_argument(
175
+ "--enable-ml-stubs",
176
+ action="store_true",
177
+ default=False,
178
+ help="Generate stub predictions/insights (requires ALLOW_ML_STUBS=1 env var)",
179
+ )
180
+ agg_parser.add_argument(
181
+ "--seed-base",
182
+ type=str,
183
+ default="",
184
+ help="Base string for deterministic stub seeding",
185
+ )
186
+ # Phase 5: ML feature flags
187
+ agg_parser.add_argument(
188
+ "--enable-predictions",
189
+ action="store_true",
190
+ default=False,
191
+ help="Enable Prophet-based trend forecasting (requires prophet package)",
192
+ )
193
+ agg_parser.add_argument(
194
+ "--enable-insights",
195
+ action="store_true",
196
+ default=False,
197
+ help="Enable OpenAI-based insights (requires openai package and OPENAI_API_KEY)",
198
+ )
199
+ agg_parser.add_argument(
200
+ "--insights-max-tokens",
201
+ type=int,
202
+ default=1000,
203
+ help="Maximum tokens for OpenAI insights response (default: 1000)",
204
+ )
205
+ agg_parser.add_argument(
206
+ "--insights-cache-ttl-hours",
207
+ type=int,
208
+ default=24,
209
+ help="Cache TTL for insights in hours (default: 24)",
210
+ )
211
+ agg_parser.add_argument(
212
+ "--insights-dry-run",
213
+ action="store_true",
214
+ default=False,
215
+ help="Generate prompt artifact without calling OpenAI API",
216
+ )
217
+ # Hidden flag for stub mode (testing only, not in help)
218
+ agg_parser.add_argument(
219
+ "--stub-mode",
220
+ action="store_true",
221
+ default=False,
222
+ help=argparse.SUPPRESS, # Hidden from help
223
+ )
224
+
225
+ return parser
226
+
227
+
228
+ def _extract_comments(
229
+ client: ADOClient,
230
+ db: DatabaseManager,
231
+ config: Config,
232
+ max_prs: int,
233
+ max_threads_per_pr: int,
234
+ ) -> dict[str, int | bool]:
235
+ """Extract PR threads and comments with rate limiting.
236
+
237
+ §6: Incremental strategy - only fetch for PRs in backfill window.
238
+ Rate limit protection via max_prs and max_threads_per_pr.
239
+
240
+ Args:
241
+ client: ADO API client.
242
+ db: Database manager.
243
+ config: Application config.
244
+ max_prs: Maximum PRs to process per run.
245
+ max_threads_per_pr: Maximum threads per PR (0 = unlimited).
246
+
247
+ Returns:
248
+ Stats dict with threads, comments, prs_processed, capped.
249
+ """
250
+ import json
251
+
252
+ from .persistence.repository import PRRepository
253
+
254
+ repo = PRRepository(db)
255
+ stats: dict[str, int | bool] = {
256
+ "threads": 0,
257
+ "comments": 0,
258
+ "prs_processed": 0,
259
+ "capped": False,
260
+ }
261
+
262
+ # Get recently completed PRs to extract comments for
263
+ # Limit by max_prs to avoid rate limiting
264
+ cursor = db.execute(
265
+ """
266
+ SELECT pull_request_uid, pull_request_id, repository_id
267
+ FROM pull_requests
268
+ WHERE status = 'completed'
269
+ ORDER BY closed_date DESC
270
+ LIMIT ?
271
+ """,
272
+ (max_prs,),
273
+ )
274
+ prs_to_process = cursor.fetchall()
275
+
276
+ if len(prs_to_process) >= max_prs:
277
+ stats["capped"] = True
278
+
279
+ for pr_row in prs_to_process:
280
+ pr_uid = pr_row["pull_request_uid"]
281
+ pr_id = pr_row["pull_request_id"]
282
+ repo_id = pr_row["repository_id"]
283
+
284
+ # §6: Incremental sync - check last_updated
285
+ last_updated = repo.get_thread_last_updated(pr_uid)
286
+
287
+ try:
288
+ # Fetch threads from API
289
+ threads = client.get_pr_threads(
290
+ project=config.projects[0], # TODO: get project from PR
291
+ repository_id=repo_id,
292
+ pull_request_id=pr_id,
293
+ )
294
+
295
+ # Apply max_threads_per_pr limit
296
+ if max_threads_per_pr > 0 and len(threads) > max_threads_per_pr:
297
+ threads = threads[:max_threads_per_pr]
298
+
299
+ for thread in threads:
300
+ thread_id = str(thread.get("id", ""))
301
+ thread_updated = thread.get("lastUpdatedDate", "")
302
+ thread_created = thread.get("publishedDate", thread_updated)
303
+ thread_status = thread.get("status", "unknown")
304
+
305
+ # §6: Skip unchanged threads (incremental sync)
306
+ if last_updated and thread_updated <= last_updated:
307
+ continue
308
+
309
+ # Serialize thread context
310
+ thread_context = None
311
+ if "threadContext" in thread:
312
+ thread_context = json.dumps(thread["threadContext"])
313
+
314
+ # Upsert thread
315
+ repo.upsert_thread(
316
+ thread_id=thread_id,
317
+ pull_request_uid=pr_uid,
318
+ status=thread_status,
319
+ thread_context=thread_context,
320
+ last_updated=thread_updated,
321
+ created_at=thread_created,
322
+ is_deleted=thread.get("isDeleted", False),
323
+ )
324
+ stats["threads"] = int(stats["threads"]) + 1
325
+
326
+ # Process comments in thread
327
+ for comment in thread.get("comments", []):
328
+ comment_id = str(comment.get("id", ""))
329
+ author = comment.get("author", {})
330
+ author_id = author.get("id", "unknown")
331
+
332
+ # Upsert author first to avoid FK violation (same as P2 fix)
333
+ repo.upsert_user(
334
+ user_id=author_id,
335
+ display_name=author.get("displayName", "Unknown"),
336
+ email=author.get("uniqueName"),
337
+ )
338
+
339
+ repo.upsert_comment(
340
+ comment_id=comment_id,
341
+ thread_id=thread_id,
342
+ pull_request_uid=pr_uid,
343
+ author_id=author_id,
344
+ content=comment.get("content"),
345
+ comment_type=comment.get("commentType", "text"),
346
+ created_at=comment.get("publishedDate", ""),
347
+ last_updated=comment.get("lastUpdatedDate"),
348
+ is_deleted=comment.get("isDeleted", False),
349
+ )
350
+ stats["comments"] = int(stats["comments"]) + 1
351
+
352
+ stats["prs_processed"] = int(stats["prs_processed"]) + 1
353
+
354
+ except ExtractionError as e:
355
+ logger.warning(f"Failed to extract comments for PR {pr_uid}: {e}")
356
+ # Continue with other PRs - don't fail entire run
357
+
358
+ db.connection.commit()
359
+ return stats
360
+
361
+
362
+ def cmd_extract(args: Namespace) -> int:
363
+ """Execute the extract command."""
364
+ start_time = time.perf_counter()
365
+ timing = RunTimings()
366
+ counts = RunCounts()
367
+ warnings_list: list[str] = []
368
+ per_project_status: dict[str, str] = {}
369
+ first_fatal_error: str | None = None
370
+
371
+ try:
372
+ # Load and validate configuration
373
+ config = load_config(
374
+ config_path=args.config,
375
+ organization=args.organization,
376
+ projects=args.projects,
377
+ pat=args.pat,
378
+ database=args.database,
379
+ start_date=args.start_date,
380
+ end_date=args.end_date,
381
+ backfill_days=args.backfill_days,
382
+ )
383
+ config.log_summary()
384
+
385
+ # Connect to database
386
+ extract_start = time.perf_counter()
387
+ db = DatabaseManager(config.database)
388
+ db.connect()
389
+
390
+ try:
391
+ # Create ADO client
392
+ client = ADOClient(
393
+ organization=config.organization,
394
+ pat=config.pat, # Invariant 19: PAT handled securely
395
+ config=config.api,
396
+ )
397
+
398
+ # Test connection
399
+ client.test_connection(config.projects[0])
400
+
401
+ # Run extraction
402
+ extractor = PRExtractor(client, db, config)
403
+ summary = extractor.extract_all(backfill_days=args.backfill_days)
404
+
405
+ # Collect timing
406
+ timing.extract_seconds = time.perf_counter() - extract_start
407
+
408
+ # Collect counts and warnings
409
+ counts.prs_fetched = summary.total_prs
410
+ if hasattr(summary, "warnings"):
411
+ warnings_list.extend(summary.warnings)
412
+
413
+ # Collect per-project status
414
+ for project_result in summary.projects:
415
+ status = "success" if project_result.success else "failed"
416
+ per_project_status[project_result.project] = status
417
+
418
+ # Capture first fatal error
419
+ if not project_result.success and first_fatal_error is None:
420
+ first_fatal_error = (
421
+ project_result.error
422
+ or f"Extraction failed for project: {project_result.project}"
423
+ )
424
+
425
+ # Fail-fast: any project failure = exit 1
426
+ if not summary.success:
427
+ logger.error("Extraction failed")
428
+ timing.total_seconds = time.perf_counter() - start_time
429
+
430
+ # Write failure summary
431
+ run_summary = RunSummary(
432
+ tool_version=get_tool_version(),
433
+ git_sha=get_git_sha(),
434
+ organization=config.organization,
435
+ projects=config.projects,
436
+ date_range_start=str(config.date_range.start or date.today()),
437
+ date_range_end=str(config.date_range.end or date.today()),
438
+ counts=counts,
439
+ timings=timing,
440
+ warnings=warnings_list,
441
+ final_status="failed",
442
+ per_project_status=per_project_status,
443
+ first_fatal_error=first_fatal_error,
444
+ )
445
+ run_summary.write(args.artifacts_dir / "run_summary.json")
446
+ run_summary.print_final_line()
447
+ run_summary.emit_ado_commands()
448
+ return 1
449
+
450
+ logger.info(f"Extraction complete: {summary.total_prs} PRs")
451
+
452
+ # Phase 3.4: Extract comments if enabled (§6)
453
+ comments_stats = {
454
+ "threads": 0,
455
+ "comments": 0,
456
+ "prs_processed": 0,
457
+ "capped": False,
458
+ }
459
+ if getattr(args, "include_comments", False):
460
+ logger.info("Extracting PR comments (--include-comments enabled)")
461
+ comments_stats = _extract_comments(
462
+ client=client,
463
+ db=db,
464
+ config=config,
465
+ max_prs=getattr(args, "comments_max_prs_per_run", 100),
466
+ max_threads_per_pr=getattr(args, "comments_max_threads_per_pr", 50),
467
+ )
468
+ logger.info(
469
+ f"Comments extraction: {comments_stats['threads']} threads, "
470
+ f"{comments_stats['comments']} comments from {comments_stats['prs_processed']} PRs"
471
+ )
472
+ if comments_stats["capped"]:
473
+ warnings_list.append(
474
+ f"Comments extraction capped at {args.comments_max_prs_per_run} PRs"
475
+ )
476
+
477
+ timing.total_seconds = time.perf_counter() - start_time
478
+
479
+ # Write success summary
480
+ run_summary = RunSummary(
481
+ tool_version=get_tool_version(),
482
+ git_sha=get_git_sha(),
483
+ organization=config.organization,
484
+ projects=config.projects,
485
+ date_range_start=str(config.date_range.start or date.today()),
486
+ date_range_end=str(config.date_range.end or date.today()),
487
+ counts=counts,
488
+ timings=timing,
489
+ warnings=warnings_list,
490
+ final_status="success",
491
+ per_project_status=per_project_status,
492
+ first_fatal_error=None,
493
+ )
494
+ run_summary.write(args.artifacts_dir / "run_summary.json")
495
+ run_summary.print_final_line()
496
+ run_summary.emit_ado_commands()
497
+ return 0
498
+
499
+ finally:
500
+ db.close()
501
+
502
+ except ConfigurationError as e:
503
+ logger.error(f"Configuration error: {e}")
504
+ # P2 Fix: Write minimal summary for caught errors
505
+ minimal_summary = create_minimal_summary(
506
+ f"Configuration error: {e}", args.artifacts_dir
507
+ )
508
+ minimal_summary.write(args.artifacts_dir / "run_summary.json")
509
+ return 1
510
+ except DatabaseError as e:
511
+ logger.error(f"Database error: {e}")
512
+ # P2 Fix: Write minimal summary for caught errors
513
+ minimal_summary = create_minimal_summary(
514
+ f"Database error: {e}", args.artifacts_dir
515
+ )
516
+ minimal_summary.write(args.artifacts_dir / "run_summary.json")
517
+ return 1
518
+ except ExtractionError as e:
519
+ logger.error(f"Extraction error: {e}")
520
+ # P2 Fix: Write minimal summary for caught errors
521
+ minimal_summary = create_minimal_summary(
522
+ f"Extraction error: {e}", args.artifacts_dir
523
+ )
524
+ minimal_summary.write(args.artifacts_dir / "run_summary.json")
525
+ return 1
526
+
527
+
528
+ def cmd_generate_csv(args: Namespace) -> int:
529
+ """Execute the generate-csv command."""
530
+ logger.info("Generating CSV files...")
531
+ logger.info(f"Database: {args.database}")
532
+ logger.info(f"Output: {args.output}")
533
+
534
+ if not args.database.exists():
535
+ logger.error(f"Database not found: {args.database}")
536
+ return 1
537
+
538
+ try:
539
+ db = DatabaseManager(args.database)
540
+ db.connect()
541
+
542
+ try:
543
+ generator = CSVGenerator(db, args.output)
544
+ results = generator.generate_all()
545
+
546
+ # Validate schemas (Invariant 1)
547
+ generator.validate_schemas()
548
+
549
+ logger.info("CSV generation complete:")
550
+ for table, count in results.items():
551
+ logger.info(f" {table}: {count} rows")
552
+
553
+ return 0
554
+
555
+ finally:
556
+ db.close()
557
+
558
+ except DatabaseError as e:
559
+ logger.error(f"Database error: {e}")
560
+ return 1
561
+ except CSVGenerationError as e:
562
+ logger.error(f"CSV generation error: {e}")
563
+ return 1
564
+
565
+
566
+ def cmd_generate_aggregates(args: Namespace) -> int:
567
+ """Execute the generate-aggregates command (Phase 3 + Phase 5 ML)."""
568
+ logger.info("Generating JSON aggregates...")
569
+ logger.info(f"Database: {args.database}")
570
+ logger.info(f"Output: {args.output}")
571
+
572
+ if not args.database.exists():
573
+ logger.error(f"Database not found: {args.database}")
574
+ return 1
575
+
576
+ # Phase 5: Early validation for insights
577
+ enable_insights = getattr(args, "enable_insights", False)
578
+ insights_dry_run = getattr(args, "insights_dry_run", False)
579
+ if enable_insights:
580
+ # Check for OPENAI_API_KEY only if NOT in dry-run mode
581
+ # Dry-run doesn't call API so shouldn't require a key
582
+ import os
583
+
584
+ if not insights_dry_run and not os.environ.get("OPENAI_API_KEY"):
585
+ logger.error(
586
+ "OPENAI_API_KEY is required for --enable-insights. "
587
+ "Set the environment variable, or use --insights-dry-run for prompt iteration."
588
+ )
589
+ return 1
590
+
591
+ # Check for openai package (needed even for dry-run to build prompt)
592
+ try:
593
+ import openai # noqa: F401
594
+ except ImportError:
595
+ logger.error(
596
+ "OpenAI SDK not installed. Install ML extras: pip install -e '.[ml]'"
597
+ )
598
+ return 1
599
+
600
+ try:
601
+ db = DatabaseManager(args.database)
602
+ db.connect()
603
+
604
+ try:
605
+ generator = AggregateGenerator(
606
+ db=db,
607
+ output_dir=args.output,
608
+ run_id=args.run_id,
609
+ enable_ml_stubs=getattr(args, "enable_ml_stubs", False),
610
+ seed_base=getattr(args, "seed_base", ""),
611
+ # Phase 5: ML parameters
612
+ enable_predictions=getattr(args, "enable_predictions", False),
613
+ enable_insights=enable_insights,
614
+ insights_max_tokens=getattr(args, "insights_max_tokens", 1000),
615
+ insights_cache_ttl_hours=getattr(args, "insights_cache_ttl_hours", 24),
616
+ insights_dry_run=getattr(args, "insights_dry_run", False),
617
+ stub_mode=getattr(args, "stub_mode", False),
618
+ )
619
+ manifest = generator.generate_all()
620
+
621
+ logger.info("Aggregate generation complete:")
622
+ logger.info(
623
+ f" Weekly rollups: {len(manifest.aggregate_index.weekly_rollups)}"
624
+ )
625
+ logger.info(
626
+ f" Distributions: {len(manifest.aggregate_index.distributions)}"
627
+ )
628
+ logger.info(f" Predictions: {manifest.features.get('predictions', False)}")
629
+ logger.info(f" AI Insights: {manifest.features.get('ai_insights', False)}")
630
+ logger.info(f" Manifest: {args.output / 'dataset-manifest.json'}")
631
+
632
+ if manifest.warnings:
633
+ for warning in manifest.warnings:
634
+ logger.warning(f" ⚠️ {warning}")
635
+
636
+ return 0
637
+
638
+ finally:
639
+ db.close()
640
+
641
+ except DatabaseError as e:
642
+ logger.error(f"Database error: {e}")
643
+ return 1
644
+ except StubGenerationError as e:
645
+ logger.error(f"Stub generation error: {e}")
646
+ return 1
647
+ except AggregationError as e:
648
+ logger.error(f"Aggregation error: {e}")
649
+ return 1
650
+
651
+
652
+ def main() -> int:
653
+ """Main entry point for the CLI."""
654
+ parser = create_parser()
655
+ args = parser.parse_args()
656
+
657
+ # Setup logging early
658
+ log_config = LoggingConfig(
659
+ format=getattr(args, "log_format", "console"),
660
+ artifacts_dir=getattr(args, "artifacts_dir", Path("run_artifacts")),
661
+ )
662
+ setup_logging(log_config)
663
+
664
+ # Ensure artifacts directory exists
665
+ artifacts_dir = getattr(args, "artifacts_dir", Path("run_artifacts"))
666
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
667
+
668
+ summary_path = artifacts_dir / "run_summary.json"
669
+
670
+ try:
671
+ if args.command == "extract":
672
+ return cmd_extract(args)
673
+ elif args.command == "generate-csv":
674
+ return cmd_generate_csv(args)
675
+ elif args.command == "generate-aggregates":
676
+ return cmd_generate_aggregates(args)
677
+ else:
678
+ parser.print_help()
679
+ return 1
680
+ except KeyboardInterrupt:
681
+ logger.info("Operation cancelled by user")
682
+
683
+ # Write minimal failure summary if success summary doesn't exist
684
+ if not summary_path.exists():
685
+ minimal_summary = create_minimal_summary(
686
+ "Operation cancelled by user", artifacts_dir
687
+ )
688
+ minimal_summary.write(summary_path)
689
+
690
+ return 130
691
+ except Exception as e:
692
+ logger.exception(f"Unexpected error: {e}")
693
+
694
+ # Write minimal failure summary if success summary doesn't exist
695
+ if not summary_path.exists():
696
+ minimal_summary = create_minimal_summary(str(e), artifacts_dir)
697
+ minimal_summary.write(summary_path)
698
+
699
+ return 1
700
+
701
+
702
+ if __name__ == "__main__":
703
+ sys.exit(main())