ado-git-repo-insights 1.2.1__py3-none-any.whl → 2.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. ado_git_repo_insights/__init__.py +3 -3
  2. ado_git_repo_insights/cli.py +703 -354
  3. ado_git_repo_insights/config.py +186 -186
  4. ado_git_repo_insights/extractor/__init__.py +1 -1
  5. ado_git_repo_insights/extractor/ado_client.py +452 -246
  6. ado_git_repo_insights/extractor/pr_extractor.py +239 -239
  7. ado_git_repo_insights/ml/__init__.py +13 -0
  8. ado_git_repo_insights/ml/date_utils.py +70 -0
  9. ado_git_repo_insights/ml/forecaster.py +288 -0
  10. ado_git_repo_insights/ml/insights.py +497 -0
  11. ado_git_repo_insights/persistence/__init__.py +1 -1
  12. ado_git_repo_insights/persistence/database.py +193 -193
  13. ado_git_repo_insights/persistence/models.py +207 -145
  14. ado_git_repo_insights/persistence/repository.py +662 -376
  15. ado_git_repo_insights/transform/__init__.py +1 -1
  16. ado_git_repo_insights/transform/aggregators.py +950 -0
  17. ado_git_repo_insights/transform/csv_generator.py +132 -132
  18. ado_git_repo_insights/utils/__init__.py +1 -1
  19. ado_git_repo_insights/utils/datetime_utils.py +101 -101
  20. ado_git_repo_insights/utils/logging_config.py +172 -172
  21. ado_git_repo_insights/utils/run_summary.py +207 -206
  22. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/METADATA +56 -15
  23. ado_git_repo_insights-2.7.4.dist-info/RECORD +27 -0
  24. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/licenses/LICENSE +21 -21
  25. ado_git_repo_insights-1.2.1.dist-info/RECORD +0 -22
  26. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/WHEEL +0 -0
  27. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/entry_points.txt +0 -0
  28. {ado_git_repo_insights-1.2.1.dist-info → ado_git_repo_insights-2.7.4.dist-info}/top_level.txt +0 -0
@@ -1,246 +1,452 @@
1
- """Azure DevOps REST API client.
2
-
3
- Implements pagination (continuation tokens), bounded retry with exponential backoff,
4
- and fail-fast on partial failures per Invariants 12-13 and Adjustment 4.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import base64
10
- import logging
11
- import time
12
- from collections.abc import Iterator
13
- from dataclasses import dataclass
14
- from datetime import date, timedelta
15
- from typing import Any
16
-
17
- import requests
18
- from requests.exceptions import HTTPError, RequestException
19
-
20
- from ..config import APIConfig
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class ExtractionError(Exception):
26
- """Extraction failed - causes run to fail (Invariant 7, Adjustment 4)."""
27
-
28
-
29
- @dataclass
30
- class ExtractionStats:
31
- """Statistics for an extraction run."""
32
-
33
- total_prs: int = 0
34
- pages_fetched: int = 0
35
- retries_used: int = 0
36
-
37
-
38
- class ADOClient:
39
- """Azure DevOps REST API client with pagination, retry, and rate limiting.
40
-
41
- Invariant 12: Pagination must be complete (continuation tokens).
42
- Invariant 13: Retries must be bounded and predictable.
43
- Adjustment 4: Partial failures fail the run.
44
- """
45
-
46
- def __init__(self, organization: str, pat: str, config: APIConfig) -> None:
47
- """Initialize the ADO client.
48
-
49
- Args:
50
- organization: Azure DevOps organization name.
51
- pat: Personal Access Token with Code (Read) scope.
52
- config: API configuration settings.
53
- """
54
- self.organization = organization
55
- self.base_url = f"{config.base_url}/{organization}"
56
- self.config = config
57
- self.headers = self._build_auth_headers(pat)
58
- self.stats = ExtractionStats()
59
-
60
- def _build_auth_headers(self, pat: str) -> dict[str, str]:
61
- """Build authorization headers for ADO API.
62
-
63
- Args:
64
- pat: Personal Access Token.
65
-
66
- Returns:
67
- Headers dict with Basic auth.
68
- """
69
- # Invariant 19: PAT is never logged
70
- encoded = base64.b64encode(f":{pat}".encode()).decode()
71
- return {
72
- "Authorization": f"Basic {encoded}",
73
- "Content-Type": "application/json",
74
- }
75
-
76
- def get_pull_requests(
77
- self,
78
- project: str,
79
- start_date: date,
80
- end_date: date,
81
- ) -> Iterator[dict[str, Any]]:
82
- """Fetch completed PRs for a date range with automatic pagination.
83
-
84
- Adjustment 4: Handles continuation tokens, bounded retries with backoff.
85
- Raises on partial failures (deterministic failure over silent partial success).
86
-
87
- Args:
88
- project: Project name.
89
- start_date: Start of date range (inclusive).
90
- end_date: End of date range (inclusive).
91
-
92
- Yields:
93
- PR data dictionaries.
94
-
95
- Raises:
96
- ExtractionError: If extraction fails for any date.
97
- """
98
- current_date = start_date
99
- while current_date <= end_date:
100
- try:
101
- prs = self._fetch_prs_for_date_paginated(project, current_date)
102
- yield from prs
103
- except ExtractionError as e:
104
- # Fail the entire run on any date failure (Adjustment 4)
105
- raise ExtractionError(
106
- f"Failed extracting {project} on {current_date}: {e}"
107
- ) from e
108
-
109
- time.sleep(self.config.rate_limit_sleep_seconds)
110
- current_date += timedelta(days=1)
111
-
112
- def _fetch_prs_for_date_paginated(
113
- self, project: str, dt: date
114
- ) -> list[dict[str, Any]]:
115
- """Fetch all PRs for a single date, handling continuation tokens.
116
-
117
- Invariant 12: Complete pagination via continuation tokens.
118
-
119
- Args:
120
- project: Project name.
121
- dt: Date to fetch.
122
-
123
- Returns:
124
- List of all PRs for the date.
125
- """
126
- all_prs: list[dict[str, Any]] = []
127
- continuation_token: str | None = None
128
-
129
- while True:
130
- prs, continuation_token = self._fetch_page(project, dt, continuation_token)
131
- all_prs.extend(prs)
132
- self.stats.pages_fetched += 1
133
-
134
- if not continuation_token:
135
- break
136
-
137
- logger.debug(f"Fetching next page for {project}/{dt}")
138
-
139
- self.stats.total_prs += len(all_prs)
140
- if all_prs:
141
- logger.debug(f"Fetched {len(all_prs)} PRs for {project}/{dt}")
142
-
143
- return all_prs
144
-
145
- def _fetch_page(
146
- self,
147
- project: str,
148
- dt: date,
149
- token: str | None,
150
- ) -> tuple[list[dict[str, Any]], str | None]:
151
- """Fetch a single page of PRs with retry logic.
152
-
153
- Invariant 13: Bounded retries with exponential backoff.
154
-
155
- Args:
156
- project: Project name.
157
- dt: Date to fetch.
158
- token: Continuation token from previous page.
159
-
160
- Returns:
161
- Tuple of (PR list, next continuation token or None).
162
-
163
- Raises:
164
- ExtractionError: After max retries exhausted.
165
- """
166
- url = self._build_pr_url(project, dt, token)
167
-
168
- last_error: Exception | None = None
169
- delay = self.config.retry_delay_seconds
170
-
171
- for attempt in range(1, self.config.max_retries + 1):
172
- try:
173
- response = requests.get(url, headers=self.headers, timeout=30)
174
- response.raise_for_status()
175
-
176
- next_token = response.headers.get("x-ms-continuationtoken")
177
- data = response.json()
178
- return data.get("value", []), next_token
179
-
180
- except (RequestException, HTTPError) as e:
181
- last_error = e
182
- self.stats.retries_used += 1
183
- logger.warning(
184
- f"Attempt {attempt}/{self.config.max_retries} failed: {e}"
185
- )
186
-
187
- if attempt < self.config.max_retries:
188
- logger.info(f"Retrying in {delay:.1f}s...")
189
- time.sleep(delay)
190
- delay *= self.config.retry_backoff_multiplier
191
-
192
- # All retries exhausted - fail the run (Adjustment 4)
193
- raise ExtractionError(
194
- f"Max retries ({self.config.max_retries}) exhausted for {project}/{dt}: "
195
- f"{last_error}"
196
- )
197
-
198
- def _build_pr_url(self, project: str, dt: date, token: str | None) -> str:
199
- """Build the ADO API URL for fetching PRs.
200
-
201
- Args:
202
- project: Project name.
203
- dt: Date to query.
204
- token: Optional continuation token.
205
-
206
- Returns:
207
- Fully constructed URL.
208
- """
209
- url = (
210
- f"{self.base_url}/{project}/_apis/git/pullrequests"
211
- f"?searchCriteria.status=completed"
212
- f"&searchCriteria.queryTimeRangeType=closed"
213
- f"&searchCriteria.minTime={dt}T00:00:00Z"
214
- f"&searchCriteria.maxTime={dt}T23:59:59Z"
215
- f"&$top=1000"
216
- f"&api-version={self.config.version}"
217
- )
218
-
219
- if token:
220
- url += f"&continuationToken={token}"
221
-
222
- return url
223
-
224
- def test_connection(self, project: str) -> bool:
225
- """Test connectivity to ADO API.
226
-
227
- Args:
228
- project: Project name to test.
229
-
230
- Returns:
231
- True if connection successful.
232
-
233
- Raises:
234
- ExtractionError: If connection fails.
235
- """
236
- url = f"{self.base_url}/{project}/_apis/git/repositories?api-version={self.config.version}"
237
-
238
- try:
239
- response = requests.get(url, headers=self.headers, timeout=10)
240
- response.raise_for_status()
241
- logger.info(f"Successfully connected to {self.organization}/{project}")
242
- return True
243
- except (RequestException, HTTPError) as e:
244
- raise ExtractionError(
245
- f"Failed to connect to {self.organization}/{project}: {e}"
246
- ) from e
1
+ """Azure DevOps REST API client.
2
+
3
+ Implements pagination (continuation tokens), bounded retry with exponential backoff,
4
+ and fail-fast on partial failures per Invariants 12-13 and Adjustment 4.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import json
11
+ import logging
12
+ import time
13
+ from collections.abc import Iterator
14
+ from dataclasses import dataclass
15
+ from datetime import date, timedelta
16
+ from typing import Any
17
+
18
+ import requests
19
+ from requests.exceptions import HTTPError, RequestException
20
+
21
+ from ..config import APIConfig
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class ExtractionError(Exception):
27
+ """Extraction failed - causes run to fail (Invariant 7, Adjustment 4)."""
28
+
29
+
30
+ @dataclass
31
+ class ExtractionStats:
32
+ """Statistics for an extraction run."""
33
+
34
+ total_prs: int = 0
35
+ pages_fetched: int = 0
36
+ retries_used: int = 0
37
+
38
+
39
+ class ADOClient:
40
+ """Azure DevOps REST API client with pagination, retry, and rate limiting.
41
+
42
+ Invariant 12: Pagination must be complete (continuation tokens).
43
+ Invariant 13: Retries must be bounded and predictable.
44
+ Adjustment 4: Partial failures fail the run.
45
+ """
46
+
47
+ def __init__(self, organization: str, pat: str, config: APIConfig) -> None:
48
+ """Initialize the ADO client.
49
+
50
+ Args:
51
+ organization: Azure DevOps organization name.
52
+ pat: Personal Access Token with Code (Read) scope.
53
+ config: API configuration settings.
54
+ """
55
+ self.organization = organization
56
+ self.base_url = f"{config.base_url}/{organization}"
57
+ self.config = config
58
+ self.headers = self._build_auth_headers(pat)
59
+ self.stats = ExtractionStats()
60
+
61
+ def _build_auth_headers(self, pat: str) -> dict[str, str]:
62
+ """Build authorization headers for ADO API.
63
+
64
+ Args:
65
+ pat: Personal Access Token.
66
+
67
+ Returns:
68
+ Headers dict with Basic auth.
69
+ """
70
+ # Invariant 19: PAT is never logged
71
+ encoded = base64.b64encode(f":{pat}".encode()).decode()
72
+ return {
73
+ "Authorization": f"Basic {encoded}",
74
+ "Content-Type": "application/json",
75
+ }
76
+
77
+ def _log_invalid_response(
78
+ self, response: requests.Response, error: json.JSONDecodeError
79
+ ) -> None:
80
+ """Log details of invalid JSON response for debugging.
81
+
82
+ Invariant 19: Never log auth headers or sensitive data.
83
+ Truncates body to avoid log bloat.
84
+ """
85
+ max_body_len = 2048 # Safe truncation limit
86
+
87
+ # Safely get response body
88
+ try:
89
+ body = response.text[:max_body_len] if response.text else "<empty>"
90
+ except Exception:
91
+ body = "<unable to decode response body>"
92
+
93
+ # Sanitize headers (remove auth)
94
+ safe_headers = {
95
+ k: v
96
+ for k, v in response.headers.items()
97
+ if k.lower() not in ("authorization", "x-ms-pat", "cookie")
98
+ }
99
+
100
+ logger.warning(
101
+ f"Invalid JSON response - Status: {response.status_code}, "
102
+ f"Headers: {safe_headers}, "
103
+ f"Body (truncated): {body!r}, "
104
+ f"Parse error: {error}"
105
+ )
106
+
107
+ def get_pull_requests(
108
+ self,
109
+ project: str,
110
+ start_date: date,
111
+ end_date: date,
112
+ ) -> Iterator[dict[str, Any]]:
113
+ """Fetch completed PRs for a date range with automatic pagination.
114
+
115
+ Adjustment 4: Handles continuation tokens, bounded retries with backoff.
116
+ Raises on partial failures (deterministic failure over silent partial success).
117
+
118
+ Args:
119
+ project: Project name.
120
+ start_date: Start of date range (inclusive).
121
+ end_date: End of date range (inclusive).
122
+
123
+ Yields:
124
+ PR data dictionaries.
125
+
126
+ Raises:
127
+ ExtractionError: If extraction fails for any date.
128
+ """
129
+ current_date = start_date
130
+ while current_date <= end_date:
131
+ try:
132
+ prs = self._fetch_prs_for_date_paginated(project, current_date)
133
+ yield from prs
134
+ except ExtractionError as e:
135
+ # Fail the entire run on any date failure (Adjustment 4)
136
+ raise ExtractionError(
137
+ f"Failed extracting {project} on {current_date}: {e}"
138
+ ) from e
139
+
140
+ time.sleep(self.config.rate_limit_sleep_seconds)
141
+ current_date += timedelta(days=1)
142
+
143
+ def _fetch_prs_for_date_paginated(
144
+ self, project: str, dt: date
145
+ ) -> list[dict[str, Any]]:
146
+ """Fetch all PRs for a single date, handling continuation tokens.
147
+
148
+ Invariant 12: Complete pagination via continuation tokens.
149
+
150
+ Args:
151
+ project: Project name.
152
+ dt: Date to fetch.
153
+
154
+ Returns:
155
+ List of all PRs for the date.
156
+ """
157
+ all_prs: list[dict[str, Any]] = []
158
+ continuation_token: str | None = None
159
+
160
+ while True:
161
+ prs, continuation_token = self._fetch_page(project, dt, continuation_token)
162
+ all_prs.extend(prs)
163
+ self.stats.pages_fetched += 1
164
+
165
+ if not continuation_token:
166
+ break
167
+
168
+ logger.debug(f"Fetching next page for {project}/{dt}")
169
+
170
+ self.stats.total_prs += len(all_prs)
171
+ if all_prs:
172
+ logger.debug(f"Fetched {len(all_prs)} PRs for {project}/{dt}")
173
+
174
+ return all_prs
175
+
176
+ def _fetch_page(
177
+ self,
178
+ project: str,
179
+ dt: date,
180
+ token: str | None,
181
+ ) -> tuple[list[dict[str, Any]], str | None]:
182
+ """Fetch a single page of PRs with retry logic.
183
+
184
+ Invariant 13: Bounded retries with exponential backoff.
185
+
186
+ Args:
187
+ project: Project name.
188
+ dt: Date to fetch.
189
+ token: Continuation token from previous page.
190
+
191
+ Returns:
192
+ Tuple of (PR list, next continuation token or None).
193
+
194
+ Raises:
195
+ ExtractionError: After max retries exhausted.
196
+ """
197
+ url = self._build_pr_url(project, dt, token)
198
+
199
+ last_error: Exception | None = None
200
+ delay = self.config.retry_delay_seconds
201
+
202
+ for attempt in range(1, self.config.max_retries + 1):
203
+ try:
204
+ response = requests.get(url, headers=self.headers, timeout=30)
205
+ response.raise_for_status()
206
+
207
+ next_token = response.headers.get("x-ms-continuationtoken")
208
+ data = response.json()
209
+ return data.get("value", []), next_token
210
+
211
+ except (RequestException, HTTPError, json.JSONDecodeError) as e:
212
+ last_error = e
213
+ self.stats.retries_used += 1
214
+
215
+ # Safe logging for JSON decode errors (Invariant 19: no auth headers)
216
+ if isinstance(e, json.JSONDecodeError):
217
+ self._log_invalid_response(response, e)
218
+
219
+ logger.warning(
220
+ f"Attempt {attempt}/{self.config.max_retries} failed: {e}"
221
+ )
222
+
223
+ if attempt < self.config.max_retries:
224
+ logger.info(f"Retrying in {delay:.1f}s...")
225
+ time.sleep(delay)
226
+ delay *= self.config.retry_backoff_multiplier
227
+
228
+ # All retries exhausted - fail the run (Adjustment 4)
229
+ raise ExtractionError(
230
+ f"Max retries ({self.config.max_retries}) exhausted for {project}/{dt}: "
231
+ f"{last_error}"
232
+ )
233
+
234
+ def _build_pr_url(self, project: str, dt: date, token: str | None) -> str:
235
+ """Build the ADO API URL for fetching PRs.
236
+
237
+ Args:
238
+ project: Project name.
239
+ dt: Date to query.
240
+ token: Optional continuation token.
241
+
242
+ Returns:
243
+ Fully constructed URL.
244
+ """
245
+ url = (
246
+ f"{self.base_url}/{project}/_apis/git/pullrequests"
247
+ f"?searchCriteria.status=completed"
248
+ f"&searchCriteria.queryTimeRangeType=closed"
249
+ f"&searchCriteria.minTime={dt}T00:00:00Z"
250
+ f"&searchCriteria.maxTime={dt}T23:59:59Z"
251
+ f"&$top=1000"
252
+ f"&api-version={self.config.version}"
253
+ )
254
+
255
+ if token:
256
+ url += f"&continuationToken={token}"
257
+
258
+ return url
259
+
260
+ def test_connection(self, project: str) -> bool:
261
+ """Test connectivity to ADO API.
262
+
263
+ Args:
264
+ project: Project name to test.
265
+
266
+ Returns:
267
+ True if connection successful.
268
+
269
+ Raises:
270
+ ExtractionError: If connection fails.
271
+ """
272
+ url = f"{self.base_url}/{project}/_apis/git/repositories?api-version={self.config.version}"
273
+
274
+ try:
275
+ response = requests.get(url, headers=self.headers, timeout=10)
276
+ response.raise_for_status()
277
+ logger.info(f"Successfully connected to {self.organization}/{project}")
278
+ return True
279
+ except (RequestException, HTTPError) as e:
280
+ raise ExtractionError(
281
+ f"Failed to connect to {self.organization}/{project}: {e}"
282
+ ) from e
283
+
284
+ # Phase 3.3: Team extraction methods
285
+
286
+ def get_teams(self, project: str) -> list[dict[str, Any]]:
287
+ """Fetch all teams for a project.
288
+
289
+ §5: Teams are project-scoped, fetched once per run per project.
290
+
291
+ Args:
292
+ project: Project name.
293
+
294
+ Returns:
295
+ List of team dictionaries.
296
+
297
+ Raises:
298
+ ExtractionError: If team fetch fails (allows graceful degradation).
299
+ """
300
+ url = (
301
+ f"{self.base_url}/_apis/projects/{project}/teams"
302
+ f"?api-version={self.config.version}"
303
+ )
304
+
305
+ all_teams: list[dict[str, Any]] = []
306
+ continuation_token: str | None = None
307
+
308
+ while True:
309
+ page_url = url
310
+ if continuation_token:
311
+ page_url += f"&continuationToken={continuation_token}"
312
+
313
+ try:
314
+ response = requests.get(page_url, headers=self.headers, timeout=30)
315
+ response.raise_for_status()
316
+
317
+ continuation_token = response.headers.get("x-ms-continuationtoken")
318
+ data = response.json()
319
+ teams = data.get("value", [])
320
+ all_teams.extend(teams)
321
+
322
+ if not continuation_token:
323
+ break
324
+
325
+ except (RequestException, HTTPError) as e:
326
+ raise ExtractionError(
327
+ f"Failed to fetch teams for {project}: {e}"
328
+ ) from e
329
+
330
+ time.sleep(self.config.rate_limit_sleep_seconds)
331
+
332
+ logger.info(f"Fetched {len(all_teams)} teams for {project}")
333
+ return all_teams
334
+
335
+ def get_team_members(self, project: str, team_id: str) -> list[dict[str, Any]]:
336
+ """Fetch all members of a team.
337
+
338
+ §5: Membership fetched once per run per team.
339
+
340
+ Args:
341
+ project: Project name.
342
+ team_id: Team identifier.
343
+
344
+ Returns:
345
+ List of team member dictionaries.
346
+
347
+ Raises:
348
+ ExtractionError: If member fetch fails.
349
+ """
350
+ url = (
351
+ f"{self.base_url}/_apis/projects/{project}/teams/{team_id}/members"
352
+ f"?api-version={self.config.version}"
353
+ )
354
+
355
+ all_members: list[dict[str, Any]] = []
356
+ continuation_token: str | None = None
357
+
358
+ while True:
359
+ page_url = url
360
+ if continuation_token:
361
+ page_url += f"&continuationToken={continuation_token}"
362
+
363
+ try:
364
+ response = requests.get(page_url, headers=self.headers, timeout=30)
365
+ response.raise_for_status()
366
+
367
+ continuation_token = response.headers.get("x-ms-continuationtoken")
368
+ data = response.json()
369
+ members = data.get("value", [])
370
+ all_members.extend(members)
371
+
372
+ if not continuation_token:
373
+ break
374
+
375
+ except (RequestException, HTTPError) as e:
376
+ raise ExtractionError(
377
+ f"Failed to fetch members for team {team_id}: {e}"
378
+ ) from e
379
+
380
+ time.sleep(self.config.rate_limit_sleep_seconds)
381
+
382
+ logger.debug(f"Fetched {len(all_members)} members for team {team_id}")
383
+ return all_members
384
+
385
+ # Phase 3.4: PR Threads/Comments extraction
386
+
387
+ def get_pr_threads(
388
+ self,
389
+ project: str,
390
+ repository_id: str,
391
+ pull_request_id: int,
392
+ ) -> list[dict[str, Any]]:
393
+ """Fetch all threads for a pull request.
394
+
395
+ §6: Incremental strategy - caller should filter by lastUpdatedDate.
396
+
397
+ Args:
398
+ project: Project name.
399
+ repository_id: Repository ID.
400
+ pull_request_id: PR ID.
401
+
402
+ Returns:
403
+ List of thread dictionaries.
404
+
405
+ Raises:
406
+ ExtractionError: If thread fetch fails.
407
+ """
408
+ url = (
409
+ f"{self.base_url}/{project}/_apis/git/repositories/{repository_id}"
410
+ f"/pullRequests/{pull_request_id}/threads"
411
+ f"?api-version={self.config.version}"
412
+ )
413
+
414
+ all_threads: list[dict[str, Any]] = []
415
+ continuation_token: str | None = None
416
+
417
+ while True:
418
+ page_url = url
419
+ if continuation_token:
420
+ page_url += f"&continuationToken={continuation_token}"
421
+
422
+ try:
423
+ response = requests.get(page_url, headers=self.headers, timeout=30)
424
+
425
+ # Handle rate limiting (429) with bounded backoff
426
+ if response.status_code == 429:
427
+ retry_after = int(response.headers.get("Retry-After", 60))
428
+ logger.warning(f"Rate limited, waiting {retry_after}s")
429
+ time.sleep(min(retry_after, 120)) # Cap at 2 minutes
430
+ continue
431
+
432
+ response.raise_for_status()
433
+
434
+ continuation_token = response.headers.get("x-ms-continuationtoken")
435
+ data = response.json()
436
+ threads = data.get("value", [])
437
+ all_threads.extend(threads)
438
+
439
+ if not continuation_token:
440
+ break
441
+
442
+ except (RequestException, HTTPError) as e:
443
+ raise ExtractionError(
444
+ f"Failed to fetch threads for PR {pull_request_id}: {e}"
445
+ ) from e
446
+
447
+ time.sleep(self.config.rate_limit_sleep_seconds)
448
+
449
+ logger.debug(
450
+ f"Fetched {len(all_threads)} threads for PR {repository_id}/{pull_request_id}"
451
+ )
452
+ return all_threads