github-forker 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1000 @@
1
+ """
2
+ pygithub_fork.forker
3
+ ~~~~~~~~~~~~~~~~~~~~
4
+ Core GitHubForker class.
5
+
6
+ Key capabilities on top of a bare PyGithub create_fork() call
7
+ --------------------------------------------------------------
8
+ 1. **Idempotency** — detects pre-existing forks so re-runs are safe.
9
+ 2. **Retry + exponential backoff with jitter** — handles 5xx, network
10
+ timeouts, and GitHub secondary rate limits (403 "abuse" / 429).
11
+ 3. **Fork-readiness polling** — GitHub's fork endpoint is asynchronous;
12
+ we poll until the fork is actually populated, not just created.
13
+ 4. **Thread pool** — fork_many() can run up to `pool_workers` forks
14
+ concurrently via ThreadPoolExecutor.
15
+ 5. **Background / fire-and-forget** — fork_async() returns a ForkJob
16
+ whose .result() / .status / .done can be queried from another thread
17
+ without blocking the caller.
18
+ 6. **Post-fork upstream remote** — optionally runs
19
+ `git remote add upstream <url>` in a local clone.
20
+ 7. **Post-fork webhook** — optionally registers a GitHub webhook on the
21
+ freshly created fork.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ import os
27
+ import random
28
+ import subprocess
29
+ import time
30
+ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
31
+ from datetime import datetime, timezone
32
+ from typing import Iterable, Iterator, Optional, Union
33
+
34
+ from github import Github, GithubException
35
+ from github.Repository import Repository
36
+ from github.Organization import Organization
37
+ from github.GithubObject import NotSet
38
+
39
+ try:
40
+ from github import RateLimitExceededError as _RateLimitExc
41
+ except ImportError:
42
+ from github import RateLimitExceededException as _RateLimitExc # type: ignore[attr-defined]
43
+
44
+ from .exceptions import (
45
+ ForkError,
46
+ ForkPermissionError,
47
+ ForkTimeoutError,
48
+ RepositoryNotFoundError,
49
+ UpstreamRemoteError,
50
+ WebhookError,
51
+ )
52
+ from .models import ForkRequest, ForkResult, ForkStatus, ForkerConfig
53
+
54
+ logger = logging.getLogger(__name__)
55
+
56
+
57
+ # ============================================================================
58
+ # ForkJob — a thin wrapper around concurrent.futures.Future
59
+ # ============================================================================
60
+
61
+ class ForkJob:
62
+ """
63
+ Handle returned by fork_async().
64
+
65
+ Lets you fire off a fork in the background and query it later from a
66
+ completely separate part of your code — or a different thread.
67
+
68
+ Example::
69
+
70
+ job = forker.fork_async("octocat/Hello-World")
71
+ # ... do other things ...
72
+ result = job.wait() # blocks until done
73
+ print(result.status)
74
+
75
+ # Or poll without blocking:
76
+ if job.done:
77
+ print(job.result) # ForkResult, never raises
78
+ else:
79
+ print("still running, status =", job.status)
80
+ """
81
+
82
+ def __init__(self, future: Future, source_full_name: str) -> None:
83
+ self._future = future
84
+ self.source_full_name = source_full_name
85
+
86
+ # ---- non-blocking accessors ----------------------------------------- #
87
+
88
+ @property
89
+ def done(self) -> bool:
90
+ """True if the fork operation has finished (success *or* failure)."""
91
+ return self._future.done()
92
+
93
+ @property
94
+ def status(self) -> ForkStatus:
95
+ """
96
+ Best-effort current status without blocking.
97
+ Returns PENDING while running, then the real status once done.
98
+ """
99
+ if not self._future.done():
100
+ return ForkStatus.PENDING
101
+ exc = self._future.exception()
102
+ if exc is not None:
103
+ return ForkStatus.FAILED
104
+ return self._future.result().status
105
+
106
+ @property
107
+ def result(self) -> Optional[ForkResult]:
108
+ """
109
+ The ForkResult if finished, None if still running.
110
+ Never raises — check .error on the returned ForkResult instead.
111
+ """
112
+ if not self._future.done():
113
+ return None
114
+ exc = self._future.exception()
115
+ if exc is not None:
116
+ return ForkResult(
117
+ source_full_name=self.source_full_name,
118
+ fork=None,
119
+ status=ForkStatus.FAILED,
120
+ already_existed=False,
121
+ error=exc,
122
+ )
123
+ return self._future.result()
124
+
125
+ # ---- blocking accessor ----------------------------------------------- #
126
+
127
+ def wait(self, timeout: Optional[float] = None) -> ForkResult:
128
+ """
129
+ Block until the fork completes and return the ForkResult.
130
+
131
+ Args:
132
+ timeout: seconds to wait; None = wait forever.
133
+
134
+ Raises:
135
+ concurrent.futures.TimeoutError if timeout elapses before done.
136
+ ForkError (or subclass) on fork failure.
137
+ """
138
+ return self._future.result(timeout=timeout)
139
+
140
+ def __repr__(self) -> str: # pragma: no cover
141
+ return (
142
+ f"ForkJob(source={self.source_full_name!r}, "
143
+ f"status={self.status.value}, done={self.done})"
144
+ )
145
+
146
+
147
+ # ============================================================================
148
+ # GitHubForker
149
+ # ============================================================================
150
+
151
+ class GitHubForker:
152
+ """
153
+ Production-ready GitHub repository forker.
154
+
155
+ Parameters
156
+ ----------
157
+ client : github.Github
158
+ An authenticated PyGithub client (token, app auth, etc.).
159
+ config : ForkerConfig, optional
160
+ Tunable settings; ForkerConfig() defaults are sane for most use cases.
161
+
162
+ Quick start::
163
+
164
+ from github import Github
165
+ from pygithub_fork import GitHubForker
166
+
167
+ gh = Github("ghp_xxx")
168
+ forker = GitHubForker(gh)
169
+
170
+ # --- synchronous, blocks until fork is ready ---
171
+ result = forker.fork("octocat/Hello-World")
172
+ print(result.status, result.clone_url)
173
+
174
+ # --- fire-and-forget, check later ---
175
+ job = forker.fork_async("PyGithub/PyGithub")
176
+ # ... other work ...
177
+ result = job.wait()
178
+
179
+ # --- bulk fork with thread pool ---
180
+ results = forker.fork_many(["owner/a", "owner/b", "owner/c"])
181
+ """
182
+
183
+ def __init__(self, client: Github, config: Optional[ForkerConfig] = None) -> None:
184
+ if not isinstance(client, Github):
185
+ raise TypeError("client must be an authenticated github.Github instance")
186
+ self._gh = client
187
+ self.config = config or ForkerConfig()
188
+ # Shared pool — created lazily, lives for the lifetime of this forker.
189
+ self._pool: Optional[ThreadPoolExecutor] = None
190
+
191
+ # ------------------------------------------------------------------ #
192
+ # Context-manager support (shuts down the thread pool cleanly)
193
+ # ------------------------------------------------------------------ #
194
+
195
+ def __enter__(self) -> "GitHubForker":
196
+ return self
197
+
198
+ def __exit__(self, *_: object) -> None:
199
+ self.shutdown()
200
+
201
+ def shutdown(self, wait: bool = True) -> None:
202
+ """Shut down the internal thread pool. Safe to call multiple times."""
203
+ if self._pool is not None:
204
+ self._pool.shutdown(wait=wait)
205
+ self._pool = None
206
+
207
+ @property
208
+ def _executor(self) -> ThreadPoolExecutor:
209
+ if self._pool is None:
210
+ self._pool = ThreadPoolExecutor(
211
+ max_workers=self.config.pool_workers,
212
+ thread_name_prefix="pygithub_fork",
213
+ )
214
+ return self._pool
215
+
216
+ # ================================================================== #
217
+ # Public API
218
+ # ================================================================== #
219
+
220
+ # ---- 1. fork() — synchronous, blocking ----------------------------- #
221
+
222
+ def fork(
223
+ self,
224
+ source: Union[str, Repository],
225
+ *,
226
+ organization: Optional[Union[str, Organization]] = None,
227
+ name: Optional[str] = None,
228
+ default_branch_only: Optional[bool] = None,
229
+ # Per-call overrides (use config for global defaults)
230
+ add_upstream_remote: Optional[bool] = None,
231
+ local_path: Optional[str] = None,
232
+ register_webhook: Optional[bool] = None,
233
+ webhook_url: Optional[str] = None,
234
+ webhook_events: Optional[list[str]] = None,
235
+ ) -> ForkResult:
236
+ """
237
+ Fork a single repository synchronously.
238
+
239
+ Blocks until the fork is confirmed ready (respecting
240
+ config.wait_for_ready and config.ready_timeout_seconds), then
241
+ optionally adds an upstream remote and/or registers a webhook.
242
+
243
+ Parameters
244
+ ----------
245
+ source:
246
+ ``"owner/repo"`` string **or** a PyGithub Repository object.
247
+ organization:
248
+ Org login or Organization object to fork into.
249
+ Defaults to the authenticated user's personal account.
250
+ name:
251
+ Custom name for the resulting fork (avoids collision when forking
252
+ the same upstream into multiple org targets).
253
+ default_branch_only:
254
+ When True, GitHub copies only the default branch.
255
+ add_upstream_remote:
256
+ Override config.add_upstream_remote for this call.
257
+ local_path:
258
+ Local clone path for ``git remote add upstream``.
259
+ Overrides config.local_clone_path.
260
+ register_webhook:
261
+ Override config.register_webhook for this call.
262
+ webhook_url:
263
+ Webhook payload URL (overrides config.webhook_url).
264
+ webhook_events:
265
+ Events to subscribe to (overrides config.webhook_events).
266
+
267
+ Returns
268
+ -------
269
+ ForkResult
270
+
271
+ Raises
272
+ ------
273
+ RepositoryNotFoundError
274
+ Source repo not found or token can't see it.
275
+ ForkPermissionError
276
+ Token lacks fork permission (distinct from secondary rate limit).
277
+ ForkTimeoutError
278
+ Fork created but not confirmed ready within timeout (only when
279
+ wait_for_ready=True).
280
+ ForkError
281
+ Any other unrecoverable error.
282
+ """
283
+ start = time.monotonic()
284
+ repo = self._resolve_source(source)
285
+ full_name = repo.full_name
286
+
287
+ existing = self._find_existing_fork(repo, organization)
288
+ if existing is not None:
289
+ logger.info(
290
+ "Fork of '%s' already exists at '%s'; reusing.",
291
+ full_name, existing.full_name,
292
+ )
293
+ result = ForkResult(
294
+ source_full_name=full_name,
295
+ fork=existing,
296
+ status=ForkStatus.ALREADY_EXISTED,
297
+ already_existed=True,
298
+ elapsed_seconds=time.monotonic() - start,
299
+ )
300
+ else:
301
+ fork_obj, attempts = self._create_with_retry(
302
+ repo,
303
+ organization=organization,
304
+ name=name,
305
+ default_branch_only=default_branch_only,
306
+ )
307
+ result = ForkResult(
308
+ source_full_name=full_name,
309
+ fork=fork_obj,
310
+ status=ForkStatus.CREATED,
311
+ already_existed=False,
312
+ attempts=attempts,
313
+ elapsed_seconds=time.monotonic() - start,
314
+ )
315
+
316
+ if self.config.wait_for_ready and result.fork is not None:
317
+ self._await_ready(result, start)
318
+
319
+ # ---- post-fork actions ----------------------------------------- #
320
+ if result.fork is not None:
321
+ do_upstream = (
322
+ add_upstream_remote
323
+ if add_upstream_remote is not None
324
+ else self.config.add_upstream_remote
325
+ )
326
+ effective_path = local_path or self.config.local_clone_path
327
+ if do_upstream:
328
+ self._add_upstream_remote(result, repo, effective_path)
329
+
330
+ do_webhook = (
331
+ register_webhook
332
+ if register_webhook is not None
333
+ else self.config.register_webhook
334
+ )
335
+ effective_wh_url = webhook_url or self.config.webhook_url
336
+ effective_wh_events = webhook_events or self.config.webhook_events
337
+ if do_webhook:
338
+ self._register_webhook(result, effective_wh_url, effective_wh_events)
339
+
340
+ result.elapsed_seconds = time.monotonic() - start
341
+ logger.info(
342
+ "fork '%s' → '%s' status=%s attempts=%d %.1fs",
343
+ full_name,
344
+ result.fork.full_name if result.fork else "n/a",
345
+ result.status.value,
346
+ result.attempts,
347
+ result.elapsed_seconds,
348
+ )
349
+
350
+ if self.config.on_fork_done:
351
+ try:
352
+ self.config.on_fork_done(result)
353
+ except Exception:
354
+ logger.debug("on_fork_done callback raised; ignoring.", exc_info=True)
355
+
356
+ return result
357
+
358
+ # ---- 2. fork_async() — fire-and-forget, returns ForkJob ------------- #
359
+
360
+ def fork_async(
361
+ self,
362
+ source: Union[str, Repository],
363
+ *,
364
+ organization: Optional[Union[str, Organization]] = None,
365
+ name: Optional[str] = None,
366
+ default_branch_only: Optional[bool] = None,
367
+ add_upstream_remote: Optional[bool] = None,
368
+ local_path: Optional[str] = None,
369
+ register_webhook: Optional[bool] = None,
370
+ webhook_url: Optional[str] = None,
371
+ webhook_events: Optional[list[str]] = None,
372
+ ) -> ForkJob:
373
+ """
374
+ Submit a fork to the background thread pool and return immediately.
375
+
376
+ The returned :class:`ForkJob` lets you query progress and retrieve
377
+ the result from *any* thread without blocking the caller:
378
+
379
+ .. code-block:: python
380
+
381
+ job = forker.fork_async("octocat/Hello-World")
382
+
383
+ # do other things ...
384
+
385
+ # poll without blocking:
386
+ print(job.done, job.status)
387
+
388
+ # or block when you actually need the result:
389
+ result = job.wait()
390
+
391
+ The fork runs inside the shared ``ThreadPoolExecutor`` (size set by
392
+ ``config.pool_workers``). All the same retry, backoff, readiness
393
+ polling, and post-fork actions as :meth:`fork` apply.
394
+
395
+ Parameters
396
+ ----------
397
+ Same as :meth:`fork`.
398
+
399
+ Returns
400
+ -------
401
+ ForkJob
402
+ """
403
+ source_name = (
404
+ source if isinstance(source, str)
405
+ else getattr(source, "full_name", str(source))
406
+ )
407
+ future = self._executor.submit(
408
+ self.fork,
409
+ source,
410
+ organization=organization,
411
+ name=name,
412
+ default_branch_only=default_branch_only,
413
+ add_upstream_remote=add_upstream_remote,
414
+ local_path=local_path,
415
+ register_webhook=register_webhook,
416
+ webhook_url=webhook_url,
417
+ webhook_events=webhook_events,
418
+ )
419
+ return ForkJob(future, source_name)
420
+
421
+ # ---- 3. fork_many() — batch, optionally pooled ---------------------- #
422
+
423
+ def fork_many(
424
+ self,
425
+ sources: Iterable[Union[str, Repository, ForkRequest]],
426
+ *,
427
+ organization: Optional[Union[str, Organization]] = None,
428
+ name: Optional[str] = None,
429
+ default_branch_only: Optional[bool] = None,
430
+ add_upstream_remote: Optional[bool] = None,
431
+ local_path: Optional[str] = None,
432
+ register_webhook: Optional[bool] = None,
433
+ webhook_url: Optional[str] = None,
434
+ webhook_events: Optional[list[str]] = None,
435
+ parallel: bool = True,
436
+ stop_on_error: bool = False,
437
+ ) -> list[ForkResult]:
438
+ """
439
+ Fork multiple repositories.
440
+
441
+ When ``parallel=True`` (default), forks run concurrently inside the
442
+ shared ``ThreadPoolExecutor`` (up to ``config.pool_workers`` at once).
443
+ When ``parallel=False``, they run one at a time in order.
444
+
445
+ Each item in *sources* can be:
446
+
447
+ * A plain ``"owner/repo"`` string — uses the shared keyword args below.
448
+ * A :class:`~pygithub_fork.models.Repository` object — same.
449
+ * A :class:`~pygithub_fork.models.ForkRequest` — overrides shared
450
+ args for that one item; ideal when batch-forking into different
451
+ organizations or with different names.
452
+
453
+ Per-item failures are captured in the returned ``ForkResult``
454
+ (.succeeded == False, .error set) rather than raised, **unless**
455
+ ``stop_on_error=True``.
456
+
457
+ Parameters
458
+ ----------
459
+ sources : iterable
460
+ Repositories to fork.
461
+ parallel : bool
462
+ Run forks concurrently (default True).
463
+ stop_on_error : bool
464
+ Abort the batch on the first error (default False).
465
+ organization, name, default_branch_only, add_upstream_remote,
466
+ local_path, register_webhook, webhook_url, webhook_events :
467
+ Shared defaults for all items; overridden per-item by ForkRequest.
468
+
469
+ Returns
470
+ -------
471
+ list[ForkResult]
472
+ In the *same order* as ``sources``, regardless of completion order.
473
+ """
474
+ requests = self._normalize_requests(
475
+ sources,
476
+ organization=organization,
477
+ name=name,
478
+ default_branch_only=default_branch_only,
479
+ add_upstream_remote=add_upstream_remote,
480
+ local_path=local_path,
481
+ register_webhook=register_webhook,
482
+ webhook_url=webhook_url,
483
+ webhook_events=webhook_events,
484
+ )
485
+
486
+ if parallel:
487
+ return self._run_parallel(requests, stop_on_error=stop_on_error)
488
+ else:
489
+ return self._run_sequential(requests, stop_on_error=stop_on_error)
490
+
491
+ # ---- 4. fork_iter() — streaming generator for large batches --------- #
492
+
493
+ def fork_iter(
494
+ self,
495
+ sources: Iterable[Union[str, Repository, ForkRequest]],
496
+ *,
497
+ organization: Optional[Union[str, Organization]] = None,
498
+ name: Optional[str] = None,
499
+ default_branch_only: Optional[bool] = None,
500
+ add_upstream_remote: Optional[bool] = None,
501
+ local_path: Optional[str] = None,
502
+ register_webhook: Optional[bool] = None,
503
+ webhook_url: Optional[str] = None,
504
+ webhook_events: Optional[list[str]] = None,
505
+ ) -> Iterator[ForkResult]:
506
+ """
507
+ Like fork_many(parallel=True) but yields each ForkResult as soon as
508
+ it completes (completion order, not submission order). Useful for
509
+ large batches where you want to process results incrementally.
510
+
511
+ .. code-block:: python
512
+
513
+ for result in forker.fork_iter(["owner/a", "owner/b", "owner/c"]):
514
+ print(result.source_full_name, result.status)
515
+ """
516
+ requests = self._normalize_requests(
517
+ sources,
518
+ organization=organization,
519
+ name=name,
520
+ default_branch_only=default_branch_only,
521
+ add_upstream_remote=add_upstream_remote,
522
+ local_path=local_path,
523
+ register_webhook=register_webhook,
524
+ webhook_url=webhook_url,
525
+ webhook_events=webhook_events,
526
+ )
527
+ futures = {
528
+ self._executor.submit(self._run_one_request, req): req
529
+ for req in requests
530
+ }
531
+ for future in as_completed(futures):
532
+ result = future.result() # _run_one_request never raises
533
+ yield result
534
+
535
+ # ================================================================== #
536
+ # Internals — fork lifecycle
537
+ # ================================================================== #
538
+
539
+ def _resolve_source(self, source: Union[str, Repository]) -> Repository:
540
+ if isinstance(source, Repository):
541
+ return source
542
+ if not isinstance(source, str) or "/" not in source:
543
+ raise ValueError(
544
+ f"source must be 'owner/repo' string or Repository object, got {source!r}"
545
+ )
546
+ try:
547
+ return self._gh.get_repo(source)
548
+ except GithubException as exc:
549
+ if exc.status == 404:
550
+ raise RepositoryNotFoundError(
551
+ f"Repository '{source}' not found or not accessible "
552
+ f"with the current token."
553
+ ) from exc
554
+ if exc.status in (401, 403):
555
+ raise ForkPermissionError(
556
+ f"Not authorized to access '{source}': {exc.data}"
557
+ ) from exc
558
+ raise ForkError(f"Failed to fetch '{source}': {exc.data}") from exc
559
+
560
+ def _find_existing_fork(
561
+ self,
562
+ repo: Repository,
563
+ organization: Optional[Union[str, Organization]],
564
+ ) -> Optional[Repository]:
565
+ """Return the pre-existing fork if present, otherwise None."""
566
+ try:
567
+ target_login = self._target_login(organization)
568
+ except GithubException:
569
+ return None
570
+
571
+ try:
572
+ candidate = self._gh.get_repo(f"{target_login}/{repo.name}")
573
+ except GithubException as exc:
574
+ if exc.status == 404:
575
+ return None
576
+ logger.debug("Existing-fork lookup failed non-fatally: %s", exc)
577
+ return None
578
+
579
+ is_fork = getattr(candidate, "fork", False)
580
+ parent = getattr(candidate, "parent", None)
581
+ if is_fork and parent is not None and parent.full_name == repo.full_name:
582
+ return candidate
583
+ return None
584
+
585
+ def _target_login(
586
+ self, organization: Optional[Union[str, Organization]]
587
+ ) -> str:
588
+ if organization is None:
589
+ return self._gh.get_user().login
590
+ if isinstance(organization, Organization):
591
+ return organization.login
592
+ return str(organization)
593
+
594
+ def _create_with_retry(
595
+ self,
596
+ repo: Repository,
597
+ *,
598
+ organization: Optional[Union[str, Organization]],
599
+ name: Optional[str],
600
+ default_branch_only: Optional[bool],
601
+ ) -> tuple[Repository, int]:
602
+ cfg = self.config
603
+ kwargs: dict = {
604
+ "organization": organization if organization is not None else NotSet,
605
+ "name": name if name is not None else NotSet,
606
+ "default_branch_only": (
607
+ default_branch_only if default_branch_only is not None else NotSet
608
+ ),
609
+ }
610
+
611
+ last_exc: Optional[Exception] = None
612
+ for attempt in range(1, cfg.max_retries + 1):
613
+ try:
614
+ return repo.create_fork(**kwargs), attempt
615
+
616
+ except _RateLimitExc as exc:
617
+ sleep_for = self._seconds_until_reset(exc)
618
+ self._notify_retry(attempt, exc, sleep_for)
619
+ time.sleep(sleep_for)
620
+ last_exc = exc
621
+
622
+ except GithubException as exc:
623
+ last_exc = exc
624
+ if exc.status == 404:
625
+ raise RepositoryNotFoundError(
626
+ f"Repository '{repo.full_name}' disappeared mid-fork."
627
+ ) from exc
628
+ if exc.status in (401, 403):
629
+ if self._is_secondary_rate_limit(exc):
630
+ sleep_for = self._backoff(attempt, cfg)
631
+ self._notify_retry(attempt, exc, sleep_for)
632
+ time.sleep(sleep_for)
633
+ continue
634
+ raise ForkPermissionError(
635
+ f"Not permitted to fork '{repo.full_name}' "
636
+ f"(target: {organization or 'your account'}): {exc.data}"
637
+ ) from exc
638
+ if exc.status == 422:
639
+ raise ForkError(
640
+ f"GitHub rejected fork of '{repo.full_name}': {exc.data}"
641
+ ) from exc
642
+ if exc.status >= 500 or exc.status == 429:
643
+ sleep_for = self._backoff(attempt, cfg)
644
+ self._notify_retry(attempt, exc, sleep_for)
645
+ time.sleep(sleep_for)
646
+ continue
647
+ raise ForkError(
648
+ f"Unexpected error (HTTP {exc.status}) forking "
649
+ f"'{repo.full_name}': {exc.data}"
650
+ ) from exc
651
+
652
+ except Exception as exc:
653
+ last_exc = exc
654
+ sleep_for = self._backoff(attempt, cfg)
655
+ self._notify_retry(attempt, exc, sleep_for)
656
+ time.sleep(sleep_for)
657
+
658
+ raise ForkError(
659
+ f"Max retries ({cfg.max_retries}) exceeded forking "
660
+ f"'{repo.full_name}'. Last error: {last_exc}"
661
+ ) from last_exc
662
+
663
+ def _await_ready(self, result: ForkResult, start_time: float) -> None:
664
+ """
665
+ Poll the fork until GitHub says it's populated (pushed_at or branches
666
+ present), or until ready_timeout_seconds elapses.
667
+
668
+ GitHub's fork endpoint returns *immediately* but the underlying Git
669
+ data (refs, objects, branches) can take 5-30+ seconds to actually
670
+ copy server-side. Code that reads/pushes to the fork without waiting
671
+ will see 404s or empty repos.
672
+ """
673
+ cfg = self.config
674
+ deadline = time.monotonic() + cfg.ready_timeout_seconds
675
+ full_name = result.fork.full_name # type: ignore[union-attr]
676
+
677
+ logger.debug("Waiting for fork '%s' to be ready …", full_name)
678
+
679
+ while time.monotonic() < deadline:
680
+ try:
681
+ refreshed = self._gh.get_repo(full_name)
682
+ if self._is_populated(refreshed):
683
+ result.fork = refreshed
684
+ result.status = ForkStatus.READY
685
+ return
686
+ except GithubException as exc:
687
+ if exc.status != 404:
688
+ logger.debug("Non-fatal poll error: %s", exc)
689
+ time.sleep(cfg.ready_poll_interval_seconds)
690
+
691
+ waited = time.monotonic() - start_time
692
+ result.status = ForkStatus.TIMED_OUT_WAITING
693
+ logger.warning(
694
+ "Fork '%s' created but not confirmed ready after %.0fs.",
695
+ full_name, waited,
696
+ )
697
+
698
+ # ================================================================== #
699
+ # Post-fork actions
700
+ # ================================================================== #
701
+
702
+ def _add_upstream_remote(
703
+ self,
704
+ result: ForkResult,
705
+ source_repo: Repository,
706
+ local_path: Optional[str],
707
+ ) -> None:
708
+ """
709
+ Run ``git remote add upstream <source_clone_url>`` in *local_path*.
710
+
711
+ The remote is named ``upstream`` (conventional name for the original
712
+ repo when working with forks). If ``upstream`` already exists the
713
+ command is a no-op (we check first rather than fail on duplicate).
714
+
715
+ Raises
716
+ ------
717
+ UpstreamRemoteError
718
+ If git is not available, local_path doesn't exist, or the
719
+ subprocess fails for any reason other than "remote already exists".
720
+ """
721
+ if not local_path:
722
+ logger.warning(
723
+ "add_upstream_remote=True but no local_path provided; skipping."
724
+ )
725
+ return
726
+
727
+ if not os.path.isdir(local_path):
728
+ raise UpstreamRemoteError(
729
+ f"local_path '{local_path}' does not exist or is not a directory."
730
+ )
731
+
732
+ upstream_url = source_repo.clone_url # HTTPS; swap for ssh_url if preferred
733
+
734
+ # Check whether 'upstream' already exists to keep the operation idempotent.
735
+ check = subprocess.run(
736
+ ["git", "remote", "get-url", "upstream"],
737
+ cwd=local_path,
738
+ capture_output=True,
739
+ text=True,
740
+ )
741
+ if check.returncode == 0:
742
+ existing_url = check.stdout.strip()
743
+ if existing_url == upstream_url:
744
+ logger.debug(
745
+ "Remote 'upstream' already points to '%s'; nothing to do.",
746
+ upstream_url,
747
+ )
748
+ result.upstream_remote_added = True
749
+ return
750
+ else:
751
+ logger.warning(
752
+ "Remote 'upstream' exists but points to '%s' (expected '%s'); "
753
+ "leaving it unchanged.",
754
+ existing_url, upstream_url,
755
+ )
756
+ return
757
+
758
+ proc = subprocess.run(
759
+ ["git", "remote", "add", "upstream", upstream_url],
760
+ cwd=local_path,
761
+ capture_output=True,
762
+ text=True,
763
+ )
764
+ if proc.returncode != 0:
765
+ raise UpstreamRemoteError(
766
+ f"git remote add upstream failed in '{local_path}': "
767
+ f"{proc.stderr.strip()}"
768
+ )
769
+
770
+ logger.info(
771
+ "Added remote 'upstream' → '%s' in '%s'.",
772
+ upstream_url, local_path,
773
+ )
774
+ result.upstream_remote_added = True
775
+
776
+ def _register_webhook(
777
+ self,
778
+ result: ForkResult,
779
+ webhook_url: Optional[str],
780
+ events: Optional[list[str]],
781
+ ) -> None:
782
+ """
783
+ Register a GitHub webhook on the freshly created fork.
784
+
785
+ The hook is created with ``active=True`` and the content type and
786
+ secret from ForkerConfig. Duplicate hooks (same url + events) on the
787
+ same fork are silently de-duplicated by checking existing hooks first.
788
+
789
+ Raises
790
+ ------
791
+ WebhookError
792
+ If no webhook_url is configured or the API call fails.
793
+ """
794
+ if not webhook_url:
795
+ raise WebhookError(
796
+ "register_webhook=True but no webhook_url configured. "
797
+ "Set ForkerConfig.webhook_url or pass webhook_url=... to fork()."
798
+ )
799
+ if not result.fork:
800
+ return
801
+
802
+ events = events or self.config.webhook_events or ["push", "fork"]
803
+ cfg = self.config
804
+
805
+ # De-duplicate: skip if an identical hook already exists.
806
+ try:
807
+ for hook in result.fork.get_hooks():
808
+ if hook.config.get("url") == webhook_url:
809
+ logger.debug(
810
+ "Webhook for '%s' already registered (id=%d); skipping.",
811
+ webhook_url, hook.id,
812
+ )
813
+ result.webhook_id = hook.id
814
+ return
815
+ except GithubException as exc:
816
+ logger.warning("Could not list existing hooks: %s; proceeding.", exc)
817
+
818
+ hook_config: dict = {
819
+ "url": webhook_url,
820
+ "content_type": cfg.webhook_content_type,
821
+ "insecure_ssl": "1" if cfg.webhook_insecure_ssl else "0",
822
+ }
823
+ if cfg.webhook_secret:
824
+ hook_config["secret"] = cfg.webhook_secret
825
+
826
+ try:
827
+ hook = result.fork.create_hook(
828
+ name="web",
829
+ config=hook_config,
830
+ events=events,
831
+ active=True,
832
+ )
833
+ result.webhook_id = hook.id
834
+ logger.info(
835
+ "Registered webhook id=%d on '%s' for events %s.",
836
+ hook.id, result.fork.full_name, events,
837
+ )
838
+ except GithubException as exc:
839
+ raise WebhookError(
840
+ f"Failed to register webhook on '{result.fork.full_name}': "
841
+ f"{exc.data}"
842
+ ) from exc
843
+
844
+ # ================================================================== #
845
+ # Internals — batch helpers
846
+ # ================================================================== #
847
+
848
+ @staticmethod
849
+ def _normalize_requests(
850
+ sources: Iterable[Union[str, Repository, ForkRequest]],
851
+ **defaults,
852
+ ) -> list[ForkRequest]:
853
+ out = []
854
+ for item in sources:
855
+ if isinstance(item, ForkRequest):
856
+ # Per-item ForkRequest overrides shared defaults only for
857
+ # fields where the request has an explicit (non-None) value.
858
+ req = ForkRequest(
859
+ source=item.source,
860
+ organization=item.organization if item.organization is not None else defaults.get("organization"),
861
+ name=item.name if item.name is not None else defaults.get("name"),
862
+ default_branch_only=item.default_branch_only if item.default_branch_only is not None else defaults.get("default_branch_only"),
863
+ add_upstream_remote=item.add_upstream_remote if item.add_upstream_remote is not None else defaults.get("add_upstream_remote"),
864
+ local_path=item.local_path if item.local_path is not None else defaults.get("local_path"),
865
+ register_webhook=item.register_webhook if item.register_webhook is not None else defaults.get("register_webhook"),
866
+ webhook_url=item.webhook_url if item.webhook_url is not None else defaults.get("webhook_url"),
867
+ webhook_events=item.webhook_events if item.webhook_events is not None else defaults.get("webhook_events"),
868
+ )
869
+ else:
870
+ req = ForkRequest(source=item, **{k: v for k, v in defaults.items() if v is not None})
871
+ out.append(req)
872
+ return out
873
+
874
+ def _run_one_request(self, req: ForkRequest) -> ForkResult:
875
+ """Run a single ForkRequest; catches all exceptions into ForkResult."""
876
+ src_name = (
877
+ req.source if isinstance(req.source, str)
878
+ else getattr(req.source, "full_name", str(req.source))
879
+ )
880
+ try:
881
+ return self.fork(
882
+ req.source,
883
+ organization=req.organization,
884
+ name=req.name,
885
+ default_branch_only=req.default_branch_only,
886
+ add_upstream_remote=req.add_upstream_remote,
887
+ local_path=req.local_path,
888
+ register_webhook=req.register_webhook,
889
+ webhook_url=req.webhook_url,
890
+ webhook_events=req.webhook_events,
891
+ )
892
+ except ForkError as exc:
893
+ logger.error("Failed to fork '%s': %s", src_name, exc)
894
+ return ForkResult(
895
+ source_full_name=str(src_name),
896
+ fork=None,
897
+ status=ForkStatus.FAILED,
898
+ already_existed=False,
899
+ error=exc,
900
+ )
901
+ except Exception as exc:
902
+ logger.error("Unexpected error forking '%s': %s", src_name, exc, exc_info=True)
903
+ return ForkResult(
904
+ source_full_name=str(src_name),
905
+ fork=None,
906
+ status=ForkStatus.FAILED,
907
+ already_existed=False,
908
+ error=exc,
909
+ )
910
+
911
+ def _run_parallel(
912
+ self, requests: list[ForkRequest], *, stop_on_error: bool
913
+ ) -> list[ForkResult]:
914
+ """Submit all requests to the pool; collect results in original order."""
915
+ index_future: dict[int, Future] = {
916
+ i: self._executor.submit(self._run_one_request, req)
917
+ for i, req in enumerate(requests)
918
+ }
919
+ results: list[Optional[ForkResult]] = [None] * len(requests)
920
+ for i, future in index_future.items():
921
+ result = future.result() # _run_one_request never raises
922
+ results[i] = result
923
+ if stop_on_error and not result.succeeded:
924
+ # Cancel remaining pending futures
925
+ for j, f in index_future.items():
926
+ if j > i:
927
+ f.cancel()
928
+ break
929
+ # Fill any cancelled slots
930
+ for i, r in enumerate(results):
931
+ if r is None:
932
+ src = requests[i].source
933
+ results[i] = ForkResult(
934
+ source_full_name=str(src),
935
+ fork=None,
936
+ status=ForkStatus.FAILED,
937
+ already_existed=False,
938
+ error=ForkError("Cancelled due to stop_on_error."),
939
+ )
940
+ return results # type: ignore[return-value]
941
+
942
+ def _run_sequential(
943
+ self, requests: list[ForkRequest], *, stop_on_error: bool
944
+ ) -> list[ForkResult]:
945
+ results = []
946
+ for req in requests:
947
+ result = self._run_one_request(req)
948
+ results.append(result)
949
+ if stop_on_error and not result.succeeded:
950
+ break
951
+ return results
952
+
953
+ # ================================================================== #
954
+ # Internals — utilities
955
+ # ================================================================== #
956
+
957
+ @staticmethod
958
+ def _is_populated(repo: Repository) -> bool:
959
+ if getattr(repo, "pushed_at", None) is not None:
960
+ return True
961
+ try:
962
+ return repo.get_branches().totalCount > 0
963
+ except Exception:
964
+ return False
965
+
966
+ @staticmethod
967
+ def _is_secondary_rate_limit(exc: GithubException) -> bool:
968
+ data = exc.data or {}
969
+ msg = str(data.get("message", "")).lower()
970
+ return "secondary rate limit" in msg or "abuse" in msg
971
+
972
+ @staticmethod
973
+ def _seconds_until_reset(exc: Exception) -> float:
974
+ try:
975
+ headers = getattr(exc, "headers", {}) or {}
976
+ reset_at = headers.get("x-ratelimit-reset")
977
+ if reset_at:
978
+ reset_dt = datetime.fromtimestamp(int(reset_at), tz=timezone.utc)
979
+ delta = (reset_dt - datetime.now(timezone.utc)).total_seconds()
980
+ return max(delta, 1.0) + 1.0
981
+ except Exception:
982
+ pass
983
+ return 30.0
984
+
985
+ @staticmethod
986
+ def _backoff(attempt: int, cfg: ForkerConfig) -> float:
987
+ raw = cfg.base_backoff_seconds * (2 ** (attempt - 1))
988
+ capped = min(raw, cfg.max_backoff_seconds)
989
+ return capped + random.uniform(0.0, capped * 0.25)
990
+
991
+ def _notify_retry(self, attempt: int, exc: Exception, sleep_for: float) -> None:
992
+ logger.warning(
993
+ "Attempt %d/%d failed (%s); retrying in %.1fs.",
994
+ attempt, self.config.max_retries, exc, sleep_for,
995
+ )
996
+ if self.config.on_retry:
997
+ try:
998
+ self.config.on_retry(attempt, exc, sleep_for)
999
+ except Exception:
1000
+ logger.debug("on_retry callback raised; ignoring.", exc_info=True)