astrocyte-ingestion-github 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- astrocyte_ingestion_github/__init__.py +5 -0
- astrocyte_ingestion_github/source.py +278 -0
- astrocyte_ingestion_github-0.8.0.dist-info/METADATA +49 -0
- astrocyte_ingestion_github-0.8.0.dist-info/RECORD +6 -0
- astrocyte_ingestion_github-0.8.0.dist-info/WHEEL +4 -0
- astrocyte_ingestion_github-0.8.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""GitHub REST API poll :class:`~astrocyte.ingest.source.IngestSource` — repository issues."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextlib
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
from urllib.parse import urlsplit
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
from astrocyte.config import SourceConfig
|
|
13
|
+
from astrocyte.errors import IngestError
|
|
14
|
+
from astrocyte.ingest.bank_resolve import resolve_ingest_bank_id
|
|
15
|
+
from astrocyte.ingest.logutil import log_ingest_event
|
|
16
|
+
from astrocyte.ingest.webhook import RetainCallable
|
|
17
|
+
from astrocyte.types import AstrocyteContext, HealthStatus
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("astrocyte_ingestion_github")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _token(cfg: SourceConfig) -> str:
|
|
23
|
+
auth = cfg.auth or {}
|
|
24
|
+
raw = auth.get("token")
|
|
25
|
+
if raw is None or not str(raw).strip():
|
|
26
|
+
raise IngestError("GitHub poll requires auth.token")
|
|
27
|
+
return str(raw).strip()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _api_base(cfg: SourceConfig) -> str:
|
|
31
|
+
u = (cfg.url or "").strip()
|
|
32
|
+
if u:
|
|
33
|
+
return u.rstrip("/")
|
|
34
|
+
return "https://api.github.com"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _owner_repo(cfg: SourceConfig) -> tuple[str, str]:
|
|
38
|
+
p = (cfg.path or "").strip()
|
|
39
|
+
if not p or p.count("/") != 1:
|
|
40
|
+
raise IngestError("GitHub poll requires path: owner/repo")
|
|
41
|
+
owner, repo = p.split("/", 1)
|
|
42
|
+
if not owner.strip() or not repo.strip():
|
|
43
|
+
raise IngestError("GitHub poll path owner/repo must be non-empty")
|
|
44
|
+
return owner.strip(), repo.strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _int_header(raw: object) -> int | None:
|
|
48
|
+
"""Parse GitHub numeric response headers; invalid values become ``None``."""
|
|
49
|
+
try:
|
|
50
|
+
return int(str(raw).strip())
|
|
51
|
+
except ValueError:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class GithubPollIngestSource:
|
|
56
|
+
"""Poll ``GET /repos/{owner}/{repo}/issues`` and retain new/updated issues (not pull requests)."""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
source_id: str,
|
|
61
|
+
config: SourceConfig,
|
|
62
|
+
*,
|
|
63
|
+
retain: RetainCallable,
|
|
64
|
+
) -> None:
|
|
65
|
+
self._source_id = source_id
|
|
66
|
+
self._config = config
|
|
67
|
+
self._retain = retain
|
|
68
|
+
self._task: asyncio.Task[None] | None = None
|
|
69
|
+
self._stop = asyncio.Event()
|
|
70
|
+
self._client: httpx.AsyncClient | None = None
|
|
71
|
+
self._running = False
|
|
72
|
+
self._last_error: str | None = None
|
|
73
|
+
# GitHub issue id -> last ingested updated_at (ISO) to avoid duplicate retains
|
|
74
|
+
self._seen_updated: dict[int, str] = {}
|
|
75
|
+
self._since: str | None = None
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def source_id(self) -> str:
|
|
79
|
+
return self._source_id
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def source_type(self) -> str:
|
|
83
|
+
return "poll"
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def config(self) -> SourceConfig:
|
|
87
|
+
return self._config
|
|
88
|
+
|
|
89
|
+
def _interval_s(self) -> float:
|
|
90
|
+
n = self._config.interval_seconds
|
|
91
|
+
if n is None or int(n) < 60:
|
|
92
|
+
raise IngestError("poll requires interval_seconds >= 60")
|
|
93
|
+
return float(int(n))
|
|
94
|
+
|
|
95
|
+
async def start(self) -> None:
|
|
96
|
+
if self._task is not None:
|
|
97
|
+
return
|
|
98
|
+
self._stop.clear()
|
|
99
|
+
self._last_error = None
|
|
100
|
+
self._running = True
|
|
101
|
+
base = _api_base(self._config)
|
|
102
|
+
host = urlsplit(base).hostname or ""
|
|
103
|
+
self._client = httpx.AsyncClient(
|
|
104
|
+
base_url=base,
|
|
105
|
+
headers={
|
|
106
|
+
"Accept": "application/vnd.github+json",
|
|
107
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
108
|
+
"Authorization": f"Bearer {_token(self._config)}",
|
|
109
|
+
"User-Agent": "astrocyte-ingestion-github",
|
|
110
|
+
},
|
|
111
|
+
timeout=60.0,
|
|
112
|
+
)
|
|
113
|
+
# Enterprise often uses self-signed or custom CA; keep verify=True by default
|
|
114
|
+
if "github.com" not in host and host:
|
|
115
|
+
logger.info("github poll using API base %s", base)
|
|
116
|
+
self._task = asyncio.create_task(self._run_loop(), name=f"astrocyte-github-poll-{self._source_id}")
|
|
117
|
+
|
|
118
|
+
async def stop(self) -> None:
|
|
119
|
+
self._running = False
|
|
120
|
+
self._stop.set()
|
|
121
|
+
if self._task:
|
|
122
|
+
self._task.cancel()
|
|
123
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
124
|
+
await self._task
|
|
125
|
+
self._task = None
|
|
126
|
+
if self._client is not None:
|
|
127
|
+
await self._client.aclose()
|
|
128
|
+
self._client = None
|
|
129
|
+
|
|
130
|
+
async def health_check(self) -> HealthStatus:
|
|
131
|
+
if not self._running or self._task is None:
|
|
132
|
+
return HealthStatus(healthy=False, message="github poll source stopped")
|
|
133
|
+
if self._last_error:
|
|
134
|
+
return HealthStatus(healthy=False, message=self._last_error)
|
|
135
|
+
return HealthStatus(healthy=True, message="github poll loop running")
|
|
136
|
+
|
|
137
|
+
async def _run_loop(self) -> None:
|
|
138
|
+
assert self._client is not None
|
|
139
|
+
interval = self._interval_s()
|
|
140
|
+
while not self._stop.is_set():
|
|
141
|
+
try:
|
|
142
|
+
await self._poll_once()
|
|
143
|
+
except asyncio.CancelledError:
|
|
144
|
+
raise
|
|
145
|
+
except Exception as e:
|
|
146
|
+
self._last_error = str(e)
|
|
147
|
+
log_ingest_event(
|
|
148
|
+
logger,
|
|
149
|
+
"github_poll_cycle_failed",
|
|
150
|
+
source_id=self._source_id,
|
|
151
|
+
error=str(e),
|
|
152
|
+
)
|
|
153
|
+
logger.exception("github poll failed for %s", self._source_id)
|
|
154
|
+
# interruptible sleep
|
|
155
|
+
try:
|
|
156
|
+
await asyncio.wait_for(self._stop.wait(), timeout=interval)
|
|
157
|
+
break
|
|
158
|
+
except asyncio.TimeoutError:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
async def _poll_once(self) -> None:
|
|
162
|
+
assert self._client is not None
|
|
163
|
+
owner, repo = _owner_repo(self._config)
|
|
164
|
+
params: dict[str, Any] = {
|
|
165
|
+
"state": "all",
|
|
166
|
+
"per_page": 50,
|
|
167
|
+
"sort": "updated",
|
|
168
|
+
"direction": "desc",
|
|
169
|
+
}
|
|
170
|
+
if self._since:
|
|
171
|
+
params["since"] = self._since
|
|
172
|
+
|
|
173
|
+
r = await self._client.get(f"/repos/{owner}/{repo}/issues", params=params)
|
|
174
|
+
rem_raw = r.headers.get("x-ratelimit-remaining") or r.headers.get("X-RateLimit-Remaining")
|
|
175
|
+
if rem_raw is not None:
|
|
176
|
+
rem = _int_header(rem_raw)
|
|
177
|
+
if rem is not None and rem < 20:
|
|
178
|
+
log_ingest_event(
|
|
179
|
+
logger,
|
|
180
|
+
"github_poll_rate_limit_low",
|
|
181
|
+
source_id=self._source_id,
|
|
182
|
+
remaining=rem,
|
|
183
|
+
reset=r.headers.get("x-ratelimit-reset") or r.headers.get("X-RateLimit-Reset"),
|
|
184
|
+
)
|
|
185
|
+
try:
|
|
186
|
+
r.raise_for_status()
|
|
187
|
+
except httpx.HTTPStatusError as e:
|
|
188
|
+
log_ingest_event(
|
|
189
|
+
logger,
|
|
190
|
+
"github_poll_http_error",
|
|
191
|
+
source_id=self._source_id,
|
|
192
|
+
status_code=e.response.status_code,
|
|
193
|
+
url=str(e.request.url),
|
|
194
|
+
)
|
|
195
|
+
raise
|
|
196
|
+
issues = r.json()
|
|
197
|
+
if not isinstance(issues, list):
|
|
198
|
+
raise IngestError("GitHub issues response must be a JSON array")
|
|
199
|
+
|
|
200
|
+
cursor_max: str | None = None
|
|
201
|
+
for issue in issues:
|
|
202
|
+
if not isinstance(issue, dict):
|
|
203
|
+
continue
|
|
204
|
+
uat = issue.get("updated_at")
|
|
205
|
+
if isinstance(uat, str):
|
|
206
|
+
if cursor_max is None or uat > cursor_max:
|
|
207
|
+
cursor_max = uat
|
|
208
|
+
|
|
209
|
+
for issue in issues:
|
|
210
|
+
if not isinstance(issue, dict):
|
|
211
|
+
continue
|
|
212
|
+
if issue.get("pull_request"):
|
|
213
|
+
continue
|
|
214
|
+
iid = issue.get("id")
|
|
215
|
+
upd = issue.get("updated_at")
|
|
216
|
+
if not isinstance(iid, int) or not isinstance(upd, str):
|
|
217
|
+
continue
|
|
218
|
+
if self._seen_updated.get(iid) == upd:
|
|
219
|
+
continue
|
|
220
|
+
title = issue.get("title")
|
|
221
|
+
body = issue.get("body")
|
|
222
|
+
num = issue.get("number")
|
|
223
|
+
html_url = issue.get("html_url")
|
|
224
|
+
t = title if isinstance(title, str) else ""
|
|
225
|
+
b = body if isinstance(body, str) else ""
|
|
226
|
+
n = num if isinstance(num, int) else 0
|
|
227
|
+
u = html_url if isinstance(html_url, str) else ""
|
|
228
|
+
text = f"[GitHub #{n}] {t}\n\n{b}".strip()
|
|
229
|
+
if not text:
|
|
230
|
+
text = f"[GitHub #{n}] (empty)"
|
|
231
|
+
|
|
232
|
+
user = issue.get("user")
|
|
233
|
+
login = None
|
|
234
|
+
if isinstance(user, dict):
|
|
235
|
+
lg = user.get("login")
|
|
236
|
+
if isinstance(lg, str):
|
|
237
|
+
login = lg
|
|
238
|
+
|
|
239
|
+
eff_principal = self._config.principal
|
|
240
|
+
if isinstance(eff_principal, str) and eff_principal.strip():
|
|
241
|
+
p = eff_principal.strip()
|
|
242
|
+
elif login:
|
|
243
|
+
p = f"github:{login}"
|
|
244
|
+
else:
|
|
245
|
+
p = None
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
bank_id = resolve_ingest_bank_id(self._config, principal=p)
|
|
249
|
+
except IngestError as e:
|
|
250
|
+
logger.warning("github poll %s skip issue %s: %s", self._source_id, iid, e)
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
profile = self._config.extraction_profile
|
|
254
|
+
ctx = AstrocyteContext(principal=p) if p else None
|
|
255
|
+
metadata = {
|
|
256
|
+
"github": {
|
|
257
|
+
"issue_id": iid,
|
|
258
|
+
"number": n,
|
|
259
|
+
"html_url": u,
|
|
260
|
+
"updated_at": upd,
|
|
261
|
+
"author": login,
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
await self._retain(
|
|
266
|
+
text,
|
|
267
|
+
bank_id,
|
|
268
|
+
metadata=metadata,
|
|
269
|
+
content_type="text",
|
|
270
|
+
extraction_profile=profile,
|
|
271
|
+
source=self._source_id,
|
|
272
|
+
context=ctx,
|
|
273
|
+
)
|
|
274
|
+
self._seen_updated[iid] = upd
|
|
275
|
+
|
|
276
|
+
if cursor_max:
|
|
277
|
+
self._since = cursor_max
|
|
278
|
+
self._last_error = None
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: astrocyte-ingestion-github
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: GitHub Issues API poll IngestSource adapter for Astrocyte
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: astrocyte<0.9,>=0.7.0
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# astrocyte-ingestion-github
|
|
15
|
+
|
|
16
|
+
**Poll** driver for Astrocyte `sources:` — ingests **GitHub repository issues** (not pull requests) via the [REST API](https://docs.github.com/en/rest/issues/issues#list-repository-issues).
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install astrocyte-ingestion-github
|
|
22
|
+
# or
|
|
23
|
+
pip install 'astrocyte[poll]'
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Config (`astrocyte.yaml`)
|
|
27
|
+
|
|
28
|
+
```yaml
|
|
29
|
+
sources:
|
|
30
|
+
gh_issues:
|
|
31
|
+
type: poll
|
|
32
|
+
driver: github
|
|
33
|
+
path: octocat/Hello-World # owner/repo
|
|
34
|
+
interval_seconds: 120 # >= 60 (GitHub API rate limits)
|
|
35
|
+
target_bank: engineering
|
|
36
|
+
auth:
|
|
37
|
+
token: ${GITHUB_TOKEN} # classic PAT or fine-grained token (issues read)
|
|
38
|
+
extraction_profile: builtin_text # optional
|
|
39
|
+
# Optional: GitHub Enterprise Server API root
|
|
40
|
+
# url: https://github.example.com/api/v3
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The adapter sets `Authorization: Bearer …` and uses `since` (max `updated_at` from the last response) to limit traffic. Each issue is retained as text `[GitHub #N] title` plus body; metadata includes `github.issue_id`, `number`, `html_url`, `updated_at`, `author`.
|
|
44
|
+
|
|
45
|
+
Principal for bank resolution: `sources.*.principal` if set; otherwise `github:<author_login>` from the issue.
|
|
46
|
+
|
|
47
|
+
## Entry point
|
|
48
|
+
|
|
49
|
+
Registers as **`github`** under **`astrocyte.ingest_poll_drivers`** (same discovery pattern as stream drivers).
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
astrocyte_ingestion_github/__init__.py,sha256=PQBCYli4fcWKZif6cnAYS258mJs0e3fuVYJ-oVpe2cE,197
|
|
2
|
+
astrocyte_ingestion_github/source.py,sha256=xFYK8BwJq9vHgkgjhsKSiI9cRkBOgvUVlL-tZrbURJc,9598
|
|
3
|
+
astrocyte_ingestion_github-0.8.0.dist-info/METADATA,sha256=juYL4awau9ZC90Zw1txbC-gBekrVb2HDvYhEy4eHuRI,1733
|
|
4
|
+
astrocyte_ingestion_github-0.8.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
5
|
+
astrocyte_ingestion_github-0.8.0.dist-info/entry_points.txt,sha256=-YuT3uNXcPGhafoc8sXepqoTgE3B-WOpvNAx2x283Bk,98
|
|
6
|
+
astrocyte_ingestion_github-0.8.0.dist-info/RECORD,,
|