airbyte-source-github 1.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. airbyte_source_github-1.5.7.dist-info/METADATA +144 -0
  2. airbyte_source_github-1.5.7.dist-info/RECORD +88 -0
  3. airbyte_source_github-1.5.7.dist-info/WHEEL +5 -0
  4. airbyte_source_github-1.5.7.dist-info/entry_points.txt +2 -0
  5. airbyte_source_github-1.5.7.dist-info/top_level.txt +3 -0
  6. integration_tests/__init__.py +0 -0
  7. integration_tests/abnormal_state.json +237 -0
  8. integration_tests/acceptance.py +16 -0
  9. integration_tests/configured_catalog.json +435 -0
  10. integration_tests/configured_catalog_full_refresh_test.json +415 -0
  11. integration_tests/invalid_config.json +5 -0
  12. integration_tests/sample_config.json +5 -0
  13. integration_tests/sample_state.json +137 -0
  14. source_github/__init__.py +27 -0
  15. source_github/config_migrations.py +106 -0
  16. source_github/constants.py +9 -0
  17. source_github/github_schema.py +41034 -0
  18. source_github/graphql.py +327 -0
  19. source_github/run.py +17 -0
  20. source_github/schemas/assignees.json +63 -0
  21. source_github/schemas/branches.json +48 -0
  22. source_github/schemas/collaborators.json +80 -0
  23. source_github/schemas/comments.json +104 -0
  24. source_github/schemas/commit_comment_reactions.json +4 -0
  25. source_github/schemas/commit_comments.json +53 -0
  26. source_github/schemas/commits.json +126 -0
  27. source_github/schemas/contributor_activity.json +109 -0
  28. source_github/schemas/deployments.json +77 -0
  29. source_github/schemas/events.json +63 -0
  30. source_github/schemas/issue_comment_reactions.json +4 -0
  31. source_github/schemas/issue_events.json +335 -0
  32. source_github/schemas/issue_labels.json +30 -0
  33. source_github/schemas/issue_milestones.json +61 -0
  34. source_github/schemas/issue_reactions.json +28 -0
  35. source_github/schemas/issue_timeline_events.json +1056 -0
  36. source_github/schemas/issues.json +281 -0
  37. source_github/schemas/organizations.json +197 -0
  38. source_github/schemas/project_cards.json +50 -0
  39. source_github/schemas/project_columns.json +38 -0
  40. source_github/schemas/projects.json +50 -0
  41. source_github/schemas/projects_v2.json +80 -0
  42. source_github/schemas/pull_request_comment_reactions.json +28 -0
  43. source_github/schemas/pull_request_commits.json +122 -0
  44. source_github/schemas/pull_request_stats.json +84 -0
  45. source_github/schemas/pull_requests.json +363 -0
  46. source_github/schemas/releases.json +126 -0
  47. source_github/schemas/repositories.json +313 -0
  48. source_github/schemas/review_comments.json +118 -0
  49. source_github/schemas/reviews.json +69 -0
  50. source_github/schemas/shared/events/comment.json +188 -0
  51. source_github/schemas/shared/events/commented.json +118 -0
  52. source_github/schemas/shared/events/committed.json +56 -0
  53. source_github/schemas/shared/events/cross_referenced.json +784 -0
  54. source_github/schemas/shared/events/reviewed.json +139 -0
  55. source_github/schemas/shared/reaction.json +27 -0
  56. source_github/schemas/shared/reactions.json +35 -0
  57. source_github/schemas/shared/user.json +59 -0
  58. source_github/schemas/shared/user_graphql.json +26 -0
  59. source_github/schemas/stargazers.json +19 -0
  60. source_github/schemas/tags.json +32 -0
  61. source_github/schemas/team_members.json +66 -0
  62. source_github/schemas/team_memberships.json +24 -0
  63. source_github/schemas/teams.json +50 -0
  64. source_github/schemas/users.json +63 -0
  65. source_github/schemas/workflow_jobs.json +109 -0
  66. source_github/schemas/workflow_runs.json +449 -0
  67. source_github/schemas/workflows.json +41 -0
  68. source_github/source.py +339 -0
  69. source_github/spec.json +179 -0
  70. source_github/streams.py +1678 -0
  71. source_github/utils.py +152 -0
  72. unit_tests/__init__.py +3 -0
  73. unit_tests/conftest.py +29 -0
  74. unit_tests/projects_v2_pull_requests_query.json +3 -0
  75. unit_tests/pull_request_stats_query.json +3 -0
  76. unit_tests/responses/contributor_activity_response.json +33 -0
  77. unit_tests/responses/graphql_reviews_responses.json +405 -0
  78. unit_tests/responses/issue_timeline_events.json +166 -0
  79. unit_tests/responses/issue_timeline_events_response.json +170 -0
  80. unit_tests/responses/projects_v2_response.json +45 -0
  81. unit_tests/responses/pull_request_comment_reactions.json +744 -0
  82. unit_tests/responses/pull_request_stats_response.json +317 -0
  83. unit_tests/test_migrations/test_config.json +8 -0
  84. unit_tests/test_migrations/test_new_config.json +8 -0
  85. unit_tests/test_multiple_token_authenticator.py +160 -0
  86. unit_tests/test_source.py +326 -0
  87. unit_tests/test_stream.py +1471 -0
  88. unit_tests/utils.py +78 -0
@@ -0,0 +1,1678 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import time
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
8
+ from urllib import parse
9
+
10
+ import pendulum
11
+ import requests
12
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode
13
+ from airbyte_cdk.models import Type as MessageType
14
+ from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
15
+ from airbyte_cdk.sources.streams.http import HttpStream
16
+ from airbyte_cdk.sources.streams.http.exceptions import DefaultBackoffException
17
+ from requests.exceptions import HTTPError
18
+
19
+ from . import constants
20
+ from .graphql import (
21
+ CursorStorage,
22
+ QueryReactions,
23
+ get_query_issue_reactions,
24
+ get_query_projectsV2,
25
+ get_query_pull_requests,
26
+ get_query_reviews,
27
+ )
28
+ from .utils import GitHubAPILimitException, getter
29
+
30
+
31
+ class GithubStreamABC(HttpStream, ABC):
32
+
33
+ primary_key = "id"
34
+
35
+ # Detect streams with high API load
36
+ large_stream = False
37
+
38
+ stream_base_params = {}
39
+
40
+ def __init__(self, api_url: str = "https://api.github.com", access_token_type: str = "", **kwargs):
41
+ if kwargs.get("authenticator"):
42
+ kwargs["authenticator"].max_time = self.max_time
43
+ super().__init__(**kwargs)
44
+
45
+ self.access_token_type = access_token_type
46
+ self.api_url = api_url
47
+
48
+ @property
49
+ def url_base(self) -> str:
50
+ return self.api_url
51
+
52
+ @property
53
+ def availability_strategy(self) -> Optional["AvailabilityStrategy"]:
54
+ return None
55
+
56
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
57
+ links = response.links
58
+ if "next" in links:
59
+ next_link = links["next"]["url"]
60
+ parsed_link = parse.urlparse(next_link)
61
+ page = dict(parse.parse_qsl(parsed_link.query)).get("page")
62
+ return {"page": page}
63
+
64
+ def request_params(
65
+ self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
66
+ ) -> MutableMapping[str, Any]:
67
+
68
+ params = {"per_page": self.page_size}
69
+
70
+ if next_page_token:
71
+ params.update(next_page_token)
72
+
73
+ params.update(self.stream_base_params)
74
+
75
+ return params
76
+
77
+ def request_headers(self, **kwargs) -> Mapping[str, Any]:
78
+ # Without sending `User-Agent` header we will be getting `403 Client Error: Forbidden for url` error.
79
+ return {"User-Agent": "PostmanRuntime/7.28.0"}
80
+
81
+ def parse_response(
82
+ self,
83
+ response: requests.Response,
84
+ stream_state: Mapping[str, Any],
85
+ stream_slice: Mapping[str, Any] = None,
86
+ next_page_token: Mapping[str, Any] = None,
87
+ ) -> Iterable[Mapping]:
88
+ for record in response.json(): # GitHub puts records in an array.
89
+ yield self.transform(record=record, stream_slice=stream_slice)
90
+
91
+ def should_retry(self, response: requests.Response) -> bool:
92
+ if super().should_retry(response):
93
+ return True
94
+
95
+ retry_flag = (
96
+ # The GitHub GraphQL API has limitations
97
+ # https://docs.github.com/en/graphql/overview/resource-limitations
98
+ (response.headers.get("X-RateLimit-Resource") == "graphql" and self.check_graphql_rate_limited(response.json()))
99
+ # Rate limit HTTP headers
100
+ # https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limit-http-headers
101
+ or (response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0")
102
+ # Secondary rate limits
103
+ # https://docs.github.com/en/rest/overview/resources-in-the-rest-api#secondary-rate-limits
104
+ or "Retry-After" in response.headers
105
+ )
106
+ if retry_flag:
107
+ headers = [
108
+ "X-RateLimit-Resource",
109
+ "X-RateLimit-Remaining",
110
+ "X-RateLimit-Reset",
111
+ "X-RateLimit-Limit",
112
+ "X-RateLimit-Used",
113
+ "Retry-After",
114
+ ]
115
+ headers = ", ".join([f"{h}: {response.headers[h]}" for h in headers if h in response.headers])
116
+ if headers:
117
+ headers = f"HTTP headers: {headers},"
118
+
119
+ self.logger.info(
120
+ f"Rate limit handling for stream `{self.name}` for the response with {response.status_code} status code, {headers} with message: {response.text}"
121
+ )
122
+
123
+ return retry_flag
124
+
125
+ def backoff_time(self, response: requests.Response) -> Optional[float]:
126
+ # This method is called if we run into the rate limit. GitHub limits requests to 5000 per hour and provides
127
+ # `X-RateLimit-Reset` header which contains time when this hour will be finished and limits will be reset so
128
+ # we again could have 5000 per another hour.
129
+
130
+ min_backoff_time = 60.0
131
+ retry_after = response.headers.get("Retry-After")
132
+ if retry_after is not None:
133
+ backoff_time_in_seconds = max(float(retry_after), min_backoff_time)
134
+ return self.get_waiting_time(backoff_time_in_seconds)
135
+
136
+ reset_time = response.headers.get("X-RateLimit-Reset")
137
+ if reset_time:
138
+ backoff_time_in_seconds = max(float(reset_time) - time.time(), min_backoff_time)
139
+ return self.get_waiting_time(backoff_time_in_seconds)
140
+
141
+ def get_waiting_time(self, backoff_time_in_seconds):
142
+ if backoff_time_in_seconds < self.max_time:
143
+ return backoff_time_in_seconds
144
+ else:
145
+ self._session.auth.update_token() # New token will be used in next request
146
+ return 1
147
+
148
+ @staticmethod
149
+ def check_graphql_rate_limited(response_json: dict) -> bool:
150
+ errors = response_json.get("errors")
151
+ if errors:
152
+ for error in errors:
153
+ if error.get("type") == "RATE_LIMITED":
154
+ return True
155
+ return False
156
+
157
+ def read_records(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
158
+ # get out the stream_slice parts for later use.
159
+ organisation = stream_slice.get("organization", "")
160
+ repository = stream_slice.get("repository", "")
161
+ # Reading records while handling the errors
162
+ try:
163
+ yield from super().read_records(stream_slice=stream_slice, **kwargs)
164
+ except HTTPError as e:
165
+ # This whole try/except situation in `read_records()` isn't good but right now in `self._send_request()`
166
+ # function we have `response.raise_for_status()` so we don't have much choice on how to handle errors.
167
+ # Bocked on https://github.com/airbytehq/airbyte/issues/3514.
168
+ if e.response.status_code == requests.codes.NOT_FOUND:
169
+ # A lot of streams are not available for repositories owned by a user instead of an organization.
170
+ if isinstance(self, Organizations):
171
+ error_msg = f"Syncing `{self.__class__.__name__}` stream isn't available for organization `{organisation}`."
172
+ elif isinstance(self, TeamMemberships):
173
+ error_msg = f"Syncing `{self.__class__.__name__}` stream for organization `{organisation}`, team `{stream_slice.get('team_slug')}` and user `{stream_slice.get('username')}` isn't available: User has no team membership. Skipping..."
174
+ else:
175
+ error_msg = f"Syncing `{self.__class__.__name__}` stream isn't available for repository `{repository}`."
176
+ elif e.response.status_code == requests.codes.FORBIDDEN:
177
+ error_msg = str(e.response.json().get("message"))
178
+ # When using the `check_connection` method, we should raise an error if we do not have access to the repository.
179
+ if isinstance(self, Repositories):
180
+ raise e
181
+ # When `403` for the stream, that has no access to the organization's teams, based on OAuth Apps Restrictions:
182
+ # https://docs.github.com/en/organizations/restricting-access-to-your-organizations-data/enabling-oauth-app-access-restrictions-for-your-organization
183
+ # For all `Organisation` based streams
184
+ elif isinstance(self, Organizations) or isinstance(self, Teams) or isinstance(self, Users):
185
+ error_msg = (
186
+ f"Syncing `{self.name}` stream isn't available for organization `{organisation}`. Full error message: {error_msg}"
187
+ )
188
+ # For all other `Repository` base streams
189
+ else:
190
+ error_msg = (
191
+ f"Syncing `{self.name}` stream isn't available for repository `{repository}`. Full error message: {error_msg}"
192
+ )
193
+ elif e.response.status_code == requests.codes.UNAUTHORIZED:
194
+ if self.access_token_type == constants.PERSONAL_ACCESS_TOKEN_TITLE:
195
+ error_msg = str(e.response.json().get("message"))
196
+ self.logger.error(f"{self.access_token_type} renewal is required: {error_msg}")
197
+ raise e
198
+ elif e.response.status_code == requests.codes.GONE and isinstance(self, Projects):
199
+ # Some repos don't have projects enabled and we we get "410 Client Error: Gone for
200
+ # url: https://api.github.com/repos/xyz/projects?per_page=100" error.
201
+ error_msg = f"Syncing `Projects` stream isn't available for repository `{stream_slice['repository']}`."
202
+ elif e.response.status_code == requests.codes.CONFLICT:
203
+ error_msg = (
204
+ f"Syncing `{self.name}` stream isn't available for repository "
205
+ f"`{stream_slice['repository']}`, it seems like this repository is empty."
206
+ )
207
+ elif e.response.status_code == requests.codes.SERVER_ERROR and isinstance(self, WorkflowRuns):
208
+ error_msg = f"Syncing `{self.name}` stream isn't available for repository `{stream_slice['repository']}`."
209
+ elif e.response.status_code == requests.codes.BAD_GATEWAY:
210
+ error_msg = f"Stream {self.name} temporary failed. Try to re-run sync later"
211
+ else:
212
+ # most probably here we're facing a 500 server error and a risk to get a non-json response, so lets output response.text
213
+ self.logger.error(f"Undefined error while reading records: {e.response.text}")
214
+ raise e
215
+
216
+ self.logger.warning(error_msg)
217
+ except GitHubAPILimitException:
218
+ self.logger.warning(
219
+ f"Stream: `{self.name}`, slice: `{stream_slice}`. Limits for all provided tokens are reached, please try again later"
220
+ )
221
+
222
+
223
+ class GithubStream(GithubStreamABC):
224
+ def __init__(self, repositories: List[str], page_size_for_large_streams: int, **kwargs):
225
+ super().__init__(**kwargs)
226
+ self.repositories = repositories
227
+ # GitHub pagination could be from 1 to 100.
228
+ # This parameter is deprecated and in future will be used sane default, page_size: 10
229
+ self.page_size = page_size_for_large_streams if self.large_stream else constants.DEFAULT_PAGE_SIZE
230
+
231
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
232
+ return f"repos/{stream_slice['repository']}/{self.name}"
233
+
234
+ def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
235
+ for repository in self.repositories:
236
+ yield {"repository": repository}
237
+
238
+ def get_error_display_message(self, exception: BaseException) -> Optional[str]:
239
+ if (
240
+ isinstance(exception, DefaultBackoffException)
241
+ and exception.response.status_code == requests.codes.BAD_GATEWAY
242
+ and self.large_stream
243
+ and self.page_size > 1
244
+ ):
245
+ return f'Please try to decrease the "Page size for large streams" below {self.page_size}. The stream "{self.name}" is a large stream, such streams can fail with 502 for high "page_size" values.'
246
+ return super().get_error_display_message(exception)
247
+
248
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
249
+ record["repository"] = stream_slice["repository"]
250
+ return record
251
+
252
+
253
+ class SemiIncrementalMixin:
254
+ """
255
+ Semi incremental streams are also incremental but with one difference, they:
256
+ - read all records;
257
+ - output only new records.
258
+ This means that semi incremental streams read all records (like full_refresh streams) but do filtering directly
259
+ in the code and output only latest records (like incremental streams).
260
+ """
261
+
262
+ cursor_field = "updated_at"
263
+
264
+ # This flag is used to indicate that current stream supports `sort` and `direction` request parameters and that
265
+ # we should break processing records if possible. If `sort` is set to `updated` and `direction` is set to `desc`
266
+ # this means that latest records will be at the beginning of the response and after we processed those latest
267
+ # records we can just stop and not process other record. This will increase speed of each incremental stream
268
+ # which supports those 2 request parameters. Currently only `IssueMilestones` and `PullRequests` streams are
269
+ # supporting this.
270
+ is_sorted = False
271
+
272
+ def __init__(self, start_date: str = "", **kwargs):
273
+ super().__init__(**kwargs)
274
+ self._start_date = start_date
275
+ self._starting_point_cache = {}
276
+
277
+ @property
278
+ def slice_keys(self):
279
+ if hasattr(self, "repositories"):
280
+ return ["repository"]
281
+ return ["organization"]
282
+
283
+ record_slice_key = slice_keys
284
+
285
+ def convert_cursor_value(self, value):
286
+ return value
287
+
288
+ @property
289
+ def state_checkpoint_interval(self) -> Optional[int]:
290
+ if self.is_sorted == "asc":
291
+ return self.page_size
292
+
293
+ def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
294
+ """
295
+ Return the latest state by comparing the cursor value in the latest record with the stream's most recent state
296
+ object and returning an updated state object.
297
+ """
298
+ slice_value = getter(latest_record, self.record_slice_key)
299
+ updated_state = self.convert_cursor_value(latest_record[self.cursor_field])
300
+ stream_state_value = current_stream_state.get(slice_value, {}).get(self.cursor_field)
301
+ if stream_state_value:
302
+ updated_state = max(updated_state, stream_state_value)
303
+ current_stream_state.setdefault(slice_value, {})[self.cursor_field] = updated_state
304
+ return current_stream_state
305
+
306
+ def _get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str:
307
+ if stream_state:
308
+ state_path = [stream_slice[k] for k in self.slice_keys] + [self.cursor_field]
309
+ stream_state_value = getter(stream_state, state_path, strict=False)
310
+ if stream_state_value:
311
+ if self._start_date:
312
+ return max(self._start_date, stream_state_value)
313
+ return stream_state_value
314
+ return self._start_date
315
+
316
+ def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str:
317
+ cache_key = tuple([stream_slice[k] for k in self.slice_keys])
318
+ if cache_key not in self._starting_point_cache:
319
+ self._starting_point_cache[cache_key] = self._get_starting_point(stream_state, stream_slice)
320
+ return self._starting_point_cache[cache_key]
321
+
322
+ def read_records(
323
+ self,
324
+ sync_mode: SyncMode,
325
+ cursor_field: List[str] = None,
326
+ stream_slice: Mapping[str, Any] = None,
327
+ stream_state: Mapping[str, Any] = None,
328
+ ) -> Iterable[Mapping[str, Any]]:
329
+ start_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
330
+ for record in super().read_records(
331
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
332
+ ):
333
+ cursor_value = self.convert_cursor_value(record[self.cursor_field])
334
+ if not start_point or cursor_value > start_point:
335
+ yield record
336
+ elif self.is_sorted == "desc" and cursor_value < start_point:
337
+ break
338
+
339
+ def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
340
+ self._starting_point_cache.clear()
341
+ yield from super().stream_slices(**kwargs)
342
+
343
+
344
+ class IncrementalMixin(SemiIncrementalMixin):
345
+ def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]:
346
+ params = super().request_params(stream_state=stream_state, **kwargs)
347
+ since_params = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
348
+ if since_params:
349
+ params["since"] = since_params
350
+ return params
351
+
352
+
353
+ # Below are full refresh streams
354
+
355
+
356
+ class RepositoryStats(GithubStream):
357
+ """
358
+ This stream is technical and not intended for the user, we use it for checking connection with the repository.
359
+ API docs: https://docs.github.com/en/rest/reference/repos#get-a-repository
360
+ """
361
+
362
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
363
+ return f"repos/{stream_slice['repository']}"
364
+
365
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
366
+ yield response.json()
367
+
368
+
369
+ class Assignees(GithubStream):
370
+ """
371
+ API docs: https://docs.github.com/en/rest/issues/assignees?apiVersion=2022-11-28#list-assignees
372
+ """
373
+
374
+
375
+ class Branches(GithubStream):
376
+ """
377
+ API docs: https://docs.github.com/en/rest/branches/branches?apiVersion=2022-11-28#list-branches
378
+ """
379
+
380
+ primary_key = ["repository", "name"]
381
+
382
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
383
+ return f"repos/{stream_slice['repository']}/branches"
384
+
385
+
386
+ class Collaborators(GithubStream):
387
+ """
388
+ API docs: https://docs.github.com/en/rest/collaborators/collaborators?apiVersion=2022-11-28#list-repository-collaborators
389
+ """
390
+
391
+
392
+ class IssueLabels(GithubStream):
393
+ """
394
+ API docs: https://docs.github.com/en/rest/issues/labels?apiVersion=2022-11-28#list-labels-for-a-repository
395
+ """
396
+
397
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
398
+ return f"repos/{stream_slice['repository']}/labels"
399
+
400
+
401
+ class Organizations(GithubStreamABC):
402
+ """
403
+ API docs: https://docs.github.com/en/rest/orgs/orgs?apiVersion=2022-11-28#list-organizations
404
+ """
405
+
406
+ # GitHub pagination could be from 1 to 100.
407
+ page_size = 100
408
+
409
+ def __init__(self, organizations: List[str], access_token_type: str = "", **kwargs):
410
+ super().__init__(**kwargs)
411
+ self.organizations = organizations
412
+ self.access_token_type = access_token_type
413
+
414
+ def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
415
+ for organization in self.organizations:
416
+ yield {"organization": organization}
417
+
418
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
419
+ return f"orgs/{stream_slice['organization']}"
420
+
421
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
422
+ yield response.json()
423
+
424
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
425
+ record["organization"] = stream_slice["organization"]
426
+ return record
427
+
428
+
429
+ class Repositories(SemiIncrementalMixin, Organizations):
430
+ """
431
+ API docs: https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-organization-repositories
432
+ """
433
+
434
+ is_sorted = "desc"
435
+ stream_base_params = {
436
+ "sort": "updated",
437
+ "direction": "desc",
438
+ }
439
+
440
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
441
+ return f"orgs/{stream_slice['organization']}/repos"
442
+
443
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
444
+ for record in response.json(): # GitHub puts records in an array.
445
+ yield self.transform(record=record, stream_slice=stream_slice)
446
+
447
+
448
+ class Tags(GithubStream):
449
+ """
450
+ API docs: https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags
451
+ """
452
+
453
+ primary_key = ["repository", "name"]
454
+
455
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
456
+ return f"repos/{stream_slice['repository']}/tags"
457
+
458
+
459
+ class Teams(Organizations):
460
+ """
461
+ API docs: https://docs.github.com/en/rest/teams/teams?apiVersion=2022-11-28#list-teams
462
+ """
463
+
464
+ use_cache = True
465
+
466
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
467
+ return f"orgs/{stream_slice['organization']}/teams"
468
+
469
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
470
+ for record in response.json():
471
+ yield self.transform(record=record, stream_slice=stream_slice)
472
+
473
+
474
+ class Users(Organizations):
475
+ """
476
+ API docs: https://docs.github.com/en/rest/orgs/members?apiVersion=2022-11-28#list-organization-members
477
+ """
478
+
479
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
480
+ return f"orgs/{stream_slice['organization']}/members"
481
+
482
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
483
+ for record in response.json():
484
+ yield self.transform(record=record, stream_slice=stream_slice)
485
+
486
+
487
+ # Below are semi incremental streams
488
+
489
+
490
+ class Releases(SemiIncrementalMixin, GithubStream):
491
+ """
492
+ API docs: https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#list-releases
493
+ """
494
+
495
+ cursor_field = "created_at"
496
+
497
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
498
+ record = super().transform(record=record, stream_slice=stream_slice)
499
+
500
+ assets = record.get("assets", [])
501
+ for asset in assets:
502
+ uploader = asset.pop("uploader", None)
503
+ asset["uploader_id"] = uploader.get("id") if uploader else None
504
+
505
+ return record
506
+
507
+
508
+ class Events(SemiIncrementalMixin, GithubStream):
509
+ """
510
+ API docs: https://docs.github.com/en/rest/activity/events?apiVersion=2022-11-28#list-repository-events
511
+ """
512
+
513
+ cursor_field = "created_at"
514
+
515
+
516
+ class PullRequests(SemiIncrementalMixin, GithubStream):
517
+ """
518
+ API docs: https://docs.github.com/en/rest/pulls/pulls?apiVersion=2022-11-28#list-pull-requests
519
+ """
520
+
521
+ use_cache = True
522
+ large_stream = True
523
+
524
+ def __init__(self, **kwargs):
525
+ super().__init__(**kwargs)
526
+ self._first_read = True
527
+
528
+ def read_records(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
529
+ """
530
+ Decide if this a first read or not by the presence of the state object
531
+ """
532
+ self._first_read = not bool(stream_state)
533
+ yield from super().read_records(stream_state=stream_state, **kwargs)
534
+
535
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
536
+ return f"repos/{stream_slice['repository']}/pulls"
537
+
538
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
539
+ record = super().transform(record=record, stream_slice=stream_slice)
540
+
541
+ for nested in ("head", "base"):
542
+ entry = record.get(nested, {})
543
+ entry["repo_id"] = (record.get("head", {}).pop("repo", {}) or {}).get("id")
544
+
545
+ return record
546
+
547
+ def request_params(self, **kwargs) -> MutableMapping[str, Any]:
548
+ base_params = super().request_params(**kwargs)
549
+ # The very first time we read this stream we want to read ascending so we can save state in case of
550
+ # a halfway failure. But if there is state, we read descending to allow incremental behavior.
551
+ params = {"state": "all", "sort": "updated", "direction": self.is_sorted}
552
+
553
+ return {**base_params, **params}
554
+
555
+ @property
556
+ def is_sorted(self) -> str:
557
+ """
558
+ Depending if there any state we read stream in ascending or descending order.
559
+ """
560
+ if self._first_read:
561
+ return "asc"
562
+ return "desc"
563
+
564
+
565
+ class CommitComments(SemiIncrementalMixin, GithubStream):
566
+ """
567
+ API docs: https://docs.github.com/en/rest/commits/comments?apiVersion=2022-11-28#list-commit-comments-for-a-repository
568
+ """
569
+
570
+ use_cache = True
571
+
572
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
573
+ return f"repos/{stream_slice['repository']}/comments"
574
+
575
+
576
+ class IssueMilestones(SemiIncrementalMixin, GithubStream):
577
+ """
578
+ API docs: https://docs.github.com/en/rest/issues/milestones?apiVersion=2022-11-28#list-milestones
579
+ """
580
+
581
+ is_sorted = "desc"
582
+ stream_base_params = {
583
+ "state": "all",
584
+ "sort": "updated",
585
+ "direction": "desc",
586
+ }
587
+
588
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
589
+ return f"repos/{stream_slice['repository']}/milestones"
590
+
591
+
592
+ class Stargazers(SemiIncrementalMixin, GithubStream):
593
+ """
594
+ API docs: https://docs.github.com/en/rest/activity/starring?apiVersion=2022-11-28#list-stargazers
595
+ """
596
+
597
+ primary_key = "user_id"
598
+ cursor_field = "starred_at"
599
+
600
+ def request_headers(self, **kwargs) -> Mapping[str, Any]:
601
+ base_headers = super().request_headers(**kwargs)
602
+ # We need to send below header if we want to get `starred_at` field. See docs (Alternative response with
603
+ # star creation timestamps) - https://docs.github.com/en/rest/reference/activity#list-stargazers.
604
+ headers = {"Accept": "application/vnd.github.v3.star+json"}
605
+
606
+ return {**base_headers, **headers}
607
+
608
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
609
+ """
610
+ We need to provide the "user_id" for the primary_key attribute
611
+ and don't remove the whole "user" block from the record.
612
+ """
613
+ record = super().transform(record=record, stream_slice=stream_slice)
614
+ record["user_id"] = record.get("user").get("id")
615
+ return record
616
+
617
+
618
+ class Projects(SemiIncrementalMixin, GithubStream):
619
+ """
620
+ API docs: https://docs.github.com/en/rest/projects/projects?apiVersion=2022-11-28#list-repository-projects
621
+ """
622
+
623
+ use_cache = True
624
+ stream_base_params = {
625
+ "state": "all",
626
+ }
627
+
628
+ def request_headers(self, **kwargs) -> Mapping[str, Any]:
629
+ base_headers = super().request_headers(**kwargs)
630
+ # Projects stream requires sending following `Accept` header. If we won't sent it
631
+ # we'll get `415 Client Error: Unsupported Media Type` error.
632
+ headers = {"Accept": "application/vnd.github.inertia-preview+json"}
633
+
634
+ return {**base_headers, **headers}
635
+
636
+
637
+ class IssueEvents(SemiIncrementalMixin, GithubStream):
638
+ """
639
+ API docs: https://docs.github.com/en/rest/issues/events?apiVersion=2022-11-28#list-issue-events-for-a-repository
640
+ """
641
+
642
+ cursor_field = "created_at"
643
+
644
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
645
+ return f"repos/{stream_slice['repository']}/issues/events"
646
+
647
+
648
+ # Below are incremental streams
649
+
650
+
651
+ class Comments(IncrementalMixin, GithubStream):
652
+ """
653
+ API docs: https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments-for-a-repository
654
+ """
655
+
656
+ use_cache = True
657
+ large_stream = True
658
+ max_retries = 7
659
+
660
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
661
+ return f"repos/{stream_slice['repository']}/issues/comments"
662
+
663
+
664
+ class Commits(IncrementalMixin, GithubStream):
665
+ """
666
+ API docs: https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#list-commits
667
+
668
+ Pull commits from each branch of each repository, tracking state for each branch
669
+ """
670
+
671
+ primary_key = "sha"
672
+ cursor_field = "created_at"
673
+ slice_keys = ["repository", "branch"]
674
+
675
+ def __init__(self, branches_to_pull: Mapping[str, List[str]], default_branches: Mapping[str, str], **kwargs):
676
+ super().__init__(**kwargs)
677
+ self.branches_to_pull = branches_to_pull
678
+ self.default_branches = default_branches
679
+
680
+ def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]:
681
+ params = super(IncrementalMixin, self).request_params(stream_state=stream_state, stream_slice=stream_slice, **kwargs)
682
+ since = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
683
+ if since:
684
+ params["since"] = since
685
+ params["sha"] = stream_slice["branch"]
686
+ return params
687
+
688
+ def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
689
+ for stream_slice in super().stream_slices(**kwargs):
690
+ repository = stream_slice["repository"]
691
+ for branch in self.branches_to_pull.get(repository, []):
692
+ yield {"branch": branch, "repository": repository}
693
+
694
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
695
+ record = super().transform(record=record, stream_slice=stream_slice)
696
+
697
+ # Record of the `commits` stream doesn't have an updated_at/created_at field at the top level (so we could
698
+ # just write `record["updated_at"]` or `record["created_at"]`). Instead each record has such value in
699
+ # `commit.author.date`. So the easiest way is to just enrich the record returned from API with top level
700
+ # field `created_at` and use it as cursor_field.
701
+ # Include the branch in the record
702
+ record["created_at"] = record["commit"]["author"]["date"]
703
+ record["branch"] = stream_slice["branch"]
704
+
705
+ return record
706
+
707
+ def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
708
+ repository = latest_record["repository"]
709
+ branch = latest_record["branch"]
710
+ updated_state = latest_record[self.cursor_field]
711
+ stream_state_value = current_stream_state.get(repository, {}).get(branch, {}).get(self.cursor_field)
712
+ if stream_state_value:
713
+ updated_state = max(updated_state, stream_state_value)
714
+ current_stream_state.setdefault(repository, {}).setdefault(branch, {})[self.cursor_field] = updated_state
715
+ return current_stream_state
716
+
717
+
718
+ class Issues(IncrementalMixin, GithubStream):
719
+ """
720
+ API docs: https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues
721
+ """
722
+
723
+ use_cache = True
724
+ large_stream = True
725
+ is_sorted = "asc"
726
+
727
+ stream_base_params = {
728
+ "state": "all",
729
+ "sort": "updated",
730
+ "direction": "asc",
731
+ }
732
+
733
+
734
+ class ReviewComments(IncrementalMixin, GithubStream):
735
+ """
736
+ API docs: https://docs.github.com/en/rest/pulls/comments?apiVersion=2022-11-28#list-review-comments-in-a-repository
737
+ """
738
+
739
+ use_cache = True
740
+ large_stream = True
741
+
742
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
743
+ return f"repos/{stream_slice['repository']}/pulls/comments"
744
+
745
+
746
+ class GitHubGraphQLStream(GithubStream, ABC):
747
+
748
+ http_method = "POST"
749
+
750
+ def path(
751
+ self, *, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
752
+ ) -> str:
753
+ return "graphql"
754
+
755
+ def should_retry(self, response: requests.Response) -> bool:
756
+ if response.status_code in (requests.codes.BAD_GATEWAY, requests.codes.GATEWAY_TIMEOUT):
757
+ self.page_size = int(self.page_size / 2)
758
+ return True
759
+ self.page_size = constants.DEFAULT_PAGE_SIZE_FOR_LARGE_STREAM if self.large_stream else constants.DEFAULT_PAGE_SIZE
760
+ return super().should_retry(response) or response.json().get("errors")
761
+
762
+ def _get_repository_name(self, repository: Mapping[str, Any]) -> str:
763
+ return repository["owner"]["login"] + "/" + repository["name"]
764
+
765
+ def request_params(
766
+ self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
767
+ ) -> MutableMapping[str, Any]:
768
+ return {}
769
+
770
+
771
+ class PullRequestStats(SemiIncrementalMixin, GitHubGraphQLStream):
772
+ """
773
+ API docs: https://docs.github.com/en/graphql/reference/objects#pullrequest
774
+ """
775
+
776
+ large_stream = True
777
+ is_sorted = "asc"
778
+
779
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
780
+ repository = response.json()["data"]["repository"]
781
+ if repository:
782
+ nodes = repository["pullRequests"]["nodes"]
783
+ for record in nodes:
784
+ record["review_comments"] = sum([node["comments"]["totalCount"] for node in record["review_comments"]["nodes"]])
785
+ record["comments"] = record["comments"]["totalCount"]
786
+ record["commits"] = record["commits"]["totalCount"]
787
+ record["repository"] = self._get_repository_name(repository)
788
+ if record["merged_by"]:
789
+ record["merged_by"]["type"] = record["merged_by"].pop("__typename")
790
+ yield record
791
+
792
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
793
+ repository = response.json()["data"]["repository"]
794
+ if repository:
795
+ pageInfo = repository["pullRequests"]["pageInfo"]
796
+ if pageInfo["hasNextPage"]:
797
+ return {"after": pageInfo["endCursor"]}
798
+
799
+ def request_body_json(
800
+ self,
801
+ stream_state: Mapping[str, Any],
802
+ stream_slice: Mapping[str, Any] = None,
803
+ next_page_token: Mapping[str, Any] = None,
804
+ ) -> Optional[Mapping]:
805
+ organization, name = stream_slice["repository"].split("/")
806
+ if next_page_token:
807
+ next_page_token = next_page_token["after"]
808
+ query = get_query_pull_requests(
809
+ owner=organization, name=name, first=self.page_size, after=next_page_token, direction=self.is_sorted.upper()
810
+ )
811
+ return {"query": query}
812
+
813
+ def request_headers(self, **kwargs) -> Mapping[str, Any]:
814
+ base_headers = super().request_headers(**kwargs)
815
+ # https://docs.github.com/en/graphql/overview/schema-previews#merge-info-preview
816
+ headers = {"Accept": "application/vnd.github.merge-info-preview+json"}
817
+ return {**base_headers, **headers}
818
+
819
+
820
+ class Reviews(SemiIncrementalMixin, GitHubGraphQLStream):
821
+ """
822
+ API docs: https://docs.github.com/en/rest/pulls/reviews?apiVersion=2022-11-28#list-reviews-for-a-pull-request
823
+ """
824
+
825
+ is_sorted = False
826
+ cursor_field = "updated_at"
827
+
828
+ def __init__(self, **kwargs):
829
+ super().__init__(**kwargs)
830
+ self.pull_requests_cursor = {}
831
+ self.reviews_cursors = {}
832
+
833
+ def _get_records(self, pull_request, repository_name):
834
+ "yield review records from pull_request"
835
+ for record in pull_request["reviews"]["nodes"]:
836
+ record["repository"] = repository_name
837
+ record["pull_request_url"] = pull_request["url"]
838
+ if record["commit"]:
839
+ record["commit_id"] = record.pop("commit")["oid"]
840
+ if record["user"]:
841
+ record["user"]["type"] = record["user"].pop("__typename")
842
+ # for backward compatibility with REST API response
843
+ record["_links"] = {
844
+ "html": {"href": record["html_url"]},
845
+ "pull_request": {"href": record["pull_request_url"]},
846
+ }
847
+ yield record
848
+
849
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
850
+ repository = response.json()["data"]["repository"]
851
+ if repository:
852
+ repository_name = self._get_repository_name(repository)
853
+ if "pullRequests" in repository:
854
+ for pull_request in repository["pullRequests"]["nodes"]:
855
+ yield from self._get_records(pull_request, repository_name)
856
+ elif "pullRequest" in repository:
857
+ yield from self._get_records(repository["pullRequest"], repository_name)
858
+
859
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
860
+ repository = response.json()["data"]["repository"]
861
+ if repository:
862
+ repository_name = self._get_repository_name(repository)
863
+ reviews_cursors = self.reviews_cursors.setdefault(repository_name, {})
864
+ if "pullRequests" in repository:
865
+ if repository["pullRequests"]["pageInfo"]["hasNextPage"]:
866
+ self.pull_requests_cursor[repository_name] = repository["pullRequests"]["pageInfo"]["endCursor"]
867
+ for pull_request in repository["pullRequests"]["nodes"]:
868
+ if pull_request["reviews"]["pageInfo"]["hasNextPage"]:
869
+ pull_request_number = pull_request["number"]
870
+ reviews_cursors[pull_request_number] = pull_request["reviews"]["pageInfo"]["endCursor"]
871
+ elif "pullRequest" in repository:
872
+ if repository["pullRequest"]["reviews"]["pageInfo"]["hasNextPage"]:
873
+ pull_request_number = repository["pullRequest"]["number"]
874
+ reviews_cursors[pull_request_number] = repository["pullRequest"]["reviews"]["pageInfo"]["endCursor"]
875
+ if reviews_cursors:
876
+ number, after = reviews_cursors.popitem()
877
+ return {"after": after, "number": number}
878
+ if repository_name in self.pull_requests_cursor:
879
+ return {"after": self.pull_requests_cursor.pop(repository_name)}
880
+
881
+ def request_body_json(
882
+ self,
883
+ stream_state: Mapping[str, Any],
884
+ stream_slice: Mapping[str, Any] = None,
885
+ next_page_token: Mapping[str, Any] = None,
886
+ ) -> Optional[Mapping]:
887
+ organization, name = stream_slice["repository"].split("/")
888
+ if not next_page_token:
889
+ next_page_token = {"after": None}
890
+ query = get_query_reviews(owner=organization, name=name, first=self.page_size, **next_page_token)
891
+ return {"query": query}
892
+
893
+
894
+ class PullRequestCommits(GithubStream):
895
+ """
896
+ API docs: https://docs.github.com/en/rest/pulls/pulls?apiVersion=2022-11-28#list-commits-on-a-pull-request
897
+ """
898
+
899
+ primary_key = "sha"
900
+
901
+ def __init__(self, parent: HttpStream, **kwargs):
902
+ super().__init__(**kwargs)
903
+ self.parent = parent
904
+
905
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
906
+ return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_number']}/commits"
907
+
908
+ def stream_slices(
909
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
910
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
911
+ parent_stream_slices = self.parent.stream_slices(
912
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
913
+ )
914
+ for stream_slice in parent_stream_slices:
915
+ parent_records = self.parent.read_records(
916
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
917
+ )
918
+ for record in parent_records:
919
+ yield {"repository": record["repository"], "pull_number": record["number"]}
920
+
921
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
922
+ record = super().transform(record=record, stream_slice=stream_slice)
923
+ record["pull_number"] = stream_slice["pull_number"]
924
+ return record
925
+
926
+
927
+ class ProjectsV2(SemiIncrementalMixin, GitHubGraphQLStream):
928
+ """
929
+ API docs: https://docs.github.com/en/graphql/reference/objects#projectv2
930
+ """
931
+
932
+ is_sorted = "asc"
933
+
934
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
935
+ repository = response.json()["data"]["repository"]
936
+ if repository:
937
+ nodes = repository["projectsV2"]["nodes"]
938
+ for record in nodes:
939
+ record["owner_id"] = record.pop("owner").get("id")
940
+ record["repository"] = self._get_repository_name(repository)
941
+ yield record
942
+
943
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
944
+ repository = response.json()["data"]["repository"]
945
+ if repository:
946
+ page_info = repository["projectsV2"]["pageInfo"]
947
+ if page_info["hasNextPage"]:
948
+ return {"after": page_info["endCursor"]}
949
+
950
+ def request_body_json(
951
+ self,
952
+ stream_state: Mapping[str, Any],
953
+ stream_slice: Mapping[str, Any] = None,
954
+ next_page_token: Mapping[str, Any] = None,
955
+ ) -> Optional[Mapping]:
956
+ organization, name = stream_slice["repository"].split("/")
957
+ if next_page_token:
958
+ next_page_token = next_page_token["after"]
959
+ query = get_query_projectsV2(
960
+ owner=organization, name=name, first=self.page_size, after=next_page_token, direction=self.is_sorted.upper()
961
+ )
962
+ return {"query": query}
963
+
964
+
965
+ # Reactions streams
966
+
967
+
968
+ class ReactionStream(GithubStream, ABC):
969
+
970
+ parent_key = "id"
971
+ copy_parent_key = "comment_id"
972
+ cursor_field = "created_at"
973
+
974
+ def __init__(self, start_date: str = "", **kwargs):
975
+ super().__init__(**kwargs)
976
+ kwargs["start_date"] = start_date
977
+ self._parent_stream = self.parent_entity(**kwargs)
978
+ self._start_date = start_date
979
+
980
+ @property
981
+ @abstractmethod
982
+ def parent_entity(self):
983
+ """
984
+ Specify the class of the parent stream for which receive reactions
985
+ """
986
+
987
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
988
+ parent_path = self._parent_stream.path(stream_slice=stream_slice, **kwargs)
989
+ return f"{parent_path}/{stream_slice[self.copy_parent_key]}/reactions"
990
+
991
+ def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
992
+ for stream_slice in super().stream_slices(**kwargs):
993
+ for parent_record in self._parent_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice):
994
+ yield {self.copy_parent_key: parent_record[self.parent_key], "repository": stream_slice["repository"]}
995
+
996
+ def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
997
+ repository = latest_record["repository"]
998
+ parent_id = str(latest_record[self.copy_parent_key])
999
+ updated_state = latest_record[self.cursor_field]
1000
+ stream_state_value = current_stream_state.get(repository, {}).get(parent_id, {}).get(self.cursor_field)
1001
+ if stream_state_value:
1002
+ updated_state = max(updated_state, stream_state_value)
1003
+ current_stream_state.setdefault(repository, {}).setdefault(parent_id, {})[self.cursor_field] = updated_state
1004
+ return current_stream_state
1005
+
1006
+ def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str:
1007
+ if stream_state:
1008
+ repository = stream_slice["repository"]
1009
+ parent_id = str(stream_slice[self.copy_parent_key])
1010
+ stream_state_value = stream_state.get(repository, {}).get(parent_id, {}).get(self.cursor_field)
1011
+ if stream_state_value:
1012
+ if self._start_date:
1013
+ return max(self._start_date, stream_state_value)
1014
+ return stream_state_value
1015
+ return self._start_date
1016
+
1017
+ def read_records(
1018
+ self,
1019
+ sync_mode: SyncMode,
1020
+ cursor_field: List[str] = None,
1021
+ stream_slice: Mapping[str, Any] = None,
1022
+ stream_state: Mapping[str, Any] = None,
1023
+ ) -> Iterable[Mapping[str, Any]]:
1024
+ starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
1025
+ for record in super().read_records(
1026
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1027
+ ):
1028
+ if not starting_point or record[self.cursor_field] > starting_point:
1029
+ yield record
1030
+
1031
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1032
+ record = super().transform(record, stream_slice)
1033
+ record[self.copy_parent_key] = stream_slice[self.copy_parent_key]
1034
+ return record
1035
+
1036
+
1037
+ class CommitCommentReactions(ReactionStream):
1038
+ """
1039
+ API docs: https://docs.github.com/en/rest/reference/reactions?apiVersion=2022-11-28#list-reactions-for-a-commit-comment
1040
+ """
1041
+
1042
+ parent_entity = CommitComments
1043
+
1044
+
1045
+ class IssueCommentReactions(ReactionStream):
1046
+ """
1047
+ API docs: https://docs.github.com/en/rest/reactions/reactions?apiVersion=2022-11-28#list-reactions-for-an-issue-comment
1048
+ """
1049
+
1050
+ parent_entity = Comments
1051
+
1052
+
1053
+ class IssueReactions(SemiIncrementalMixin, GitHubGraphQLStream):
1054
+ """
1055
+ https://docs.github.com/en/graphql/reference/objects#issue
1056
+ https://docs.github.com/en/graphql/reference/objects#reaction
1057
+ """
1058
+
1059
+ cursor_field = "created_at"
1060
+
1061
+ def __init__(self, **kwargs):
1062
+ super().__init__(**kwargs)
1063
+ self.issues_cursor = {}
1064
+ self.reactions_cursors = {}
1065
+
1066
+ def _get_reactions_from_issue(self, issue, repository_name):
1067
+ for reaction in issue["reactions"]["nodes"]:
1068
+ reaction["repository"] = repository_name
1069
+ reaction["issue_number"] = issue["number"]
1070
+ reaction["user"]["type"] = "User"
1071
+ yield reaction
1072
+
1073
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
1074
+ repository = response.json()["data"]["repository"]
1075
+ if repository:
1076
+ repository_name = self._get_repository_name(repository)
1077
+ if "issues" in repository:
1078
+ for issue in repository["issues"]["nodes"]:
1079
+ yield from self._get_reactions_from_issue(issue, repository_name)
1080
+ elif "issue" in repository:
1081
+ yield from self._get_reactions_from_issue(repository["issue"], repository_name)
1082
+
1083
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
1084
+ repository = response.json()["data"]["repository"]
1085
+ if repository:
1086
+ repository_name = self._get_repository_name(repository)
1087
+ reactions_cursors = self.reactions_cursors.setdefault(repository_name, {})
1088
+ if "issues" in repository:
1089
+ if repository["issues"]["pageInfo"]["hasNextPage"]:
1090
+ self.issues_cursor[repository_name] = repository["issues"]["pageInfo"]["endCursor"]
1091
+ for issue in repository["issues"]["nodes"]:
1092
+ if issue["reactions"]["pageInfo"]["hasNextPage"]:
1093
+ issue_number = issue["number"]
1094
+ reactions_cursors[issue_number] = issue["reactions"]["pageInfo"]["endCursor"]
1095
+ elif "issue" in repository:
1096
+ if repository["issue"]["reactions"]["pageInfo"]["hasNextPage"]:
1097
+ issue_number = repository["issue"]["number"]
1098
+ reactions_cursors[issue_number] = repository["issue"]["reactions"]["pageInfo"]["endCursor"]
1099
+ if reactions_cursors:
1100
+ number, after = reactions_cursors.popitem()
1101
+ return {"after": after, "number": number}
1102
+ if repository_name in self.issues_cursor:
1103
+ return {"after": self.issues_cursor.pop(repository_name)}
1104
+
1105
+ def request_body_json(
1106
+ self,
1107
+ stream_state: Mapping[str, Any],
1108
+ stream_slice: Mapping[str, Any] = None,
1109
+ next_page_token: Mapping[str, Any] = None,
1110
+ ) -> Optional[Mapping]:
1111
+ organization, name = stream_slice["repository"].split("/")
1112
+ if not next_page_token:
1113
+ next_page_token = {"after": None}
1114
+ query = get_query_issue_reactions(owner=organization, name=name, first=self.page_size, **next_page_token)
1115
+ return {"query": query}
1116
+
1117
+
1118
+ class PullRequestCommentReactions(SemiIncrementalMixin, GitHubGraphQLStream):
1119
+ """
1120
+ API docs:
1121
+ https://docs.github.com/en/graphql/reference/objects#pullrequestreviewcomment
1122
+ https://docs.github.com/en/graphql/reference/objects#reaction
1123
+ """
1124
+
1125
+ cursor_field = "created_at"
1126
+
1127
+ def __init__(self, **kwargs):
1128
+ super().__init__(**kwargs)
1129
+ self.cursor_storage = CursorStorage(["PullRequest", "PullRequestReview", "PullRequestReviewComment", "Reaction"])
1130
+ self.query_reactions = QueryReactions()
1131
+
1132
+ def _get_reactions_from_comment(self, comment, repository):
1133
+ for reaction in comment["reactions"]["nodes"]:
1134
+ reaction["repository"] = self._get_repository_name(repository)
1135
+ reaction["comment_id"] = comment["id"]
1136
+ if reaction["user"]:
1137
+ reaction["user"]["type"] = "User"
1138
+ yield reaction
1139
+
1140
+ def _get_reactions_from_review(self, review, repository):
1141
+ for comment in review["comments"]["nodes"]:
1142
+ yield from self._get_reactions_from_comment(comment, repository)
1143
+
1144
+ def _get_reactions_from_pull_request(self, pull_request, repository):
1145
+ for review in pull_request["reviews"]["nodes"]:
1146
+ yield from self._get_reactions_from_review(review, repository)
1147
+
1148
+ def _get_reactions_from_repository(self, repository):
1149
+ for pull_request in repository["pullRequests"]["nodes"]:
1150
+ yield from self._get_reactions_from_pull_request(pull_request, repository)
1151
+
1152
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
1153
+ data = response.json()["data"]
1154
+ repository = data.get("repository")
1155
+ if repository:
1156
+ yield from self._get_reactions_from_repository(repository)
1157
+
1158
+ node = data.get("node")
1159
+ if node:
1160
+ if node["__typename"] == "PullRequest":
1161
+ yield from self._get_reactions_from_pull_request(node, node["repository"])
1162
+ elif node["__typename"] == "PullRequestReview":
1163
+ yield from self._get_reactions_from_review(node, node["repository"])
1164
+ elif node["__typename"] == "PullRequestReviewComment":
1165
+ yield from self._get_reactions_from_comment(node, node["repository"])
1166
+
1167
+ def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
1168
+ data = response.json()["data"]
1169
+ repository = data.get("repository")
1170
+ if repository:
1171
+ self._add_cursor(repository, "pullRequests")
1172
+ for pull_request in repository["pullRequests"]["nodes"]:
1173
+ self._add_cursor(pull_request, "reviews")
1174
+ for review in pull_request["reviews"]["nodes"]:
1175
+ self._add_cursor(review, "comments")
1176
+ for comment in review["comments"]["nodes"]:
1177
+ self._add_cursor(comment, "reactions")
1178
+
1179
+ node = data.get("node")
1180
+ if node:
1181
+ if node["__typename"] == "PullRequest":
1182
+ self._add_cursor(node, "reviews")
1183
+ for review in node["reviews"]["nodes"]:
1184
+ self._add_cursor(review, "comments")
1185
+ for comment in review["comments"]["nodes"]:
1186
+ self._add_cursor(comment, "reactions")
1187
+ elif node["__typename"] == "PullRequestReview":
1188
+ self._add_cursor(node, "comments")
1189
+ for comment in node["comments"]["nodes"]:
1190
+ self._add_cursor(comment, "reactions")
1191
+ elif node["__typename"] == "PullRequestReviewComment":
1192
+ self._add_cursor(node, "reactions")
1193
+
1194
+ return self.cursor_storage.get_cursor()
1195
+
1196
+ def _add_cursor(self, node, link):
1197
+ link_to_object = {
1198
+ "reactions": "Reaction",
1199
+ "comments": "PullRequestReviewComment",
1200
+ "reviews": "PullRequestReview",
1201
+ "pullRequests": "PullRequest",
1202
+ }
1203
+
1204
+ pageInfo = node[link]["pageInfo"]
1205
+ if pageInfo["hasNextPage"]:
1206
+ self.cursor_storage.add_cursor(
1207
+ link_to_object[link], pageInfo["endCursor"], node[link]["totalCount"], parent_id=node.get("node_id")
1208
+ )
1209
+
1210
+ def request_body_json(
1211
+ self,
1212
+ stream_state: Mapping[str, Any],
1213
+ stream_slice: Mapping[str, Any] = None,
1214
+ next_page_token: Mapping[str, Any] = None,
1215
+ ) -> Optional[Mapping]:
1216
+ organization, name = stream_slice["repository"].split("/")
1217
+ if next_page_token:
1218
+ after = next_page_token["cursor"]
1219
+ page_size = min(self.page_size, next_page_token["total_count"])
1220
+ if next_page_token["typename"] == "PullRequest":
1221
+ query = self.query_reactions.get_query_root_repository(owner=organization, name=name, first=page_size, after=after)
1222
+ elif next_page_token["typename"] == "PullRequestReview":
1223
+ query = self.query_reactions.get_query_root_pull_request(node_id=next_page_token["parent_id"], first=page_size, after=after)
1224
+ elif next_page_token["typename"] == "PullRequestReviewComment":
1225
+ query = self.query_reactions.get_query_root_review(node_id=next_page_token["parent_id"], first=page_size, after=after)
1226
+ elif next_page_token["typename"] == "Reaction":
1227
+ query = self.query_reactions.get_query_root_comment(node_id=next_page_token["parent_id"], first=page_size, after=after)
1228
+ else:
1229
+ query = self.query_reactions.get_query_root_repository(owner=organization, name=name, first=self.page_size)
1230
+
1231
+ return {"query": query}
1232
+
1233
+
1234
+ class Deployments(SemiIncrementalMixin, GithubStream):
1235
+ """
1236
+ API docs: https://docs.github.com/en/rest/deployments/deployments?apiVersion=2022-11-28#list-deployments
1237
+ """
1238
+
1239
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1240
+ return f"repos/{stream_slice['repository']}/deployments"
1241
+
1242
+
1243
+ class ProjectColumns(GithubStream):
1244
+ """
1245
+ API docs: https://docs.github.com/en/rest/projects/columns?apiVersion=2022-11-28#list-project-columns
1246
+ """
1247
+
1248
+ use_cache = True
1249
+ cursor_field = "updated_at"
1250
+
1251
+ def __init__(self, parent: HttpStream, start_date: str, **kwargs):
1252
+ super().__init__(**kwargs)
1253
+ self.parent = parent
1254
+ self._start_date = start_date
1255
+
1256
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1257
+ return f"projects/{stream_slice['project_id']}/columns"
1258
+
1259
+ def stream_slices(
1260
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
1261
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
1262
+ parent_stream_slices = self.parent.stream_slices(
1263
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
1264
+ )
1265
+ for stream_slice in parent_stream_slices:
1266
+ parent_records = self.parent.read_records(
1267
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1268
+ )
1269
+ for record in parent_records:
1270
+ yield {"repository": record["repository"], "project_id": record["id"]}
1271
+
1272
+ def read_records(
1273
+ self,
1274
+ sync_mode: SyncMode,
1275
+ cursor_field: List[str] = None,
1276
+ stream_slice: Mapping[str, Any] = None,
1277
+ stream_state: Mapping[str, Any] = None,
1278
+ ) -> Iterable[Mapping[str, Any]]:
1279
+ starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
1280
+ for record in super().read_records(
1281
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1282
+ ):
1283
+ if not starting_point or record[self.cursor_field] > starting_point:
1284
+ yield record
1285
+
1286
+ def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str:
1287
+ if stream_state:
1288
+ repository = stream_slice["repository"]
1289
+ project_id = str(stream_slice["project_id"])
1290
+ stream_state_value = stream_state.get(repository, {}).get(project_id, {}).get(self.cursor_field)
1291
+ if stream_state_value:
1292
+ if self._start_date:
1293
+ return max(self._start_date, stream_state_value)
1294
+ return stream_state_value
1295
+ return self._start_date
1296
+
1297
+ def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
1298
+ repository = latest_record["repository"]
1299
+ project_id = str(latest_record["project_id"])
1300
+ updated_state = latest_record[self.cursor_field]
1301
+ stream_state_value = current_stream_state.get(repository, {}).get(project_id, {}).get(self.cursor_field)
1302
+ if stream_state_value:
1303
+ updated_state = max(updated_state, stream_state_value)
1304
+ current_stream_state.setdefault(repository, {}).setdefault(project_id, {})[self.cursor_field] = updated_state
1305
+ return current_stream_state
1306
+
1307
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1308
+ record = super().transform(record=record, stream_slice=stream_slice)
1309
+ record["project_id"] = stream_slice["project_id"]
1310
+ return record
1311
+
1312
+
1313
+ class ProjectCards(GithubStream):
1314
+ """
1315
+ API docs: https://docs.github.com/en/rest/projects/cards?apiVersion=2022-11-28#list-project-cards
1316
+ """
1317
+
1318
+ cursor_field = "updated_at"
1319
+ stream_base_params = {"archived_state": "all"}
1320
+
1321
+ def __init__(self, parent: HttpStream, start_date: str, **kwargs):
1322
+ super().__init__(**kwargs)
1323
+ self.parent = parent
1324
+ self._start_date = start_date
1325
+
1326
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1327
+ return f"projects/columns/{stream_slice['column_id']}/cards"
1328
+
1329
+ def stream_slices(
1330
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
1331
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
1332
+ parent_stream_slices = self.parent.stream_slices(
1333
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
1334
+ )
1335
+ for stream_slice in parent_stream_slices:
1336
+ parent_records = self.parent.read_records(
1337
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1338
+ )
1339
+ for record in parent_records:
1340
+ yield {"repository": record["repository"], "project_id": record["project_id"], "column_id": record["id"]}
1341
+
1342
+ def read_records(
1343
+ self,
1344
+ sync_mode: SyncMode,
1345
+ cursor_field: List[str] = None,
1346
+ stream_slice: Mapping[str, Any] = None,
1347
+ stream_state: Mapping[str, Any] = None,
1348
+ ) -> Iterable[Mapping[str, Any]]:
1349
+ starting_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
1350
+ for record in super().read_records(
1351
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1352
+ ):
1353
+ if not starting_point or record[self.cursor_field] > starting_point:
1354
+ yield record
1355
+
1356
+ def get_starting_point(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any]) -> str:
1357
+ if stream_state:
1358
+ repository = stream_slice["repository"]
1359
+ project_id = str(stream_slice["project_id"])
1360
+ column_id = str(stream_slice["column_id"])
1361
+ stream_state_value = stream_state.get(repository, {}).get(project_id, {}).get(column_id, {}).get(self.cursor_field)
1362
+ if stream_state_value:
1363
+ if self._start_date:
1364
+ return max(self._start_date, stream_state_value)
1365
+ return stream_state_value
1366
+ return self._start_date
1367
+
1368
+ def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]):
1369
+ repository = latest_record["repository"]
1370
+ project_id = str(latest_record["project_id"])
1371
+ column_id = str(latest_record["column_id"])
1372
+ updated_state = latest_record[self.cursor_field]
1373
+ stream_state_value = current_stream_state.get(repository, {}).get(project_id, {}).get(column_id, {}).get(self.cursor_field)
1374
+ if stream_state_value:
1375
+ updated_state = max(updated_state, stream_state_value)
1376
+ current_stream_state.setdefault(repository, {}).setdefault(project_id, {}).setdefault(column_id, {})[
1377
+ self.cursor_field
1378
+ ] = updated_state
1379
+ return current_stream_state
1380
+
1381
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1382
+ record = super().transform(record=record, stream_slice=stream_slice)
1383
+ record["project_id"] = stream_slice["project_id"]
1384
+ record["column_id"] = stream_slice["column_id"]
1385
+ return record
1386
+
1387
+
1388
+ class Workflows(SemiIncrementalMixin, GithubStream):
1389
+ """
1390
+ Get all workflows of a GitHub repository
1391
+ API documentation: https://docs.github.com/en/rest/actions/workflows?apiVersion=2022-11-28#list-repository-workflows
1392
+ """
1393
+
1394
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1395
+ return f"repos/{stream_slice['repository']}/actions/workflows"
1396
+
1397
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
1398
+ response = response.json().get("workflows")
1399
+ for record in response:
1400
+ yield self.transform(record=record, stream_slice=stream_slice)
1401
+
1402
+ def convert_cursor_value(self, value):
1403
+ return pendulum.parse(value).in_tz(tz="UTC").format("YYYY-MM-DDTHH:mm:ss[Z]")
1404
+
1405
+
1406
+ class WorkflowRuns(SemiIncrementalMixin, GithubStream):
1407
+ """
1408
+ Get all workflow runs for a GitHub repository
1409
+ API documentation: https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-repository
1410
+ """
1411
+
1412
+ # key for accessing slice value from record
1413
+ record_slice_key = ["repository", "full_name"]
1414
+
1415
+ # https://docs.github.com/en/actions/managing-workflow-runs/re-running-workflows-and-jobs
1416
+ re_run_period = 32 # days
1417
+
1418
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1419
+ return f"repos/{stream_slice['repository']}/actions/runs"
1420
+
1421
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
1422
+ response = response.json().get("workflow_runs")
1423
+ for record in response:
1424
+ yield record
1425
+
1426
+ def read_records(
1427
+ self,
1428
+ sync_mode: SyncMode,
1429
+ cursor_field: List[str] = None,
1430
+ stream_slice: Mapping[str, Any] = None,
1431
+ stream_state: Mapping[str, Any] = None,
1432
+ ) -> Iterable[Mapping[str, Any]]:
1433
+ # Records in the workflows_runs stream are naturally descending sorted by `created_at` field.
1434
+ # On first sight this is not big deal because cursor_field is `updated_at`.
1435
+ # But we still can use `created_at` as a breakpoint because after 30 days period
1436
+ # https://docs.github.com/en/actions/managing-workflow-runs/re-running-workflows-and-jobs
1437
+ # workflows_runs records cannot be updated. It means if we initially fully synced stream on subsequent incremental sync we need
1438
+ # only to look behind on 30 days to find all records which were updated.
1439
+ start_point = self.get_starting_point(stream_state=stream_state, stream_slice=stream_slice)
1440
+ break_point = None
1441
+ if start_point:
1442
+ break_point = (pendulum.parse(start_point) - pendulum.duration(days=self.re_run_period)).to_iso8601_string()
1443
+ for record in super(SemiIncrementalMixin, self).read_records(
1444
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1445
+ ):
1446
+ cursor_value = record[self.cursor_field]
1447
+ created_at = record["created_at"]
1448
+ if not start_point or cursor_value > start_point:
1449
+ yield record
1450
+ if break_point and created_at < break_point:
1451
+ break
1452
+
1453
+
1454
+ class WorkflowJobs(SemiIncrementalMixin, GithubStream):
1455
+ """
1456
+ Get all workflow jobs for a workflow run
1457
+ API documentation: https://docs.github.com/pt/rest/actions/workflow-jobs?apiVersion=2022-11-28#list-jobs-for-a-workflow-run
1458
+ """
1459
+
1460
+ cursor_field = "completed_at"
1461
+
1462
+ def __init__(self, parent: WorkflowRuns, **kwargs):
1463
+ super().__init__(**kwargs)
1464
+ self.parent = parent
1465
+
1466
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1467
+ return f"repos/{stream_slice['repository']}/actions/runs/{stream_slice['run_id']}/jobs"
1468
+
1469
+ def read_records(
1470
+ self,
1471
+ sync_mode: SyncMode,
1472
+ cursor_field: List[str] = None,
1473
+ stream_slice: Mapping[str, Any] = None,
1474
+ stream_state: Mapping[str, Any] = None,
1475
+ ) -> Iterable[Mapping[str, Any]]:
1476
+ parent_stream_state = None
1477
+ if stream_state is not None:
1478
+ parent_stream_state = {repository: {self.parent.cursor_field: v[self.cursor_field]} for repository, v in stream_state.items()}
1479
+ parent_stream_slices = self.parent.stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=parent_stream_state)
1480
+ for stream_slice in parent_stream_slices:
1481
+ parent_records = self.parent.read_records(
1482
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=parent_stream_state
1483
+ )
1484
+ for record in parent_records:
1485
+ stream_slice["run_id"] = record["id"]
1486
+ yield from super().read_records(
1487
+ sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1488
+ )
1489
+
1490
+ def parse_response(
1491
+ self,
1492
+ response: requests.Response,
1493
+ stream_state: Mapping[str, Any],
1494
+ stream_slice: Mapping[str, Any] = None,
1495
+ next_page_token: Mapping[str, Any] = None,
1496
+ ) -> Iterable[Mapping]:
1497
+ for record in response.json()["jobs"]:
1498
+ if record.get(self.cursor_field):
1499
+ yield self.transform(record=record, stream_slice=stream_slice)
1500
+
1501
+ def request_params(
1502
+ self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
1503
+ ) -> MutableMapping[str, Any]:
1504
+ params = super().request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token)
1505
+ params["filter"] = "all"
1506
+ return params
1507
+
1508
+
1509
+ class TeamMembers(GithubStream):
1510
+ """
1511
+ API docs: https://docs.github.com/en/rest/teams/members?apiVersion=2022-11-28#list-team-members
1512
+ """
1513
+
1514
+ use_cache = True
1515
+ primary_key = ["id", "team_slug"]
1516
+
1517
+ def __init__(self, parent: Teams, **kwargs):
1518
+ super().__init__(**kwargs)
1519
+ self.parent = parent
1520
+
1521
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1522
+ return f"orgs/{stream_slice['organization']}/teams/{stream_slice['team_slug']}/members"
1523
+
1524
+ def stream_slices(
1525
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
1526
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
1527
+ parent_stream_slices = self.parent.stream_slices(
1528
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
1529
+ )
1530
+ for stream_slice in parent_stream_slices:
1531
+ parent_records = self.parent.read_records(
1532
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1533
+ )
1534
+ for record in parent_records:
1535
+ yield {"organization": record["organization"], "team_slug": record["slug"]}
1536
+
1537
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1538
+ record["organization"] = stream_slice["organization"]
1539
+ record["team_slug"] = stream_slice["team_slug"]
1540
+ return record
1541
+
1542
+
1543
+ class TeamMemberships(GithubStream):
1544
+ """
1545
+ API docs: https://docs.github.com/en/rest/teams/members?apiVersion=2022-11-28#get-team-membership-for-a-user
1546
+ """
1547
+
1548
+ primary_key = ["url"]
1549
+
1550
+ def __init__(self, parent: TeamMembers, **kwargs):
1551
+ super().__init__(**kwargs)
1552
+ self.parent = parent
1553
+
1554
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1555
+ return f"orgs/{stream_slice['organization']}/teams/{stream_slice['team_slug']}/memberships/{stream_slice['username']}"
1556
+
1557
+ def stream_slices(
1558
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
1559
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
1560
+ parent_stream_slices = self.parent.stream_slices(
1561
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
1562
+ )
1563
+ for stream_slice in parent_stream_slices:
1564
+ parent_records = self.parent.read_records(
1565
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1566
+ )
1567
+ for record in parent_records:
1568
+ yield {"organization": record["organization"], "team_slug": record["team_slug"], "username": record["login"]}
1569
+
1570
+ def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any], **kwargs) -> Iterable[Mapping]:
1571
+ yield self.transform(response.json(), stream_slice=stream_slice)
1572
+
1573
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1574
+ record["organization"] = stream_slice["organization"]
1575
+ record["team_slug"] = stream_slice["team_slug"]
1576
+ record["username"] = stream_slice["username"]
1577
+ return record
1578
+
1579
+
1580
+ class ContributorActivity(GithubStream):
1581
+ """
1582
+ API docs: https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#get-all-contributor-commit-activity
1583
+ """
1584
+
1585
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1586
+ return f"repos/{stream_slice['repository']}/stats/contributors"
1587
+
1588
+ def request_headers(self, **kwargs) -> Mapping[str, Any]:
1589
+ params = super().request_headers(**kwargs)
1590
+ params.update({"Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"})
1591
+ return params
1592
+
1593
+ def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
1594
+ record["repository"] = stream_slice["repository"]
1595
+ record.update(record.pop("author"))
1596
+ return record
1597
+
1598
+ def should_retry(self, response: requests.Response) -> bool:
1599
+ """
1600
+ If the data hasn't been cached when you query a repository's statistics, you'll receive a 202 response, need to retry to get results
1601
+ see for more info https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#a-word-about-caching
1602
+ """
1603
+ if super().should_retry(response) or response.status_code == requests.codes.ACCEPTED:
1604
+ return True
1605
+
1606
+ def backoff_time(self, response: requests.Response) -> Optional[float]:
1607
+ return 90 if response.status_code == requests.codes.ACCEPTED else super().backoff_time(response)
1608
+
1609
+ def parse_response(
1610
+ self,
1611
+ response: requests.Response,
1612
+ stream_state: Mapping[str, Any],
1613
+ stream_slice: Mapping[str, Any] = None,
1614
+ next_page_token: Mapping[str, Any] = None,
1615
+ ) -> Iterable[Mapping]:
1616
+ if response.status_code == requests.codes.NO_CONTENT:
1617
+ self.logger.warning(f"Empty response received for {self.name} stats in repository {stream_slice.get('repository')}")
1618
+ else:
1619
+ yield from super().parse_response(
1620
+ response, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token
1621
+ )
1622
+
1623
+ def read_records(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
1624
+ repository = stream_slice.get("repository", "")
1625
+ try:
1626
+ yield from super().read_records(stream_slice=stream_slice, **kwargs)
1627
+ except HTTPError as e:
1628
+ if e.response.status_code == requests.codes.ACCEPTED:
1629
+ yield AirbyteMessage(
1630
+ type=MessageType.LOG,
1631
+ log=AirbyteLogMessage(
1632
+ level=Level.INFO,
1633
+ message=f"Syncing `{self.__class__.__name__}` " f"stream isn't available for repository `{repository}`.",
1634
+ ),
1635
+ )
1636
+ else:
1637
+ raise e
1638
+
1639
+
1640
+ class IssueTimelineEvents(GithubStream):
1641
+ """
1642
+ API docs https://docs.github.com/en/rest/issues/timeline?apiVersion=2022-11-28#list-timeline-events-for-an-issue
1643
+ """
1644
+
1645
+ primary_key = ["repository", "issue_number"]
1646
+
1647
+ def __init__(self, **kwargs):
1648
+ super().__init__(**kwargs)
1649
+ self.parent = Issues(**kwargs)
1650
+
1651
+ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
1652
+ return f"repos/{stream_slice['repository']}/issues/{stream_slice['number']}/timeline"
1653
+
1654
+ def stream_slices(
1655
+ self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
1656
+ ) -> Iterable[Optional[Mapping[str, Any]]]:
1657
+ parent_stream_slices = self.parent.stream_slices(
1658
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_state=stream_state
1659
+ )
1660
+ for stream_slice in parent_stream_slices:
1661
+ parent_records = self.parent.read_records(
1662
+ sync_mode=SyncMode.full_refresh, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
1663
+ )
1664
+ for record in parent_records:
1665
+ yield {"repository": record["repository"], "number": record["number"]}
1666
+
1667
+ def parse_response(
1668
+ self,
1669
+ response: requests.Response,
1670
+ stream_state: Mapping[str, Any],
1671
+ stream_slice: Mapping[str, Any] = None,
1672
+ next_page_token: Mapping[str, Any] = None,
1673
+ ) -> Iterable[Mapping]:
1674
+ events_list = response.json()
1675
+ record = {"repository": stream_slice["repository"], "issue_number": stream_slice["number"]}
1676
+ for event in events_list:
1677
+ record[event["event"]] = event
1678
+ yield record