airbyte-source-github 1.7.0__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-source-github
3
- Version: 1.7.0
3
+ Version: 1.7.1
4
4
  Summary: Source implementation for GitHub.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
@@ -52,11 +52,11 @@ source_github/schemas/users.json,sha256=xASJmm56AqLYxSCfn5qlPy0xUVJOW8K3gWlwRr4J
52
52
  source_github/schemas/workflow_jobs.json,sha256=ORowQYqvJhJE2EEV1jXyQSPCFmtO6NyhJZGTgpXte1Q,2089
53
53
  source_github/schemas/workflow_runs.json,sha256=JWK1p1HQI2dDnutF4rd7gPG7Nx1_RJL2VXIka4KQwMQ,10171
54
54
  source_github/schemas/workflows.json,sha256=zvtOslS-veNo5_iXmMxMNlY8OOt8DdvTZ3hjtdJbdvY,753
55
- source_github/source.py,sha256=MSAwf6DCNMhu6Pu6ADPR7EfOIYYDnYf8bnx1LHG6LY4,15612
56
- source_github/spec.json,sha256=_L8sFYPPMeUTRqXOza0IISyR3c2E5u4aHThKMQA2r4s,7096
57
- source_github/streams.py,sha256=M-N-JOmz5AixLvu2aAmrkG5yhg20ShdbvPl-sngu3kM,75191
55
+ source_github/source.py,sha256=jOGHJLL6ys4NRAjGs-Lw1RJxK25NxA5XS8n-uH2Gico,13879
56
+ source_github/spec.json,sha256=tsRjwqInYQjvqhm-Yzdn7_VC5QyInCGAToFJnUrCnOU,7074
57
+ source_github/streams.py,sha256=vcP2P0vCvShaa3z8M9dnmbY6awKR8vAIw3XgR8PlZtk,77006
58
58
  source_github/utils.py,sha256=DfAHFjsF8hzDXeSCR6qtfs7W_av6o2BkkEVhtHpWbis,5462
59
- airbyte_source_github-1.7.0.dist-info/METADATA,sha256=fVACNnJU7rCR7m8QVdx5BKYvMl4H46_i-2XpUiiQo_o,5228
60
- airbyte_source_github-1.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
61
- airbyte_source_github-1.7.0.dist-info/entry_points.txt,sha256=gYhqVrTAZvMwuYByg0b_-o115yUFLLcfNxMrLZmiW9k,55
62
- airbyte_source_github-1.7.0.dist-info/RECORD,,
59
+ airbyte_source_github-1.7.1.dist-info/METADATA,sha256=fr12eeFOb6XmyXnxebMc5CnwuBKsQZieK9wmbGt_aEo,5228
60
+ airbyte_source_github-1.7.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
61
+ airbyte_source_github-1.7.1.dist-info/entry_points.txt,sha256=gYhqVrTAZvMwuYByg0b_-o115yUFLLcfNxMrLZmiW9k,55
62
+ airbyte_source_github-1.7.1.dist-info/RECORD,,
source_github/source.py CHANGED
@@ -3,7 +3,7 @@
3
3
  #
4
4
 
5
5
  from os import getenv
6
- from typing import Any, Dict, List, Mapping, MutableMapping, Tuple
6
+ from typing import Any, List, Mapping, MutableMapping, Optional, Tuple
7
7
  from urllib.parse import urlparse
8
8
 
9
9
  from airbyte_cdk import AirbyteLogger
@@ -65,7 +65,9 @@ class SourceGithub(AbstractSource):
65
65
  continue_sync_on_stream_failure = True
66
66
 
67
67
  @staticmethod
68
- def _get_org_repositories(config: Mapping[str, Any], authenticator: MultipleTokenAuthenticator) -> Tuple[List[str], List[str]]:
68
+ def _get_org_repositories(
69
+ config: Mapping[str, Any], authenticator: MultipleTokenAuthenticator
70
+ ) -> Tuple[List[str], List[str], Optional[str]]:
69
71
  """
70
72
  Parse config/repositories and produce two lists: organizations, repositories.
71
73
  Args:
@@ -78,16 +80,19 @@ class SourceGithub(AbstractSource):
78
80
  organizations = set()
79
81
  unchecked_repos = set()
80
82
  unchecked_orgs = set()
83
+ pattern = None
81
84
 
82
85
  for org_repos in config_repositories:
83
- org, _, repos = org_repos.partition("/")
84
- if repos == "*":
85
- unchecked_orgs.add(org)
86
+ _, _, repos = org_repos.partition("/")
87
+ if "*" in repos:
88
+ unchecked_orgs.add(org_repos)
86
89
  else:
87
90
  unchecked_repos.add(org_repos)
88
91
 
89
92
  if unchecked_orgs:
90
- stream = Repositories(authenticator=authenticator, organizations=unchecked_orgs, api_url=config.get("api_url"))
93
+ org_names = [org.split("/")[0] for org in unchecked_orgs]
94
+ pattern = "|".join([f"({org.replace('*', '.*')})" for org in unchecked_orgs])
95
+ stream = Repositories(authenticator=authenticator, organizations=org_names, api_url=config.get("api_url"), pattern=pattern)
91
96
  for record in read_full_refresh(stream):
92
97
  repositories.add(record["full_name"])
93
98
  organizations.add(record["organization"])
@@ -96,7 +101,7 @@ class SourceGithub(AbstractSource):
96
101
  if unchecked_repos:
97
102
  stream = RepositoryStats(
98
103
  authenticator=authenticator,
99
- repositories=unchecked_repos,
104
+ repositories=list(unchecked_repos),
100
105
  api_url=config.get("api_url"),
101
106
  # This parameter is deprecated and in future will be used sane default, page_size: 10
102
107
  page_size_for_large_streams=config.get("page_size_for_large_streams", constants.DEFAULT_PAGE_SIZE_FOR_LARGE_STREAM),
@@ -107,7 +112,7 @@ class SourceGithub(AbstractSource):
107
112
  if organization:
108
113
  organizations.add(organization)
109
114
 
110
- return list(organizations), list(repositories)
115
+ return list(organizations), list(repositories), pattern
111
116
 
112
117
  @staticmethod
113
118
  def get_access_token(config: Mapping[str, Any]):
@@ -169,45 +174,6 @@ class SourceGithub(AbstractSource):
169
174
  def _is_http_allowed() -> bool:
170
175
  return getenv("DEPLOYMENT_MODE", "").upper() != "CLOUD"
171
176
 
172
- @staticmethod
173
- def _get_branches_data(
174
- selected_branches: List, full_refresh_args: Dict[str, Any] = None
175
- ) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
176
- selected_branches = set(selected_branches)
177
-
178
- # Get the default branch for each repository
179
- default_branches = {}
180
- repository_stats_stream = RepositoryStats(**full_refresh_args)
181
- for stream_slice in repository_stats_stream.stream_slices(sync_mode=SyncMode.full_refresh):
182
- default_branches.update(
183
- {
184
- repo_stats["full_name"]: repo_stats["default_branch"]
185
- for repo_stats in repository_stats_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)
186
- }
187
- )
188
-
189
- all_branches = []
190
- branches_stream = Branches(**full_refresh_args)
191
- for stream_slice in branches_stream.stream_slices(sync_mode=SyncMode.full_refresh):
192
- for branch in branches_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice):
193
- all_branches.append(f"{branch['repository']}/{branch['name']}")
194
-
195
- # Create mapping of repository to list of branches to pull commits for
196
- # If no branches are specified for a repo, use its default branch
197
- branches_to_pull: Dict[str, List[str]] = {}
198
- for repo in full_refresh_args["repositories"]:
199
- repo_branches = []
200
- for branch in selected_branches:
201
- branch_parts = branch.split("/", 2)
202
- if "/".join(branch_parts[:2]) == repo and branch in all_branches:
203
- repo_branches.append(branch_parts[-1])
204
- if not repo_branches:
205
- repo_branches = [default_branches[repo]]
206
-
207
- branches_to_pull[repo] = repo_branches
208
-
209
- return default_branches, branches_to_pull
210
-
211
177
  def user_friendly_error_message(self, message: str) -> str:
212
178
  user_message = ""
213
179
  if "404 Client Error: Not Found for url: https://api.github.com/repos/" in message:
@@ -229,7 +195,7 @@ class SourceGithub(AbstractSource):
229
195
  config = self._validate_and_transform_config(config)
230
196
  try:
231
197
  authenticator = self._get_authenticator(config)
232
- _, repositories = self._get_org_repositories(config=config, authenticator=authenticator)
198
+ _, repositories, _ = self._get_org_repositories(config=config, authenticator=authenticator)
233
199
  if not repositories:
234
200
  return (
235
201
  False,
@@ -246,7 +212,7 @@ class SourceGithub(AbstractSource):
246
212
  authenticator = self._get_authenticator(config)
247
213
  config = self._validate_and_transform_config(config)
248
214
  try:
249
- organizations, repositories = self._get_org_repositories(config=config, authenticator=authenticator)
215
+ organizations, repositories, pattern = self._get_org_repositories(config=config, authenticator=authenticator)
250
216
  except Exception as e:
251
217
  message = repr(e)
252
218
  user_message = self.user_friendly_error_message(message)
@@ -291,7 +257,6 @@ class SourceGithub(AbstractSource):
291
257
  }
292
258
  repository_args_with_start_date = {**repository_args, "start_date": start_date}
293
259
 
294
- default_branches, branches_to_pull = self._get_branches_data(config.get("branch", []), repository_args)
295
260
  pull_requests_stream = PullRequests(**repository_args_with_start_date)
296
261
  projects_stream = Projects(**repository_args_with_start_date)
297
262
  project_columns_stream = ProjectColumns(projects_stream, **repository_args_with_start_date)
@@ -307,7 +272,7 @@ class SourceGithub(AbstractSource):
307
272
  Comments(**repository_args_with_start_date),
308
273
  CommitCommentReactions(**repository_args_with_start_date),
309
274
  CommitComments(**repository_args_with_start_date),
310
- Commits(**repository_args_with_start_date, branches_to_pull=branches_to_pull, default_branches=default_branches),
275
+ Commits(**repository_args_with_start_date, branches_to_pull=config.get("branches", [])),
311
276
  ContributorActivity(**repository_args),
312
277
  Deployments(**repository_args_with_start_date),
313
278
  Events(**repository_args_with_start_date),
@@ -327,7 +292,7 @@ class SourceGithub(AbstractSource):
327
292
  ProjectsV2(**repository_args_with_start_date),
328
293
  pull_requests_stream,
329
294
  Releases(**repository_args_with_start_date),
330
- Repositories(**organization_args_with_start_date),
295
+ Repositories(**organization_args_with_start_date, pattern=pattern),
331
296
  ReviewComments(**repository_args_with_start_date),
332
297
  Reviews(**repository_args_with_start_date),
333
298
  Stargazers(**repository_args_with_start_date),
source_github/spec.json CHANGED
@@ -81,18 +81,19 @@
81
81
  "type": "array",
82
82
  "items": {
83
83
  "type": "string",
84
- "pattern": "^([\\w.-]+/(\\*|[\\w.-]+(?<!\\.git))\\s+)*[\\w.-]+/(\\*|[\\w.-]+(?<!\\.git))$"
84
+ "pattern": "^[\\w.-]+/(([\\w.-]*\\*)|[\\w.-]+(?<!\\.git))$"
85
85
  },
86
86
  "minItems": 1,
87
87
  "examples": [
88
- "airbytehq/airbyte airbytehq/another-repo",
88
+ "airbytehq/airbyte",
89
+ "airbytehq/another-repo",
89
90
  "airbytehq/*",
90
- "airbytehq/airbyte"
91
+ "airbytehq/a*"
91
92
  ],
92
93
  "title": "GitHub Repositories",
93
- "description": "List of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/airbyte airbytehq/another-repo` for multiple repositories.",
94
+ "description": "List of GitHub organizations/repositories, e.g. `airbytehq/airbyte` for single repository, `airbytehq/*` for get all repositories from organization and `airbytehq/a* for matching multiple repositories by pattern.",
94
95
  "order": 1,
95
- "pattern_descriptor": "org/repo org/another-repo org/*"
96
+ "pattern_descriptor": "org/repo org/another-repo org/* org/a*"
96
97
  },
97
98
  "start_date": {
98
99
  "type": "string",
@@ -126,7 +127,7 @@
126
127
  "type": "string"
127
128
  },
128
129
  "title": "Branches",
129
- "examples": ["airbytehq/airbyte/master airbytehq/airbyte/my-branch"],
130
+ "examples": ["airbytehq/airbyte/master", "airbytehq/airbyte/my-branch"],
130
131
  "description": "List of GitHub repository branches to pull commits for, e.g. `airbytehq/airbyte/master`. If no branches are specified for a repository, the default branch will be pulled.",
131
132
  "order": 4,
132
133
  "pattern_descriptor": "org/repo/branch1 org/repo/branch2"
source_github/streams.py CHANGED
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ import re
5
6
  import time
6
7
  from abc import ABC, abstractmethod
7
8
  from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
@@ -441,12 +442,18 @@ class Repositories(SemiIncrementalMixin, Organizations):
441
442
  "direction": "desc",
442
443
  }
443
444
 
445
+ def __init__(self, *args, pattern: Optional[str] = None, **kwargs):
446
+ self._pattern = re.compile(pattern) if pattern else pattern
447
+ super().__init__(*args, **kwargs)
448
+
444
449
  def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
445
450
  return f"orgs/{stream_slice['organization']}/repos"
446
451
 
447
452
  def parse_response(self, response: requests.Response, stream_slice: Mapping[str, Any] = None, **kwargs) -> Iterable[Mapping]:
448
453
  for record in response.json(): # GitHub puts records in an array.
449
- yield self.transform(record=record, stream_slice=stream_slice)
454
+ record = self.transform(record=record, stream_slice=stream_slice)
455
+ if not self._pattern or self._pattern.match(record["full_name"]):
456
+ yield record
450
457
 
451
458
 
452
459
  class Tags(GithubStream):
@@ -676,10 +683,13 @@ class Commits(IncrementalMixin, GithubStream):
676
683
  cursor_field = "created_at"
677
684
  slice_keys = ["repository", "branch"]
678
685
 
679
- def __init__(self, branches_to_pull: Mapping[str, List[str]], default_branches: Mapping[str, str], **kwargs):
686
+ def __init__(self, branches_to_pull: List[str], **kwargs):
680
687
  super().__init__(**kwargs)
681
- self.branches_to_pull = branches_to_pull
682
- self.default_branches = default_branches
688
+ kwargs.pop("start_date")
689
+ self.branches_to_repos = {}
690
+ self.branches_to_pull = set(branches_to_pull)
691
+ self.branches_stream = Branches(**kwargs)
692
+ self.repositories_stream = RepositoryStats(**kwargs)
683
693
 
684
694
  def request_params(self, stream_state: Mapping[str, Any], stream_slice: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]:
685
695
  params = super(IncrementalMixin, self).request_params(stream_state=stream_state, stream_slice=stream_slice, **kwargs)
@@ -690,9 +700,10 @@ class Commits(IncrementalMixin, GithubStream):
690
700
  return params
691
701
 
692
702
  def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
703
+ self._validate_branches_to_pull()
693
704
  for stream_slice in super().stream_slices(**kwargs):
694
705
  repository = stream_slice["repository"]
695
- for branch in self.branches_to_pull.get(repository, []):
706
+ for branch in self.branches_to_repos.get(repository, []):
696
707
  yield {"branch": branch, "repository": repository}
697
708
 
698
709
  def transform(self, record: MutableMapping[str, Any], stream_slice: Mapping[str, Any]) -> MutableMapping[str, Any]:
@@ -718,6 +729,30 @@ class Commits(IncrementalMixin, GithubStream):
718
729
  current_stream_state.setdefault(repository, {}).setdefault(branch, {})[self.cursor_field] = updated_state
719
730
  return current_stream_state
720
731
 
732
+ def _validate_branches_to_pull(self):
733
+ # Get the default branch for each repository
734
+ default_branches = {}
735
+ for stream_slice in self.repositories_stream.stream_slices(sync_mode=SyncMode.full_refresh):
736
+ for repo_stats in self.repositories_stream.read_records(stream_slice=stream_slice, sync_mode=SyncMode.full_refresh):
737
+ default_branches[repo_stats["full_name"]] = repo_stats["default_branch"]
738
+
739
+ all_branches = []
740
+ for stream_slice in self.branches_stream.stream_slices(sync_mode=SyncMode.full_refresh):
741
+ for branch in self.branches_stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice):
742
+ all_branches.append(f"{branch['repository']}/{branch['name']}")
743
+
744
+ # Create mapping of repository to list of branches to pull commits for
745
+ # If no branches are specified for a repo, use its default branch
746
+ for repo in self.repositories:
747
+ repo_branches = []
748
+ for branch in self.branches_to_pull:
749
+ branch_parts = branch.split("/", 2)
750
+ if "/".join(branch_parts[:2]) == repo and branch in all_branches:
751
+ repo_branches.append(branch_parts[-1])
752
+ if not repo_branches:
753
+ repo_branches = [default_branches[repo]]
754
+ self.branches_to_repos[repo] = repo_branches
755
+
721
756
 
722
757
  class Issues(IncrementalMixin, GithubStream):
723
758
  """