cartography 0.95.0__py3-none-any.whl → 0.96.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (42) hide show
  1. cartography/cli.py +15 -0
  2. cartography/client/core/tx.py +1 -1
  3. cartography/config.py +6 -2
  4. cartography/data/indexes.cypher +1 -2
  5. cartography/data/jobs/cleanup/aws_import_identity_center_cleanup.json +16 -0
  6. cartography/data/jobs/cleanup/{github_users_cleanup.json → github_org_and_users_cleanup.json} +5 -0
  7. cartography/data/jobs/cleanup/github_repos_cleanup.json +25 -0
  8. cartography/graph/querybuilder.py +4 -0
  9. cartography/intel/aws/apigateway.py +3 -3
  10. cartography/intel/aws/ec2/auto_scaling_groups.py +147 -185
  11. cartography/intel/aws/ec2/instances.py +2 -0
  12. cartography/intel/aws/ec2/network_acls.py +209 -0
  13. cartography/intel/aws/ec2/subnets.py +2 -0
  14. cartography/intel/aws/iam.py +4 -3
  15. cartography/intel/aws/identitycenter.py +307 -0
  16. cartography/intel/aws/resources.py +4 -0
  17. cartography/intel/cve/__init__.py +1 -1
  18. cartography/intel/cve/feed.py +10 -7
  19. cartography/intel/github/repos.py +176 -27
  20. cartography/intel/github/users.py +156 -39
  21. cartography/intel/okta/users.py +2 -1
  22. cartography/intel/semgrep/__init__.py +1 -1
  23. cartography/intel/semgrep/dependencies.py +54 -22
  24. cartography/models/aws/ec2/auto_scaling_groups.py +204 -0
  25. cartography/models/aws/ec2/launch_configurations.py +55 -0
  26. cartography/models/aws/ec2/network_acl_rules.py +98 -0
  27. cartography/models/aws/ec2/network_acls.py +86 -0
  28. cartography/models/aws/identitycenter/__init__.py +0 -0
  29. cartography/models/aws/identitycenter/awsidentitycenter.py +44 -0
  30. cartography/models/aws/identitycenter/awspermissionset.py +84 -0
  31. cartography/models/aws/identitycenter/awsssouser.py +68 -0
  32. cartography/models/core/common.py +18 -1
  33. cartography/models/github/orgs.py +26 -0
  34. cartography/models/github/users.py +119 -0
  35. cartography/models/semgrep/dependencies.py +13 -0
  36. cartography-0.96.0.dist-info/METADATA +53 -0
  37. {cartography-0.95.0.dist-info → cartography-0.96.0.dist-info}/RECORD +41 -28
  38. {cartography-0.95.0.dist-info → cartography-0.96.0.dist-info}/WHEEL +1 -1
  39. cartography-0.95.0.dist-info/METADATA +0 -53
  40. {cartography-0.95.0.dist-info → cartography-0.96.0.dist-info}/LICENSE +0 -0
  41. {cartography-0.95.0.dist-info → cartography-0.96.0.dist-info}/entry_points.txt +0 -0
  42. {cartography-0.95.0.dist-info → cartography-0.96.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import configparser
2
2
  import logging
3
+ from collections import namedtuple
3
4
  from string import Template
4
5
  from typing import Any
5
6
  from typing import Dict
@@ -12,11 +13,26 @@ from packaging.requirements import Requirement
12
13
  from packaging.utils import canonicalize_name
13
14
 
14
15
  from cartography.intel.github.util import fetch_all
16
+ from cartography.intel.github.util import PaginatedGraphqlData
15
17
  from cartography.util import run_cleanup_job
16
18
  from cartography.util import timeit
17
19
 
18
20
  logger = logging.getLogger(__name__)
19
21
 
22
+
23
+ # Representation of a user's permission level and affiliation to a GitHub repo. See:
24
+ # - Permission: https://docs.github.com/en/graphql/reference/enums#repositorypermission
25
+ # - Affiliation: https://docs.github.com/en/graphql/reference/enums#collaboratoraffiliation
26
+ UserAffiliationAndRepoPermission = namedtuple(
27
+ 'UserAffiliationAndRepoPermission',
28
+ [
29
+ 'user', # Dict
30
+ 'permission', # 'WRITE', 'MAINTAIN', 'ADMIN', etc
31
+ 'affiliation', # 'OUTSIDE', 'DIRECT'
32
+ ],
33
+ )
34
+
35
+
20
36
  GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
21
37
  query($login: String!, $cursor: String) {
22
38
  organization(login: $login)
@@ -59,17 +75,11 @@ GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
59
75
  login
60
76
  __typename
61
77
  }
62
- collaborators(affiliation: OUTSIDE, first: 50) {
63
- edges {
64
- permission
65
- }
66
- nodes {
67
- url
68
- login
69
- name
70
- email
71
- company
72
- }
78
+ directCollaborators: collaborators(first: 100, affiliation: DIRECT) {
79
+ totalCount
80
+ }
81
+ outsideCollaborators: collaborators(first: 100, affiliation: OUTSIDE) {
82
+ totalCount
73
83
  }
74
84
  requirements:object(expression: "HEAD:requirements.txt") {
75
85
  ... on Blob {
@@ -89,6 +99,111 @@ GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
89
99
  # Note: In the above query, `HEAD` references the default branch.
90
100
  # See https://stackoverflow.com/questions/48935381/github-graphql-api-default-branch-in-repository
91
101
 
102
+ GITHUB_REPO_COLLABS_PAGINATED_GRAPHQL = """
103
+ query($login: String!, $repo: String!, $affiliation: CollaboratorAffiliation!, $cursor: String) {
104
+ organization(login: $login) {
105
+ url
106
+ login
107
+ repository(name: $repo){
108
+ name
109
+ collaborators(first: 50, affiliation: $affiliation, after: $cursor) {
110
+ edges {
111
+ permission
112
+ }
113
+ nodes {
114
+ url
115
+ login
116
+ name
117
+ email
118
+ company
119
+ }
120
+ pageInfo{
121
+ endCursor
122
+ hasNextPage
123
+ }
124
+ }
125
+ }
126
+ }
127
+ rateLimit {
128
+ limit
129
+ cost
130
+ remaining
131
+ resetAt
132
+ }
133
+ }
134
+ """
135
+
136
+
137
+ def _get_repo_collaborators_for_multiple_repos(
138
+ repo_raw_data: list[dict[str, Any]],
139
+ affiliation: str,
140
+ org: str,
141
+ api_url: str,
142
+ token: str,
143
+ ) -> dict[str, List[UserAffiliationAndRepoPermission]]:
144
+ """
145
+ For every repo in the given list, retrieve the collaborators.
146
+ :param repo_raw_data: A list of dicts representing repos. See tests.data.github.repos.GET_REPOS for data shape.
147
+ :param affiliation: The type of affiliation to retrieve collaborators for. Either 'DIRECT' or 'OUTSIDE'.
148
+ See https://docs.github.com/en/graphql/reference/enums#collaboratoraffiliation
149
+ :param org: The name of the target Github organization as string.
150
+ :param api_url: The Github v4 API endpoint as string.
151
+ :param token: The Github API token as string.
152
+ :return: A dictionary of repo URL to list of UserAffiliationAndRepoPermission
153
+ """
154
+ result: dict[str, List[UserAffiliationAndRepoPermission]] = {}
155
+ for repo in repo_raw_data:
156
+ repo_name = repo['name']
157
+ repo_url = repo['url']
158
+
159
+ if ((affiliation == 'OUTSIDE' and repo['outsideCollaborators']['totalCount'] == 0) or
160
+ (affiliation == 'DIRECT' and repo['directCollaborators']['totalCount'] == 0)):
161
+ # repo has no collabs of the affiliation type we're looking for, so don't waste time making an API call
162
+ result[repo_url] = []
163
+ continue
164
+
165
+ collab_users = []
166
+ collab_permission = []
167
+ collaborators = _get_repo_collaborators(token, api_url, org, repo_name, affiliation)
168
+ # nodes and edges are expected to always be present given that we only call for them if totalCount is > 0
169
+ for collab in collaborators.nodes:
170
+ collab_users.append(collab)
171
+ for perm in collaborators.edges:
172
+ collab_permission.append(perm['permission'])
173
+
174
+ result[repo_url] = [
175
+ UserAffiliationAndRepoPermission(user, permission, affiliation)
176
+ for user, permission in zip(collab_users, collab_permission)
177
+ ]
178
+ return result
179
+
180
+
181
+ def _get_repo_collaborators(
182
+ token: str, api_url: str, organization: str, repo: str, affiliation: str,
183
+ ) -> PaginatedGraphqlData:
184
+ """
185
+ Retrieve a list of collaborators for a given repository, as described in
186
+ https://docs.github.com/en/graphql/reference/objects#repositorycollaboratorconnection.
187
+ :param token: The Github API token as string.
188
+ :param api_url: The Github v4 API endpoint as string.
189
+ :param organization: The name of the target Github organization as string.
190
+ :pram repo: The name of the target Github repository as string.
191
+ :param affiliation: The type of affiliation to retrieve collaborators for. Either 'DIRECT' or 'OUTSIDE'.
192
+ See https://docs.github.com/en/graphql/reference/enums#collaboratoraffiliation
193
+ :return: A list of dicts representing repos. See tests.data.github.repos for data shape.
194
+ """
195
+ collaborators, _ = fetch_all(
196
+ token,
197
+ api_url,
198
+ organization,
199
+ GITHUB_REPO_COLLABS_PAGINATED_GRAPHQL,
200
+ 'repository',
201
+ resource_inner_type='collaborators',
202
+ repo=repo,
203
+ affiliation=affiliation,
204
+ )
205
+ return collaborators
206
+
92
207
 
93
208
  @timeit
94
209
  def get(token: str, api_url: str, organization: str) -> List[Dict]:
@@ -111,11 +226,18 @@ def get(token: str, api_url: str, organization: str) -> List[Dict]:
111
226
  return repos.nodes
112
227
 
113
228
 
114
- def transform(repos_json: List[Dict]) -> Dict:
229
+ def transform(
230
+ repos_json: List[Dict], direct_collaborators: dict[str, List[UserAffiliationAndRepoPermission]],
231
+ outside_collaborators: dict[str, List[UserAffiliationAndRepoPermission]],
232
+ ) -> Dict:
115
233
  """
116
234
  Parses the JSON returned from GitHub API to create data for graph ingestion
117
- :param repos_json: the list of individual repository nodes from GitHub. See tests.data.github.repos.GET_REPOS for
118
- data shape.
235
+ :param repos_json: the list of individual repository nodes from GitHub.
236
+ See tests.data.github.repos.GET_REPOS for data shape.
237
+ :param direct_collaborators: dict of repo URL to list of direct collaborators.
238
+ See tests.data.github.repos.DIRECT_COLLABORATORS for data shape.
239
+ :param outside_collaborators: dict of repo URL to list of outside collaborators.
240
+ See tests.data.github.repos.OUTSIDE_COLLABORATORS for data shape.
119
241
  :return: Dict containing the repos, repo->language mapping, owners->repo mapping, outside collaborators->repo
120
242
  mapping, and Python requirements files (if any) in a repo.
121
243
  """
@@ -123,7 +245,10 @@ def transform(repos_json: List[Dict]) -> Dict:
123
245
  transformed_repo_languages: List[Dict] = []
124
246
  transformed_repo_owners: List[Dict] = []
125
247
  # See https://docs.github.com/en/graphql/reference/enums#repositorypermission
126
- transformed_collaborators: Dict[str, List[Any]] = {
248
+ transformed_outside_collaborators: Dict[str, List[Any]] = {
249
+ 'ADMIN': [], 'MAINTAIN': [], 'READ': [], 'TRIAGE': [], 'WRITE': [],
250
+ }
251
+ transformed_direct_collaborators: Dict[str, List[Any]] = {
127
252
  'ADMIN': [], 'MAINTAIN': [], 'READ': [], 'TRIAGE': [], 'WRITE': [],
128
253
  }
129
254
  transformed_requirements_files: List[Dict] = []
@@ -131,14 +256,22 @@ def transform(repos_json: List[Dict]) -> Dict:
131
256
  _transform_repo_languages(repo_object['url'], repo_object, transformed_repo_languages)
132
257
  _transform_repo_objects(repo_object, transformed_repo_list)
133
258
  _transform_repo_owners(repo_object['owner']['url'], repo_object, transformed_repo_owners)
134
- _transform_collaborators(repo_object['collaborators'], repo_object['url'], transformed_collaborators)
259
+ _transform_collaborators(
260
+ repo_object['url'], outside_collaborators[repo_object['url']],
261
+ transformed_outside_collaborators,
262
+ )
263
+ _transform_collaborators(
264
+ repo_object['url'], direct_collaborators[repo_object['url']],
265
+ transformed_direct_collaborators,
266
+ )
135
267
  _transform_requirements_txt(repo_object['requirements'], repo_object['url'], transformed_requirements_files)
136
268
  _transform_setup_cfg_requirements(repo_object['setupCfg'], repo_object['url'], transformed_requirements_files)
137
269
  results = {
138
270
  'repos': transformed_repo_list,
139
271
  'repo_languages': transformed_repo_languages,
140
272
  'repo_owners': transformed_repo_owners,
141
- 'repo_collaborators': transformed_collaborators,
273
+ 'repo_outside_collaborators': transformed_outside_collaborators,
274
+ 'repo_direct_collaborators': transformed_direct_collaborators,
142
275
  'python_requirements': transformed_requirements_files,
143
276
  }
144
277
  return results
@@ -229,11 +362,15 @@ def _transform_repo_languages(repo_url: str, repo: Dict, repo_languages: List[Di
229
362
  })
230
363
 
231
364
 
232
- def _transform_collaborators(collaborators: Dict, repo_url: str, transformed_collaborators: Dict) -> None:
365
+ def _transform_collaborators(
366
+ repo_url: str, collaborators: List[UserAffiliationAndRepoPermission], transformed_collaborators: Dict,
367
+ ) -> None:
233
368
  """
234
- Performs data adjustments for outside collaborators in a GitHub repo.
369
+ Performs data adjustments for collaborators in a GitHub repo.
235
370
  Output data shape = [{permission, repo_url, url (the user's URL), login, name}, ...]
236
- :param collaborators: See cartography.tests.data.github.repos for data shape.
371
+ :param collaborators: For data shape, see
372
+ cartography.tests.data.github.repos.DIRECT_COLLABORATORS
373
+ cartography.tests.data.github.repos.OUTSIDE_COLLABORATORS
237
374
  :param repo_url: The URL of the GitHub repo.
238
375
  :param transformed_collaborators: Output dict. Data shape =
239
376
  {'ADMIN': [{ user }, ...], 'MAINTAIN': [{ user }, ...], 'READ': [ ... ], 'TRIAGE': [ ... ], 'WRITE': [ ... ]}
@@ -241,10 +378,11 @@ def _transform_collaborators(collaborators: Dict, repo_url: str, transformed_col
241
378
  """
242
379
  # `collaborators` is sometimes None
243
380
  if collaborators:
244
- for idx, user in enumerate(collaborators['nodes']):
245
- user_permission = collaborators['edges'][idx]['permission']
381
+ for collaborator in collaborators:
382
+ user = collaborator.user
246
383
  user['repo_url'] = repo_url
247
- transformed_collaborators[user_permission].append(user)
384
+ user['affiliation'] = collaborator.affiliation
385
+ transformed_collaborators[collaborator.permission].append(user)
248
386
 
249
387
 
250
388
  def _transform_requirements_txt(
@@ -482,7 +620,7 @@ def load_github_owners(neo4j_session: neo4j.Session, update_tag: int, repo_owner
482
620
 
483
621
 
484
622
  @timeit
485
- def load_collaborators(neo4j_session: neo4j.Session, update_tag: int, collaborators: Dict) -> None:
623
+ def load_collaborators(neo4j_session: neo4j.Session, update_tag: int, collaborators: Dict, affiliation: str) -> None:
486
624
  query = Template("""
487
625
  UNWIND $UserData as user
488
626
 
@@ -502,7 +640,7 @@ def load_collaborators(neo4j_session: neo4j.Session, update_tag: int, collaborat
502
640
  SET o.lastupdated = $UpdateTag
503
641
  """)
504
642
  for collab_type in collaborators.keys():
505
- relationship_label = f"OUTSIDE_COLLAB_{collab_type}"
643
+ relationship_label = f"{affiliation}_COLLAB_{collab_type}"
506
644
  neo4j_session.run(
507
645
  query.safe_substitute(rel_label=relationship_label),
508
646
  UserData=collaborators[collab_type],
@@ -515,7 +653,12 @@ def load(neo4j_session: neo4j.Session, common_job_parameters: Dict, repo_data: D
515
653
  load_github_repos(neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repos'])
516
654
  load_github_owners(neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repo_owners'])
517
655
  load_github_languages(neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repo_languages'])
518
- load_collaborators(neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repo_collaborators'])
656
+ load_collaborators(
657
+ neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repo_direct_collaborators'], 'DIRECT',
658
+ )
659
+ load_collaborators(
660
+ neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['repo_outside_collaborators'], 'OUTSIDE',
661
+ )
519
662
  load_python_requirements(neo4j_session, common_job_parameters['UPDATE_TAG'], repo_data['python_requirements'])
520
663
 
521
664
 
@@ -561,6 +704,12 @@ def sync(
561
704
  """
562
705
  logger.info("Syncing GitHub repos")
563
706
  repos_json = get(github_api_key, github_url, organization)
564
- repo_data = transform(repos_json)
707
+ direct_collabs = _get_repo_collaborators_for_multiple_repos(
708
+ repos_json, "DIRECT", organization, github_url, github_api_key,
709
+ )
710
+ outside_collabs = _get_repo_collaborators_for_multiple_repos(
711
+ repos_json, "OUTSIDE", organization, github_url, github_api_key,
712
+ )
713
+ repo_data = transform(repos_json, direct_collabs, outside_collabs)
565
714
  load(neo4j_session, common_job_parameters, repo_data)
566
715
  run_cleanup_job('github_repos_cleanup.json', neo4j_session, common_job_parameters)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from copy import deepcopy
2
3
  from typing import Any
3
4
  from typing import Dict
4
5
  from typing import List
@@ -6,7 +7,11 @@ from typing import Tuple
6
7
 
7
8
  import neo4j
8
9
 
10
+ from cartography.client.core.tx import load
9
11
  from cartography.intel.github.util import fetch_all
12
+ from cartography.models.github.orgs import GitHubOrganizationSchema
13
+ from cartography.models.github.users import GitHubOrganizationUserSchema
14
+ from cartography.models.github.users import GitHubUnaffiliatedUserSchema
10
15
  from cartography.stats import get_stats_client
11
16
  from cartography.util import merge_module_sync_metadata
12
17
  from cartography.util import run_cleanup_job
@@ -44,17 +49,46 @@ GITHUB_ORG_USERS_PAGINATED_GRAPHQL = """
44
49
  }
45
50
  """
46
51
 
52
+ GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL = """
53
+ query($login: String!, $cursor: String) {
54
+ organization(login: $login)
55
+ {
56
+ url
57
+ login
58
+ enterpriseOwners(first:100, after: $cursor){
59
+ edges {
60
+ node {
61
+ url
62
+ login
63
+ name
64
+ isSiteAdmin
65
+ email
66
+ company
67
+ }
68
+ organizationRole
69
+ }
70
+ pageInfo{
71
+ endCursor
72
+ hasNextPage
73
+ }
74
+ }
75
+ }
76
+ }
77
+ """
78
+
47
79
 
48
80
  @timeit
49
- def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
81
+ def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
50
82
  """
51
83
  Retrieve a list of users from the given GitHub organization as described in
52
84
  https://docs.github.com/en/graphql/reference/objects#organizationmemberedge.
53
85
  :param token: The Github API token as string.
54
86
  :param api_url: The Github v4 API endpoint as string.
55
87
  :param organization: The name of the target Github organization as string.
56
- :return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA
57
- for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape.
88
+ :return: A 2-tuple containing
89
+ 1. a list of dicts representing users and
90
+ 2. data on the owning GitHub organization
91
+ see tests.data.github.users.GITHUB_USER_DATA for shape of both
58
92
  """
59
93
  users, org = fetch_all(
60
94
  token,
@@ -66,56 +100,139 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
66
100
  return users.edges, org
67
101
 
68
102
 
103
+ def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
104
+ """
105
+ Retrieve a list of enterprise owners from the given GitHub organization as described in
106
+ https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge.
107
+ :param token: The Github API token as string.
108
+ :param api_url: The Github v4 API endpoint as string.
109
+ :param organization: The name of the target Github organization as string.
110
+ :return: A 2-tuple containing
111
+ 1. a list of dicts representing users who are enterprise owners
112
+ 3. data on the owning GitHub organization
113
+ see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
114
+ """
115
+ owners, org = fetch_all(
116
+ token,
117
+ api_url,
118
+ organization,
119
+ GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL,
120
+ 'enterpriseOwners',
121
+ )
122
+ return owners.edges, org
123
+
124
+
69
125
  @timeit
70
- def load_organization_users(
71
- neo4j_session: neo4j.Session, user_data: List[Dict], org_data: Dict,
126
+ def transform_users(user_data: List[Dict], owners_data: List[Dict], org_data: Dict) -> Tuple[List[Dict], List[Dict]]:
127
+ """
128
+ Taking raw user and owner data, return two lists of processed user data:
129
+ * organization users aka affiliated users (users directly affiliated with an organization)
130
+ * unaffiliated users (user who, for example, are enterprise owners but not members of the target organization).
131
+
132
+ :param token: The Github API token as string.
133
+ :param api_url: The Github v4 API endpoint as string.
134
+ :param organization: The name of the target Github organization as string.
135
+ :return: A 2-tuple containing
136
+ 1. a list of dicts representing users who are affiliated with the target org
137
+ see tests.data.github.users.GITHUB_USER_DATA for shape
138
+ 2. a list of dicts representing users who are not affiliated (e.g. enterprise owners who are not also in
139
+ the target org) — see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
140
+ 3. data on the owning GitHub organization
141
+ """
142
+
143
+ users_dict = {}
144
+ for user in user_data:
145
+ processed_user = deepcopy(user['node'])
146
+ processed_user['role'] = user['role']
147
+ processed_user['hasTwoFactorEnabled'] = user['hasTwoFactorEnabled']
148
+ processed_user['MEMBER_OF'] = org_data['url']
149
+ users_dict[processed_user['url']] = processed_user
150
+
151
+ owners_dict = {}
152
+ for owner in owners_data:
153
+ processed_owner = deepcopy(owner['node'])
154
+ processed_owner['isEnterpriseOwner'] = True
155
+ if owner['organizationRole'] == 'UNAFFILIATED':
156
+ processed_owner['UNAFFILIATED'] = org_data['url']
157
+ else:
158
+ processed_owner['MEMBER_OF'] = org_data['url']
159
+ owners_dict[processed_owner['url']] = processed_owner
160
+
161
+ affiliated_users = [] # users affiliated with the target org
162
+ for url, user in users_dict.items():
163
+ user['isEnterpriseOwner'] = url in owners_dict
164
+ affiliated_users.append(user)
165
+
166
+ unaffiliated_users = [] # users not affiliated with the target org
167
+ for url, owner in owners_dict.items():
168
+ if url not in users_dict:
169
+ unaffiliated_users.append(owner)
170
+
171
+ return affiliated_users, unaffiliated_users
172
+
173
+
174
+ @timeit
175
+ def load_users(
176
+ neo4j_session: neo4j.Session,
177
+ node_schema: GitHubOrganizationUserSchema | GitHubUnaffiliatedUserSchema,
178
+ user_data: List[Dict],
179
+ org_data: Dict,
72
180
  update_tag: int,
73
181
  ) -> None:
74
- query = """
75
- MERGE (org:GitHubOrganization{id: $OrgUrl})
76
- ON CREATE SET org.firstseen = timestamp()
77
- SET org.username = $OrgLogin,
78
- org.lastupdated = $UpdateTag
79
- WITH org
80
-
81
- UNWIND $UserData as user
82
-
83
- MERGE (u:GitHubUser{id: user.node.url})
84
- ON CREATE SET u.firstseen = timestamp()
85
- SET u.fullname = user.node.name,
86
- u.username = user.node.login,
87
- u.has_2fa_enabled = user.hasTwoFactorEnabled,
88
- u.role = user.role,
89
- u.is_site_admin = user.node.isSiteAdmin,
90
- u.email = user.node.email,
91
- u.company = user.node.company,
92
- u.lastupdated = $UpdateTag
93
-
94
- MERGE (u)-[r:MEMBER_OF]->(org)
95
- ON CREATE SET r.firstseen = timestamp()
96
- SET r.lastupdated = $UpdateTag
97
- """
98
- neo4j_session.run(
99
- query,
100
- OrgUrl=org_data['url'],
101
- OrgLogin=org_data['login'],
102
- UserData=user_data,
103
- UpdateTag=update_tag,
182
+ logger.info(f"Loading {len(user_data)} GitHub users to the graph")
183
+ load(
184
+ neo4j_session,
185
+ node_schema,
186
+ user_data,
187
+ lastupdated=update_tag,
188
+ org_url=org_data['url'],
189
+ )
190
+
191
+
192
+ @timeit
193
+ def load_organization(
194
+ neo4j_session: neo4j.Session,
195
+ node_schema: GitHubOrganizationSchema,
196
+ org_data: List[Dict[str, Any]],
197
+ update_tag: int,
198
+ ) -> None:
199
+ logger.info(f"Loading {len(org_data)} GitHub organization to the graph")
200
+ load(
201
+ neo4j_session,
202
+ node_schema,
203
+ org_data,
204
+ lastupdated=update_tag,
104
205
  )
105
206
 
106
207
 
107
208
  @timeit
108
209
  def sync(
109
210
  neo4j_session: neo4j.Session,
110
- common_job_parameters: Dict[str, Any],
211
+ common_job_parameters: Dict,
111
212
  github_api_key: str,
112
213
  github_url: str,
113
214
  organization: str,
114
215
  ) -> None:
115
216
  logger.info("Syncing GitHub users")
116
- user_data, org_data = get(github_api_key, github_url, organization)
117
- load_organization_users(neo4j_session, user_data, org_data, common_job_parameters['UPDATE_TAG'])
118
- run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters)
217
+ user_data, org_data = get_users(github_api_key, github_url, organization)
218
+ owners_data, org_data = get_enterprise_owners(github_api_key, github_url, organization)
219
+ processed_affiliated_user_data, processed_unaffiliated_user_data = (
220
+ transform_users(user_data, owners_data, org_data)
221
+ )
222
+ load_organization(
223
+ neo4j_session, GitHubOrganizationSchema(), [org_data],
224
+ common_job_parameters['UPDATE_TAG'],
225
+ )
226
+ load_users(
227
+ neo4j_session, GitHubOrganizationUserSchema(), processed_affiliated_user_data, org_data,
228
+ common_job_parameters['UPDATE_TAG'],
229
+ )
230
+ load_users(
231
+ neo4j_session, GitHubUnaffiliatedUserSchema(), processed_unaffiliated_user_data, org_data,
232
+ common_job_parameters['UPDATE_TAG'],
233
+ )
234
+ # no automated cleanup job for users because user node has no sub_resource_relationship
235
+ run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters)
119
236
  merge_module_sync_metadata(
120
237
  neo4j_session,
121
238
  group_type='GitHubOrganization',
@@ -150,7 +150,8 @@ def _load_okta_users(
150
150
  new_user.okta_last_updated = user_data.okta_last_updated,
151
151
  new_user.password_changed = user_data.password_changed,
152
152
  new_user.transition_to_status = user_data.transition_to_status,
153
- new_user.lastupdated = $okta_update_tag
153
+ new_user.lastupdated = $okta_update_tag,
154
+ new_user :UserAccount
154
155
  WITH new_user, org
155
156
  MERGE (org)-[org_r:RESOURCE]->(new_user)
156
157
  ON CREATE SET org_r.firstseen = timestamp()
@@ -26,5 +26,5 @@ def start_semgrep_ingestion(
26
26
  # sync_deployment must be called first since it populates common_job_parameters
27
27
  # with the deployment ID and slug, which are required by the other sync functions
28
28
  sync_deployment(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
29
- sync_dependencies(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
29
+ sync_dependencies(neo4j_session, config.semgrep_app_token, config.semgrep_dependency_ecosystems, config.update_tag, common_job_parameters) # noqa: E501
30
30
  sync_findings(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)