cartography 0.95.0rc1__py3-none-any.whl → 0.96.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cartography might be problematic. Click here for more details.
- cartography/cli.py +15 -0
- cartography/config.py +4 -0
- cartography/data/indexes.cypher +1 -2
- cartography/data/jobs/cleanup/aws_import_identity_center_cleanup.json +16 -0
- cartography/data/jobs/cleanup/{github_users_cleanup.json → github_org_and_users_cleanup.json} +5 -0
- cartography/graph/querybuilder.py +4 -0
- cartography/intel/aws/ec2/network_acls.py +208 -0
- cartography/intel/aws/identitycenter.py +307 -0
- cartography/intel/aws/resources.py +4 -0
- cartography/intel/github/users.py +156 -39
- cartography/intel/okta/users.py +2 -1
- cartography/intel/semgrep/__init__.py +9 -2
- cartography/intel/semgrep/dependencies.py +233 -0
- cartography/intel/semgrep/deployment.py +67 -0
- cartography/intel/semgrep/findings.py +22 -53
- cartography/models/aws/ec2/network_acl_rules.py +97 -0
- cartography/models/aws/ec2/network_acls.py +86 -0
- cartography/models/core/common.py +18 -1
- cartography/models/github/orgs.py +26 -0
- cartography/models/github/users.py +119 -0
- cartography/models/semgrep/dependencies.py +90 -0
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/METADATA +1 -1
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/RECORD +27 -17
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/WHEEL +1 -1
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/LICENSE +0 -0
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/entry_points.txt +0 -0
- {cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from typing import Any
|
|
3
4
|
from typing import Dict
|
|
4
5
|
from typing import List
|
|
@@ -6,7 +7,11 @@ from typing import Tuple
|
|
|
6
7
|
|
|
7
8
|
import neo4j
|
|
8
9
|
|
|
10
|
+
from cartography.client.core.tx import load
|
|
9
11
|
from cartography.intel.github.util import fetch_all
|
|
12
|
+
from cartography.models.github.orgs import GitHubOrganizationSchema
|
|
13
|
+
from cartography.models.github.users import GitHubOrganizationUserSchema
|
|
14
|
+
from cartography.models.github.users import GitHubUnaffiliatedUserSchema
|
|
10
15
|
from cartography.stats import get_stats_client
|
|
11
16
|
from cartography.util import merge_module_sync_metadata
|
|
12
17
|
from cartography.util import run_cleanup_job
|
|
@@ -44,17 +49,46 @@ GITHUB_ORG_USERS_PAGINATED_GRAPHQL = """
|
|
|
44
49
|
}
|
|
45
50
|
"""
|
|
46
51
|
|
|
52
|
+
GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL = """
|
|
53
|
+
query($login: String!, $cursor: String) {
|
|
54
|
+
organization(login: $login)
|
|
55
|
+
{
|
|
56
|
+
url
|
|
57
|
+
login
|
|
58
|
+
enterpriseOwners(first:100, after: $cursor){
|
|
59
|
+
edges {
|
|
60
|
+
node {
|
|
61
|
+
url
|
|
62
|
+
login
|
|
63
|
+
name
|
|
64
|
+
isSiteAdmin
|
|
65
|
+
email
|
|
66
|
+
company
|
|
67
|
+
}
|
|
68
|
+
organizationRole
|
|
69
|
+
}
|
|
70
|
+
pageInfo{
|
|
71
|
+
endCursor
|
|
72
|
+
hasNextPage
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
"""
|
|
78
|
+
|
|
47
79
|
|
|
48
80
|
@timeit
|
|
49
|
-
def
|
|
81
|
+
def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
|
|
50
82
|
"""
|
|
51
83
|
Retrieve a list of users from the given GitHub organization as described in
|
|
52
84
|
https://docs.github.com/en/graphql/reference/objects#organizationmemberedge.
|
|
53
85
|
:param token: The Github API token as string.
|
|
54
86
|
:param api_url: The Github v4 API endpoint as string.
|
|
55
87
|
:param organization: The name of the target Github organization as string.
|
|
56
|
-
:return: A 2-tuple containing
|
|
57
|
-
|
|
88
|
+
:return: A 2-tuple containing
|
|
89
|
+
1. a list of dicts representing users and
|
|
90
|
+
2. data on the owning GitHub organization
|
|
91
|
+
see tests.data.github.users.GITHUB_USER_DATA for shape of both
|
|
58
92
|
"""
|
|
59
93
|
users, org = fetch_all(
|
|
60
94
|
token,
|
|
@@ -66,56 +100,139 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
|
|
|
66
100
|
return users.edges, org
|
|
67
101
|
|
|
68
102
|
|
|
103
|
+
def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
|
|
104
|
+
"""
|
|
105
|
+
Retrieve a list of enterprise owners from the given GitHub organization as described in
|
|
106
|
+
https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge.
|
|
107
|
+
:param token: The Github API token as string.
|
|
108
|
+
:param api_url: The Github v4 API endpoint as string.
|
|
109
|
+
:param organization: The name of the target Github organization as string.
|
|
110
|
+
:return: A 2-tuple containing
|
|
111
|
+
1. a list of dicts representing users who are enterprise owners
|
|
112
|
+
3. data on the owning GitHub organization
|
|
113
|
+
see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
|
|
114
|
+
"""
|
|
115
|
+
owners, org = fetch_all(
|
|
116
|
+
token,
|
|
117
|
+
api_url,
|
|
118
|
+
organization,
|
|
119
|
+
GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL,
|
|
120
|
+
'enterpriseOwners',
|
|
121
|
+
)
|
|
122
|
+
return owners.edges, org
|
|
123
|
+
|
|
124
|
+
|
|
69
125
|
@timeit
|
|
70
|
-
def
|
|
71
|
-
|
|
126
|
+
def transform_users(user_data: List[Dict], owners_data: List[Dict], org_data: Dict) -> Tuple[List[Dict], List[Dict]]:
|
|
127
|
+
"""
|
|
128
|
+
Taking raw user and owner data, return two lists of processed user data:
|
|
129
|
+
* organization users aka affiliated users (users directly affiliated with an organization)
|
|
130
|
+
* unaffiliated users (user who, for example, are enterprise owners but not members of the target organization).
|
|
131
|
+
|
|
132
|
+
:param token: The Github API token as string.
|
|
133
|
+
:param api_url: The Github v4 API endpoint as string.
|
|
134
|
+
:param organization: The name of the target Github organization as string.
|
|
135
|
+
:return: A 2-tuple containing
|
|
136
|
+
1. a list of dicts representing users who are affiliated with the target org
|
|
137
|
+
see tests.data.github.users.GITHUB_USER_DATA for shape
|
|
138
|
+
2. a list of dicts representing users who are not affiliated (e.g. enterprise owners who are not also in
|
|
139
|
+
the target org) — see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
|
|
140
|
+
3. data on the owning GitHub organization
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
users_dict = {}
|
|
144
|
+
for user in user_data:
|
|
145
|
+
processed_user = deepcopy(user['node'])
|
|
146
|
+
processed_user['role'] = user['role']
|
|
147
|
+
processed_user['hasTwoFactorEnabled'] = user['hasTwoFactorEnabled']
|
|
148
|
+
processed_user['MEMBER_OF'] = org_data['url']
|
|
149
|
+
users_dict[processed_user['url']] = processed_user
|
|
150
|
+
|
|
151
|
+
owners_dict = {}
|
|
152
|
+
for owner in owners_data:
|
|
153
|
+
processed_owner = deepcopy(owner['node'])
|
|
154
|
+
processed_owner['isEnterpriseOwner'] = True
|
|
155
|
+
if owner['organizationRole'] == 'UNAFFILIATED':
|
|
156
|
+
processed_owner['UNAFFILIATED'] = org_data['url']
|
|
157
|
+
else:
|
|
158
|
+
processed_owner['MEMBER_OF'] = org_data['url']
|
|
159
|
+
owners_dict[processed_owner['url']] = processed_owner
|
|
160
|
+
|
|
161
|
+
affiliated_users = [] # users affiliated with the target org
|
|
162
|
+
for url, user in users_dict.items():
|
|
163
|
+
user['isEnterpriseOwner'] = url in owners_dict
|
|
164
|
+
affiliated_users.append(user)
|
|
165
|
+
|
|
166
|
+
unaffiliated_users = [] # users not affiliated with the target org
|
|
167
|
+
for url, owner in owners_dict.items():
|
|
168
|
+
if url not in users_dict:
|
|
169
|
+
unaffiliated_users.append(owner)
|
|
170
|
+
|
|
171
|
+
return affiliated_users, unaffiliated_users
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@timeit
|
|
175
|
+
def load_users(
|
|
176
|
+
neo4j_session: neo4j.Session,
|
|
177
|
+
node_schema: GitHubOrganizationUserSchema | GitHubUnaffiliatedUserSchema,
|
|
178
|
+
user_data: List[Dict],
|
|
179
|
+
org_data: Dict,
|
|
72
180
|
update_tag: int,
|
|
73
181
|
) -> None:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
"""
|
|
98
|
-
neo4j_session.run(
|
|
99
|
-
query,
|
|
100
|
-
OrgUrl=org_data['url'],
|
|
101
|
-
OrgLogin=org_data['login'],
|
|
102
|
-
UserData=user_data,
|
|
103
|
-
UpdateTag=update_tag,
|
|
182
|
+
logger.info(f"Loading {len(user_data)} GitHub users to the graph")
|
|
183
|
+
load(
|
|
184
|
+
neo4j_session,
|
|
185
|
+
node_schema,
|
|
186
|
+
user_data,
|
|
187
|
+
lastupdated=update_tag,
|
|
188
|
+
org_url=org_data['url'],
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@timeit
|
|
193
|
+
def load_organization(
|
|
194
|
+
neo4j_session: neo4j.Session,
|
|
195
|
+
node_schema: GitHubOrganizationSchema,
|
|
196
|
+
org_data: List[Dict[str, Any]],
|
|
197
|
+
update_tag: int,
|
|
198
|
+
) -> None:
|
|
199
|
+
logger.info(f"Loading {len(org_data)} GitHub organization to the graph")
|
|
200
|
+
load(
|
|
201
|
+
neo4j_session,
|
|
202
|
+
node_schema,
|
|
203
|
+
org_data,
|
|
204
|
+
lastupdated=update_tag,
|
|
104
205
|
)
|
|
105
206
|
|
|
106
207
|
|
|
107
208
|
@timeit
|
|
108
209
|
def sync(
|
|
109
210
|
neo4j_session: neo4j.Session,
|
|
110
|
-
common_job_parameters: Dict
|
|
211
|
+
common_job_parameters: Dict,
|
|
111
212
|
github_api_key: str,
|
|
112
213
|
github_url: str,
|
|
113
214
|
organization: str,
|
|
114
215
|
) -> None:
|
|
115
216
|
logger.info("Syncing GitHub users")
|
|
116
|
-
user_data, org_data =
|
|
117
|
-
|
|
118
|
-
|
|
217
|
+
user_data, org_data = get_users(github_api_key, github_url, organization)
|
|
218
|
+
owners_data, org_data = get_enterprise_owners(github_api_key, github_url, organization)
|
|
219
|
+
processed_affiliated_user_data, processed_unaffiliated_user_data = (
|
|
220
|
+
transform_users(user_data, owners_data, org_data)
|
|
221
|
+
)
|
|
222
|
+
load_organization(
|
|
223
|
+
neo4j_session, GitHubOrganizationSchema(), [org_data],
|
|
224
|
+
common_job_parameters['UPDATE_TAG'],
|
|
225
|
+
)
|
|
226
|
+
load_users(
|
|
227
|
+
neo4j_session, GitHubOrganizationUserSchema(), processed_affiliated_user_data, org_data,
|
|
228
|
+
common_job_parameters['UPDATE_TAG'],
|
|
229
|
+
)
|
|
230
|
+
load_users(
|
|
231
|
+
neo4j_session, GitHubUnaffiliatedUserSchema(), processed_unaffiliated_user_data, org_data,
|
|
232
|
+
common_job_parameters['UPDATE_TAG'],
|
|
233
|
+
)
|
|
234
|
+
# no automated cleanup job for users because user node has no sub_resource_relationship
|
|
235
|
+
run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters)
|
|
119
236
|
merge_module_sync_metadata(
|
|
120
237
|
neo4j_session,
|
|
121
238
|
group_type='GitHubOrganization',
|
cartography/intel/okta/users.py
CHANGED
|
@@ -150,7 +150,8 @@ def _load_okta_users(
|
|
|
150
150
|
new_user.okta_last_updated = user_data.okta_last_updated,
|
|
151
151
|
new_user.password_changed = user_data.password_changed,
|
|
152
152
|
new_user.transition_to_status = user_data.transition_to_status,
|
|
153
|
-
new_user.lastupdated = $okta_update_tag
|
|
153
|
+
new_user.lastupdated = $okta_update_tag,
|
|
154
|
+
new_user :UserAccount
|
|
154
155
|
WITH new_user, org
|
|
155
156
|
MERGE (org)-[org_r:RESOURCE]->(new_user)
|
|
156
157
|
ON CREATE SET org_r.firstseen = timestamp()
|
|
@@ -3,7 +3,9 @@ import logging
|
|
|
3
3
|
import neo4j
|
|
4
4
|
|
|
5
5
|
from cartography.config import Config
|
|
6
|
-
from cartography.intel.semgrep.
|
|
6
|
+
from cartography.intel.semgrep.dependencies import sync_dependencies
|
|
7
|
+
from cartography.intel.semgrep.deployment import sync_deployment
|
|
8
|
+
from cartography.intel.semgrep.findings import sync_findings
|
|
7
9
|
from cartography.util import timeit
|
|
8
10
|
|
|
9
11
|
|
|
@@ -20,4 +22,9 @@ def start_semgrep_ingestion(
|
|
|
20
22
|
if not config.semgrep_app_token:
|
|
21
23
|
logger.info('Semgrep import is not configured - skipping this module. See docs to configure.')
|
|
22
24
|
return
|
|
23
|
-
|
|
25
|
+
|
|
26
|
+
# sync_deployment must be called first since it populates common_job_parameters
|
|
27
|
+
# with the deployment ID and slug, which are required by the other sync functions
|
|
28
|
+
sync_deployment(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
|
|
29
|
+
sync_dependencies(neo4j_session, config.semgrep_app_token, config.semgrep_dependency_ecosystems, config.update_tag, common_job_parameters) # noqa: E501
|
|
30
|
+
sync_findings(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
from typing import Callable
|
|
4
|
+
from typing import Dict
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
import neo4j
|
|
8
|
+
import requests
|
|
9
|
+
from requests.exceptions import HTTPError
|
|
10
|
+
from requests.exceptions import ReadTimeout
|
|
11
|
+
|
|
12
|
+
from cartography.client.core.tx import load
|
|
13
|
+
from cartography.graph.job import GraphJob
|
|
14
|
+
from cartography.models.semgrep.dependencies import SemgrepGoLibrarySchema
|
|
15
|
+
from cartography.models.semgrep.dependencies import SemgrepNpmLibrarySchema
|
|
16
|
+
from cartography.stats import get_stats_client
|
|
17
|
+
from cartography.util import merge_module_sync_metadata
|
|
18
|
+
from cartography.util import timeit
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
stat_handler = get_stats_client(__name__)
|
|
22
|
+
_PAGE_SIZE = 10000
|
|
23
|
+
_TIMEOUT = (60, 60)
|
|
24
|
+
_MAX_RETRIES = 3
|
|
25
|
+
|
|
26
|
+
# The keys in this dictionary must be in Semgrep's list of supported ecosystems, defined here:
|
|
27
|
+
# https://semgrep.dev/api/v1/docs/#tag/SupplyChainService/operation/semgrep_app.products.sca.handlers.dependency.list_dependencies_conexxion
|
|
28
|
+
ECOSYSTEM_TO_SCHEMA: Dict = {
|
|
29
|
+
'gomod': SemgrepGoLibrarySchema,
|
|
30
|
+
'npm': SemgrepNpmLibrarySchema,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_and_validate_semgrep_ecosystems(ecosystems: str) -> List[str]:
|
|
35
|
+
validated_ecosystems: List[str] = []
|
|
36
|
+
for ecosystem in ecosystems.split(','):
|
|
37
|
+
ecosystem = ecosystem.strip().lower()
|
|
38
|
+
|
|
39
|
+
if ecosystem in ECOSYSTEM_TO_SCHEMA:
|
|
40
|
+
validated_ecosystems.append(ecosystem)
|
|
41
|
+
else:
|
|
42
|
+
valid_ecosystems: str = ','.join(ECOSYSTEM_TO_SCHEMA.keys())
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f'Error parsing `semgrep-dependency-ecosystems`. You specified "{ecosystems}". '
|
|
45
|
+
f'Please check that your input is formatted as comma-separated values, e.g. "gomod,npm". '
|
|
46
|
+
f'Full list of supported ecosystems: {valid_ecosystems}.',
|
|
47
|
+
)
|
|
48
|
+
return validated_ecosystems
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@timeit
|
|
52
|
+
def get_dependencies(semgrep_app_token: str, deployment_id: str, ecosystem: str) -> List[Dict[str, Any]]:
|
|
53
|
+
"""
|
|
54
|
+
Gets all dependencies for the given ecosystem within the given Semgrep deployment ID.
|
|
55
|
+
param: semgrep_app_token: The Semgrep App token to use for authentication.
|
|
56
|
+
param: deployment_id: The Semgrep deployment ID to use for retrieving dependencies.
|
|
57
|
+
param: ecosystem: The ecosystem to import dependencies from, e.g. "gomod" or "npm".
|
|
58
|
+
"""
|
|
59
|
+
all_deps = []
|
|
60
|
+
deps_url = f"https://semgrep.dev/api/v1/deployments/{deployment_id}/dependencies"
|
|
61
|
+
has_more = True
|
|
62
|
+
page = 0
|
|
63
|
+
retries = 0
|
|
64
|
+
headers = {
|
|
65
|
+
"Content-Type": "application/json",
|
|
66
|
+
"Authorization": f"Bearer {semgrep_app_token}",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
request_data: dict[str, Any] = {
|
|
70
|
+
"pageSize": _PAGE_SIZE,
|
|
71
|
+
"dependencyFilter": {
|
|
72
|
+
"ecosystem": [ecosystem],
|
|
73
|
+
},
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
logger.info(f"Retrieving Semgrep {ecosystem} dependencies for deployment '{deployment_id}'.")
|
|
77
|
+
while has_more:
|
|
78
|
+
try:
|
|
79
|
+
response = requests.post(deps_url, json=request_data, headers=headers, timeout=_TIMEOUT)
|
|
80
|
+
response.raise_for_status()
|
|
81
|
+
data = response.json()
|
|
82
|
+
except (ReadTimeout, HTTPError):
|
|
83
|
+
logger.warning(f"Failed to retrieve Semgrep {ecosystem} dependencies for page {page}. Retrying...")
|
|
84
|
+
retries += 1
|
|
85
|
+
if retries >= _MAX_RETRIES:
|
|
86
|
+
raise
|
|
87
|
+
continue
|
|
88
|
+
deps = data.get("dependencies", [])
|
|
89
|
+
has_more = data.get("hasMore", False)
|
|
90
|
+
logger.info(f"Processed page {page} of Semgrep {ecosystem} dependencies.")
|
|
91
|
+
all_deps.extend(deps)
|
|
92
|
+
retries = 0
|
|
93
|
+
page += 1
|
|
94
|
+
request_data["cursor"] = data.get("cursor")
|
|
95
|
+
|
|
96
|
+
logger.info(f"Retrieved {len(all_deps)} Semgrep {ecosystem} dependencies in {page} pages.")
|
|
97
|
+
return all_deps
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def transform_dependencies(raw_deps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
101
|
+
"""
|
|
102
|
+
Transforms the raw dependencies response from Semgrep API into a list of dicts
|
|
103
|
+
that can be used to create the Dependency nodes.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
sample raw_dep as of November 2024:
|
|
108
|
+
{
|
|
109
|
+
"repositoryId": "123456",
|
|
110
|
+
"definedAt": {
|
|
111
|
+
"path": "go.mod",
|
|
112
|
+
"startLine": "6",
|
|
113
|
+
"endLine": "6",
|
|
114
|
+
"url": "https://github.com/org/repo-name/blob/00000000000000000000000000000000/go.mod#L6",
|
|
115
|
+
"committedAt": "1970-01-01T00:00:00Z",
|
|
116
|
+
"startCol": "0",
|
|
117
|
+
"endCol": "0"
|
|
118
|
+
},
|
|
119
|
+
"transitivity": "DIRECT",
|
|
120
|
+
"package": {
|
|
121
|
+
"name": "github.com/foo/bar",
|
|
122
|
+
"versionSpecifier": "1.2.3"
|
|
123
|
+
},
|
|
124
|
+
"ecosystem": "gomod",
|
|
125
|
+
"licenses": [],
|
|
126
|
+
"pathToTransitivity": []
|
|
127
|
+
},
|
|
128
|
+
"""
|
|
129
|
+
deps = []
|
|
130
|
+
for raw_dep in raw_deps:
|
|
131
|
+
|
|
132
|
+
# We could call a different endpoint to get all repo IDs and store a mapping of repo ID to URL,
|
|
133
|
+
# but it's much simpler to just extract the URL from the definedAt field.
|
|
134
|
+
repo_url = raw_dep["definedAt"]["url"].split("/blob/", 1)[0]
|
|
135
|
+
|
|
136
|
+
name = raw_dep["package"]["name"]
|
|
137
|
+
version = raw_dep["package"]["versionSpecifier"]
|
|
138
|
+
id = f"{name}|{version}"
|
|
139
|
+
|
|
140
|
+
# As of November 2024, Semgrep does not import dependencies with version specifiers such as >, <, etc.
|
|
141
|
+
# For now, hardcode the specifier to ==<version> to align with GitHub-sourced Python dependencies.
|
|
142
|
+
# If Semgrep eventually supports version specifiers, update this line accordingly.
|
|
143
|
+
specifier = f"=={version}"
|
|
144
|
+
|
|
145
|
+
deps.append({
|
|
146
|
+
# existing dependency properties:
|
|
147
|
+
"id": id,
|
|
148
|
+
"name": name,
|
|
149
|
+
"specifier": specifier,
|
|
150
|
+
"version": version,
|
|
151
|
+
"repo_url": repo_url,
|
|
152
|
+
|
|
153
|
+
# Semgrep-specific properties:
|
|
154
|
+
"ecosystem": raw_dep["ecosystem"],
|
|
155
|
+
"transitivity": raw_dep["transitivity"].lower(),
|
|
156
|
+
"url": raw_dep["definedAt"]["url"],
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
return deps
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@timeit
|
|
163
|
+
def load_dependencies(
|
|
164
|
+
neo4j_session: neo4j.Session,
|
|
165
|
+
dependency_schema: Callable,
|
|
166
|
+
dependencies: List[Dict],
|
|
167
|
+
deployment_id: str,
|
|
168
|
+
update_tag: int,
|
|
169
|
+
) -> None:
|
|
170
|
+
logger.info(f"Loading {len(dependencies)} {dependency_schema().label} objects into the graph.")
|
|
171
|
+
load(
|
|
172
|
+
neo4j_session,
|
|
173
|
+
dependency_schema(),
|
|
174
|
+
dependencies,
|
|
175
|
+
lastupdated=update_tag,
|
|
176
|
+
DEPLOYMENT_ID=deployment_id,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@timeit
|
|
181
|
+
def cleanup(
|
|
182
|
+
neo4j_session: neo4j.Session,
|
|
183
|
+
dependency_schema: Callable,
|
|
184
|
+
common_job_parameters: Dict[str, Any],
|
|
185
|
+
) -> None:
|
|
186
|
+
logger.info(f"Running Semgrep Dependencies cleanup job for {dependency_schema().label}.")
|
|
187
|
+
GraphJob.from_node_schema(dependency_schema(), common_job_parameters).run(neo4j_session)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@timeit
|
|
191
|
+
def sync_dependencies(
|
|
192
|
+
neo4j_session: neo4j.Session,
|
|
193
|
+
semgrep_app_token: str,
|
|
194
|
+
ecosystems_str: str,
|
|
195
|
+
update_tag: int,
|
|
196
|
+
common_job_parameters: Dict[str, Any],
|
|
197
|
+
) -> None:
|
|
198
|
+
|
|
199
|
+
deployment_id = common_job_parameters.get("DEPLOYMENT_ID")
|
|
200
|
+
if not deployment_id:
|
|
201
|
+
logger.warning(
|
|
202
|
+
"Missing Semgrep deployment ID, ensure that sync_deployment() has been called. "
|
|
203
|
+
"Skipping Semgrep dependencies sync job.",
|
|
204
|
+
)
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
if not ecosystems_str:
|
|
208
|
+
logger.warning(
|
|
209
|
+
"Semgrep is not configured to import dependencies for any ecosystems, see docs to configure. "
|
|
210
|
+
"Skipping Semgrep dependencies sync job.",
|
|
211
|
+
)
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
# We don't expect an error here since we've already validated the input in cli.py
|
|
215
|
+
ecosystems = parse_and_validate_semgrep_ecosystems(ecosystems_str)
|
|
216
|
+
|
|
217
|
+
logger.info("Running Semgrep dependencies sync job.")
|
|
218
|
+
|
|
219
|
+
for ecosystem in ecosystems:
|
|
220
|
+
schema = ECOSYSTEM_TO_SCHEMA[ecosystem]
|
|
221
|
+
raw_deps = get_dependencies(semgrep_app_token, deployment_id, ecosystem)
|
|
222
|
+
deps = transform_dependencies(raw_deps)
|
|
223
|
+
load_dependencies(neo4j_session, schema, deps, deployment_id, update_tag)
|
|
224
|
+
cleanup(neo4j_session, schema, common_job_parameters)
|
|
225
|
+
|
|
226
|
+
merge_module_sync_metadata(
|
|
227
|
+
neo4j_session=neo4j_session,
|
|
228
|
+
group_type='Semgrep',
|
|
229
|
+
group_id=deployment_id,
|
|
230
|
+
synced_type='SemgrepDependency',
|
|
231
|
+
update_tag=update_tag,
|
|
232
|
+
stat_handler=stat_handler,
|
|
233
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
import neo4j
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from cartography.client.core.tx import load
|
|
9
|
+
from cartography.models.semgrep.deployment import SemgrepDeploymentSchema
|
|
10
|
+
from cartography.stats import get_stats_client
|
|
11
|
+
from cartography.util import timeit
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
stat_handler = get_stats_client(__name__)
|
|
15
|
+
_TIMEOUT = (60, 60)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@timeit
|
|
19
|
+
def get_deployment(semgrep_app_token: str) -> Dict[str, Any]:
|
|
20
|
+
"""
|
|
21
|
+
Gets the deployment associated with the passed Semgrep App token.
|
|
22
|
+
param: semgrep_app_token: The Semgrep App token to use for authentication.
|
|
23
|
+
"""
|
|
24
|
+
deployment = {}
|
|
25
|
+
deployment_url = "https://semgrep.dev/api/v1/deployments"
|
|
26
|
+
headers = {
|
|
27
|
+
"Content-Type": "application/json",
|
|
28
|
+
"Authorization": f"Bearer {semgrep_app_token}",
|
|
29
|
+
}
|
|
30
|
+
response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT)
|
|
31
|
+
response.raise_for_status()
|
|
32
|
+
|
|
33
|
+
data = response.json()
|
|
34
|
+
deployment["id"] = data["deployments"][0]["id"]
|
|
35
|
+
deployment["name"] = data["deployments"][0]["name"]
|
|
36
|
+
deployment["slug"] = data["deployments"][0]["slug"]
|
|
37
|
+
|
|
38
|
+
return deployment
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@timeit
|
|
42
|
+
def load_semgrep_deployment(
|
|
43
|
+
neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int,
|
|
44
|
+
) -> None:
|
|
45
|
+
logger.info(f"Loading SemgrepDeployment {deployment} into the graph.")
|
|
46
|
+
load(
|
|
47
|
+
neo4j_session,
|
|
48
|
+
SemgrepDeploymentSchema(),
|
|
49
|
+
[deployment],
|
|
50
|
+
lastupdated=update_tag,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@timeit
|
|
55
|
+
def sync_deployment(
|
|
56
|
+
neo4j_session: neo4j.Session,
|
|
57
|
+
semgrep_app_token: str,
|
|
58
|
+
update_tag: int,
|
|
59
|
+
common_job_parameters: Dict[str, Any],
|
|
60
|
+
) -> None:
|
|
61
|
+
|
|
62
|
+
semgrep_deployment = get_deployment(semgrep_app_token)
|
|
63
|
+
deployment_id = semgrep_deployment["id"]
|
|
64
|
+
deployment_slug = semgrep_deployment["slug"]
|
|
65
|
+
load_semgrep_deployment(neo4j_session, semgrep_deployment, update_tag)
|
|
66
|
+
common_job_parameters["DEPLOYMENT_ID"] = deployment_id
|
|
67
|
+
common_job_parameters["DEPLOYMENT_SLUG"] = deployment_slug
|