cartography 0.118.0__py3-none-any.whl → 0.119.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cartography might be problematic. Click here for more details.
- cartography/_version.py +2 -2
- cartography/cli.py +20 -0
- cartography/client/core/tx.py +19 -3
- cartography/config.py +9 -0
- cartography/data/indexes.cypher +0 -6
- cartography/graph/job.py +7 -5
- cartography/intel/aws/__init__.py +21 -9
- cartography/intel/aws/ecr.py +7 -0
- cartography/intel/aws/ecr_image_layers.py +143 -42
- cartography/intel/aws/inspector.py +65 -33
- cartography/intel/aws/resourcegroupstaggingapi.py +1 -1
- cartography/intel/gcp/compute.py +3 -3
- cartography/intel/github/repos.py +23 -5
- cartography/intel/gsuite/__init__.py +12 -8
- cartography/intel/gsuite/groups.py +291 -0
- cartography/intel/gsuite/users.py +142 -0
- cartography/intel/okta/awssaml.py +1 -1
- cartography/intel/okta/users.py +1 -1
- cartography/intel/ontology/__init__.py +44 -0
- cartography/intel/ontology/devices.py +54 -0
- cartography/intel/ontology/users.py +54 -0
- cartography/intel/ontology/utils.py +121 -0
- cartography/models/airbyte/user.py +4 -0
- cartography/models/anthropic/user.py +4 -0
- cartography/models/aws/ecr/image.py +47 -0
- cartography/models/aws/iam/group_membership.py +3 -2
- cartography/models/aws/identitycenter/awsssouser.py +3 -1
- cartography/models/bigfix/bigfix_computer.py +1 -1
- cartography/models/cloudflare/member.py +4 -0
- cartography/models/crowdstrike/hosts.py +1 -1
- cartography/models/duo/endpoint.py +1 -1
- cartography/models/duo/phone.py +2 -2
- cartography/models/duo/user.py +4 -0
- cartography/models/entra/user.py +2 -1
- cartography/models/github/users.py +4 -0
- cartography/models/gsuite/__init__.py +0 -0
- cartography/models/gsuite/group.py +218 -0
- cartography/models/gsuite/tenant.py +29 -0
- cartography/models/gsuite/user.py +107 -0
- cartography/models/kandji/device.py +1 -2
- cartography/models/keycloak/user.py +4 -0
- cartography/models/lastpass/user.py +4 -0
- cartography/models/ontology/__init__.py +0 -0
- cartography/models/ontology/device.py +125 -0
- cartography/models/ontology/mapping/__init__.py +16 -0
- cartography/models/ontology/mapping/data/__init__.py +1 -0
- cartography/models/ontology/mapping/data/devices.py +160 -0
- cartography/models/ontology/mapping/data/users.py +239 -0
- cartography/models/ontology/mapping/specs.py +65 -0
- cartography/models/ontology/user.py +52 -0
- cartography/models/openai/user.py +4 -0
- cartography/models/scaleway/iam/user.py +4 -0
- cartography/models/snipeit/asset.py +1 -0
- cartography/models/snipeit/user.py +4 -0
- cartography/models/tailscale/device.py +1 -1
- cartography/models/tailscale/user.py +6 -1
- cartography/rules/data/frameworks/mitre_attack/requirements/t1098_account_manipulation/__init__.py +176 -89
- cartography/sync.py +3 -0
- cartography/util.py +44 -17
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/METADATA +1 -1
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/RECORD +65 -50
- cartography/data/jobs/cleanup/gsuite_ingest_groups_cleanup.json +0 -23
- cartography/data/jobs/cleanup/gsuite_ingest_users_cleanup.json +0 -11
- cartography/intel/gsuite/api.py +0 -355
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/WHEEL +0 -0
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/entry_points.txt +0 -0
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/licenses/LICENSE +0 -0
- {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/top_level.txt +0 -0
cartography/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.119.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 119, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
cartography/cli.py
CHANGED
|
@@ -730,6 +730,26 @@ class CLI:
|
|
|
730
730
|
"Required if you are using the Trivy module. Ignored otherwise."
|
|
731
731
|
),
|
|
732
732
|
)
|
|
733
|
+
parser.add_argument(
|
|
734
|
+
"--ontology-users-source",
|
|
735
|
+
type=str,
|
|
736
|
+
default=None,
|
|
737
|
+
help=(
|
|
738
|
+
"Comma-separated list of sources of truth for user data in the ontology. "
|
|
739
|
+
"'User' nodes will only be created for users that exist in one of the sources. "
|
|
740
|
+
"Required if you are using the ontology module. Ignored otherwise."
|
|
741
|
+
),
|
|
742
|
+
)
|
|
743
|
+
parser.add_argument(
|
|
744
|
+
"--ontology-devices-source",
|
|
745
|
+
type=str,
|
|
746
|
+
default=None,
|
|
747
|
+
help=(
|
|
748
|
+
"Comma-separated list of sources of truth for client computer data in the ontology. "
|
|
749
|
+
"'Device' nodes will only be created for groups that exist in one of the sources. "
|
|
750
|
+
"Required if you are using the ontology module. Ignored otherwise."
|
|
751
|
+
),
|
|
752
|
+
)
|
|
733
753
|
parser.add_argument(
|
|
734
754
|
"--trivy-results-dir",
|
|
735
755
|
type=str,
|
cartography/client/core/tx.py
CHANGED
|
@@ -249,6 +249,7 @@ def load_graph_data(
|
|
|
249
249
|
neo4j_session: neo4j.Session,
|
|
250
250
|
query: str,
|
|
251
251
|
dict_list: List[Dict[str, Any]],
|
|
252
|
+
batch_size: int = 10000,
|
|
252
253
|
**kwargs,
|
|
253
254
|
) -> None:
|
|
254
255
|
"""
|
|
@@ -257,10 +258,13 @@ def load_graph_data(
|
|
|
257
258
|
:param query: The Neo4j write query to run. This query is not meant to be handwritten, rather it should be generated
|
|
258
259
|
with cartography.graph.querybuilder.build_ingestion_query().
|
|
259
260
|
:param dict_list: The data to load to the graph represented as a list of dicts.
|
|
261
|
+
:param batch_size: The number of items to process per transaction. Defaults to 10000.
|
|
260
262
|
:param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
|
|
261
263
|
:return: None
|
|
262
264
|
"""
|
|
263
|
-
|
|
265
|
+
if batch_size <= 0:
|
|
266
|
+
raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
|
|
267
|
+
for data_batch in batch(dict_list, size=batch_size):
|
|
264
268
|
neo4j_session.write_transaction(
|
|
265
269
|
write_list_of_dicts_tx,
|
|
266
270
|
query,
|
|
@@ -316,6 +320,7 @@ def load(
|
|
|
316
320
|
neo4j_session: neo4j.Session,
|
|
317
321
|
node_schema: CartographyNodeSchema,
|
|
318
322
|
dict_list: List[Dict[str, Any]],
|
|
323
|
+
batch_size: int = 10000,
|
|
319
324
|
**kwargs,
|
|
320
325
|
) -> None:
|
|
321
326
|
"""
|
|
@@ -324,21 +329,27 @@ def load(
|
|
|
324
329
|
:param neo4j_session: The Neo4j session
|
|
325
330
|
:param node_schema: The CartographyNodeSchema object to create indexes for and generate a query.
|
|
326
331
|
:param dict_list: The data to load to the graph represented as a list of dicts.
|
|
332
|
+
:param batch_size: The number of items to process per transaction. Defaults to 10000.
|
|
327
333
|
:param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
|
|
328
334
|
:return: None
|
|
329
335
|
"""
|
|
336
|
+
if batch_size <= 0:
|
|
337
|
+
raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
|
|
330
338
|
if len(dict_list) == 0:
|
|
331
339
|
# If there is no data to load, save some time.
|
|
332
340
|
return
|
|
333
341
|
ensure_indexes(neo4j_session, node_schema)
|
|
334
342
|
ingestion_query = build_ingestion_query(node_schema)
|
|
335
|
-
load_graph_data(
|
|
343
|
+
load_graph_data(
|
|
344
|
+
neo4j_session, ingestion_query, dict_list, batch_size=batch_size, **kwargs
|
|
345
|
+
)
|
|
336
346
|
|
|
337
347
|
|
|
338
348
|
def load_matchlinks(
|
|
339
349
|
neo4j_session: neo4j.Session,
|
|
340
350
|
rel_schema: CartographyRelSchema,
|
|
341
351
|
dict_list: list[dict[str, Any]],
|
|
352
|
+
batch_size: int = 10000,
|
|
342
353
|
**kwargs,
|
|
343
354
|
) -> None:
|
|
344
355
|
"""
|
|
@@ -347,9 +358,12 @@ def load_matchlinks(
|
|
|
347
358
|
:param rel_schema: The CartographyRelSchema object to generate a query.
|
|
348
359
|
:param dict_list: The data to load to the graph represented as a list of dicts. The dicts must contain the source and
|
|
349
360
|
target node ids.
|
|
361
|
+
:param batch_size: The number of items to process per transaction. Defaults to 10000.
|
|
350
362
|
:param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
|
|
351
363
|
:return: None
|
|
352
364
|
"""
|
|
365
|
+
if batch_size <= 0:
|
|
366
|
+
raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
|
|
353
367
|
if len(dict_list) == 0:
|
|
354
368
|
# If there is no data to load, save some time.
|
|
355
369
|
return
|
|
@@ -369,4 +383,6 @@ def load_matchlinks(
|
|
|
369
383
|
ensure_indexes_for_matchlinks(neo4j_session, rel_schema)
|
|
370
384
|
matchlink_query = build_matchlink_query(rel_schema)
|
|
371
385
|
logger.debug(f"Matchlink query: {matchlink_query}")
|
|
372
|
-
load_graph_data(
|
|
386
|
+
load_graph_data(
|
|
387
|
+
neo4j_session, matchlink_query, dict_list, batch_size=batch_size, **kwargs
|
|
388
|
+
)
|
cartography/config.py
CHANGED
|
@@ -161,6 +161,11 @@ class Config:
|
|
|
161
161
|
:param trivy_s3_bucket: The S3 bucket name containing Trivy scan results. Optional.
|
|
162
162
|
:type trivy_s3_prefix: str
|
|
163
163
|
:param trivy_s3_prefix: The S3 prefix path containing Trivy scan results. Optional.
|
|
164
|
+
:type ontology_users_source: str
|
|
165
|
+
:param ontology_users_source: Comma-separated list of sources of truth for user data in the ontology. Optional.
|
|
166
|
+
:type ontology_devices_source: str
|
|
167
|
+
:param ontology_devices_source: Comma-separated list of sources of truth for client computers data in the ontology.
|
|
168
|
+
Optional.
|
|
164
169
|
:type trivy_results_dir: str
|
|
165
170
|
:param trivy_results_dir: Local directory containing Trivy scan results. Optional.
|
|
166
171
|
:type scaleway_access_key: str
|
|
@@ -266,6 +271,8 @@ class Config:
|
|
|
266
271
|
airbyte_api_url=None,
|
|
267
272
|
trivy_s3_bucket=None,
|
|
268
273
|
trivy_s3_prefix=None,
|
|
274
|
+
ontology_users_source=None,
|
|
275
|
+
ontology_devices_source=None,
|
|
269
276
|
trivy_results_dir=None,
|
|
270
277
|
scaleway_access_key=None,
|
|
271
278
|
scaleway_secret_key=None,
|
|
@@ -359,6 +366,8 @@ class Config:
|
|
|
359
366
|
self.airbyte_api_url = airbyte_api_url
|
|
360
367
|
self.trivy_s3_bucket = trivy_s3_bucket
|
|
361
368
|
self.trivy_s3_prefix = trivy_s3_prefix
|
|
369
|
+
self.ontology_users_source = ontology_users_source
|
|
370
|
+
self.ontology_devices_source = ontology_devices_source
|
|
362
371
|
self.trivy_results_dir = trivy_results_dir
|
|
363
372
|
self.scaleway_access_key = scaleway_access_key
|
|
364
373
|
self.scaleway_secret_key = scaleway_secret_key
|
cartography/data/indexes.cypher
CHANGED
|
@@ -102,12 +102,6 @@ CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.id);
|
|
|
102
102
|
CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.lastupdated);
|
|
103
103
|
CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.id);
|
|
104
104
|
CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.lastupdated);
|
|
105
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.email);
|
|
106
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.id);
|
|
107
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.lastupdated);
|
|
108
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.email);
|
|
109
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.id);
|
|
110
|
-
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.lastupdated);
|
|
111
105
|
CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.id);
|
|
112
106
|
CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.ip);
|
|
113
107
|
CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.lastupdated);
|
cartography/graph/job.py
CHANGED
|
@@ -125,11 +125,13 @@ class GraphJob:
|
|
|
125
125
|
}
|
|
126
126
|
|
|
127
127
|
@classmethod
|
|
128
|
-
def from_json(
|
|
128
|
+
def from_json(
|
|
129
|
+
cls, blob: Union[str, dict], short_name: Optional[str] = None
|
|
130
|
+
) -> "GraphJob":
|
|
129
131
|
"""
|
|
130
|
-
Create a job from a JSON blob.
|
|
132
|
+
Create a job from a JSON dict or blob.
|
|
131
133
|
"""
|
|
132
|
-
data
|
|
134
|
+
data = json.loads(blob) if isinstance(blob, str) else blob
|
|
133
135
|
statements = _get_statements_from_json(data, short_name)
|
|
134
136
|
name = data["name"]
|
|
135
137
|
return cls(name, statements, short_name)
|
|
@@ -242,12 +244,12 @@ class GraphJob:
|
|
|
242
244
|
def run_from_json(
|
|
243
245
|
cls,
|
|
244
246
|
neo4j_session: neo4j.Session,
|
|
245
|
-
blob: str,
|
|
247
|
+
blob: Union[str, dict],
|
|
246
248
|
parameters: Dict,
|
|
247
249
|
short_name: Optional[str] = None,
|
|
248
250
|
) -> None:
|
|
249
251
|
"""
|
|
250
|
-
Run a job from a JSON blob. This will deserialize the job and execute all statements sequentially.
|
|
252
|
+
Run a job from a JSON dict or blob. This will deserialize the job and execute all statements sequentially.
|
|
251
253
|
"""
|
|
252
254
|
if not parameters:
|
|
253
255
|
parameters = {}
|
|
@@ -6,6 +6,7 @@ from typing import Dict
|
|
|
6
6
|
from typing import Iterable
|
|
7
7
|
from typing import List
|
|
8
8
|
|
|
9
|
+
import aioboto3
|
|
9
10
|
import boto3
|
|
10
11
|
import botocore.exceptions
|
|
11
12
|
import neo4j
|
|
@@ -49,12 +50,13 @@ def _build_aws_sync_kwargs(
|
|
|
49
50
|
|
|
50
51
|
def _sync_one_account(
|
|
51
52
|
neo4j_session: neo4j.Session,
|
|
52
|
-
boto3_session: boto3.
|
|
53
|
+
boto3_session: boto3.Session,
|
|
53
54
|
current_aws_account_id: str,
|
|
54
55
|
update_tag: int,
|
|
55
56
|
common_job_parameters: Dict[str, Any],
|
|
56
57
|
regions: list[str] | None = None,
|
|
57
58
|
aws_requested_syncs: Iterable[str] = RESOURCE_FUNCTIONS.keys(),
|
|
59
|
+
aioboto3_session: aioboto3.Session = aioboto3.Session(),
|
|
58
60
|
) -> None:
|
|
59
61
|
# Autodiscover the regions supported by the account unless the user has specified the regions to sync.
|
|
60
62
|
if not regions:
|
|
@@ -72,13 +74,20 @@ def _sync_one_account(
|
|
|
72
74
|
for func_name in aws_requested_syncs:
|
|
73
75
|
if func_name in RESOURCE_FUNCTIONS:
|
|
74
76
|
# Skip permission relationships and tags for now because they rely on data already being in the graph
|
|
75
|
-
if func_name
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
if func_name == "ecr:image_layers":
|
|
78
|
+
# has a different signature than the other functions (aioboto3_session replaces boto3_session)
|
|
79
|
+
RESOURCE_FUNCTIONS[func_name](
|
|
80
|
+
neo4j_session,
|
|
81
|
+
aioboto3_session,
|
|
82
|
+
regions,
|
|
83
|
+
current_aws_account_id,
|
|
84
|
+
update_tag,
|
|
85
|
+
common_job_parameters,
|
|
86
|
+
)
|
|
87
|
+
elif func_name in ["permission_relationships", "resourcegroupstaggingapi"]:
|
|
81
88
|
continue
|
|
89
|
+
else:
|
|
90
|
+
RESOURCE_FUNCTIONS[func_name](**sync_args)
|
|
82
91
|
else:
|
|
83
92
|
raise ValueError(
|
|
84
93
|
f'AWS sync function "{func_name}" was specified but does not exist. Did you misspell it?',
|
|
@@ -115,7 +124,7 @@ def _sync_one_account(
|
|
|
115
124
|
|
|
116
125
|
|
|
117
126
|
def _autodiscover_account_regions(
|
|
118
|
-
boto3_session: boto3.
|
|
127
|
+
boto3_session: boto3.Session,
|
|
119
128
|
account_id: str,
|
|
120
129
|
) -> List[str]:
|
|
121
130
|
regions: List[str] = []
|
|
@@ -136,7 +145,7 @@ def _autodiscover_account_regions(
|
|
|
136
145
|
|
|
137
146
|
def _autodiscover_accounts(
|
|
138
147
|
neo4j_session: neo4j.Session,
|
|
139
|
-
boto3_session: boto3.
|
|
148
|
+
boto3_session: boto3.Session,
|
|
140
149
|
account_id: str,
|
|
141
150
|
sync_tag: int,
|
|
142
151
|
common_job_parameters: Dict,
|
|
@@ -197,8 +206,10 @@ def _sync_multiple_accounts(
|
|
|
197
206
|
if num_accounts == 1:
|
|
198
207
|
# Use the default boto3 session because boto3 gets confused if you give it a profile name with 1 account
|
|
199
208
|
boto3_session = boto3.Session()
|
|
209
|
+
aioboto3_session = aioboto3.Session()
|
|
200
210
|
else:
|
|
201
211
|
boto3_session = boto3.Session(profile_name=profile_name)
|
|
212
|
+
aioboto3_session = aioboto3.Session(profile_name=profile_name)
|
|
202
213
|
|
|
203
214
|
_autodiscover_accounts(
|
|
204
215
|
neo4j_session,
|
|
@@ -217,6 +228,7 @@ def _sync_multiple_accounts(
|
|
|
217
228
|
common_job_parameters,
|
|
218
229
|
regions=regions,
|
|
219
230
|
aws_requested_syncs=aws_requested_syncs, # Could be replaced later with per-account requested syncs
|
|
231
|
+
aioboto3_session=aioboto3_session,
|
|
220
232
|
)
|
|
221
233
|
except Exception as e:
|
|
222
234
|
if aws_best_effort_mode:
|
cartography/intel/aws/ecr.py
CHANGED
|
@@ -248,12 +248,19 @@ def transform_ecr_repository_images(repo_data: Dict) -> tuple[List[Dict], List[D
|
|
|
248
248
|
|
|
249
249
|
# Create ECRImage for the manifest list itself
|
|
250
250
|
if digest not in ecr_images_dict:
|
|
251
|
+
# Extract child image digests (excluding attestations for CONTAINS_IMAGE relationship)
|
|
252
|
+
child_digests = [
|
|
253
|
+
m["digest"]
|
|
254
|
+
for m in manifest_images
|
|
255
|
+
if m.get("type") != "attestation"
|
|
256
|
+
]
|
|
251
257
|
ecr_images_dict[digest] = {
|
|
252
258
|
"imageDigest": digest,
|
|
253
259
|
"type": "manifest_list",
|
|
254
260
|
"architecture": None,
|
|
255
261
|
"os": None,
|
|
256
262
|
"variant": None,
|
|
263
|
+
"child_image_digests": child_digests if child_digests else None,
|
|
257
264
|
}
|
|
258
265
|
|
|
259
266
|
# Create ECRImage nodes for each image in the manifest list
|
|
@@ -12,7 +12,6 @@ from typing import Any
|
|
|
12
12
|
from typing import Optional
|
|
13
13
|
|
|
14
14
|
import aioboto3
|
|
15
|
-
import boto3
|
|
16
15
|
import httpx
|
|
17
16
|
import neo4j
|
|
18
17
|
from botocore.exceptions import ClientError
|
|
@@ -334,6 +333,7 @@ def transform_ecr_image_layers(
|
|
|
334
333
|
image_layers_data: dict[str, dict[str, list[str]]],
|
|
335
334
|
image_digest_map: dict[str, str],
|
|
336
335
|
image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
|
|
336
|
+
existing_properties_map: Optional[dict[str, dict[str, Any]]] = None,
|
|
337
337
|
) -> tuple[list[dict], list[dict]]:
|
|
338
338
|
"""
|
|
339
339
|
Transform image layer data into format suitable for Neo4j ingestion.
|
|
@@ -342,10 +342,13 @@ def transform_ecr_image_layers(
|
|
|
342
342
|
:param image_layers_data: Map of image URI to platform to diff_ids
|
|
343
343
|
:param image_digest_map: Map of image URI to image digest
|
|
344
344
|
:param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
|
|
345
|
+
:param existing_properties_map: Map of image digest to existing ECRImage properties (type, architecture, etc.)
|
|
345
346
|
:return: List of layer objects ready for ingestion
|
|
346
347
|
"""
|
|
347
348
|
if image_attestation_map is None:
|
|
348
349
|
image_attestation_map = {}
|
|
350
|
+
if existing_properties_map is None:
|
|
351
|
+
existing_properties_map = {}
|
|
349
352
|
layers_by_diff_id: dict[str, dict[str, Any]] = {}
|
|
350
353
|
memberships_by_digest: dict[str, dict[str, Any]] = {}
|
|
351
354
|
|
|
@@ -353,6 +356,16 @@ def transform_ecr_image_layers(
|
|
|
353
356
|
# fetch_image_layers_async guarantees every uri in image_layers_data has a digest
|
|
354
357
|
image_digest = image_digest_map[image_uri]
|
|
355
358
|
|
|
359
|
+
# Check if this is a manifest list
|
|
360
|
+
is_manifest_list = False
|
|
361
|
+
if image_digest in existing_properties_map:
|
|
362
|
+
image_type = existing_properties_map[image_digest].get("type")
|
|
363
|
+
is_manifest_list = image_type == "manifest_list"
|
|
364
|
+
|
|
365
|
+
# Skip creating layer relationships for manifest lists
|
|
366
|
+
if is_manifest_list:
|
|
367
|
+
continue
|
|
368
|
+
|
|
356
369
|
ordered_layers_for_image: Optional[list[str]] = None
|
|
357
370
|
|
|
358
371
|
for _, diff_ids in platforms.items():
|
|
@@ -391,6 +404,10 @@ def transform_ecr_image_layers(
|
|
|
391
404
|
"layer_diff_ids": ordered_layers_for_image,
|
|
392
405
|
}
|
|
393
406
|
|
|
407
|
+
# Preserve existing ECRImage properties (type, architecture, os, variant, etc.)
|
|
408
|
+
if image_digest in existing_properties_map:
|
|
409
|
+
membership.update(existing_properties_map[image_digest])
|
|
410
|
+
|
|
394
411
|
# Add attestation data if available for this image
|
|
395
412
|
if image_uri in image_attestation_map:
|
|
396
413
|
attestation = image_attestation_map[image_uri]
|
|
@@ -433,7 +450,12 @@ def load_ecr_image_layers(
|
|
|
433
450
|
current_aws_account_id: str,
|
|
434
451
|
aws_update_tag: int,
|
|
435
452
|
) -> None:
|
|
436
|
-
"""
|
|
453
|
+
"""
|
|
454
|
+
Load image layers into Neo4j.
|
|
455
|
+
|
|
456
|
+
Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
|
|
457
|
+
since layer objects can contain large arrays of relationships.
|
|
458
|
+
"""
|
|
437
459
|
logger.info(
|
|
438
460
|
f"Loading {len(image_layers)} image layers for region {region} into graph.",
|
|
439
461
|
)
|
|
@@ -442,6 +464,7 @@ def load_ecr_image_layers(
|
|
|
442
464
|
neo4j_session,
|
|
443
465
|
ECRImageLayerSchema(),
|
|
444
466
|
image_layers,
|
|
467
|
+
batch_size=1000,
|
|
445
468
|
lastupdated=aws_update_tag,
|
|
446
469
|
AWS_ID=current_aws_account_id,
|
|
447
470
|
)
|
|
@@ -455,10 +478,17 @@ def load_ecr_image_layer_memberships(
|
|
|
455
478
|
current_aws_account_id: str,
|
|
456
479
|
aws_update_tag: int,
|
|
457
480
|
) -> None:
|
|
481
|
+
"""
|
|
482
|
+
Load image layer memberships into Neo4j.
|
|
483
|
+
|
|
484
|
+
Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
|
|
485
|
+
since membership objects can contain large arrays of layer diff_ids.
|
|
486
|
+
"""
|
|
458
487
|
load(
|
|
459
488
|
neo4j_session,
|
|
460
489
|
ECRImageSchema(),
|
|
461
490
|
memberships,
|
|
491
|
+
batch_size=1000,
|
|
462
492
|
lastupdated=aws_update_tag,
|
|
463
493
|
Region=region,
|
|
464
494
|
AWS_ID=current_aws_account_id,
|
|
@@ -527,8 +557,15 @@ async def fetch_image_layers_async(
|
|
|
527
557
|
async def fetch_single_image_layers(
|
|
528
558
|
repo_image: dict,
|
|
529
559
|
http_client: httpx.AsyncClient,
|
|
530
|
-
) -> Optional[
|
|
531
|
-
|
|
560
|
+
) -> Optional[
|
|
561
|
+
tuple[str, str, dict[str, list[str]], Optional[dict[str, dict[str, str]]]]
|
|
562
|
+
]:
|
|
563
|
+
"""
|
|
564
|
+
Fetch layers for a single image and extract attestation if present.
|
|
565
|
+
|
|
566
|
+
Returns tuple of (uri, digest, platform_layers, attestations_by_child_digest) where
|
|
567
|
+
attestations_by_child_digest maps child image digest to parent image info
|
|
568
|
+
"""
|
|
532
569
|
async with semaphore:
|
|
533
570
|
# Caller guarantees these fields exist in every repo_image
|
|
534
571
|
uri = repo_image["uri"]
|
|
@@ -551,13 +588,13 @@ async def fetch_image_layers_async(
|
|
|
551
588
|
|
|
552
589
|
manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
|
|
553
590
|
platform_layers: dict[str, list[str]] = {}
|
|
554
|
-
attestation_data: Optional[dict[str, str]] = None
|
|
591
|
+
attestation_data: Optional[dict[str, dict[str, str]]] = None
|
|
555
592
|
|
|
556
593
|
if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
|
|
557
594
|
|
|
558
595
|
async def _process_child_manifest(
|
|
559
596
|
manifest_ref: dict,
|
|
560
|
-
) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
|
|
597
|
+
) -> tuple[dict[str, list[str]], Optional[tuple[str, dict[str, str]]]]:
|
|
561
598
|
# Check if this is an attestation manifest
|
|
562
599
|
if (
|
|
563
600
|
manifest_ref.get("annotations", {}).get(
|
|
@@ -565,18 +602,27 @@ async def fetch_image_layers_async(
|
|
|
565
602
|
)
|
|
566
603
|
== "attestation-manifest"
|
|
567
604
|
):
|
|
605
|
+
# Extract which child image this attestation is for
|
|
606
|
+
attests_child_digest = manifest_ref.get("annotations", {}).get(
|
|
607
|
+
"vnd.docker.reference.digest"
|
|
608
|
+
)
|
|
609
|
+
if not attests_child_digest:
|
|
610
|
+
return {}, None
|
|
611
|
+
|
|
568
612
|
# Extract base image from attestation
|
|
569
|
-
|
|
570
|
-
if
|
|
613
|
+
attestation_digest = manifest_ref.get("digest")
|
|
614
|
+
if attestation_digest:
|
|
571
615
|
attestation_info = (
|
|
572
616
|
await _extract_parent_image_from_attestation(
|
|
573
617
|
ecr_client,
|
|
574
618
|
repo_name,
|
|
575
|
-
|
|
619
|
+
attestation_digest,
|
|
576
620
|
http_client,
|
|
577
621
|
)
|
|
578
622
|
)
|
|
579
|
-
|
|
623
|
+
if attestation_info:
|
|
624
|
+
# Return (attests_child_digest, parent_info) tuple
|
|
625
|
+
return {}, (attests_child_digest, attestation_info)
|
|
580
626
|
return {}, None
|
|
581
627
|
|
|
582
628
|
child_digest = manifest_ref.get("digest")
|
|
@@ -612,14 +658,22 @@ async def fetch_image_layers_async(
|
|
|
612
658
|
)
|
|
613
659
|
|
|
614
660
|
# Merge results from successful child manifest processing
|
|
661
|
+
# Track attestation data by child digest for proper mapping
|
|
662
|
+
attestations_by_child_digest: dict[str, dict[str, str]] = {}
|
|
663
|
+
|
|
615
664
|
for result in child_results:
|
|
616
665
|
if isinstance(result, tuple) and len(result) == 2:
|
|
617
666
|
layer_data, attest_data = result
|
|
618
667
|
if layer_data:
|
|
619
668
|
platform_layers.update(layer_data)
|
|
620
|
-
if attest_data
|
|
621
|
-
#
|
|
622
|
-
|
|
669
|
+
if attest_data:
|
|
670
|
+
# attest_data is (child_digest, parent_info) tuple
|
|
671
|
+
child_digest, parent_info = attest_data
|
|
672
|
+
attestations_by_child_digest[child_digest] = parent_info
|
|
673
|
+
|
|
674
|
+
# Build attestation_data with child digest mapping
|
|
675
|
+
if attestations_by_child_digest:
|
|
676
|
+
attestation_data = attestations_by_child_digest
|
|
623
677
|
else:
|
|
624
678
|
diff_map = await _diff_ids_for_manifest(
|
|
625
679
|
ecr_client,
|
|
@@ -630,7 +684,9 @@ async def fetch_image_layers_async(
|
|
|
630
684
|
)
|
|
631
685
|
platform_layers.update(diff_map)
|
|
632
686
|
|
|
633
|
-
if
|
|
687
|
+
# Return if we found layers or attestation data
|
|
688
|
+
# Manifest lists may have attestation_data without platform_layers
|
|
689
|
+
if platform_layers or attestation_data:
|
|
634
690
|
return uri, digest, platform_layers, attestation_data
|
|
635
691
|
|
|
636
692
|
return None
|
|
@@ -670,13 +726,22 @@ async def fetch_image_layers_async(
|
|
|
670
726
|
)
|
|
671
727
|
|
|
672
728
|
if result:
|
|
673
|
-
uri, digest, layer_data,
|
|
729
|
+
uri, digest, layer_data, attestations_by_child_digest = result
|
|
674
730
|
if not digest:
|
|
675
731
|
raise ValueError(f"Empty digest returned for image {uri}")
|
|
676
732
|
image_layers_data[uri] = layer_data
|
|
677
733
|
image_digest_map[uri] = digest
|
|
678
|
-
if
|
|
679
|
-
|
|
734
|
+
if attestations_by_child_digest:
|
|
735
|
+
# Map attestation data by child digest URIs
|
|
736
|
+
repo_uri = extract_repo_uri_from_image_uri(uri)
|
|
737
|
+
for (
|
|
738
|
+
child_digest,
|
|
739
|
+
parent_info,
|
|
740
|
+
) in attestations_by_child_digest.items():
|
|
741
|
+
child_uri = f"{repo_uri}@{child_digest}"
|
|
742
|
+
image_attestation_map[child_uri] = parent_info
|
|
743
|
+
# Also add to digest map so transform can look up the child digest
|
|
744
|
+
image_digest_map[child_uri] = child_digest
|
|
680
745
|
|
|
681
746
|
logger.info(
|
|
682
747
|
f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
|
|
@@ -698,7 +763,7 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
|
|
|
698
763
|
@timeit
|
|
699
764
|
def sync(
|
|
700
765
|
neo4j_session: neo4j.Session,
|
|
701
|
-
|
|
766
|
+
aioboto3_session: aioboto3.Session,
|
|
702
767
|
regions: list[str],
|
|
703
768
|
current_aws_account_id: str,
|
|
704
769
|
update_tag: int,
|
|
@@ -721,30 +786,71 @@ def sync(
|
|
|
721
786
|
current_aws_account_id,
|
|
722
787
|
)
|
|
723
788
|
|
|
724
|
-
#
|
|
725
|
-
|
|
789
|
+
# Query for ECR images with all their existing properties to preserve during layer sync
|
|
790
|
+
query = """
|
|
791
|
+
MATCH (img:ECRImage)<-[:IMAGE]-(repo_img:ECRRepositoryImage)<-[:REPO_IMAGE]-(repo:ECRRepository)
|
|
792
|
+
MATCH (repo)<-[:RESOURCE]-(:AWSAccount {id: $AWS_ID})
|
|
793
|
+
WHERE repo.region = $Region
|
|
794
|
+
RETURN DISTINCT
|
|
795
|
+
img.digest AS digest,
|
|
796
|
+
repo_img.id AS uri,
|
|
797
|
+
repo.uri AS repo_uri,
|
|
798
|
+
img.type AS type,
|
|
799
|
+
img.architecture AS architecture,
|
|
800
|
+
img.os AS os,
|
|
801
|
+
img.variant AS variant,
|
|
802
|
+
img.attestation_type AS attestation_type,
|
|
803
|
+
img.attests_digest AS attests_digest,
|
|
804
|
+
img.media_type AS media_type,
|
|
805
|
+
img.artifact_media_type AS artifact_media_type,
|
|
806
|
+
img.child_image_digests AS child_image_digests
|
|
807
|
+
"""
|
|
808
|
+
from cartography.client.core.tx import read_list_of_dicts_tx
|
|
726
809
|
|
|
727
|
-
ecr_images =
|
|
810
|
+
ecr_images = neo4j_session.read_transaction(
|
|
811
|
+
read_list_of_dicts_tx, query, AWS_ID=current_aws_account_id, Region=region
|
|
812
|
+
)
|
|
728
813
|
|
|
729
|
-
#
|
|
814
|
+
# Build repo_images_list and existing_properties map
|
|
730
815
|
repo_images_list = []
|
|
816
|
+
existing_properties = {}
|
|
731
817
|
seen_digests = set()
|
|
732
818
|
|
|
733
|
-
for
|
|
734
|
-
|
|
819
|
+
for img_data in ecr_images:
|
|
820
|
+
digest = img_data["digest"]
|
|
821
|
+
image_type = img_data.get("type")
|
|
822
|
+
|
|
823
|
+
if digest not in seen_digests:
|
|
735
824
|
seen_digests.add(digest)
|
|
736
|
-
repo_uri = extract_repo_uri_from_image_uri(uri)
|
|
737
825
|
|
|
738
|
-
#
|
|
826
|
+
# Store existing properties for ALL images to preserve during updates
|
|
827
|
+
existing_properties[digest] = {
|
|
828
|
+
"type": image_type,
|
|
829
|
+
"architecture": img_data.get("architecture"),
|
|
830
|
+
"os": img_data.get("os"),
|
|
831
|
+
"variant": img_data.get("variant"),
|
|
832
|
+
"attestation_type": img_data.get("attestation_type"),
|
|
833
|
+
"attests_digest": img_data.get("attests_digest"),
|
|
834
|
+
"media_type": img_data.get("media_type"),
|
|
835
|
+
"artifact_media_type": img_data.get("artifact_media_type"),
|
|
836
|
+
"child_image_digests": img_data.get("child_image_digests"),
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
repo_uri = img_data["repo_uri"]
|
|
739
840
|
digest_uri = f"{repo_uri}@{digest}"
|
|
740
841
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
842
|
+
# Fetch manifests for:
|
|
843
|
+
# - Platform-specific images (type="image") - to get their layers
|
|
844
|
+
# - Manifest lists (type="manifest_list") - to extract attestation parent image data
|
|
845
|
+
# Skip only attestations since they don't have useful layer or parent data
|
|
846
|
+
if image_type != "attestation":
|
|
847
|
+
repo_images_list.append(
|
|
848
|
+
{
|
|
849
|
+
"imageDigest": digest,
|
|
850
|
+
"uri": digest_uri,
|
|
851
|
+
"repo_uri": repo_uri,
|
|
852
|
+
}
|
|
853
|
+
)
|
|
748
854
|
|
|
749
855
|
logger.info(
|
|
750
856
|
f"Found {len(repo_images_list)} distinct ECR image digests in graph for region {region}"
|
|
@@ -768,15 +874,9 @@ def sync(
|
|
|
768
874
|
dict[str, str],
|
|
769
875
|
dict[str, dict[str, str]],
|
|
770
876
|
]:
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
aws_access_key_id=credentials.access_key,
|
|
775
|
-
aws_secret_access_key=credentials.secret_key,
|
|
776
|
-
aws_session_token=credentials.token,
|
|
777
|
-
region_name=region,
|
|
778
|
-
)
|
|
779
|
-
async with session.client("ecr") as ecr_client:
|
|
877
|
+
async with aioboto3_session.client(
|
|
878
|
+
"ecr", region_name=region
|
|
879
|
+
) as ecr_client:
|
|
780
880
|
return await fetch_image_layers_async(ecr_client, repo_images_list)
|
|
781
881
|
|
|
782
882
|
# Use get_event_loop() + run_until_complete() to avoid tearing down loop
|
|
@@ -798,6 +898,7 @@ def sync(
|
|
|
798
898
|
image_layers_data,
|
|
799
899
|
image_digest_map,
|
|
800
900
|
image_attestation_map,
|
|
901
|
+
existing_properties,
|
|
801
902
|
)
|
|
802
903
|
load_ecr_image_layers(
|
|
803
904
|
neo4j_session,
|