cartography 0.117.0__py3-none-any.whl → 0.118.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (57) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +11 -0
  3. cartography/config.py +5 -0
  4. cartography/graph/job.py +6 -2
  5. cartography/graph/statement.py +4 -0
  6. cartography/intel/aws/__init__.py +1 -0
  7. cartography/intel/aws/apigateway.py +18 -5
  8. cartography/intel/aws/ec2/elastic_ip_addresses.py +3 -1
  9. cartography/intel/aws/ec2/internet_gateways.py +4 -2
  10. cartography/intel/aws/ec2/load_balancer_v2s.py +11 -5
  11. cartography/intel/aws/ec2/network_interfaces.py +4 -0
  12. cartography/intel/aws/ec2/reserved_instances.py +3 -1
  13. cartography/intel/aws/ec2/tgw.py +11 -5
  14. cartography/intel/aws/ec2/volumes.py +1 -1
  15. cartography/intel/aws/ecr.py +202 -26
  16. cartography/intel/aws/elasticsearch.py +13 -4
  17. cartography/intel/aws/identitycenter.py +93 -54
  18. cartography/intel/aws/inspector.py +26 -14
  19. cartography/intel/aws/permission_relationships.py +3 -3
  20. cartography/intel/aws/s3.py +26 -13
  21. cartography/intel/aws/ssm.py +3 -5
  22. cartography/intel/azure/compute.py +9 -4
  23. cartography/intel/azure/cosmosdb.py +31 -15
  24. cartography/intel/azure/sql.py +25 -12
  25. cartography/intel/azure/storage.py +19 -9
  26. cartography/intel/azure/subscription.py +3 -1
  27. cartography/intel/crowdstrike/spotlight.py +5 -2
  28. cartography/intel/entra/app_role_assignments.py +9 -2
  29. cartography/intel/gcp/__init__.py +26 -9
  30. cartography/intel/gcp/clients.py +8 -4
  31. cartography/intel/gcp/compute.py +39 -18
  32. cartography/intel/gcp/crm/folders.py +9 -3
  33. cartography/intel/gcp/crm/orgs.py +8 -3
  34. cartography/intel/gcp/crm/projects.py +14 -3
  35. cartography/intel/jamf/computers.py +7 -1
  36. cartography/intel/oci/iam.py +23 -9
  37. cartography/intel/oci/organizations.py +3 -1
  38. cartography/intel/oci/utils.py +28 -5
  39. cartography/intel/okta/awssaml.py +8 -7
  40. cartography/intel/pagerduty/escalation_policies.py +13 -6
  41. cartography/intel/pagerduty/schedules.py +9 -4
  42. cartography/intel/pagerduty/services.py +7 -3
  43. cartography/intel/pagerduty/teams.py +5 -2
  44. cartography/intel/pagerduty/users.py +3 -1
  45. cartography/intel/pagerduty/vendors.py +3 -1
  46. cartography/intel/trivy/__init__.py +109 -58
  47. cartography/models/aws/ec2/networkinterfaces.py +2 -0
  48. cartography/models/aws/ecr/image.py +8 -0
  49. cartography/models/aws/ecr/repository_image.py +1 -1
  50. cartography/sync.py +1 -1
  51. cartography/util.py +5 -1
  52. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/METADATA +3 -3
  53. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/RECORD +57 -57
  54. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/WHEEL +0 -0
  55. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/entry_points.txt +0 -0
  56. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/licenses/LICENSE +0 -0
  57. {cartography-0.117.0.dist-info → cartography-0.118.0.dist-info}/top_level.txt +0 -0
cartography/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.117.0'
32
- __version_tuple__ = version_tuple = (0, 117, 0)
31
+ __version__ = version = '0.118.0'
32
+ __version_tuple__ = version_tuple = (0, 118, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
cartography/cli.py CHANGED
@@ -279,6 +279,17 @@ class CLI:
279
279
  "Example: 'HIGH' will sync only HIGH and CRITICAL findings, filtering out LOW and MEDIUM severity findings."
280
280
  ),
281
281
  )
282
+ parser.add_argument(
283
+ "--experimental-aws-inspector-batch",
284
+ type=int,
285
+ default=1000,
286
+ help=(
287
+ "EXPERIMENTAL: This feature is experimental and may be removed in the future. "
288
+ "Batch size for AWS Inspector findings sync. Controls how many findings are fetched, processed and cleaned up at a time. "
289
+ "Default is 1000. Increase this value if you have a large number of findings and want to reduce API calls, "
290
+ "or decrease it if you're experiencing memory issues."
291
+ ),
292
+ )
282
293
  parser.add_argument(
283
294
  "--analysis-job-directory",
284
295
  type=str,
cartography/config.py CHANGED
@@ -58,6 +58,9 @@ class Config:
58
58
  :type aws_guardduty_severity_threshold: str
59
59
  :param aws_guardduty_severity_threshold: GuardDuty severity threshold filter. Only findings at or above this
60
60
  severity level will be synced. Valid values: LOW, MEDIUM, HIGH, CRITICAL. Optional.
61
+ :type experimental_aws_inspector_batch: int
62
+ :param experimental_aws_inspector_batch: EXPERIMENTAL: Batch size for AWS Inspector findings sync. Controls how
63
+ many findings are fetched, processed and cleaned up at a time. Default is 1000. Optional.
61
64
  :type analysis_job_directory: str
62
65
  :param analysis_job_directory: Path to a directory tree containing analysis jobs to run. Optional.
63
66
  :type oci_sync_all_profiles: bool
@@ -195,6 +198,7 @@ class Config:
195
198
  aws_regions=None,
196
199
  aws_best_effort_mode=False,
197
200
  aws_cloudtrail_management_events_lookback_hours=None,
201
+ experimental_aws_inspector_batch=1000,
198
202
  azure_sync_all_subscriptions=False,
199
203
  azure_sp_auth=None,
200
204
  azure_tenant_id=None,
@@ -287,6 +291,7 @@ class Config:
287
291
  self.aws_cloudtrail_management_events_lookback_hours = (
288
292
  aws_cloudtrail_management_events_lookback_hours
289
293
  )
294
+ self.experimental_aws_inspector_batch = experimental_aws_inspector_batch
290
295
  self.azure_sync_all_subscriptions = azure_sync_all_subscriptions
291
296
  self.azure_sp_auth = azure_sp_auth
292
297
  self.azure_tenant_id = azure_tenant_id
cartography/graph/job.py CHANGED
@@ -139,11 +139,13 @@ class GraphJob:
139
139
  cls,
140
140
  node_schema: CartographyNodeSchema,
141
141
  parameters: Dict[str, Any],
142
+ iterationsize: int = 100,
142
143
  ) -> "GraphJob":
143
144
  """
144
145
  Create a cleanup job from a CartographyNodeSchema object.
145
146
  For a given node, the fields used in the node_schema.sub_resource_relationship.target_node_node_matcher.keys()
146
147
  must be provided as keys and values in the params dict.
148
+ :param iterationsize: The number of items to process in each iteration. Defaults to 100.
147
149
  """
148
150
  queries: List[str] = build_cleanup_queries(node_schema)
149
151
 
@@ -165,7 +167,7 @@ class GraphJob:
165
167
  query,
166
168
  parameters=parameters,
167
169
  iterative=True,
168
- iterationsize=100,
170
+ iterationsize=iterationsize,
169
171
  parent_job_name=node_schema.label,
170
172
  parent_job_sequence_num=idx,
171
173
  )
@@ -185,6 +187,7 @@ class GraphJob:
185
187
  sub_resource_label: str,
186
188
  sub_resource_id: str,
187
189
  update_tag: int,
190
+ iterationsize: int = 100,
188
191
  ) -> "GraphJob":
189
192
  """
190
193
  Create a cleanup job from a CartographyRelSchema object (specifically, a MatchLink).
@@ -194,6 +197,7 @@ class GraphJob:
194
197
  - For a given rel_schema, the fields used in the rel_schema.properties._sub_resource_label.name and
195
198
  rel_schema.properties._sub_resource_id.name must be provided as keys and values in the params dict.
196
199
  - The rel_schema must have a source_node_matcher and target_node_matcher.
200
+ :param iterationsize: The number of items to process in each iteration. Defaults to 100.
197
201
  """
198
202
  cleanup_link_query = build_cleanup_query_for_matchlink(rel_schema)
199
203
  logger.debug(f"Cleanup query: {cleanup_link_query}")
@@ -208,7 +212,7 @@ class GraphJob:
208
212
  cleanup_link_query,
209
213
  parameters=parameters,
210
214
  iterative=True,
211
- iterationsize=100,
215
+ iterationsize=iterationsize,
212
216
  parent_job_name=rel_schema.rel_label,
213
217
  )
214
218
 
@@ -52,6 +52,10 @@ class GraphStatement:
52
52
  self.parameters = parameters or {}
53
53
  self.iterative = iterative
54
54
  self.iterationsize = iterationsize
55
+ if iterationsize < 0:
56
+ raise ValueError(
57
+ f"iterationsize must be a positive integer, got {iterationsize}",
58
+ )
55
59
  self.parameters["LIMIT_SIZE"] = self.iterationsize
56
60
 
57
61
  self.parent_job_name = parent_job_name if parent_job_name else None
@@ -312,6 +312,7 @@ def start_aws_ingestion(neo4j_session: neo4j.Session, config: Config) -> None:
312
312
  "permission_relationships_file": config.permission_relationships_file,
313
313
  "aws_guardduty_severity_threshold": config.aws_guardduty_severity_threshold,
314
314
  "aws_cloudtrail_management_events_lookback_hours": config.aws_cloudtrail_management_events_lookback_hours,
315
+ "experimental_aws_inspector_batch": config.experimental_aws_inspector_batch,
315
316
  }
316
317
  try:
317
318
  boto3_session = boto3.Session()
@@ -178,11 +178,24 @@ def get_rest_api_resources_methods_integrations(
178
178
  method["apiId"] = api["id"]
179
179
  method["httpMethod"] = http_method
180
180
  methods.append(method)
181
- integration = client.get_integration(
182
- restApiId=api["id"],
183
- resourceId=resource_id,
184
- httpMethod=http_method,
185
- )
181
+ try:
182
+ integration = client.get_integration(
183
+ restApiId=api["id"],
184
+ resourceId=resource_id,
185
+ httpMethod=http_method,
186
+ )
187
+ except ClientError as e:
188
+ error_code = e.response.get("Error", {}).get("Code")
189
+ if error_code == "NotFoundException":
190
+ logger.warning(
191
+ "No integration found for API %s resource %s method %s: %s",
192
+ api["id"],
193
+ resource_id,
194
+ http_method,
195
+ e,
196
+ )
197
+ continue
198
+ raise
186
199
  integration["resourceId"] = resource_id
187
200
  integration["apiId"] = api["id"]
188
201
  integration["integrationHttpMethod"] = integration.get("httpMethod")
@@ -6,6 +6,7 @@ import boto3
6
6
  import neo4j
7
7
  from botocore.exceptions import ClientError
8
8
 
9
+ from cartography.client.core.tx import run_write_query
9
10
  from cartography.util import aws_handle_regions
10
11
  from cartography.util import run_cleanup_job
11
12
  from cartography.util import timeit
@@ -83,7 +84,8 @@ def load_elastic_ip_addresses(
83
84
  SET r.lastupdated = $update_tag
84
85
  """
85
86
 
86
- neo4j_session.run(
87
+ run_write_query(
88
+ neo4j_session,
87
89
  ingest_addresses,
88
90
  elastic_ip_addresses=elastic_ip_addresses,
89
91
  Region=region,
@@ -5,6 +5,7 @@ from typing import List
5
5
  import boto3
6
6
  import neo4j
7
7
 
8
+ from cartography.client.core.tx import run_write_query
8
9
  from cartography.util import aws_handle_regions
9
10
  from cartography.util import run_cleanup_job
10
11
  from cartography.util import timeit
@@ -63,13 +64,14 @@ def load_internet_gateways(
63
64
  SET r.lastupdated = $aws_update_tag
64
65
  """
65
66
 
66
- neo4j_session.run(
67
+ run_write_query(
68
+ neo4j_session,
67
69
  query,
68
70
  internet_gateways=internet_gateways,
69
71
  region=region,
70
72
  aws_account_id=current_aws_account_id,
71
73
  aws_update_tag=update_tag,
72
- ).consume()
74
+ )
73
75
 
74
76
 
75
77
  @timeit
@@ -6,6 +6,7 @@ import boto3
6
6
  import botocore
7
7
  import neo4j
8
8
 
9
+ from cartography.client.core.tx import run_write_query
9
10
  from cartography.util import aws_handle_regions
10
11
  from cartography.util import run_cleanup_job
11
12
  from cartography.util import timeit
@@ -104,7 +105,8 @@ def load_load_balancer_v2s(
104
105
  logger.warning("Skipping load balancer entry with missing DNSName: %r", lb)
105
106
  continue
106
107
 
107
- neo4j_session.run(
108
+ run_write_query(
109
+ neo4j_session,
108
110
  ingest_load_balancer_v2,
109
111
  ID=load_balancer_id,
110
112
  CREATED_TIME=str(lb["CreatedTime"]),
@@ -138,7 +140,8 @@ def load_load_balancer_v2s(
138
140
  SET r.lastupdated = $update_tag
139
141
  """
140
142
  for group in lb["SecurityGroups"]:
141
- neo4j_session.run(
143
+ run_write_query(
144
+ neo4j_session,
142
145
  ingest_load_balancer_v2_security_group,
143
146
  ID=load_balancer_id,
144
147
  GROUP_ID=str(group),
@@ -182,7 +185,8 @@ def load_load_balancer_v2_subnets(
182
185
  SET r.lastupdated = $update_tag
183
186
  """
184
187
  for az in az_data:
185
- neo4j_session.run(
188
+ run_write_query(
189
+ neo4j_session,
186
190
  ingest_load_balancer_subnet,
187
191
  ID=load_balancer_id,
188
192
  SubnetId=az["SubnetId"],
@@ -219,7 +223,8 @@ def load_load_balancer_v2_target_groups(
219
223
  continue
220
224
 
221
225
  for instance in target_group["Targets"]:
222
- neo4j_session.run(
226
+ run_write_query(
227
+ neo4j_session,
223
228
  ingest_instances,
224
229
  ID=load_balancer_id,
225
230
  INSTANCE_ID=instance,
@@ -253,7 +258,8 @@ def load_load_balancer_v2_listeners(
253
258
  ON CREATE SET r.firstseen = timestamp()
254
259
  SET r.lastupdated = $update_tag
255
260
  """
256
- neo4j_session.run(
261
+ run_write_query(
262
+ neo4j_session,
257
263
  ingest_listener,
258
264
  LoadBalancerId=load_balancer_id,
259
265
  Listeners=listener_data,
@@ -98,6 +98,10 @@ def transform_network_interface_data(
98
98
  "SourceDestCheck": network_interface["SourceDestCheck"],
99
99
  "Status": network_interface["Status"],
100
100
  "SubnetId": network_interface["SubnetId"],
101
+ "AttachTime": network_interface.get("Attachment", {}).get("AttachTime"),
102
+ "DeviceIndex": network_interface.get("Attachment", {}).get(
103
+ "DeviceIndex"
104
+ ),
101
105
  "ElbV1Id": elb_v1_id,
102
106
  "ElbV2Id": elb_v2_id,
103
107
  },
@@ -6,6 +6,7 @@ import boto3
6
6
  import neo4j
7
7
  from botocore.exceptions import ClientError
8
8
 
9
+ from cartography.client.core.tx import run_write_query
9
10
  from cartography.util import aws_handle_regions
10
11
  from cartography.util import run_cleanup_job
11
12
  from cartography.util import timeit
@@ -64,7 +65,8 @@ def load_reserved_instances(
64
65
  r_instance["Start"] = str(r_instance["Start"])
65
66
  r_instance["End"] = str(r_instance["End"])
66
67
 
67
- neo4j_session.run(
68
+ run_write_query(
69
+ neo4j_session,
68
70
  ingest_reserved_instances,
69
71
  reserved_instances_list=data,
70
72
  AWS_ACCOUNT_ID=current_aws_account_id,
@@ -6,6 +6,7 @@ import boto3
6
6
  import botocore.exceptions
7
7
  import neo4j
8
8
 
9
+ from cartography.client.core.tx import run_write_query
9
10
  from cartography.util import aws_handle_regions
10
11
  from cartography.util import run_cleanup_job
11
12
  from cartography.util import timeit
@@ -120,7 +121,8 @@ def load_transit_gateways(
120
121
  for tgw in data:
121
122
  tgw_id = tgw["TransitGatewayId"]
122
123
 
123
- neo4j_session.run(
124
+ run_write_query(
125
+ neo4j_session,
124
126
  ingest_transit_gateway,
125
127
  TgwId=tgw_id,
126
128
  ARN=tgw["TransitGatewayArn"],
@@ -161,7 +163,8 @@ def _attach_shared_transit_gateway(
161
163
  """
162
164
 
163
165
  if tgw["OwnerId"] != current_aws_account_id:
164
- neo4j_session.run(
166
+ run_write_query(
167
+ neo4j_session,
165
168
  attach_tgw,
166
169
  ARN=tgw["TransitGatewayArn"],
167
170
  TransitGatewayId=tgw["TransitGatewayId"],
@@ -202,7 +205,8 @@ def load_tgw_attachments(
202
205
  for tgwa in data:
203
206
  tgwa_id = tgwa["TransitGatewayAttachmentId"]
204
207
 
205
- neo4j_session.run(
208
+ run_write_query(
209
+ neo4j_session,
206
210
  ingest_transit_gateway,
207
211
  TgwAttachmentId=tgwa_id,
208
212
  TransitGatewayId=tgwa["TransitGatewayId"],
@@ -261,7 +265,8 @@ def _attach_tgw_vpc_attachment_to_vpc_subnets(
261
265
  SET p.lastupdated = $update_tag
262
266
  """
263
267
 
264
- neo4j_session.run(
268
+ run_write_query(
269
+ neo4j_session,
265
270
  attach_vpc_tgw_attachment_to_vpc,
266
271
  VpcId=tgw_vpc_attachment["VpcId"],
267
272
  TgwAttachmentId=tgw_vpc_attachment["TransitGatewayAttachmentId"],
@@ -269,7 +274,8 @@ def _attach_tgw_vpc_attachment_to_vpc_subnets(
269
274
  )
270
275
 
271
276
  for subnet_id in tgw_vpc_attachment["SubnetIds"]:
272
- neo4j_session.run(
277
+ run_write_query(
278
+ neo4j_session,
273
279
  attach_vpc_tgw_attachment_to_subnet,
274
280
  SubnetId=subnet_id,
275
281
  TgwAttachmentId=tgw_vpc_attachment["TransitGatewayAttachmentId"],
@@ -70,7 +70,7 @@ def transform_volumes(
70
70
 
71
71
  for attachment in active_attachments:
72
72
  vol_with_attachment = raw_vol.copy()
73
- vol_with_attachment["InstanceId"] = attachment["InstanceId"]
73
+ vol_with_attachment["InstanceId"] = attachment.get("InstanceId")
74
74
  result.append(vol_with_attachment)
75
75
 
76
76
  return result
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import logging
2
3
  from typing import Any
3
4
  from typing import Dict
@@ -18,6 +19,12 @@ from cartography.util import to_synchronous
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ # Manifest list media types
23
+ MANIFEST_LIST_MEDIA_TYPES = {
24
+ "application/vnd.docker.distribution.manifest.list.v2+json",
25
+ "application/vnd.oci.image.index.v1+json",
26
+ }
27
+
21
28
 
22
29
  @timeit
23
30
  @aws_handle_regions
@@ -34,6 +41,84 @@ def get_ecr_repositories(
34
41
  return ecr_repositories
35
42
 
36
43
 
44
+ def _get_platform_specific_digests(
45
+ client: Any, repository_name: str, manifest_list_digest: str
46
+ ) -> tuple[List[Dict[str, Any]], set[str]]:
47
+ """
48
+ Fetch manifest list and extract platform-specific image digests and attestations.
49
+
50
+ Returns:
51
+ - List of all images (platform-specific + attestations) with digest, type, architecture, os, variant
52
+ - Set of ALL digests referenced in the manifest list
53
+ """
54
+ response = client.batch_get_image(
55
+ repositoryName=repository_name,
56
+ imageIds=[{"imageDigest": manifest_list_digest}],
57
+ acceptedMediaTypes=list(MANIFEST_LIST_MEDIA_TYPES),
58
+ )
59
+
60
+ if not response.get("images"):
61
+ raise ValueError(
62
+ f"No manifest list found for digest {manifest_list_digest} in repository {repository_name}"
63
+ )
64
+
65
+ # batch_get_image returns a single manifest list (hence [0])
66
+ # The manifests[] array inside contains all platform-specific images and attestations
67
+ manifest_json = json.loads(response["images"][0]["imageManifest"])
68
+ manifests = manifest_json.get("manifests", [])
69
+
70
+ if not manifests:
71
+ raise ValueError(
72
+ f"Manifest list {manifest_list_digest} has no manifests in repository {repository_name}"
73
+ )
74
+
75
+ all_images = []
76
+ all_referenced_digests = set()
77
+
78
+ for manifest_ref in manifests:
79
+ digest = manifest_ref.get("digest")
80
+ if not digest:
81
+ raise ValueError(
82
+ f"Manifest in list {manifest_list_digest} has no digest in repository {repository_name}"
83
+ )
84
+
85
+ all_referenced_digests.add(digest)
86
+
87
+ platform_info = manifest_ref.get("platform", {})
88
+ architecture = platform_info.get("architecture")
89
+ os_name = platform_info.get("os")
90
+
91
+ # Determine if this is an attestation
92
+ annotations = manifest_ref.get("annotations", {})
93
+ is_attestation = (
94
+ architecture == "unknown" and os_name == "unknown"
95
+ ) or annotations.get("vnd.docker.reference.type") == "attestation-manifest"
96
+
97
+ all_images.append(
98
+ {
99
+ "digest": digest,
100
+ "type": "attestation" if is_attestation else "image",
101
+ "architecture": architecture,
102
+ "os": os_name,
103
+ "variant": platform_info.get("variant"),
104
+ "attestation_type": (
105
+ annotations.get("vnd.docker.reference.type")
106
+ if is_attestation
107
+ else None
108
+ ),
109
+ "attests_digest": (
110
+ annotations.get("vnd.docker.reference.digest")
111
+ if is_attestation
112
+ else None
113
+ ),
114
+ "media_type": manifest_ref.get("mediaType"),
115
+ "artifact_media_type": manifest_ref.get("artifactType"),
116
+ }
117
+ )
118
+
119
+ return all_images, all_referenced_digests
120
+
121
+
37
122
  @timeit
38
123
  @aws_handle_regions
39
124
  def get_ecr_repository_images(
@@ -46,7 +131,11 @@ def get_ecr_repository_images(
46
131
  )
47
132
  client = boto3_session.client("ecr", region_name=region)
48
133
  list_paginator = client.get_paginator("list_images")
49
- ecr_repository_images: List[Dict] = []
134
+
135
+ # First pass: Collect all image details and track manifest list referenced digests
136
+ all_image_details: List[Dict] = []
137
+ manifest_list_referenced_digests: set[str] = set()
138
+
50
139
  for page in list_paginator.paginate(repositoryName=repository_name):
51
140
  image_ids = page["imageIds"]
52
141
  if not image_ids:
@@ -58,14 +147,37 @@ def get_ecr_repository_images(
58
147
  for response in describe_response:
59
148
  image_details = response["imageDetails"]
60
149
  for detail in image_details:
61
- tags = detail.get("imageTags") or []
62
- if tags:
63
- for tag in tags:
64
- image_detail = {**detail, "imageTag": tag}
65
- image_detail.pop("imageTags", None)
66
- ecr_repository_images.append(image_detail)
67
- else:
68
- ecr_repository_images.append({**detail})
150
+ # Check if this is a manifest list
151
+ media_type = detail.get("imageManifestMediaType")
152
+ if media_type in MANIFEST_LIST_MEDIA_TYPES:
153
+ # Fetch all images from manifest list (platform-specific + attestations)
154
+ manifest_list_digest = detail["imageDigest"]
155
+ manifest_images, all_digests = _get_platform_specific_digests(
156
+ client, repository_name, manifest_list_digest
157
+ )
158
+ detail["_manifest_images"] = manifest_images
159
+
160
+ # Track ALL digests so we don't create ECRRepositoryImages for them
161
+ manifest_list_referenced_digests.update(all_digests)
162
+
163
+ all_image_details.append(detail)
164
+
165
+ # Second pass: Only add images that should have ECRRepositoryImage nodes
166
+ ecr_repository_images: List[Dict] = []
167
+ for detail in all_image_details:
168
+ tags = detail.get("imageTags") or []
169
+ digest = detail.get("imageDigest")
170
+
171
+ if tags:
172
+ # Tagged images always get ECRRepositoryImage nodes (one per tag)
173
+ for tag in tags:
174
+ image_detail = {**detail, "imageTag": tag}
175
+ image_detail.pop("imageTags", None)
176
+ ecr_repository_images.append(image_detail)
177
+ elif digest not in manifest_list_referenced_digests:
178
+ # Untagged images only get nodes if they're NOT part of a manifest list
179
+ ecr_repository_images.append({**detail})
180
+
69
181
  return ecr_repository_images
70
182
 
71
183
 
@@ -91,52 +203,115 @@ def load_ecr_repositories(
91
203
 
92
204
 
93
205
  @timeit
94
- def transform_ecr_repository_images(repo_data: Dict) -> List[Dict]:
206
+ def transform_ecr_repository_images(repo_data: Dict) -> tuple[List[Dict], List[Dict]]:
95
207
  """
96
- Ensure that we only load ECRImage nodes to the graph if they have a defined imageDigest field.
97
- Process repositories in a consistent order to handle overlapping image digests deterministically.
208
+ Transform ECR repository images into repo image list and ECR image list.
209
+ For manifest lists, creates ECR images for manifest list, platform-specific images, and attestations.
210
+
211
+ Returns:
212
+ - repo_images_list: List of ECRRepositoryImage nodes with imageDigests field (one-to-many)
213
+ - ecr_images_list: List of ECRImage nodes with type, architecture, os, variant fields
98
214
  """
99
215
  repo_images_list = []
216
+ ecr_images_dict: Dict[str, Dict] = {} # Deduplicate by digest
217
+
100
218
  # Sort repository URIs to ensure consistent processing order
101
219
  for repo_uri in sorted(repo_data.keys()):
102
220
  repo_images = repo_data[repo_uri]
103
221
  for img in repo_images:
104
222
  digest = img.get("imageDigest")
105
- if digest:
106
- tag = img.get("imageTag")
107
- uri = repo_uri + (f":{tag}" if tag else "")
108
- img["repo_uri"] = repo_uri
109
- img["uri"] = uri
110
- img["id"] = uri
111
- repo_images_list.append(img)
112
- else:
223
+ if not digest:
113
224
  logger.warning(
114
225
  "Repo %s has an image that has no imageDigest. Its tag is %s. Continuing on.",
115
226
  repo_uri,
116
227
  img.get("imageTag"),
117
228
  )
229
+ continue
230
+
231
+ tag = img.get("imageTag")
232
+ uri = repo_uri + (f":{tag}" if tag else "")
233
+
234
+ # Build ECRRepositoryImage node
235
+ repo_image = {
236
+ **img,
237
+ "repo_uri": repo_uri,
238
+ "uri": uri,
239
+ "id": uri,
240
+ }
241
+
242
+ # Check if this is a manifest list with images
243
+ manifest_images = img.get("_manifest_images")
244
+ if manifest_images:
245
+ # For manifest list: include manifest list digest + all referenced digests
246
+ all_digests = [digest] + [m["digest"] for m in manifest_images]
247
+ repo_image["imageDigests"] = all_digests
248
+
249
+ # Create ECRImage for the manifest list itself
250
+ if digest not in ecr_images_dict:
251
+ ecr_images_dict[digest] = {
252
+ "imageDigest": digest,
253
+ "type": "manifest_list",
254
+ "architecture": None,
255
+ "os": None,
256
+ "variant": None,
257
+ }
258
+
259
+ # Create ECRImage nodes for each image in the manifest list
260
+ for manifest_img in manifest_images:
261
+ manifest_digest = manifest_img["digest"]
262
+ if manifest_digest not in ecr_images_dict:
263
+ ecr_images_dict[manifest_digest] = {
264
+ "imageDigest": manifest_digest,
265
+ "type": manifest_img.get("type"),
266
+ "architecture": manifest_img.get("architecture"),
267
+ "os": manifest_img.get("os"),
268
+ "variant": manifest_img.get("variant"),
269
+ "attestation_type": manifest_img.get("attestation_type"),
270
+ "attests_digest": manifest_img.get("attests_digest"),
271
+ "media_type": manifest_img.get("media_type"),
272
+ "artifact_media_type": manifest_img.get(
273
+ "artifact_media_type"
274
+ ),
275
+ }
276
+ else:
277
+ # Regular image: single digest
278
+ repo_image["imageDigests"] = [digest]
279
+
280
+ # Create ECRImage for regular image
281
+ if digest not in ecr_images_dict:
282
+ ecr_images_dict[digest] = {
283
+ "imageDigest": digest,
284
+ "type": "image",
285
+ "architecture": None,
286
+ "os": None,
287
+ "variant": None,
288
+ }
289
+
290
+ # Remove internal field before returning
291
+ repo_image.pop("_manifest_images", None)
292
+ repo_images_list.append(repo_image)
118
293
 
119
- return repo_images_list
294
+ ecr_images_list = list(ecr_images_dict.values())
295
+ return repo_images_list, ecr_images_list
120
296
 
121
297
 
122
298
  @timeit
123
299
  def load_ecr_repository_images(
124
300
  neo4j_session: neo4j.Session,
125
301
  repo_images_list: List[Dict],
302
+ ecr_images_list: List[Dict],
126
303
  region: str,
127
304
  current_aws_account_id: str,
128
305
  aws_update_tag: int,
129
306
  ) -> None:
130
307
  logger.info(
131
- f"Loading {len(repo_images_list)} ECR repository images in {region} into graph.",
308
+ f"Loading {len(ecr_images_list)} ECR images and {len(repo_images_list)} ECR repository images in {region} into graph.",
132
309
  )
133
- image_digests = {img["imageDigest"] for img in repo_images_list}
134
- ecr_images = [{"imageDigest": d} for d in image_digests]
135
310
 
136
311
  load(
137
312
  neo4j_session,
138
313
  ECRImageSchema(),
139
- ecr_images,
314
+ ecr_images_list,
140
315
  lastupdated=aws_update_tag,
141
316
  Region=region,
142
317
  AWS_ID=current_aws_account_id,
@@ -219,10 +394,11 @@ def sync(
219
394
  current_aws_account_id,
220
395
  update_tag,
221
396
  )
222
- repo_images_list = transform_ecr_repository_images(image_data)
397
+ repo_images_list, ecr_images_list = transform_ecr_repository_images(image_data)
223
398
  load_ecr_repository_images(
224
399
  neo4j_session,
225
400
  repo_images_list,
401
+ ecr_images_list,
226
402
  region,
227
403
  current_aws_account_id,
228
404
  update_tag,