cartography 0.117.0__py3-none-any.whl → 0.119.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (107) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +31 -0
  3. cartography/client/core/tx.py +19 -3
  4. cartography/config.py +14 -0
  5. cartography/data/indexes.cypher +0 -6
  6. cartography/graph/job.py +13 -7
  7. cartography/graph/statement.py +4 -0
  8. cartography/intel/aws/__init__.py +22 -9
  9. cartography/intel/aws/apigateway.py +18 -5
  10. cartography/intel/aws/ec2/elastic_ip_addresses.py +3 -1
  11. cartography/intel/aws/ec2/internet_gateways.py +4 -2
  12. cartography/intel/aws/ec2/load_balancer_v2s.py +11 -5
  13. cartography/intel/aws/ec2/network_interfaces.py +4 -0
  14. cartography/intel/aws/ec2/reserved_instances.py +3 -1
  15. cartography/intel/aws/ec2/tgw.py +11 -5
  16. cartography/intel/aws/ec2/volumes.py +1 -1
  17. cartography/intel/aws/ecr.py +209 -26
  18. cartography/intel/aws/ecr_image_layers.py +143 -42
  19. cartography/intel/aws/elasticsearch.py +13 -4
  20. cartography/intel/aws/identitycenter.py +93 -54
  21. cartography/intel/aws/inspector.py +90 -46
  22. cartography/intel/aws/permission_relationships.py +3 -3
  23. cartography/intel/aws/resourcegroupstaggingapi.py +1 -1
  24. cartography/intel/aws/s3.py +26 -13
  25. cartography/intel/aws/ssm.py +3 -5
  26. cartography/intel/azure/compute.py +9 -4
  27. cartography/intel/azure/cosmosdb.py +31 -15
  28. cartography/intel/azure/sql.py +25 -12
  29. cartography/intel/azure/storage.py +19 -9
  30. cartography/intel/azure/subscription.py +3 -1
  31. cartography/intel/crowdstrike/spotlight.py +5 -2
  32. cartography/intel/entra/app_role_assignments.py +9 -2
  33. cartography/intel/gcp/__init__.py +26 -9
  34. cartography/intel/gcp/clients.py +8 -4
  35. cartography/intel/gcp/compute.py +42 -21
  36. cartography/intel/gcp/crm/folders.py +9 -3
  37. cartography/intel/gcp/crm/orgs.py +8 -3
  38. cartography/intel/gcp/crm/projects.py +14 -3
  39. cartography/intel/github/repos.py +23 -5
  40. cartography/intel/gsuite/__init__.py +12 -8
  41. cartography/intel/gsuite/groups.py +291 -0
  42. cartography/intel/gsuite/users.py +142 -0
  43. cartography/intel/jamf/computers.py +7 -1
  44. cartography/intel/oci/iam.py +23 -9
  45. cartography/intel/oci/organizations.py +3 -1
  46. cartography/intel/oci/utils.py +28 -5
  47. cartography/intel/okta/awssaml.py +9 -8
  48. cartography/intel/okta/users.py +1 -1
  49. cartography/intel/ontology/__init__.py +44 -0
  50. cartography/intel/ontology/devices.py +54 -0
  51. cartography/intel/ontology/users.py +54 -0
  52. cartography/intel/ontology/utils.py +121 -0
  53. cartography/intel/pagerduty/escalation_policies.py +13 -6
  54. cartography/intel/pagerduty/schedules.py +9 -4
  55. cartography/intel/pagerduty/services.py +7 -3
  56. cartography/intel/pagerduty/teams.py +5 -2
  57. cartography/intel/pagerduty/users.py +3 -1
  58. cartography/intel/pagerduty/vendors.py +3 -1
  59. cartography/intel/trivy/__init__.py +109 -58
  60. cartography/models/airbyte/user.py +4 -0
  61. cartography/models/anthropic/user.py +4 -0
  62. cartography/models/aws/ec2/networkinterfaces.py +2 -0
  63. cartography/models/aws/ecr/image.py +55 -0
  64. cartography/models/aws/ecr/repository_image.py +1 -1
  65. cartography/models/aws/iam/group_membership.py +3 -2
  66. cartography/models/aws/identitycenter/awsssouser.py +3 -1
  67. cartography/models/bigfix/bigfix_computer.py +1 -1
  68. cartography/models/cloudflare/member.py +4 -0
  69. cartography/models/crowdstrike/hosts.py +1 -1
  70. cartography/models/duo/endpoint.py +1 -1
  71. cartography/models/duo/phone.py +2 -2
  72. cartography/models/duo/user.py +4 -0
  73. cartography/models/entra/user.py +2 -1
  74. cartography/models/github/users.py +4 -0
  75. cartography/models/gsuite/__init__.py +0 -0
  76. cartography/models/gsuite/group.py +218 -0
  77. cartography/models/gsuite/tenant.py +29 -0
  78. cartography/models/gsuite/user.py +107 -0
  79. cartography/models/kandji/device.py +1 -2
  80. cartography/models/keycloak/user.py +4 -0
  81. cartography/models/lastpass/user.py +4 -0
  82. cartography/models/ontology/__init__.py +0 -0
  83. cartography/models/ontology/device.py +125 -0
  84. cartography/models/ontology/mapping/__init__.py +16 -0
  85. cartography/models/ontology/mapping/data/__init__.py +1 -0
  86. cartography/models/ontology/mapping/data/devices.py +160 -0
  87. cartography/models/ontology/mapping/data/users.py +239 -0
  88. cartography/models/ontology/mapping/specs.py +65 -0
  89. cartography/models/ontology/user.py +52 -0
  90. cartography/models/openai/user.py +4 -0
  91. cartography/models/scaleway/iam/user.py +4 -0
  92. cartography/models/snipeit/asset.py +1 -0
  93. cartography/models/snipeit/user.py +4 -0
  94. cartography/models/tailscale/device.py +1 -1
  95. cartography/models/tailscale/user.py +6 -1
  96. cartography/rules/data/frameworks/mitre_attack/requirements/t1098_account_manipulation/__init__.py +176 -89
  97. cartography/sync.py +4 -1
  98. cartography/util.py +49 -18
  99. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/METADATA +3 -3
  100. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/RECORD +104 -89
  101. cartography/data/jobs/cleanup/gsuite_ingest_groups_cleanup.json +0 -23
  102. cartography/data/jobs/cleanup/gsuite_ingest_users_cleanup.json +0 -11
  103. cartography/intel/gsuite/api.py +0 -355
  104. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/WHEEL +0 -0
  105. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/entry_points.txt +0 -0
  106. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/licenses/LICENSE +0 -0
  107. {cartography-0.117.0.dist-info → cartography-0.119.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import logging
2
3
  from typing import Any
3
4
  from typing import Dict
@@ -18,6 +19,12 @@ from cartography.util import to_synchronous
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ # Manifest list media types
23
+ MANIFEST_LIST_MEDIA_TYPES = {
24
+ "application/vnd.docker.distribution.manifest.list.v2+json",
25
+ "application/vnd.oci.image.index.v1+json",
26
+ }
27
+
21
28
 
22
29
  @timeit
23
30
  @aws_handle_regions
@@ -34,6 +41,84 @@ def get_ecr_repositories(
34
41
  return ecr_repositories
35
42
 
36
43
 
44
+ def _get_platform_specific_digests(
45
+ client: Any, repository_name: str, manifest_list_digest: str
46
+ ) -> tuple[List[Dict[str, Any]], set[str]]:
47
+ """
48
+ Fetch manifest list and extract platform-specific image digests and attestations.
49
+
50
+ Returns:
51
+ - List of all images (platform-specific + attestations) with digest, type, architecture, os, variant
52
+ - Set of ALL digests referenced in the manifest list
53
+ """
54
+ response = client.batch_get_image(
55
+ repositoryName=repository_name,
56
+ imageIds=[{"imageDigest": manifest_list_digest}],
57
+ acceptedMediaTypes=list(MANIFEST_LIST_MEDIA_TYPES),
58
+ )
59
+
60
+ if not response.get("images"):
61
+ raise ValueError(
62
+ f"No manifest list found for digest {manifest_list_digest} in repository {repository_name}"
63
+ )
64
+
65
+ # batch_get_image returns a single manifest list (hence [0])
66
+ # The manifests[] array inside contains all platform-specific images and attestations
67
+ manifest_json = json.loads(response["images"][0]["imageManifest"])
68
+ manifests = manifest_json.get("manifests", [])
69
+
70
+ if not manifests:
71
+ raise ValueError(
72
+ f"Manifest list {manifest_list_digest} has no manifests in repository {repository_name}"
73
+ )
74
+
75
+ all_images = []
76
+ all_referenced_digests = set()
77
+
78
+ for manifest_ref in manifests:
79
+ digest = manifest_ref.get("digest")
80
+ if not digest:
81
+ raise ValueError(
82
+ f"Manifest in list {manifest_list_digest} has no digest in repository {repository_name}"
83
+ )
84
+
85
+ all_referenced_digests.add(digest)
86
+
87
+ platform_info = manifest_ref.get("platform", {})
88
+ architecture = platform_info.get("architecture")
89
+ os_name = platform_info.get("os")
90
+
91
+ # Determine if this is an attestation
92
+ annotations = manifest_ref.get("annotations", {})
93
+ is_attestation = (
94
+ architecture == "unknown" and os_name == "unknown"
95
+ ) or annotations.get("vnd.docker.reference.type") == "attestation-manifest"
96
+
97
+ all_images.append(
98
+ {
99
+ "digest": digest,
100
+ "type": "attestation" if is_attestation else "image",
101
+ "architecture": architecture,
102
+ "os": os_name,
103
+ "variant": platform_info.get("variant"),
104
+ "attestation_type": (
105
+ annotations.get("vnd.docker.reference.type")
106
+ if is_attestation
107
+ else None
108
+ ),
109
+ "attests_digest": (
110
+ annotations.get("vnd.docker.reference.digest")
111
+ if is_attestation
112
+ else None
113
+ ),
114
+ "media_type": manifest_ref.get("mediaType"),
115
+ "artifact_media_type": manifest_ref.get("artifactType"),
116
+ }
117
+ )
118
+
119
+ return all_images, all_referenced_digests
120
+
121
+
37
122
  @timeit
38
123
  @aws_handle_regions
39
124
  def get_ecr_repository_images(
@@ -46,7 +131,11 @@ def get_ecr_repository_images(
46
131
  )
47
132
  client = boto3_session.client("ecr", region_name=region)
48
133
  list_paginator = client.get_paginator("list_images")
49
- ecr_repository_images: List[Dict] = []
134
+
135
+ # First pass: Collect all image details and track manifest list referenced digests
136
+ all_image_details: List[Dict] = []
137
+ manifest_list_referenced_digests: set[str] = set()
138
+
50
139
  for page in list_paginator.paginate(repositoryName=repository_name):
51
140
  image_ids = page["imageIds"]
52
141
  if not image_ids:
@@ -58,14 +147,37 @@ def get_ecr_repository_images(
58
147
  for response in describe_response:
59
148
  image_details = response["imageDetails"]
60
149
  for detail in image_details:
61
- tags = detail.get("imageTags") or []
62
- if tags:
63
- for tag in tags:
64
- image_detail = {**detail, "imageTag": tag}
65
- image_detail.pop("imageTags", None)
66
- ecr_repository_images.append(image_detail)
67
- else:
68
- ecr_repository_images.append({**detail})
150
+ # Check if this is a manifest list
151
+ media_type = detail.get("imageManifestMediaType")
152
+ if media_type in MANIFEST_LIST_MEDIA_TYPES:
153
+ # Fetch all images from manifest list (platform-specific + attestations)
154
+ manifest_list_digest = detail["imageDigest"]
155
+ manifest_images, all_digests = _get_platform_specific_digests(
156
+ client, repository_name, manifest_list_digest
157
+ )
158
+ detail["_manifest_images"] = manifest_images
159
+
160
+ # Track ALL digests so we don't create ECRRepositoryImages for them
161
+ manifest_list_referenced_digests.update(all_digests)
162
+
163
+ all_image_details.append(detail)
164
+
165
+ # Second pass: Only add images that should have ECRRepositoryImage nodes
166
+ ecr_repository_images: List[Dict] = []
167
+ for detail in all_image_details:
168
+ tags = detail.get("imageTags") or []
169
+ digest = detail.get("imageDigest")
170
+
171
+ if tags:
172
+ # Tagged images always get ECRRepositoryImage nodes (one per tag)
173
+ for tag in tags:
174
+ image_detail = {**detail, "imageTag": tag}
175
+ image_detail.pop("imageTags", None)
176
+ ecr_repository_images.append(image_detail)
177
+ elif digest not in manifest_list_referenced_digests:
178
+ # Untagged images only get nodes if they're NOT part of a manifest list
179
+ ecr_repository_images.append({**detail})
180
+
69
181
  return ecr_repository_images
70
182
 
71
183
 
@@ -91,52 +203,122 @@ def load_ecr_repositories(
91
203
 
92
204
 
93
205
  @timeit
94
- def transform_ecr_repository_images(repo_data: Dict) -> List[Dict]:
206
+ def transform_ecr_repository_images(repo_data: Dict) -> tuple[List[Dict], List[Dict]]:
95
207
  """
96
- Ensure that we only load ECRImage nodes to the graph if they have a defined imageDigest field.
97
- Process repositories in a consistent order to handle overlapping image digests deterministically.
208
+ Transform ECR repository images into repo image list and ECR image list.
209
+ For manifest lists, creates ECR images for manifest list, platform-specific images, and attestations.
210
+
211
+ Returns:
212
+ - repo_images_list: List of ECRRepositoryImage nodes with imageDigests field (one-to-many)
213
+ - ecr_images_list: List of ECRImage nodes with type, architecture, os, variant fields
98
214
  """
99
215
  repo_images_list = []
216
+ ecr_images_dict: Dict[str, Dict] = {} # Deduplicate by digest
217
+
100
218
  # Sort repository URIs to ensure consistent processing order
101
219
  for repo_uri in sorted(repo_data.keys()):
102
220
  repo_images = repo_data[repo_uri]
103
221
  for img in repo_images:
104
222
  digest = img.get("imageDigest")
105
- if digest:
106
- tag = img.get("imageTag")
107
- uri = repo_uri + (f":{tag}" if tag else "")
108
- img["repo_uri"] = repo_uri
109
- img["uri"] = uri
110
- img["id"] = uri
111
- repo_images_list.append(img)
112
- else:
223
+ if not digest:
113
224
  logger.warning(
114
225
  "Repo %s has an image that has no imageDigest. Its tag is %s. Continuing on.",
115
226
  repo_uri,
116
227
  img.get("imageTag"),
117
228
  )
229
+ continue
230
+
231
+ tag = img.get("imageTag")
232
+ uri = repo_uri + (f":{tag}" if tag else "")
233
+
234
+ # Build ECRRepositoryImage node
235
+ repo_image = {
236
+ **img,
237
+ "repo_uri": repo_uri,
238
+ "uri": uri,
239
+ "id": uri,
240
+ }
241
+
242
+ # Check if this is a manifest list with images
243
+ manifest_images = img.get("_manifest_images")
244
+ if manifest_images:
245
+ # For manifest list: include manifest list digest + all referenced digests
246
+ all_digests = [digest] + [m["digest"] for m in manifest_images]
247
+ repo_image["imageDigests"] = all_digests
248
+
249
+ # Create ECRImage for the manifest list itself
250
+ if digest not in ecr_images_dict:
251
+ # Extract child image digests (excluding attestations for CONTAINS_IMAGE relationship)
252
+ child_digests = [
253
+ m["digest"]
254
+ for m in manifest_images
255
+ if m.get("type") != "attestation"
256
+ ]
257
+ ecr_images_dict[digest] = {
258
+ "imageDigest": digest,
259
+ "type": "manifest_list",
260
+ "architecture": None,
261
+ "os": None,
262
+ "variant": None,
263
+ "child_image_digests": child_digests if child_digests else None,
264
+ }
265
+
266
+ # Create ECRImage nodes for each image in the manifest list
267
+ for manifest_img in manifest_images:
268
+ manifest_digest = manifest_img["digest"]
269
+ if manifest_digest not in ecr_images_dict:
270
+ ecr_images_dict[manifest_digest] = {
271
+ "imageDigest": manifest_digest,
272
+ "type": manifest_img.get("type"),
273
+ "architecture": manifest_img.get("architecture"),
274
+ "os": manifest_img.get("os"),
275
+ "variant": manifest_img.get("variant"),
276
+ "attestation_type": manifest_img.get("attestation_type"),
277
+ "attests_digest": manifest_img.get("attests_digest"),
278
+ "media_type": manifest_img.get("media_type"),
279
+ "artifact_media_type": manifest_img.get(
280
+ "artifact_media_type"
281
+ ),
282
+ }
283
+ else:
284
+ # Regular image: single digest
285
+ repo_image["imageDigests"] = [digest]
286
+
287
+ # Create ECRImage for regular image
288
+ if digest not in ecr_images_dict:
289
+ ecr_images_dict[digest] = {
290
+ "imageDigest": digest,
291
+ "type": "image",
292
+ "architecture": None,
293
+ "os": None,
294
+ "variant": None,
295
+ }
296
+
297
+ # Remove internal field before returning
298
+ repo_image.pop("_manifest_images", None)
299
+ repo_images_list.append(repo_image)
118
300
 
119
- return repo_images_list
301
+ ecr_images_list = list(ecr_images_dict.values())
302
+ return repo_images_list, ecr_images_list
120
303
 
121
304
 
122
305
  @timeit
123
306
  def load_ecr_repository_images(
124
307
  neo4j_session: neo4j.Session,
125
308
  repo_images_list: List[Dict],
309
+ ecr_images_list: List[Dict],
126
310
  region: str,
127
311
  current_aws_account_id: str,
128
312
  aws_update_tag: int,
129
313
  ) -> None:
130
314
  logger.info(
131
- f"Loading {len(repo_images_list)} ECR repository images in {region} into graph.",
315
+ f"Loading {len(ecr_images_list)} ECR images and {len(repo_images_list)} ECR repository images in {region} into graph.",
132
316
  )
133
- image_digests = {img["imageDigest"] for img in repo_images_list}
134
- ecr_images = [{"imageDigest": d} for d in image_digests]
135
317
 
136
318
  load(
137
319
  neo4j_session,
138
320
  ECRImageSchema(),
139
- ecr_images,
321
+ ecr_images_list,
140
322
  lastupdated=aws_update_tag,
141
323
  Region=region,
142
324
  AWS_ID=current_aws_account_id,
@@ -219,10 +401,11 @@ def sync(
219
401
  current_aws_account_id,
220
402
  update_tag,
221
403
  )
222
- repo_images_list = transform_ecr_repository_images(image_data)
404
+ repo_images_list, ecr_images_list = transform_ecr_repository_images(image_data)
223
405
  load_ecr_repository_images(
224
406
  neo4j_session,
225
407
  repo_images_list,
408
+ ecr_images_list,
226
409
  region,
227
410
  current_aws_account_id,
228
411
  update_tag,
@@ -12,7 +12,6 @@ from typing import Any
12
12
  from typing import Optional
13
13
 
14
14
  import aioboto3
15
- import boto3
16
15
  import httpx
17
16
  import neo4j
18
17
  from botocore.exceptions import ClientError
@@ -334,6 +333,7 @@ def transform_ecr_image_layers(
334
333
  image_layers_data: dict[str, dict[str, list[str]]],
335
334
  image_digest_map: dict[str, str],
336
335
  image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
336
+ existing_properties_map: Optional[dict[str, dict[str, Any]]] = None,
337
337
  ) -> tuple[list[dict], list[dict]]:
338
338
  """
339
339
  Transform image layer data into format suitable for Neo4j ingestion.
@@ -342,10 +342,13 @@ def transform_ecr_image_layers(
342
342
  :param image_layers_data: Map of image URI to platform to diff_ids
343
343
  :param image_digest_map: Map of image URI to image digest
344
344
  :param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
345
+ :param existing_properties_map: Map of image digest to existing ECRImage properties (type, architecture, etc.)
345
346
  :return: List of layer objects ready for ingestion
346
347
  """
347
348
  if image_attestation_map is None:
348
349
  image_attestation_map = {}
350
+ if existing_properties_map is None:
351
+ existing_properties_map = {}
349
352
  layers_by_diff_id: dict[str, dict[str, Any]] = {}
350
353
  memberships_by_digest: dict[str, dict[str, Any]] = {}
351
354
 
@@ -353,6 +356,16 @@ def transform_ecr_image_layers(
353
356
  # fetch_image_layers_async guarantees every uri in image_layers_data has a digest
354
357
  image_digest = image_digest_map[image_uri]
355
358
 
359
+ # Check if this is a manifest list
360
+ is_manifest_list = False
361
+ if image_digest in existing_properties_map:
362
+ image_type = existing_properties_map[image_digest].get("type")
363
+ is_manifest_list = image_type == "manifest_list"
364
+
365
+ # Skip creating layer relationships for manifest lists
366
+ if is_manifest_list:
367
+ continue
368
+
356
369
  ordered_layers_for_image: Optional[list[str]] = None
357
370
 
358
371
  for _, diff_ids in platforms.items():
@@ -391,6 +404,10 @@ def transform_ecr_image_layers(
391
404
  "layer_diff_ids": ordered_layers_for_image,
392
405
  }
393
406
 
407
+ # Preserve existing ECRImage properties (type, architecture, os, variant, etc.)
408
+ if image_digest in existing_properties_map:
409
+ membership.update(existing_properties_map[image_digest])
410
+
394
411
  # Add attestation data if available for this image
395
412
  if image_uri in image_attestation_map:
396
413
  attestation = image_attestation_map[image_uri]
@@ -433,7 +450,12 @@ def load_ecr_image_layers(
433
450
  current_aws_account_id: str,
434
451
  aws_update_tag: int,
435
452
  ) -> None:
436
- """Load image layers into Neo4j."""
453
+ """
454
+ Load image layers into Neo4j.
455
+
456
+ Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
457
+ since layer objects can contain large arrays of relationships.
458
+ """
437
459
  logger.info(
438
460
  f"Loading {len(image_layers)} image layers for region {region} into graph.",
439
461
  )
@@ -442,6 +464,7 @@ def load_ecr_image_layers(
442
464
  neo4j_session,
443
465
  ECRImageLayerSchema(),
444
466
  image_layers,
467
+ batch_size=1000,
445
468
  lastupdated=aws_update_tag,
446
469
  AWS_ID=current_aws_account_id,
447
470
  )
@@ -455,10 +478,17 @@ def load_ecr_image_layer_memberships(
455
478
  current_aws_account_id: str,
456
479
  aws_update_tag: int,
457
480
  ) -> None:
481
+ """
482
+ Load image layer memberships into Neo4j.
483
+
484
+ Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
485
+ since membership objects can contain large arrays of layer diff_ids.
486
+ """
458
487
  load(
459
488
  neo4j_session,
460
489
  ECRImageSchema(),
461
490
  memberships,
491
+ batch_size=1000,
462
492
  lastupdated=aws_update_tag,
463
493
  Region=region,
464
494
  AWS_ID=current_aws_account_id,
@@ -527,8 +557,15 @@ async def fetch_image_layers_async(
527
557
  async def fetch_single_image_layers(
528
558
  repo_image: dict,
529
559
  http_client: httpx.AsyncClient,
530
- ) -> Optional[tuple[str, str, dict[str, list[str]], Optional[dict[str, str]]]]:
531
- """Fetch layers for a single image and extract attestation if present."""
560
+ ) -> Optional[
561
+ tuple[str, str, dict[str, list[str]], Optional[dict[str, dict[str, str]]]]
562
+ ]:
563
+ """
564
+ Fetch layers for a single image and extract attestation if present.
565
+
566
+ Returns tuple of (uri, digest, platform_layers, attestations_by_child_digest) where
567
+ attestations_by_child_digest maps child image digest to parent image info
568
+ """
532
569
  async with semaphore:
533
570
  # Caller guarantees these fields exist in every repo_image
534
571
  uri = repo_image["uri"]
@@ -551,13 +588,13 @@ async def fetch_image_layers_async(
551
588
 
552
589
  manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
553
590
  platform_layers: dict[str, list[str]] = {}
554
- attestation_data: Optional[dict[str, str]] = None
591
+ attestation_data: Optional[dict[str, dict[str, str]]] = None
555
592
 
556
593
  if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
557
594
 
558
595
  async def _process_child_manifest(
559
596
  manifest_ref: dict,
560
- ) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
597
+ ) -> tuple[dict[str, list[str]], Optional[tuple[str, dict[str, str]]]]:
561
598
  # Check if this is an attestation manifest
562
599
  if (
563
600
  manifest_ref.get("annotations", {}).get(
@@ -565,18 +602,27 @@ async def fetch_image_layers_async(
565
602
  )
566
603
  == "attestation-manifest"
567
604
  ):
605
+ # Extract which child image this attestation is for
606
+ attests_child_digest = manifest_ref.get("annotations", {}).get(
607
+ "vnd.docker.reference.digest"
608
+ )
609
+ if not attests_child_digest:
610
+ return {}, None
611
+
568
612
  # Extract base image from attestation
569
- child_digest = manifest_ref.get("digest")
570
- if child_digest:
613
+ attestation_digest = manifest_ref.get("digest")
614
+ if attestation_digest:
571
615
  attestation_info = (
572
616
  await _extract_parent_image_from_attestation(
573
617
  ecr_client,
574
618
  repo_name,
575
- child_digest,
619
+ attestation_digest,
576
620
  http_client,
577
621
  )
578
622
  )
579
- return {}, attestation_info
623
+ if attestation_info:
624
+ # Return (attests_child_digest, parent_info) tuple
625
+ return {}, (attests_child_digest, attestation_info)
580
626
  return {}, None
581
627
 
582
628
  child_digest = manifest_ref.get("digest")
@@ -612,14 +658,22 @@ async def fetch_image_layers_async(
612
658
  )
613
659
 
614
660
  # Merge results from successful child manifest processing
661
+ # Track attestation data by child digest for proper mapping
662
+ attestations_by_child_digest: dict[str, dict[str, str]] = {}
663
+
615
664
  for result in child_results:
616
665
  if isinstance(result, tuple) and len(result) == 2:
617
666
  layer_data, attest_data = result
618
667
  if layer_data:
619
668
  platform_layers.update(layer_data)
620
- if attest_data and not attestation_data:
621
- # Use first attestation found
622
- attestation_data = attest_data
669
+ if attest_data:
670
+ # attest_data is (child_digest, parent_info) tuple
671
+ child_digest, parent_info = attest_data
672
+ attestations_by_child_digest[child_digest] = parent_info
673
+
674
+ # Build attestation_data with child digest mapping
675
+ if attestations_by_child_digest:
676
+ attestation_data = attestations_by_child_digest
623
677
  else:
624
678
  diff_map = await _diff_ids_for_manifest(
625
679
  ecr_client,
@@ -630,7 +684,9 @@ async def fetch_image_layers_async(
630
684
  )
631
685
  platform_layers.update(diff_map)
632
686
 
633
- if platform_layers:
687
+ # Return if we found layers or attestation data
688
+ # Manifest lists may have attestation_data without platform_layers
689
+ if platform_layers or attestation_data:
634
690
  return uri, digest, platform_layers, attestation_data
635
691
 
636
692
  return None
@@ -670,13 +726,22 @@ async def fetch_image_layers_async(
670
726
  )
671
727
 
672
728
  if result:
673
- uri, digest, layer_data, attestation_data = result
729
+ uri, digest, layer_data, attestations_by_child_digest = result
674
730
  if not digest:
675
731
  raise ValueError(f"Empty digest returned for image {uri}")
676
732
  image_layers_data[uri] = layer_data
677
733
  image_digest_map[uri] = digest
678
- if attestation_data:
679
- image_attestation_map[uri] = attestation_data
734
+ if attestations_by_child_digest:
735
+ # Map attestation data by child digest URIs
736
+ repo_uri = extract_repo_uri_from_image_uri(uri)
737
+ for (
738
+ child_digest,
739
+ parent_info,
740
+ ) in attestations_by_child_digest.items():
741
+ child_uri = f"{repo_uri}@{child_digest}"
742
+ image_attestation_map[child_uri] = parent_info
743
+ # Also add to digest map so transform can look up the child digest
744
+ image_digest_map[child_uri] = child_digest
680
745
 
681
746
  logger.info(
682
747
  f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
@@ -698,7 +763,7 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
698
763
  @timeit
699
764
  def sync(
700
765
  neo4j_session: neo4j.Session,
701
- boto3_session: boto3.session.Session,
766
+ aioboto3_session: aioboto3.Session,
702
767
  regions: list[str],
703
768
  current_aws_account_id: str,
704
769
  update_tag: int,
@@ -721,30 +786,71 @@ def sync(
721
786
  current_aws_account_id,
722
787
  )
723
788
 
724
- # Get ECR images from graph using standard client function
725
- from cartography.client.aws.ecr import get_ecr_images
789
+ # Query for ECR images with all their existing properties to preserve during layer sync
790
+ query = """
791
+ MATCH (img:ECRImage)<-[:IMAGE]-(repo_img:ECRRepositoryImage)<-[:REPO_IMAGE]-(repo:ECRRepository)
792
+ MATCH (repo)<-[:RESOURCE]-(:AWSAccount {id: $AWS_ID})
793
+ WHERE repo.region = $Region
794
+ RETURN DISTINCT
795
+ img.digest AS digest,
796
+ repo_img.id AS uri,
797
+ repo.uri AS repo_uri,
798
+ img.type AS type,
799
+ img.architecture AS architecture,
800
+ img.os AS os,
801
+ img.variant AS variant,
802
+ img.attestation_type AS attestation_type,
803
+ img.attests_digest AS attests_digest,
804
+ img.media_type AS media_type,
805
+ img.artifact_media_type AS artifact_media_type,
806
+ img.child_image_digests AS child_image_digests
807
+ """
808
+ from cartography.client.core.tx import read_list_of_dicts_tx
726
809
 
727
- ecr_images = get_ecr_images(neo4j_session, current_aws_account_id)
810
+ ecr_images = neo4j_session.read_transaction(
811
+ read_list_of_dicts_tx, query, AWS_ID=current_aws_account_id, Region=region
812
+ )
728
813
 
729
- # Filter by region and deduplicate by digest
814
+ # Build repo_images_list and existing_properties map
730
815
  repo_images_list = []
816
+ existing_properties = {}
731
817
  seen_digests = set()
732
818
 
733
- for region_name, _, uri, _, digest in ecr_images:
734
- if region_name == region and digest not in seen_digests:
819
+ for img_data in ecr_images:
820
+ digest = img_data["digest"]
821
+ image_type = img_data.get("type")
822
+
823
+ if digest not in seen_digests:
735
824
  seen_digests.add(digest)
736
- repo_uri = extract_repo_uri_from_image_uri(uri)
737
825
 
738
- # Create digest-based URI for manifest fetching
826
+ # Store existing properties for ALL images to preserve during updates
827
+ existing_properties[digest] = {
828
+ "type": image_type,
829
+ "architecture": img_data.get("architecture"),
830
+ "os": img_data.get("os"),
831
+ "variant": img_data.get("variant"),
832
+ "attestation_type": img_data.get("attestation_type"),
833
+ "attests_digest": img_data.get("attests_digest"),
834
+ "media_type": img_data.get("media_type"),
835
+ "artifact_media_type": img_data.get("artifact_media_type"),
836
+ "child_image_digests": img_data.get("child_image_digests"),
837
+ }
838
+
839
+ repo_uri = img_data["repo_uri"]
739
840
  digest_uri = f"{repo_uri}@{digest}"
740
841
 
741
- repo_images_list.append(
742
- {
743
- "imageDigest": digest,
744
- "uri": digest_uri,
745
- "repo_uri": repo_uri,
746
- }
747
- )
842
+ # Fetch manifests for:
843
+ # - Platform-specific images (type="image") - to get their layers
844
+ # - Manifest lists (type="manifest_list") - to extract attestation parent image data
845
+ # Skip only attestations since they don't have useful layer or parent data
846
+ if image_type != "attestation":
847
+ repo_images_list.append(
848
+ {
849
+ "imageDigest": digest,
850
+ "uri": digest_uri,
851
+ "repo_uri": repo_uri,
852
+ }
853
+ )
748
854
 
749
855
  logger.info(
750
856
  f"Found {len(repo_images_list)} distinct ECR image digests in graph for region {region}"
@@ -768,15 +874,9 @@ def sync(
768
874
  dict[str, str],
769
875
  dict[str, dict[str, str]],
770
876
  ]:
771
- # Use credentials from the existing boto3 session
772
- credentials = boto3_session.get_credentials()
773
- session = aioboto3.Session(
774
- aws_access_key_id=credentials.access_key,
775
- aws_secret_access_key=credentials.secret_key,
776
- aws_session_token=credentials.token,
777
- region_name=region,
778
- )
779
- async with session.client("ecr") as ecr_client:
877
+ async with aioboto3_session.client(
878
+ "ecr", region_name=region
879
+ ) as ecr_client:
780
880
  return await fetch_image_layers_async(ecr_client, repo_images_list)
781
881
 
782
882
  # Use get_event_loop() + run_until_complete() to avoid tearing down loop
@@ -798,6 +898,7 @@ def sync(
798
898
  image_layers_data,
799
899
  image_digest_map,
800
900
  image_attestation_map,
901
+ existing_properties,
801
902
  )
802
903
  load_ecr_image_layers(
803
904
  neo4j_session,