cartography 0.118.0__py3-none-any.whl → 0.119.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (68) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +20 -0
  3. cartography/client/core/tx.py +19 -3
  4. cartography/config.py +9 -0
  5. cartography/data/indexes.cypher +0 -6
  6. cartography/graph/job.py +7 -5
  7. cartography/intel/aws/__init__.py +21 -9
  8. cartography/intel/aws/ecr.py +7 -0
  9. cartography/intel/aws/ecr_image_layers.py +143 -42
  10. cartography/intel/aws/inspector.py +65 -33
  11. cartography/intel/aws/resourcegroupstaggingapi.py +1 -1
  12. cartography/intel/gcp/compute.py +3 -3
  13. cartography/intel/github/repos.py +23 -5
  14. cartography/intel/gsuite/__init__.py +12 -8
  15. cartography/intel/gsuite/groups.py +291 -0
  16. cartography/intel/gsuite/users.py +142 -0
  17. cartography/intel/okta/awssaml.py +1 -1
  18. cartography/intel/okta/users.py +1 -1
  19. cartography/intel/ontology/__init__.py +44 -0
  20. cartography/intel/ontology/devices.py +54 -0
  21. cartography/intel/ontology/users.py +54 -0
  22. cartography/intel/ontology/utils.py +121 -0
  23. cartography/models/airbyte/user.py +4 -0
  24. cartography/models/anthropic/user.py +4 -0
  25. cartography/models/aws/ecr/image.py +47 -0
  26. cartography/models/aws/iam/group_membership.py +3 -2
  27. cartography/models/aws/identitycenter/awsssouser.py +3 -1
  28. cartography/models/bigfix/bigfix_computer.py +1 -1
  29. cartography/models/cloudflare/member.py +4 -0
  30. cartography/models/crowdstrike/hosts.py +1 -1
  31. cartography/models/duo/endpoint.py +1 -1
  32. cartography/models/duo/phone.py +2 -2
  33. cartography/models/duo/user.py +4 -0
  34. cartography/models/entra/user.py +2 -1
  35. cartography/models/github/users.py +4 -0
  36. cartography/models/gsuite/__init__.py +0 -0
  37. cartography/models/gsuite/group.py +218 -0
  38. cartography/models/gsuite/tenant.py +29 -0
  39. cartography/models/gsuite/user.py +107 -0
  40. cartography/models/kandji/device.py +1 -2
  41. cartography/models/keycloak/user.py +4 -0
  42. cartography/models/lastpass/user.py +4 -0
  43. cartography/models/ontology/__init__.py +0 -0
  44. cartography/models/ontology/device.py +125 -0
  45. cartography/models/ontology/mapping/__init__.py +16 -0
  46. cartography/models/ontology/mapping/data/__init__.py +1 -0
  47. cartography/models/ontology/mapping/data/devices.py +160 -0
  48. cartography/models/ontology/mapping/data/users.py +239 -0
  49. cartography/models/ontology/mapping/specs.py +65 -0
  50. cartography/models/ontology/user.py +52 -0
  51. cartography/models/openai/user.py +4 -0
  52. cartography/models/scaleway/iam/user.py +4 -0
  53. cartography/models/snipeit/asset.py +1 -0
  54. cartography/models/snipeit/user.py +4 -0
  55. cartography/models/tailscale/device.py +1 -1
  56. cartography/models/tailscale/user.py +6 -1
  57. cartography/rules/data/frameworks/mitre_attack/requirements/t1098_account_manipulation/__init__.py +176 -89
  58. cartography/sync.py +3 -0
  59. cartography/util.py +44 -17
  60. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/METADATA +1 -1
  61. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/RECORD +65 -50
  62. cartography/data/jobs/cleanup/gsuite_ingest_groups_cleanup.json +0 -23
  63. cartography/data/jobs/cleanup/gsuite_ingest_users_cleanup.json +0 -11
  64. cartography/intel/gsuite/api.py +0 -355
  65. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/WHEEL +0 -0
  66. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/entry_points.txt +0 -0
  67. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/licenses/LICENSE +0 -0
  68. {cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/top_level.txt +0 -0
cartography/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.118.0'
32
- __version_tuple__ = version_tuple = (0, 118, 0)
31
+ __version__ = version = '0.119.0'
32
+ __version_tuple__ = version_tuple = (0, 119, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
cartography/cli.py CHANGED
@@ -730,6 +730,26 @@ class CLI:
730
730
  "Required if you are using the Trivy module. Ignored otherwise."
731
731
  ),
732
732
  )
733
+ parser.add_argument(
734
+ "--ontology-users-source",
735
+ type=str,
736
+ default=None,
737
+ help=(
738
+ "Comma-separated list of sources of truth for user data in the ontology. "
739
+ "'User' nodes will only be created for users that exist in one of the sources. "
740
+ "Required if you are using the ontology module. Ignored otherwise."
741
+ ),
742
+ )
743
+ parser.add_argument(
744
+ "--ontology-devices-source",
745
+ type=str,
746
+ default=None,
747
+ help=(
748
+ "Comma-separated list of sources of truth for client computer data in the ontology. "
749
+ "'Device' nodes will only be created for groups that exist in one of the sources. "
750
+ "Required if you are using the ontology module. Ignored otherwise."
751
+ ),
752
+ )
733
753
  parser.add_argument(
734
754
  "--trivy-results-dir",
735
755
  type=str,
@@ -249,6 +249,7 @@ def load_graph_data(
249
249
  neo4j_session: neo4j.Session,
250
250
  query: str,
251
251
  dict_list: List[Dict[str, Any]],
252
+ batch_size: int = 10000,
252
253
  **kwargs,
253
254
  ) -> None:
254
255
  """
@@ -257,10 +258,13 @@ def load_graph_data(
257
258
  :param query: The Neo4j write query to run. This query is not meant to be handwritten, rather it should be generated
258
259
  with cartography.graph.querybuilder.build_ingestion_query().
259
260
  :param dict_list: The data to load to the graph represented as a list of dicts.
261
+ :param batch_size: The number of items to process per transaction. Defaults to 10000.
260
262
  :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
261
263
  :return: None
262
264
  """
263
- for data_batch in batch(dict_list, size=10000):
265
+ if batch_size <= 0:
266
+ raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
267
+ for data_batch in batch(dict_list, size=batch_size):
264
268
  neo4j_session.write_transaction(
265
269
  write_list_of_dicts_tx,
266
270
  query,
@@ -316,6 +320,7 @@ def load(
316
320
  neo4j_session: neo4j.Session,
317
321
  node_schema: CartographyNodeSchema,
318
322
  dict_list: List[Dict[str, Any]],
323
+ batch_size: int = 10000,
319
324
  **kwargs,
320
325
  ) -> None:
321
326
  """
@@ -324,21 +329,27 @@ def load(
324
329
  :param neo4j_session: The Neo4j session
325
330
  :param node_schema: The CartographyNodeSchema object to create indexes for and generate a query.
326
331
  :param dict_list: The data to load to the graph represented as a list of dicts.
332
+ :param batch_size: The number of items to process per transaction. Defaults to 10000.
327
333
  :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
328
334
  :return: None
329
335
  """
336
+ if batch_size <= 0:
337
+ raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
330
338
  if len(dict_list) == 0:
331
339
  # If there is no data to load, save some time.
332
340
  return
333
341
  ensure_indexes(neo4j_session, node_schema)
334
342
  ingestion_query = build_ingestion_query(node_schema)
335
- load_graph_data(neo4j_session, ingestion_query, dict_list, **kwargs)
343
+ load_graph_data(
344
+ neo4j_session, ingestion_query, dict_list, batch_size=batch_size, **kwargs
345
+ )
336
346
 
337
347
 
338
348
  def load_matchlinks(
339
349
  neo4j_session: neo4j.Session,
340
350
  rel_schema: CartographyRelSchema,
341
351
  dict_list: list[dict[str, Any]],
352
+ batch_size: int = 10000,
342
353
  **kwargs,
343
354
  ) -> None:
344
355
  """
@@ -347,9 +358,12 @@ def load_matchlinks(
347
358
  :param rel_schema: The CartographyRelSchema object to generate a query.
348
359
  :param dict_list: The data to load to the graph represented as a list of dicts. The dicts must contain the source and
349
360
  target node ids.
361
+ :param batch_size: The number of items to process per transaction. Defaults to 10000.
350
362
  :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
351
363
  :return: None
352
364
  """
365
+ if batch_size <= 0:
366
+ raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
353
367
  if len(dict_list) == 0:
354
368
  # If there is no data to load, save some time.
355
369
  return
@@ -369,4 +383,6 @@ def load_matchlinks(
369
383
  ensure_indexes_for_matchlinks(neo4j_session, rel_schema)
370
384
  matchlink_query = build_matchlink_query(rel_schema)
371
385
  logger.debug(f"Matchlink query: {matchlink_query}")
372
- load_graph_data(neo4j_session, matchlink_query, dict_list, **kwargs)
386
+ load_graph_data(
387
+ neo4j_session, matchlink_query, dict_list, batch_size=batch_size, **kwargs
388
+ )
cartography/config.py CHANGED
@@ -161,6 +161,11 @@ class Config:
161
161
  :param trivy_s3_bucket: The S3 bucket name containing Trivy scan results. Optional.
162
162
  :type trivy_s3_prefix: str
163
163
  :param trivy_s3_prefix: The S3 prefix path containing Trivy scan results. Optional.
164
+ :type ontology_users_source: str
165
+ :param ontology_users_source: Comma-separated list of sources of truth for user data in the ontology. Optional.
166
+ :type ontology_devices_source: str
167
+ :param ontology_devices_source: Comma-separated list of sources of truth for client computers data in the ontology.
168
+ Optional.
164
169
  :type trivy_results_dir: str
165
170
  :param trivy_results_dir: Local directory containing Trivy scan results. Optional.
166
171
  :type scaleway_access_key: str
@@ -266,6 +271,8 @@ class Config:
266
271
  airbyte_api_url=None,
267
272
  trivy_s3_bucket=None,
268
273
  trivy_s3_prefix=None,
274
+ ontology_users_source=None,
275
+ ontology_devices_source=None,
269
276
  trivy_results_dir=None,
270
277
  scaleway_access_key=None,
271
278
  scaleway_secret_key=None,
@@ -359,6 +366,8 @@ class Config:
359
366
  self.airbyte_api_url = airbyte_api_url
360
367
  self.trivy_s3_bucket = trivy_s3_bucket
361
368
  self.trivy_s3_prefix = trivy_s3_prefix
369
+ self.ontology_users_source = ontology_users_source
370
+ self.ontology_devices_source = ontology_devices_source
362
371
  self.trivy_results_dir = trivy_results_dir
363
372
  self.scaleway_access_key = scaleway_access_key
364
373
  self.scaleway_secret_key = scaleway_secret_key
@@ -102,12 +102,6 @@ CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.id);
102
102
  CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.lastupdated);
103
103
  CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.id);
104
104
  CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.lastupdated);
105
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.email);
106
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.id);
107
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.lastupdated);
108
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.email);
109
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.id);
110
- CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.lastupdated);
111
105
  CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.id);
112
106
  CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.ip);
113
107
  CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.lastupdated);
cartography/graph/job.py CHANGED
@@ -125,11 +125,13 @@ class GraphJob:
125
125
  }
126
126
 
127
127
  @classmethod
128
- def from_json(cls, blob: str, short_name: Optional[str] = None) -> "GraphJob":
128
+ def from_json(
129
+ cls, blob: Union[str, dict], short_name: Optional[str] = None
130
+ ) -> "GraphJob":
129
131
  """
130
- Create a job from a JSON blob.
132
+ Create a job from a JSON dict or blob.
131
133
  """
132
- data: Dict = json.loads(blob)
134
+ data = json.loads(blob) if isinstance(blob, str) else blob
133
135
  statements = _get_statements_from_json(data, short_name)
134
136
  name = data["name"]
135
137
  return cls(name, statements, short_name)
@@ -242,12 +244,12 @@ class GraphJob:
242
244
  def run_from_json(
243
245
  cls,
244
246
  neo4j_session: neo4j.Session,
245
- blob: str,
247
+ blob: Union[str, dict],
246
248
  parameters: Dict,
247
249
  short_name: Optional[str] = None,
248
250
  ) -> None:
249
251
  """
250
- Run a job from a JSON blob. This will deserialize the job and execute all statements sequentially.
252
+ Run a job from a JSON dict or blob. This will deserialize the job and execute all statements sequentially.
251
253
  """
252
254
  if not parameters:
253
255
  parameters = {}
@@ -6,6 +6,7 @@ from typing import Dict
6
6
  from typing import Iterable
7
7
  from typing import List
8
8
 
9
+ import aioboto3
9
10
  import boto3
10
11
  import botocore.exceptions
11
12
  import neo4j
@@ -49,12 +50,13 @@ def _build_aws_sync_kwargs(
49
50
 
50
51
  def _sync_one_account(
51
52
  neo4j_session: neo4j.Session,
52
- boto3_session: boto3.session.Session,
53
+ boto3_session: boto3.Session,
53
54
  current_aws_account_id: str,
54
55
  update_tag: int,
55
56
  common_job_parameters: Dict[str, Any],
56
57
  regions: list[str] | None = None,
57
58
  aws_requested_syncs: Iterable[str] = RESOURCE_FUNCTIONS.keys(),
59
+ aioboto3_session: aioboto3.Session = aioboto3.Session(),
58
60
  ) -> None:
59
61
  # Autodiscover the regions supported by the account unless the user has specified the regions to sync.
60
62
  if not regions:
@@ -72,13 +74,20 @@ def _sync_one_account(
72
74
  for func_name in aws_requested_syncs:
73
75
  if func_name in RESOURCE_FUNCTIONS:
74
76
  # Skip permission relationships and tags for now because they rely on data already being in the graph
75
- if func_name not in [
76
- "permission_relationships",
77
- "resourcegroupstaggingapi",
78
- ]:
79
- RESOURCE_FUNCTIONS[func_name](**sync_args)
80
- else:
77
+ if func_name == "ecr:image_layers":
78
+ # has a different signature than the other functions (aioboto3_session replaces boto3_session)
79
+ RESOURCE_FUNCTIONS[func_name](
80
+ neo4j_session,
81
+ aioboto3_session,
82
+ regions,
83
+ current_aws_account_id,
84
+ update_tag,
85
+ common_job_parameters,
86
+ )
87
+ elif func_name in ["permission_relationships", "resourcegroupstaggingapi"]:
81
88
  continue
89
+ else:
90
+ RESOURCE_FUNCTIONS[func_name](**sync_args)
82
91
  else:
83
92
  raise ValueError(
84
93
  f'AWS sync function "{func_name}" was specified but does not exist. Did you misspell it?',
@@ -115,7 +124,7 @@ def _sync_one_account(
115
124
 
116
125
 
117
126
  def _autodiscover_account_regions(
118
- boto3_session: boto3.session.Session,
127
+ boto3_session: boto3.Session,
119
128
  account_id: str,
120
129
  ) -> List[str]:
121
130
  regions: List[str] = []
@@ -136,7 +145,7 @@ def _autodiscover_account_regions(
136
145
 
137
146
  def _autodiscover_accounts(
138
147
  neo4j_session: neo4j.Session,
139
- boto3_session: boto3.session.Session,
148
+ boto3_session: boto3.Session,
140
149
  account_id: str,
141
150
  sync_tag: int,
142
151
  common_job_parameters: Dict,
@@ -197,8 +206,10 @@ def _sync_multiple_accounts(
197
206
  if num_accounts == 1:
198
207
  # Use the default boto3 session because boto3 gets confused if you give it a profile name with 1 account
199
208
  boto3_session = boto3.Session()
209
+ aioboto3_session = aioboto3.Session()
200
210
  else:
201
211
  boto3_session = boto3.Session(profile_name=profile_name)
212
+ aioboto3_session = aioboto3.Session(profile_name=profile_name)
202
213
 
203
214
  _autodiscover_accounts(
204
215
  neo4j_session,
@@ -217,6 +228,7 @@ def _sync_multiple_accounts(
217
228
  common_job_parameters,
218
229
  regions=regions,
219
230
  aws_requested_syncs=aws_requested_syncs, # Could be replaced later with per-account requested syncs
231
+ aioboto3_session=aioboto3_session,
220
232
  )
221
233
  except Exception as e:
222
234
  if aws_best_effort_mode:
@@ -248,12 +248,19 @@ def transform_ecr_repository_images(repo_data: Dict) -> tuple[List[Dict], List[D
248
248
 
249
249
  # Create ECRImage for the manifest list itself
250
250
  if digest not in ecr_images_dict:
251
+ # Extract child image digests (excluding attestations for CONTAINS_IMAGE relationship)
252
+ child_digests = [
253
+ m["digest"]
254
+ for m in manifest_images
255
+ if m.get("type") != "attestation"
256
+ ]
251
257
  ecr_images_dict[digest] = {
252
258
  "imageDigest": digest,
253
259
  "type": "manifest_list",
254
260
  "architecture": None,
255
261
  "os": None,
256
262
  "variant": None,
263
+ "child_image_digests": child_digests if child_digests else None,
257
264
  }
258
265
 
259
266
  # Create ECRImage nodes for each image in the manifest list
@@ -12,7 +12,6 @@ from typing import Any
12
12
  from typing import Optional
13
13
 
14
14
  import aioboto3
15
- import boto3
16
15
  import httpx
17
16
  import neo4j
18
17
  from botocore.exceptions import ClientError
@@ -334,6 +333,7 @@ def transform_ecr_image_layers(
334
333
  image_layers_data: dict[str, dict[str, list[str]]],
335
334
  image_digest_map: dict[str, str],
336
335
  image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
336
+ existing_properties_map: Optional[dict[str, dict[str, Any]]] = None,
337
337
  ) -> tuple[list[dict], list[dict]]:
338
338
  """
339
339
  Transform image layer data into format suitable for Neo4j ingestion.
@@ -342,10 +342,13 @@ def transform_ecr_image_layers(
342
342
  :param image_layers_data: Map of image URI to platform to diff_ids
343
343
  :param image_digest_map: Map of image URI to image digest
344
344
  :param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
345
+ :param existing_properties_map: Map of image digest to existing ECRImage properties (type, architecture, etc.)
345
346
  :return: List of layer objects ready for ingestion
346
347
  """
347
348
  if image_attestation_map is None:
348
349
  image_attestation_map = {}
350
+ if existing_properties_map is None:
351
+ existing_properties_map = {}
349
352
  layers_by_diff_id: dict[str, dict[str, Any]] = {}
350
353
  memberships_by_digest: dict[str, dict[str, Any]] = {}
351
354
 
@@ -353,6 +356,16 @@ def transform_ecr_image_layers(
353
356
  # fetch_image_layers_async guarantees every uri in image_layers_data has a digest
354
357
  image_digest = image_digest_map[image_uri]
355
358
 
359
+ # Check if this is a manifest list
360
+ is_manifest_list = False
361
+ if image_digest in existing_properties_map:
362
+ image_type = existing_properties_map[image_digest].get("type")
363
+ is_manifest_list = image_type == "manifest_list"
364
+
365
+ # Skip creating layer relationships for manifest lists
366
+ if is_manifest_list:
367
+ continue
368
+
356
369
  ordered_layers_for_image: Optional[list[str]] = None
357
370
 
358
371
  for _, diff_ids in platforms.items():
@@ -391,6 +404,10 @@ def transform_ecr_image_layers(
391
404
  "layer_diff_ids": ordered_layers_for_image,
392
405
  }
393
406
 
407
+ # Preserve existing ECRImage properties (type, architecture, os, variant, etc.)
408
+ if image_digest in existing_properties_map:
409
+ membership.update(existing_properties_map[image_digest])
410
+
394
411
  # Add attestation data if available for this image
395
412
  if image_uri in image_attestation_map:
396
413
  attestation = image_attestation_map[image_uri]
@@ -433,7 +450,12 @@ def load_ecr_image_layers(
433
450
  current_aws_account_id: str,
434
451
  aws_update_tag: int,
435
452
  ) -> None:
436
- """Load image layers into Neo4j."""
453
+ """
454
+ Load image layers into Neo4j.
455
+
456
+ Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
457
+ since layer objects can contain large arrays of relationships.
458
+ """
437
459
  logger.info(
438
460
  f"Loading {len(image_layers)} image layers for region {region} into graph.",
439
461
  )
@@ -442,6 +464,7 @@ def load_ecr_image_layers(
442
464
  neo4j_session,
443
465
  ECRImageLayerSchema(),
444
466
  image_layers,
467
+ batch_size=1000,
445
468
  lastupdated=aws_update_tag,
446
469
  AWS_ID=current_aws_account_id,
447
470
  )
@@ -455,10 +478,17 @@ def load_ecr_image_layer_memberships(
455
478
  current_aws_account_id: str,
456
479
  aws_update_tag: int,
457
480
  ) -> None:
481
+ """
482
+ Load image layer memberships into Neo4j.
483
+
484
+ Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
485
+ since membership objects can contain large arrays of layer diff_ids.
486
+ """
458
487
  load(
459
488
  neo4j_session,
460
489
  ECRImageSchema(),
461
490
  memberships,
491
+ batch_size=1000,
462
492
  lastupdated=aws_update_tag,
463
493
  Region=region,
464
494
  AWS_ID=current_aws_account_id,
@@ -527,8 +557,15 @@ async def fetch_image_layers_async(
527
557
  async def fetch_single_image_layers(
528
558
  repo_image: dict,
529
559
  http_client: httpx.AsyncClient,
530
- ) -> Optional[tuple[str, str, dict[str, list[str]], Optional[dict[str, str]]]]:
531
- """Fetch layers for a single image and extract attestation if present."""
560
+ ) -> Optional[
561
+ tuple[str, str, dict[str, list[str]], Optional[dict[str, dict[str, str]]]]
562
+ ]:
563
+ """
564
+ Fetch layers for a single image and extract attestation if present.
565
+
566
+ Returns tuple of (uri, digest, platform_layers, attestations_by_child_digest) where
567
+ attestations_by_child_digest maps child image digest to parent image info
568
+ """
532
569
  async with semaphore:
533
570
  # Caller guarantees these fields exist in every repo_image
534
571
  uri = repo_image["uri"]
@@ -551,13 +588,13 @@ async def fetch_image_layers_async(
551
588
 
552
589
  manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
553
590
  platform_layers: dict[str, list[str]] = {}
554
- attestation_data: Optional[dict[str, str]] = None
591
+ attestation_data: Optional[dict[str, dict[str, str]]] = None
555
592
 
556
593
  if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
557
594
 
558
595
  async def _process_child_manifest(
559
596
  manifest_ref: dict,
560
- ) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
597
+ ) -> tuple[dict[str, list[str]], Optional[tuple[str, dict[str, str]]]]:
561
598
  # Check if this is an attestation manifest
562
599
  if (
563
600
  manifest_ref.get("annotations", {}).get(
@@ -565,18 +602,27 @@ async def fetch_image_layers_async(
565
602
  )
566
603
  == "attestation-manifest"
567
604
  ):
605
+ # Extract which child image this attestation is for
606
+ attests_child_digest = manifest_ref.get("annotations", {}).get(
607
+ "vnd.docker.reference.digest"
608
+ )
609
+ if not attests_child_digest:
610
+ return {}, None
611
+
568
612
  # Extract base image from attestation
569
- child_digest = manifest_ref.get("digest")
570
- if child_digest:
613
+ attestation_digest = manifest_ref.get("digest")
614
+ if attestation_digest:
571
615
  attestation_info = (
572
616
  await _extract_parent_image_from_attestation(
573
617
  ecr_client,
574
618
  repo_name,
575
- child_digest,
619
+ attestation_digest,
576
620
  http_client,
577
621
  )
578
622
  )
579
- return {}, attestation_info
623
+ if attestation_info:
624
+ # Return (attests_child_digest, parent_info) tuple
625
+ return {}, (attests_child_digest, attestation_info)
580
626
  return {}, None
581
627
 
582
628
  child_digest = manifest_ref.get("digest")
@@ -612,14 +658,22 @@ async def fetch_image_layers_async(
612
658
  )
613
659
 
614
660
  # Merge results from successful child manifest processing
661
+ # Track attestation data by child digest for proper mapping
662
+ attestations_by_child_digest: dict[str, dict[str, str]] = {}
663
+
615
664
  for result in child_results:
616
665
  if isinstance(result, tuple) and len(result) == 2:
617
666
  layer_data, attest_data = result
618
667
  if layer_data:
619
668
  platform_layers.update(layer_data)
620
- if attest_data and not attestation_data:
621
- # Use first attestation found
622
- attestation_data = attest_data
669
+ if attest_data:
670
+ # attest_data is (child_digest, parent_info) tuple
671
+ child_digest, parent_info = attest_data
672
+ attestations_by_child_digest[child_digest] = parent_info
673
+
674
+ # Build attestation_data with child digest mapping
675
+ if attestations_by_child_digest:
676
+ attestation_data = attestations_by_child_digest
623
677
  else:
624
678
  diff_map = await _diff_ids_for_manifest(
625
679
  ecr_client,
@@ -630,7 +684,9 @@ async def fetch_image_layers_async(
630
684
  )
631
685
  platform_layers.update(diff_map)
632
686
 
633
- if platform_layers:
687
+ # Return if we found layers or attestation data
688
+ # Manifest lists may have attestation_data without platform_layers
689
+ if platform_layers or attestation_data:
634
690
  return uri, digest, platform_layers, attestation_data
635
691
 
636
692
  return None
@@ -670,13 +726,22 @@ async def fetch_image_layers_async(
670
726
  )
671
727
 
672
728
  if result:
673
- uri, digest, layer_data, attestation_data = result
729
+ uri, digest, layer_data, attestations_by_child_digest = result
674
730
  if not digest:
675
731
  raise ValueError(f"Empty digest returned for image {uri}")
676
732
  image_layers_data[uri] = layer_data
677
733
  image_digest_map[uri] = digest
678
- if attestation_data:
679
- image_attestation_map[uri] = attestation_data
734
+ if attestations_by_child_digest:
735
+ # Map attestation data by child digest URIs
736
+ repo_uri = extract_repo_uri_from_image_uri(uri)
737
+ for (
738
+ child_digest,
739
+ parent_info,
740
+ ) in attestations_by_child_digest.items():
741
+ child_uri = f"{repo_uri}@{child_digest}"
742
+ image_attestation_map[child_uri] = parent_info
743
+ # Also add to digest map so transform can look up the child digest
744
+ image_digest_map[child_uri] = child_digest
680
745
 
681
746
  logger.info(
682
747
  f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
@@ -698,7 +763,7 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
698
763
  @timeit
699
764
  def sync(
700
765
  neo4j_session: neo4j.Session,
701
- boto3_session: boto3.session.Session,
766
+ aioboto3_session: aioboto3.Session,
702
767
  regions: list[str],
703
768
  current_aws_account_id: str,
704
769
  update_tag: int,
@@ -721,30 +786,71 @@ def sync(
721
786
  current_aws_account_id,
722
787
  )
723
788
 
724
- # Get ECR images from graph using standard client function
725
- from cartography.client.aws.ecr import get_ecr_images
789
+ # Query for ECR images with all their existing properties to preserve during layer sync
790
+ query = """
791
+ MATCH (img:ECRImage)<-[:IMAGE]-(repo_img:ECRRepositoryImage)<-[:REPO_IMAGE]-(repo:ECRRepository)
792
+ MATCH (repo)<-[:RESOURCE]-(:AWSAccount {id: $AWS_ID})
793
+ WHERE repo.region = $Region
794
+ RETURN DISTINCT
795
+ img.digest AS digest,
796
+ repo_img.id AS uri,
797
+ repo.uri AS repo_uri,
798
+ img.type AS type,
799
+ img.architecture AS architecture,
800
+ img.os AS os,
801
+ img.variant AS variant,
802
+ img.attestation_type AS attestation_type,
803
+ img.attests_digest AS attests_digest,
804
+ img.media_type AS media_type,
805
+ img.artifact_media_type AS artifact_media_type,
806
+ img.child_image_digests AS child_image_digests
807
+ """
808
+ from cartography.client.core.tx import read_list_of_dicts_tx
726
809
 
727
- ecr_images = get_ecr_images(neo4j_session, current_aws_account_id)
810
+ ecr_images = neo4j_session.read_transaction(
811
+ read_list_of_dicts_tx, query, AWS_ID=current_aws_account_id, Region=region
812
+ )
728
813
 
729
- # Filter by region and deduplicate by digest
814
+ # Build repo_images_list and existing_properties map
730
815
  repo_images_list = []
816
+ existing_properties = {}
731
817
  seen_digests = set()
732
818
 
733
- for region_name, _, uri, _, digest in ecr_images:
734
- if region_name == region and digest not in seen_digests:
819
+ for img_data in ecr_images:
820
+ digest = img_data["digest"]
821
+ image_type = img_data.get("type")
822
+
823
+ if digest not in seen_digests:
735
824
  seen_digests.add(digest)
736
- repo_uri = extract_repo_uri_from_image_uri(uri)
737
825
 
738
- # Create digest-based URI for manifest fetching
826
+ # Store existing properties for ALL images to preserve during updates
827
+ existing_properties[digest] = {
828
+ "type": image_type,
829
+ "architecture": img_data.get("architecture"),
830
+ "os": img_data.get("os"),
831
+ "variant": img_data.get("variant"),
832
+ "attestation_type": img_data.get("attestation_type"),
833
+ "attests_digest": img_data.get("attests_digest"),
834
+ "media_type": img_data.get("media_type"),
835
+ "artifact_media_type": img_data.get("artifact_media_type"),
836
+ "child_image_digests": img_data.get("child_image_digests"),
837
+ }
838
+
839
+ repo_uri = img_data["repo_uri"]
739
840
  digest_uri = f"{repo_uri}@{digest}"
740
841
 
741
- repo_images_list.append(
742
- {
743
- "imageDigest": digest,
744
- "uri": digest_uri,
745
- "repo_uri": repo_uri,
746
- }
747
- )
842
+ # Fetch manifests for:
843
+ # - Platform-specific images (type="image") - to get their layers
844
+ # - Manifest lists (type="manifest_list") - to extract attestation parent image data
845
+ # Skip only attestations since they don't have useful layer or parent data
846
+ if image_type != "attestation":
847
+ repo_images_list.append(
848
+ {
849
+ "imageDigest": digest,
850
+ "uri": digest_uri,
851
+ "repo_uri": repo_uri,
852
+ }
853
+ )
748
854
 
749
855
  logger.info(
750
856
  f"Found {len(repo_images_list)} distinct ECR image digests in graph for region {region}"
@@ -768,15 +874,9 @@ def sync(
768
874
  dict[str, str],
769
875
  dict[str, dict[str, str]],
770
876
  ]:
771
- # Use credentials from the existing boto3 session
772
- credentials = boto3_session.get_credentials()
773
- session = aioboto3.Session(
774
- aws_access_key_id=credentials.access_key,
775
- aws_secret_access_key=credentials.secret_key,
776
- aws_session_token=credentials.token,
777
- region_name=region,
778
- )
779
- async with session.client("ecr") as ecr_client:
877
+ async with aioboto3_session.client(
878
+ "ecr", region_name=region
879
+ ) as ecr_client:
780
880
  return await fetch_image_layers_async(ecr_client, repo_images_list)
781
881
 
782
882
  # Use get_event_loop() + run_until_complete() to avoid tearing down loop
@@ -798,6 +898,7 @@ def sync(
798
898
  image_layers_data,
799
899
  image_digest_map,
800
900
  image_attestation_map,
901
+ existing_properties,
801
902
  )
802
903
  load_ecr_image_layers(
803
904
  neo4j_session,