PyPI - cartography - Versions diffs - 0.118.0__py3-none-any.whl → 0.119.0__py3-none-any.whl - Mend

cartography 0.118.0py3-none-any.whl → 0.119.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cartography might be problematic. Click here for more details.

Files changed (68) hide show

cartography/_version.py +2 -2
cartography/cli.py +20 -0
cartography/client/core/tx.py +19 -3
cartography/config.py +9 -0
cartography/data/indexes.cypher +0 -6
cartography/graph/job.py +7 -5
cartography/intel/aws/__init__.py +21 -9
cartography/intel/aws/ecr.py +7 -0
cartography/intel/aws/ecr_image_layers.py +143 -42
cartography/intel/aws/inspector.py +65 -33
cartography/intel/aws/resourcegroupstaggingapi.py +1 -1
cartography/intel/gcp/compute.py +3 -3
cartography/intel/github/repos.py +23 -5
cartography/intel/gsuite/__init__.py +12 -8
cartography/intel/gsuite/groups.py +291 -0
cartography/intel/gsuite/users.py +142 -0
cartography/intel/okta/awssaml.py +1 -1
cartography/intel/okta/users.py +1 -1
cartography/intel/ontology/__init__.py +44 -0
cartography/intel/ontology/devices.py +54 -0
cartography/intel/ontology/users.py +54 -0
cartography/intel/ontology/utils.py +121 -0
cartography/models/airbyte/user.py +4 -0
cartography/models/anthropic/user.py +4 -0
cartography/models/aws/ecr/image.py +47 -0
cartography/models/aws/iam/group_membership.py +3 -2
cartography/models/aws/identitycenter/awsssouser.py +3 -1
cartography/models/bigfix/bigfix_computer.py +1 -1
cartography/models/cloudflare/member.py +4 -0
cartography/models/crowdstrike/hosts.py +1 -1
cartography/models/duo/endpoint.py +1 -1
cartography/models/duo/phone.py +2 -2
cartography/models/duo/user.py +4 -0
cartography/models/entra/user.py +2 -1
cartography/models/github/users.py +4 -0
cartography/models/gsuite/__init__.py +0 -0
cartography/models/gsuite/group.py +218 -0
cartography/models/gsuite/tenant.py +29 -0
cartography/models/gsuite/user.py +107 -0
cartography/models/kandji/device.py +1 -2
cartography/models/keycloak/user.py +4 -0
cartography/models/lastpass/user.py +4 -0
cartography/models/ontology/__init__.py +0 -0
cartography/models/ontology/device.py +125 -0
cartography/models/ontology/mapping/__init__.py +16 -0
cartography/models/ontology/mapping/data/__init__.py +1 -0
cartography/models/ontology/mapping/data/devices.py +160 -0
cartography/models/ontology/mapping/data/users.py +239 -0
cartography/models/ontology/mapping/specs.py +65 -0
cartography/models/ontology/user.py +52 -0
cartography/models/openai/user.py +4 -0
cartography/models/scaleway/iam/user.py +4 -0
cartography/models/snipeit/asset.py +1 -0
cartography/models/snipeit/user.py +4 -0
cartography/models/tailscale/device.py +1 -1
cartography/models/tailscale/user.py +6 -1
cartography/rules/data/frameworks/mitre_attack/requirements/t1098_account_manipulation/__init__.py +176 -89
cartography/sync.py +3 -0
cartography/util.py +44 -17
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/METADATA +1 -1
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/RECORD +65 -50
cartography/data/jobs/cleanup/gsuite_ingest_groups_cleanup.json +0 -23
cartography/data/jobs/cleanup/gsuite_ingest_users_cleanup.json +0 -11
cartography/intel/gsuite/api.py +0 -355
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/WHEEL +0 -0
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/entry_points.txt +0 -0
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/licenses/LICENSE +0 -0
{cartography-0.118.0.dist-info → cartography-0.119.0.dist-info}/top_level.txt +0 -0

cartography/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.118.0'
-__version_tuple__ = version_tuple = (0, 118, 0)
+__version__ = version = '0.119.0'
+__version_tuple__ = version_tuple = (0, 119, 0)
 __commit_id__ = commit_id = None

cartography/cli.py CHANGED Viewed

@@ -730,6 +730,26 @@ class CLI:
                 "Required if you are using the Trivy module. Ignored otherwise."
             ),
         )
+        parser.add_argument(
+            "--ontology-users-source",
+            type=str,
+            default=None,
+            help=(
+                "Comma-separated list of sources of truth for user data in the ontology. "
+                "'User' nodes will only be created for users that exist in one of the sources. "
+                "Required if you are using the ontology module. Ignored otherwise."
+            ),
+        )
+        parser.add_argument(
+            "--ontology-devices-source",
+            type=str,
+            default=None,
+            help=(
+                "Comma-separated list of sources of truth for client computer data in the ontology. "
+                "'Device' nodes will only be created for groups that exist in one of the sources. "
+                "Required if you are using the ontology module. Ignored otherwise."
+            ),
+        )
         parser.add_argument(
             "--trivy-results-dir",
             type=str,

cartography/client/core/tx.py CHANGED Viewed

@@ -249,6 +249,7 @@ def load_graph_data(
     neo4j_session: neo4j.Session,
     query: str,
     dict_list: List[Dict[str, Any]],
+    batch_size: int = 10000,
     **kwargs,
 ) -> None:
     """
@@ -257,10 +258,13 @@ def load_graph_data(
     :param query: The Neo4j write query to run. This query is not meant to be handwritten, rather it should be generated
     with cartography.graph.querybuilder.build_ingestion_query().
     :param dict_list: The data to load to the graph represented as a list of dicts.
+    :param batch_size: The number of items to process per transaction. Defaults to 10000.
     :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
     :return: None
     """
-    for data_batch in batch(dict_list, size=10000):
+    if batch_size <= 0:
+        raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
+    for data_batch in batch(dict_list, size=batch_size):
         neo4j_session.write_transaction(
             write_list_of_dicts_tx,
             query,
@@ -316,6 +320,7 @@ def load(
     neo4j_session: neo4j.Session,
     node_schema: CartographyNodeSchema,
     dict_list: List[Dict[str, Any]],
+    batch_size: int = 10000,
     **kwargs,
 ) -> None:
     """
@@ -324,21 +329,27 @@ def load(
     :param neo4j_session: The Neo4j session
     :param node_schema: The CartographyNodeSchema object to create indexes for and generate a query.
     :param dict_list: The data to load to the graph represented as a list of dicts.
+    :param batch_size: The number of items to process per transaction. Defaults to 10000.
     :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
     :return: None
     """
+    if batch_size <= 0:
+        raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
     if len(dict_list) == 0:
         # If there is no data to load, save some time.
         return
     ensure_indexes(neo4j_session, node_schema)
     ingestion_query = build_ingestion_query(node_schema)
-    load_graph_data(neo4j_session, ingestion_query, dict_list, **kwargs)
+    load_graph_data(
+        neo4j_session, ingestion_query, dict_list, batch_size=batch_size, **kwargs
+    )
 def load_matchlinks(
     neo4j_session: neo4j.Session,
     rel_schema: CartographyRelSchema,
     dict_list: list[dict[str, Any]],
+    batch_size: int = 10000,
     **kwargs,
 ) -> None:
     """
@@ -347,9 +358,12 @@ def load_matchlinks(
     :param rel_schema: The CartographyRelSchema object to generate a query.
     :param dict_list: The data to load to the graph represented as a list of dicts. The dicts must contain the source and
     target node ids.
+    :param batch_size: The number of items to process per transaction. Defaults to 10000.
     :param kwargs: Allows additional keyword args to be supplied to the Neo4j query.
     :return: None
     """
+    if batch_size <= 0:
+        raise ValueError(f"batch_size must be greater than 0, got {batch_size}")
     if len(dict_list) == 0:
         # If there is no data to load, save some time.
         return
@@ -369,4 +383,6 @@ def load_matchlinks(
     ensure_indexes_for_matchlinks(neo4j_session, rel_schema)
     matchlink_query = build_matchlink_query(rel_schema)
     logger.debug(f"Matchlink query: {matchlink_query}")
-    load_graph_data(neo4j_session, matchlink_query, dict_list, **kwargs)
+    load_graph_data(
+        neo4j_session, matchlink_query, dict_list, batch_size=batch_size, **kwargs
+    )

cartography/config.py CHANGED Viewed

@@ -161,6 +161,11 @@ class Config:
     :param trivy_s3_bucket: The S3 bucket name containing Trivy scan results. Optional.
     :type trivy_s3_prefix: str
     :param trivy_s3_prefix: The S3 prefix path containing Trivy scan results. Optional.
+    :type ontology_users_source: str
+    :param ontology_users_source: Comma-separated list of sources of truth for user data in the ontology. Optional.
+    :type ontology_devices_source: str
+    :param ontology_devices_source: Comma-separated list of sources of truth for client computers data in the ontology.
+        Optional.
     :type trivy_results_dir: str
     :param trivy_results_dir: Local directory containing Trivy scan results. Optional.
     :type scaleway_access_key: str
@@ -266,6 +271,8 @@ class Config:
         airbyte_api_url=None,
         trivy_s3_bucket=None,
         trivy_s3_prefix=None,
+        ontology_users_source=None,
+        ontology_devices_source=None,
         trivy_results_dir=None,
         scaleway_access_key=None,
         scaleway_secret_key=None,
@@ -359,6 +366,8 @@ class Config:
         self.airbyte_api_url = airbyte_api_url
         self.trivy_s3_bucket = trivy_s3_bucket
         self.trivy_s3_prefix = trivy_s3_prefix
+        self.ontology_users_source = ontology_users_source
+        self.ontology_devices_source = ontology_devices_source
         self.trivy_results_dir = trivy_results_dir
         self.scaleway_access_key = scaleway_access_key
         self.scaleway_secret_key = scaleway_secret_key

cartography/data/indexes.cypher CHANGED Viewed

@@ -102,12 +102,6 @@ CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.id);
 CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.lastupdated);
 CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.id);
 CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.lastupdated);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.email);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.id);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.lastupdated);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.email);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.id);
-CREATE INDEX IF NOT EXISTS FOR (n:GSuiteUser) ON (n.lastupdated);
 CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.id);
 CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.ip);
 CREATE INDEX IF NOT EXISTS FOR (n:Ip) ON (n.lastupdated);

cartography/graph/job.py CHANGED Viewed

@@ -125,11 +125,13 @@ class GraphJob:
         }
     @classmethod
-    def from_json(cls, blob: str, short_name: Optional[str] = None) -> "GraphJob":
+    def from_json(
+        cls, blob: Union[str, dict], short_name: Optional[str] = None
+    ) -> "GraphJob":
         """
-        Create a job from a JSON blob.
+        Create a job from a JSON dict or blob.
         """
-        data: Dict = json.loads(blob)
+        data = json.loads(blob) if isinstance(blob, str) else blob
         statements = _get_statements_from_json(data, short_name)
         name = data["name"]
         return cls(name, statements, short_name)
@@ -242,12 +244,12 @@ class GraphJob:
     def run_from_json(
         cls,
         neo4j_session: neo4j.Session,
-        blob: str,
+        blob: Union[str, dict],
         parameters: Dict,
         short_name: Optional[str] = None,
     ) -> None:
         """
-        Run a job from a JSON blob. This will deserialize the job and execute all statements sequentially.
+        Run a job from a JSON dict or blob. This will deserialize the job and execute all statements sequentially.
         """
         if not parameters:
             parameters = {}

cartography/intel/aws/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Dict
 from typing import Iterable
 from typing import List
+import aioboto3
 import boto3
 import botocore.exceptions
 import neo4j
@@ -49,12 +50,13 @@ def _build_aws_sync_kwargs(
 def _sync_one_account(
     neo4j_session: neo4j.Session,
-    boto3_session: boto3.session.Session,
+    boto3_session: boto3.Session,
     current_aws_account_id: str,
     update_tag: int,
     common_job_parameters: Dict[str, Any],
     regions: list[str] | None = None,
     aws_requested_syncs: Iterable[str] = RESOURCE_FUNCTIONS.keys(),
+    aioboto3_session: aioboto3.Session = aioboto3.Session(),
 ) -> None:
     # Autodiscover the regions supported by the account unless the user has specified the regions to sync.
     if not regions:
@@ -72,13 +74,20 @@ def _sync_one_account(
     for func_name in aws_requested_syncs:
         if func_name in RESOURCE_FUNCTIONS:
             # Skip permission relationships and tags for now because they rely on data already being in the graph
-            if func_name not in [
-                "permission_relationships",
-                "resourcegroupstaggingapi",
-            ]:
-                RESOURCE_FUNCTIONS[func_name](**sync_args)
-            else:
+            if func_name == "ecr:image_layers":
+                # has a different signature than the other functions (aioboto3_session replaces boto3_session)
+                RESOURCE_FUNCTIONS[func_name](
+                    neo4j_session,
+                    aioboto3_session,
+                    regions,
+                    current_aws_account_id,
+                    update_tag,
+                    common_job_parameters,
+                )
+            elif func_name in ["permission_relationships", "resourcegroupstaggingapi"]:
                 continue
+            else:
+                RESOURCE_FUNCTIONS[func_name](**sync_args)
         else:
             raise ValueError(
                 f'AWS sync function "{func_name}" was specified but does not exist. Did you misspell it?',
@@ -115,7 +124,7 @@ def _sync_one_account(
 def _autodiscover_account_regions(
-    boto3_session: boto3.session.Session,
+    boto3_session: boto3.Session,
     account_id: str,
 ) -> List[str]:
     regions: List[str] = []
@@ -136,7 +145,7 @@ def _autodiscover_account_regions(
 def _autodiscover_accounts(
     neo4j_session: neo4j.Session,
-    boto3_session: boto3.session.Session,
+    boto3_session: boto3.Session,
     account_id: str,
     sync_tag: int,
     common_job_parameters: Dict,
@@ -197,8 +206,10 @@ def _sync_multiple_accounts(
         if num_accounts == 1:
             # Use the default boto3 session because boto3 gets confused if you give it a profile name with 1 account
             boto3_session = boto3.Session()
+            aioboto3_session = aioboto3.Session()
         else:
             boto3_session = boto3.Session(profile_name=profile_name)
+            aioboto3_session = aioboto3.Session(profile_name=profile_name)
         _autodiscover_accounts(
             neo4j_session,
@@ -217,6 +228,7 @@ def _sync_multiple_accounts(
                 common_job_parameters,
                 regions=regions,
                 aws_requested_syncs=aws_requested_syncs,  # Could be replaced later with per-account requested syncs
+                aioboto3_session=aioboto3_session,
             )
         except Exception as e:
             if aws_best_effort_mode:

cartography/intel/aws/ecr.py CHANGED Viewed

@@ -248,12 +248,19 @@ def transform_ecr_repository_images(repo_data: Dict) -> tuple[List[Dict], List[D
                 # Create ECRImage for the manifest list itself
                 if digest not in ecr_images_dict:
+                    # Extract child image digests (excluding attestations for CONTAINS_IMAGE relationship)
+                    child_digests = [
+                        m["digest"]
+                        for m in manifest_images
+                        if m.get("type") != "attestation"
+                    ]
                     ecr_images_dict[digest] = {
                         "imageDigest": digest,
                         "type": "manifest_list",
                         "architecture": None,
                         "os": None,
                         "variant": None,
+                        "child_image_digests": child_digests if child_digests else None,
                     }
                 # Create ECRImage nodes for each image in the manifest list

cartography/intel/aws/ecr_image_layers.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import Any
 from typing import Optional
 import aioboto3
-import boto3
 import httpx
 import neo4j
 from botocore.exceptions import ClientError
@@ -334,6 +333,7 @@ def transform_ecr_image_layers(
     image_layers_data: dict[str, dict[str, list[str]]],
     image_digest_map: dict[str, str],
     image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
+    existing_properties_map: Optional[dict[str, dict[str, Any]]] = None,
 ) -> tuple[list[dict], list[dict]]:
     """
     Transform image layer data into format suitable for Neo4j ingestion.
@@ -342,10 +342,13 @@ def transform_ecr_image_layers(
     :param image_layers_data: Map of image URI to platform to diff_ids
     :param image_digest_map: Map of image URI to image digest
     :param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
+    :param existing_properties_map: Map of image digest to existing ECRImage properties (type, architecture, etc.)
     :return: List of layer objects ready for ingestion
     """
     if image_attestation_map is None:
         image_attestation_map = {}
+    if existing_properties_map is None:
+        existing_properties_map = {}
     layers_by_diff_id: dict[str, dict[str, Any]] = {}
     memberships_by_digest: dict[str, dict[str, Any]] = {}
@@ -353,6 +356,16 @@ def transform_ecr_image_layers(
         # fetch_image_layers_async guarantees every uri in image_layers_data has a digest
         image_digest = image_digest_map[image_uri]
+        # Check if this is a manifest list
+        is_manifest_list = False
+        if image_digest in existing_properties_map:
+            image_type = existing_properties_map[image_digest].get("type")
+            is_manifest_list = image_type == "manifest_list"
+        # Skip creating layer relationships for manifest lists
+        if is_manifest_list:
+            continue
         ordered_layers_for_image: Optional[list[str]] = None
         for _, diff_ids in platforms.items():
@@ -391,6 +404,10 @@ def transform_ecr_image_layers(
                 "layer_diff_ids": ordered_layers_for_image,
             }
+            # Preserve existing ECRImage properties (type, architecture, os, variant, etc.)
+            if image_digest in existing_properties_map:
+                membership.update(existing_properties_map[image_digest])
             # Add attestation data if available for this image
             if image_uri in image_attestation_map:
                 attestation = image_attestation_map[image_uri]
@@ -433,7 +450,12 @@ def load_ecr_image_layers(
     current_aws_account_id: str,
     aws_update_tag: int,
 ) -> None:
-    """Load image layers into Neo4j."""
+    """
+    Load image layers into Neo4j.
+    Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
+    since layer objects can contain large arrays of relationships.
+    """
     logger.info(
         f"Loading {len(image_layers)} image layers for region {region} into graph.",
     )
@@ -442,6 +464,7 @@ def load_ecr_image_layers(
         neo4j_session,
         ECRImageLayerSchema(),
         image_layers,
+        batch_size=1000,
         lastupdated=aws_update_tag,
         AWS_ID=current_aws_account_id,
     )
@@ -455,10 +478,17 @@ def load_ecr_image_layer_memberships(
     current_aws_account_id: str,
     aws_update_tag: int,
 ) -> None:
+    """
+    Load image layer memberships into Neo4j.
+    Uses a smaller batch size (1000) to avoid Neo4j transaction memory limits,
+    since membership objects can contain large arrays of layer diff_ids.
+    """
     load(
         neo4j_session,
         ECRImageSchema(),
         memberships,
+        batch_size=1000,
         lastupdated=aws_update_tag,
         Region=region,
         AWS_ID=current_aws_account_id,
@@ -527,8 +557,15 @@ async def fetch_image_layers_async(
     async def fetch_single_image_layers(
         repo_image: dict,
         http_client: httpx.AsyncClient,
-    ) -> Optional[tuple[str, str, dict[str, list[str]], Optional[dict[str, str]]]]:
-        """Fetch layers for a single image and extract attestation if present."""
+    ) -> Optional[
+        tuple[str, str, dict[str, list[str]], Optional[dict[str, dict[str, str]]]]
+    ]:
+        """
+        Fetch layers for a single image and extract attestation if present.
+        Returns tuple of (uri, digest, platform_layers, attestations_by_child_digest) where
+        attestations_by_child_digest maps child image digest to parent image info
+        """
         async with semaphore:
             # Caller guarantees these fields exist in every repo_image
             uri = repo_image["uri"]
@@ -551,13 +588,13 @@ async def fetch_image_layers_async(
             manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
             platform_layers: dict[str, list[str]] = {}
-            attestation_data: Optional[dict[str, str]] = None
+            attestation_data: Optional[dict[str, dict[str, str]]] = None
             if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
                 async def _process_child_manifest(
                     manifest_ref: dict,
-                ) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
+                ) -> tuple[dict[str, list[str]], Optional[tuple[str, dict[str, str]]]]:
                     # Check if this is an attestation manifest
                     if (
                         manifest_ref.get("annotations", {}).get(
@@ -565,18 +602,27 @@ async def fetch_image_layers_async(
                         )
                         == "attestation-manifest"
                     ):
+                        # Extract which child image this attestation is for
+                        attests_child_digest = manifest_ref.get("annotations", {}).get(
+                            "vnd.docker.reference.digest"
+                        )
+                        if not attests_child_digest:
+                            return {}, None
                         # Extract base image from attestation
-                        child_digest = manifest_ref.get("digest")
-                        if child_digest:
+                        attestation_digest = manifest_ref.get("digest")
+                        if attestation_digest:
                             attestation_info = (
                                 await _extract_parent_image_from_attestation(
                                     ecr_client,
                                     repo_name,
-                                    child_digest,
+                                    attestation_digest,
                                     http_client,
                                 )
                             )
-                            return {}, attestation_info
+                            if attestation_info:
+                                # Return (attests_child_digest, parent_info) tuple
+                                return {}, (attests_child_digest, attestation_info)
                         return {}, None
                     child_digest = manifest_ref.get("digest")
@@ -612,14 +658,22 @@ async def fetch_image_layers_async(
                 )
                 # Merge results from successful child manifest processing
+                # Track attestation data by child digest for proper mapping
+                attestations_by_child_digest: dict[str, dict[str, str]] = {}
                 for result in child_results:
                     if isinstance(result, tuple) and len(result) == 2:
                         layer_data, attest_data = result
                         if layer_data:
                             platform_layers.update(layer_data)
-                        if attest_data and not attestation_data:
-                            # Use first attestation found
-                            attestation_data = attest_data
+                        if attest_data:
+                            # attest_data is (child_digest, parent_info) tuple
+                            child_digest, parent_info = attest_data
+                            attestations_by_child_digest[child_digest] = parent_info
+                # Build attestation_data with child digest mapping
+                if attestations_by_child_digest:
+                    attestation_data = attestations_by_child_digest
             else:
                 diff_map = await _diff_ids_for_manifest(
                     ecr_client,
@@ -630,7 +684,9 @@ async def fetch_image_layers_async(
                 )
                 platform_layers.update(diff_map)
-            if platform_layers:
+            # Return if we found layers or attestation data
+            # Manifest lists may have attestation_data without platform_layers
+            if platform_layers or attestation_data:
                 return uri, digest, platform_layers, attestation_data
             return None
@@ -670,13 +726,22 @@ async def fetch_image_layers_async(
                 )
             if result:
-                uri, digest, layer_data, attestation_data = result
+                uri, digest, layer_data, attestations_by_child_digest = result
                 if not digest:
                     raise ValueError(f"Empty digest returned for image {uri}")
                 image_layers_data[uri] = layer_data
                 image_digest_map[uri] = digest
-                if attestation_data:
-                    image_attestation_map[uri] = attestation_data
+                if attestations_by_child_digest:
+                    # Map attestation data by child digest URIs
+                    repo_uri = extract_repo_uri_from_image_uri(uri)
+                    for (
+                        child_digest,
+                        parent_info,
+                    ) in attestations_by_child_digest.items():
+                        child_uri = f"{repo_uri}@{child_digest}"
+                        image_attestation_map[child_uri] = parent_info
+                        # Also add to digest map so transform can look up the child digest
+                        image_digest_map[child_uri] = child_digest
     logger.info(
         f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
@@ -698,7 +763,7 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
 @timeit
 def sync(
     neo4j_session: neo4j.Session,
-    boto3_session: boto3.session.Session,
+    aioboto3_session: aioboto3.Session,
     regions: list[str],
     current_aws_account_id: str,
     update_tag: int,
@@ -721,30 +786,71 @@ def sync(
             current_aws_account_id,
         )
-        # Get ECR images from graph using standard client function
-        from cartography.client.aws.ecr import get_ecr_images
+        # Query for ECR images with all their existing properties to preserve during layer sync
+        query = """
+        MATCH (img:ECRImage)<-[:IMAGE]-(repo_img:ECRRepositoryImage)<-[:REPO_IMAGE]-(repo:ECRRepository)
+        MATCH (repo)<-[:RESOURCE]-(:AWSAccount {id: $AWS_ID})
+        WHERE repo.region = $Region
+        RETURN DISTINCT
+            img.digest AS digest,
+            repo_img.id AS uri,
+            repo.uri AS repo_uri,
+            img.type AS type,
+            img.architecture AS architecture,
+            img.os AS os,
+            img.variant AS variant,
+            img.attestation_type AS attestation_type,
+            img.attests_digest AS attests_digest,
+            img.media_type AS media_type,
+            img.artifact_media_type AS artifact_media_type,
+            img.child_image_digests AS child_image_digests
+        """
+        from cartography.client.core.tx import read_list_of_dicts_tx
-        ecr_images = get_ecr_images(neo4j_session, current_aws_account_id)
+        ecr_images = neo4j_session.read_transaction(
+            read_list_of_dicts_tx, query, AWS_ID=current_aws_account_id, Region=region
+        )
-        # Filter by region and deduplicate by digest
+        # Build repo_images_list and existing_properties map
         repo_images_list = []
+        existing_properties = {}
         seen_digests = set()
-        for region_name, _, uri, _, digest in ecr_images:
-            if region_name == region and digest not in seen_digests:
+        for img_data in ecr_images:
+            digest = img_data["digest"]
+            image_type = img_data.get("type")
+            if digest not in seen_digests:
                 seen_digests.add(digest)
-                repo_uri = extract_repo_uri_from_image_uri(uri)
-                # Create digest-based URI for manifest fetching
+                # Store existing properties for ALL images to preserve during updates
+                existing_properties[digest] = {
+                    "type": image_type,
+                    "architecture": img_data.get("architecture"),
+                    "os": img_data.get("os"),
+                    "variant": img_data.get("variant"),
+                    "attestation_type": img_data.get("attestation_type"),
+                    "attests_digest": img_data.get("attests_digest"),
+                    "media_type": img_data.get("media_type"),
+                    "artifact_media_type": img_data.get("artifact_media_type"),
+                    "child_image_digests": img_data.get("child_image_digests"),
+                }
+                repo_uri = img_data["repo_uri"]
                 digest_uri = f"{repo_uri}@{digest}"
-                repo_images_list.append(
-                    {
-                        "imageDigest": digest,
-                        "uri": digest_uri,
-                        "repo_uri": repo_uri,
-                    }
-                )
+                # Fetch manifests for:
+                # - Platform-specific images (type="image") - to get their layers
+                # - Manifest lists (type="manifest_list") - to extract attestation parent image data
+                # Skip only attestations since they don't have useful layer or parent data
+                if image_type != "attestation":
+                    repo_images_list.append(
+                        {
+                            "imageDigest": digest,
+                            "uri": digest_uri,
+                            "repo_uri": repo_uri,
+                        }
+                    )
         logger.info(
             f"Found {len(repo_images_list)} distinct ECR image digests in graph for region {region}"
@@ -768,15 +874,9 @@ def sync(
                 dict[str, str],
                 dict[str, dict[str, str]],
             ]:
-                # Use credentials from the existing boto3 session
-                credentials = boto3_session.get_credentials()
-                session = aioboto3.Session(
-                    aws_access_key_id=credentials.access_key,
-                    aws_secret_access_key=credentials.secret_key,
-                    aws_session_token=credentials.token,
-                    region_name=region,
-                )
-                async with session.client("ecr") as ecr_client:
+                async with aioboto3_session.client(
+                    "ecr", region_name=region
+                ) as ecr_client:
                     return await fetch_image_layers_async(ecr_client, repo_images_list)
             # Use get_event_loop() + run_until_complete() to avoid tearing down loop
@@ -798,6 +898,7 @@ def sync(
                 image_layers_data,
                 image_digest_map,
                 image_attestation_map,
+                existing_properties,
             )
             load_ecr_image_layers(
                 neo4j_session,

cartography 0.118.0__py3-none-any.whl → 0.119.0__py3-none-any.whl

Potentially problematic release.

cartography 0.118.0py3-none-any.whl → 0.119.0py3-none-any.whl