PyPI - cartography - Versions diffs - 0.116.0__py3-none-any.whl → 0.117.0__py3-none-any.whl - Mend

cartography 0.116.0py3-none-any.whl → 0.117.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cartography might be problematic. Click here for more details.

Files changed (21) hide show

cartography/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.116.0'
-__version_tuple__ = version_tuple = (0, 116, 0)
+__version__ = version = '0.117.0'
+__version_tuple__ = version_tuple = (0, 117, 0)
 __commit_id__ = commit_id = None

cartography/client/core/tx.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Optional
 from typing import Tuple
 from typing import Union
+import backoff
 import neo4j
 from cartography.graph.querybuilder import build_create_index_queries
@@ -14,11 +15,31 @@ from cartography.graph.querybuilder import build_ingestion_query
 from cartography.graph.querybuilder import build_matchlink_query
 from cartography.models.core.nodes import CartographyNodeSchema
 from cartography.models.core.relationships import CartographyRelSchema
+from cartography.util import backoff_handler
 from cartography.util import batch
 logger = logging.getLogger(__name__)
+@backoff.on_exception(  # type: ignore
+    backoff.expo,
+    (
+        ConnectionResetError,
+        neo4j.exceptions.ServiceUnavailable,
+        neo4j.exceptions.SessionExpired,
+        neo4j.exceptions.TransientError,
+    ),
+    max_tries=5,
+    on_backoff=backoff_handler,
+)
+def _run_index_query_with_retry(neo4j_session: neo4j.Session, query: str) -> None:
+    """
+    Execute an index creation query with retry logic.
+    Index creation requires autocommit transactions and can experience transient errors.
+    """
+    neo4j_session.run(query)
 def run_write_query(
     neo4j_session: neo4j.Session, query: str, **parameters: Any
 ) -> None:
@@ -269,7 +290,7 @@ def ensure_indexes(
             raise ValueError(
                 'Query provided to `ensure_indexes()` does not start with "CREATE INDEX IF NOT EXISTS".',
             )
-        neo4j_session.run(query)
+        _run_index_query_with_retry(neo4j_session, query)
 def ensure_indexes_for_matchlinks(
@@ -288,7 +309,7 @@ def ensure_indexes_for_matchlinks(
             raise ValueError(
                 'Query provided to `ensure_indexes_for_matchlinks()` does not start with "CREATE INDEX IF NOT EXISTS".',
             )
-        neo4j_session.run(query)
+        _run_index_query_with_retry(neo4j_session, query)
 def load(

cartography/intel/aws/ecr_image_layers.py CHANGED Viewed

@@ -170,6 +170,111 @@ async def get_blob_json_via_presigned(
     return response.json()
+async def _extract_parent_image_from_attestation(
+    ecr_client: ECRClient,
+    repo_name: str,
+    attestation_manifest_digest: str,
+    http_client: httpx.AsyncClient,
+) -> Optional[dict[str, str]]:
+    """
+    Extract parent image information from an in-toto provenance attestation.
+    This function fetches an attestation manifest, downloads its in-toto layer,
+    and extracts the parent image reference from the SLSA provenance materials.
+    :param ecr_client: ECR client for fetching manifests and layers
+    :param repo_name: ECR repository name
+    :param attestation_manifest_digest: Digest of the attestation manifest
+    :param http_client: HTTP client for downloading blobs
+    :return: Dict with parent_image_uri and parent_image_digest, or None if no parent image found
+    """
+    try:
+        attestation_manifest, _ = await batch_get_manifest(
+            ecr_client,
+            repo_name,
+            attestation_manifest_digest,
+            [ECR_OCI_MANIFEST_MT, ECR_DOCKER_MANIFEST_MT],
+        )
+        if not attestation_manifest:
+            logger.debug(
+                "No attestation manifest found for digest %s in repo %s",
+                attestation_manifest_digest,
+                repo_name,
+            )
+            return None
+        # Get the in-toto layer from the attestation manifest
+        layers = attestation_manifest.get("layers", [])
+        intoto_layer = next(
+            (
+                layer
+                for layer in layers
+                if "in-toto" in layer.get("mediaType", "").lower()
+            ),
+            None,
+        )
+        if not intoto_layer:
+            logger.debug(
+                "No in-toto layer found in attestation manifest %s",
+                attestation_manifest_digest,
+            )
+            return None
+        # Download the in-toto attestation blob
+        intoto_digest = intoto_layer.get("digest")
+        if not intoto_digest:
+            logger.debug("No digest found for in-toto layer")
+            return None
+        attestation_blob = await get_blob_json_via_presigned(
+            ecr_client,
+            repo_name,
+            intoto_digest,
+            http_client,
+        )
+        if not attestation_blob:
+            logger.debug("Failed to download attestation blob")
+            return None
+        # Extract parent image from SLSA provenance materials
+        materials = attestation_blob.get("predicate", {}).get("materials", [])
+        for material in materials:
+            uri = material.get("uri", "")
+            uri_l = uri.lower()
+            # Look for container image URIs that are NOT the dockerfile itself
+            is_container_ref = (
+                uri_l.startswith("pkg:docker/")
+                or uri_l.startswith("pkg:oci/")
+                or uri_l.startswith("oci://")
+            )
+            if is_container_ref and "dockerfile" not in uri_l:
+                digest_obj = material.get("digest", {})
+                sha256_digest = digest_obj.get("sha256")
+                if sha256_digest:
+                    return {
+                        "parent_image_uri": uri,
+                        "parent_image_digest": f"sha256:{sha256_digest}",
+                    }
+        logger.debug(
+            "No parent image found in attestation materials for %s",
+            attestation_manifest_digest,
+        )
+        return None
+    except Exception as e:
+        logger.warning(
+            "Error extracting parent image from attestation %s in repo %s: %s",
+            attestation_manifest_digest,
+            repo_name,
+            e,
+        )
+        return None
 async def _diff_ids_for_manifest(
     ecr_client: ECRClient,
     repo_name: str,
@@ -228,6 +333,7 @@ async def _diff_ids_for_manifest(
 def transform_ecr_image_layers(
     image_layers_data: dict[str, dict[str, list[str]]],
     image_digest_map: dict[str, str],
+    image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
 ) -> tuple[list[dict], list[dict]]:
     """
     Transform image layer data into format suitable for Neo4j ingestion.
@@ -235,8 +341,11 @@ def transform_ecr_image_layers(
     :param image_layers_data: Map of image URI to platform to diff_ids
     :param image_digest_map: Map of image URI to image digest
+    :param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
     :return: List of layer objects ready for ingestion
     """
+    if image_attestation_map is None:
+        image_attestation_map = {}
     layers_by_diff_id: dict[str, dict[str, Any]] = {}
     memberships_by_digest: dict[str, dict[str, Any]] = {}
@@ -278,10 +387,20 @@ def transform_ecr_image_layers(
                     layer["tail_image_ids"].add(image_digest)
         if ordered_layers_for_image:
-            memberships_by_digest[image_digest] = {
+            membership: dict[str, Any] = {
                 "layer_diff_ids": ordered_layers_for_image,
             }
+            # Add attestation data if available for this image
+            if image_uri in image_attestation_map:
+                attestation = image_attestation_map[image_uri]
+                membership["parent_image_uri"] = attestation["parent_image_uri"]
+                membership["parent_image_digest"] = attestation["parent_image_digest"]
+                membership["from_attestation"] = True
+                membership["confidence"] = "explicit"
+            memberships_by_digest[image_digest] = membership
     # Convert sets back to lists for Neo4j ingestion
     layers = []
     for layer in layers_by_diff_id.values():
@@ -350,12 +469,18 @@ async def fetch_image_layers_async(
     ecr_client: ECRClient,
     repo_images_list: list[dict],
     max_concurrent: int = 200,
-) -> tuple[dict[str, dict[str, list[str]]], dict[str, str]]:
+) -> tuple[dict[str, dict[str, list[str]]], dict[str, str], dict[str, dict[str, str]]]:
     """
     Fetch image layers for ECR images in parallel with caching and non-blocking I/O.
+    Returns:
+        - image_layers_data: Map of image URI to platform to diff_ids
+        - image_digest_map: Map of image URI to image digest
+        - image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
     """
     image_layers_data: dict[str, dict[str, list[str]]] = {}
     image_digest_map: dict[str, str] = {}
+    image_attestation_map: dict[str, dict[str, str]] = {}
     semaphore = asyncio.Semaphore(max_concurrent)
     # Cache for manifest fetches keyed by (repo_name, imageDigest)
@@ -402,8 +527,8 @@ async def fetch_image_layers_async(
     async def fetch_single_image_layers(
         repo_image: dict,
         http_client: httpx.AsyncClient,
-    ) -> Optional[tuple[str, str, dict[str, list[str]]]]:
-        """Fetch layers for a single image."""
+    ) -> Optional[tuple[str, str, dict[str, list[str]], Optional[dict[str, str]]]]:
+        """Fetch layers for a single image and extract attestation if present."""
         async with semaphore:
             # Caller guarantees these fields exist in every repo_image
             uri = repo_image["uri"]
@@ -426,24 +551,37 @@ async def fetch_image_layers_async(
             manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
             platform_layers: dict[str, list[str]] = {}
+            attestation_data: Optional[dict[str, str]] = None
             if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
                 async def _process_child_manifest(
                     manifest_ref: dict,
-                ) -> dict[str, list[str]]:
-                    # Skip attestation manifests - these aren't real images
+                ) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
+                    # Check if this is an attestation manifest
                     if (
                         manifest_ref.get("annotations", {}).get(
                             "vnd.docker.reference.type"
                         )
                         == "attestation-manifest"
                     ):
-                        return {}
+                        # Extract base image from attestation
+                        child_digest = manifest_ref.get("digest")
+                        if child_digest:
+                            attestation_info = (
+                                await _extract_parent_image_from_attestation(
+                                    ecr_client,
+                                    repo_name,
+                                    child_digest,
+                                    http_client,
+                                )
+                            )
+                            return {}, attestation_info
+                        return {}, None
                     child_digest = manifest_ref.get("digest")
                     if not child_digest:
-                        return {}
+                        return {}, None
                     # Use optimized caching for child manifest
                     child_doc, _ = await _fetch_and_cache_manifest(
@@ -452,16 +590,17 @@ async def fetch_image_layers_async(
                         [ECR_OCI_MANIFEST_MT, ECR_DOCKER_MANIFEST_MT],
                     )
                     if not child_doc:
-                        return {}
+                        return {}, None
                     platform_hint = extract_platform_from_manifest(manifest_ref)
-                    return await _diff_ids_for_manifest(
+                    diff_map = await _diff_ids_for_manifest(
                         ecr_client,
                         repo_name,
                         child_doc,
                         http_client,
                         platform_hint,
                     )
+                    return diff_map, None
                 # Process all child manifests in parallel
                 child_tasks = [
@@ -474,8 +613,13 @@ async def fetch_image_layers_async(
                 # Merge results from successful child manifest processing
                 for result in child_results:
-                    if isinstance(result, dict):
-                        platform_layers.update(result)
+                    if isinstance(result, tuple) and len(result) == 2:
+                        layer_data, attest_data = result
+                        if layer_data:
+                            platform_layers.update(layer_data)
+                        if attest_data and not attestation_data:
+                            # Use first attestation found
+                            attestation_data = attest_data
             else:
                 diff_map = await _diff_ids_for_manifest(
                     ecr_client,
@@ -487,7 +631,7 @@ async def fetch_image_layers_async(
                 platform_layers.update(diff_map)
             if platform_layers:
-                return uri, digest, platform_layers
+                return uri, digest, platform_layers, attestation_data
             return None
@@ -507,7 +651,7 @@ async def fetch_image_layers_async(
         )
         if not tasks:
-            return image_layers_data, image_digest_map
+            return image_layers_data, image_digest_map, image_attestation_map
         progress_interval = max(1, min(100, total // 10 or 1))
         completed = 0
@@ -526,16 +670,22 @@ async def fetch_image_layers_async(
                 )
             if result:
-                uri, digest, layer_data = result
+                uri, digest, layer_data, attestation_data = result
                 if not digest:
                     raise ValueError(f"Empty digest returned for image {uri}")
                 image_layers_data[uri] = layer_data
                 image_digest_map[uri] = digest
+                if attestation_data:
+                    image_attestation_map[uri] = attestation_data
     logger.info(
         f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
     )
-    return image_layers_data, image_digest_map
+    if image_attestation_map:
+        logger.info(
+            f"Found attestations with base image info for {len(image_attestation_map)} images"
+        )
+    return image_layers_data, image_digest_map, image_attestation_map
 def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
@@ -613,9 +763,11 @@ def sync(
                 f"Starting to fetch layers for {len(repo_images_list)} images..."
             )
-            async def _fetch_with_async_client() -> (
-                tuple[dict[str, dict[str, list[str]]], dict[str, str]]
-            ):
+            async def _fetch_with_async_client() -> tuple[
+                dict[str, dict[str, list[str]]],
+                dict[str, str],
+                dict[str, dict[str, str]],
+            ]:
                 # Use credentials from the existing boto3 session
                 credentials = boto3_session.get_credentials()
                 session = aioboto3.Session(
@@ -635,8 +787,8 @@ def sync(
                 loop = asyncio.new_event_loop()
                 asyncio.set_event_loop(loop)
-            image_layers_data, image_digest_map = loop.run_until_complete(
-                _fetch_with_async_client()
+            image_layers_data, image_digest_map, image_attestation_map = (
+                loop.run_until_complete(_fetch_with_async_client())
             )
             logger.info(
@@ -645,6 +797,7 @@ def sync(
             layers, memberships = transform_ecr_image_layers(
                 image_layers_data,
                 image_digest_map,
+                image_attestation_map,
             )
             load_ecr_image_layers(
                 neo4j_session,

cartography/intel/azure/__init__.py CHANGED Viewed

@@ -9,7 +9,9 @@ from cartography.util import timeit
 from . import app_service
 from . import compute
+from . import container_instances
 from . import cosmosdb
+from . import data_lake
 from . import functions
 from . import logic_apps
 from . import resource_groups
@@ -30,6 +32,13 @@ def _sync_one_subscription(
     update_tag: int,
     common_job_parameters: Dict,
 ) -> None:
+    container_instances.sync(
+        neo4j_session,
+        credentials,
+        subscription_id,
+        update_tag,
+        common_job_parameters,
+    )
     compute.sync(
         neo4j_session,
         credentials.credential,
@@ -86,6 +95,13 @@ def _sync_one_subscription(
         update_tag,
         common_job_parameters,
     )
+    data_lake.sync(
+        neo4j_session,
+        credentials,
+        subscription_id,
+        update_tag,
+        common_job_parameters,
+    )
 def _sync_tenant(

cartography/intel/azure/container_instances.py ADDED Viewed

@@ -0,0 +1,95 @@
+import logging
+from typing import Any
+import neo4j
+from azure.core.exceptions import ClientAuthenticationError
+from azure.core.exceptions import HttpResponseError
+from azure.mgmt.containerinstance import ContainerInstanceManagementClient
+from cartography.client.core.tx import load
+from cartography.graph.job import GraphJob
+from cartography.models.azure.container_instance import AzureContainerInstanceSchema
+from cartography.util import timeit
+from .util.credentials import Credentials
+logger = logging.getLogger(__name__)
+@timeit
+def get_container_instances(
+    credentials: Credentials, subscription_id: str
+) -> list[dict]:
+    try:
+        client = ContainerInstanceManagementClient(
+            credentials.credential, subscription_id
+        )
+        # NOTE: Azure Container Instances are called "Container Groups" in the SDK
+        return [cg.as_dict() for cg in client.container_groups.list()]
+    except (ClientAuthenticationError, HttpResponseError) as e:
+        logger.warning(
+            f"Failed to get Container Instances for subscription {subscription_id}: {str(e)}"
+        )
+        return []
+def transform_container_instances(container_groups: list[dict]) -> list[dict]:
+    transformed_instances: list[dict[str, Any]] = []
+    for group in container_groups:
+        transformed_instance = {
+            "id": group.get("id"),
+            "name": group.get("name"),
+            "location": group.get("location"),
+            "type": group.get("type"),
+            "provisioning_state": group.get("properties", {}).get("provisioning_state"),
+            "ip_address": ((group.get("properties") or {}).get("ip_address") or {}).get(
+                "ip"
+            ),
+            "os_type": group.get("properties", {}).get("os_type"),
+        }
+        transformed_instances.append(transformed_instance)
+    return transformed_instances
+@timeit
+def load_container_instances(
+    neo4j_session: neo4j.Session,
+    data: list[dict[str, Any]],
+    subscription_id: str,
+    update_tag: int,
+) -> None:
+    load(
+        neo4j_session,
+        AzureContainerInstanceSchema(),
+        data,
+        lastupdated=update_tag,
+        AZURE_SUBSCRIPTION_ID=subscription_id,
+    )
+@timeit
+def cleanup_container_instances(
+    neo4j_session: neo4j.Session, common_job_parameters: dict
+) -> None:
+    GraphJob.from_node_schema(
+        AzureContainerInstanceSchema(), common_job_parameters
+    ).run(neo4j_session)
+@timeit
+def sync(
+    neo4j_session: neo4j.Session,
+    credentials: Credentials,
+    subscription_id: str,
+    update_tag: int,
+    common_job_parameters: dict,
+) -> None:
+    logger.info(
+        f"Syncing Azure Container Instances for subscription {subscription_id}."
+    )
+    raw_groups = get_container_instances(credentials, subscription_id)
+    transformed_groups = transform_container_instances(raw_groups)
+    load_container_instances(
+        neo4j_session, transformed_groups, subscription_id, update_tag
+    )
+    cleanup_container_instances(neo4j_session, common_job_parameters)

cartography/intel/azure/data_lake.py ADDED Viewed

@@ -0,0 +1,124 @@
+import logging
+from typing import Any
+import neo4j
+from azure.core.exceptions import ClientAuthenticationError
+from azure.core.exceptions import HttpResponseError
+from azure.mgmt.storage import StorageManagementClient
+from cartography.client.core.tx import load
+from cartography.graph.job import GraphJob
+from cartography.models.azure.data_lake_filesystem import AzureDataLakeFileSystemSchema
+from cartography.util import timeit
+from .util.credentials import Credentials
+logger = logging.getLogger(__name__)
+def _get_resource_group_from_id(resource_id: str) -> str:
+    """
+    Helper function to parse the resource group name from a full resource ID string.
+    """
+    parts = resource_id.lower().split("/")
+    rg_index = parts.index("resourcegroups")
+    return parts[rg_index + 1]
+@timeit
+def get_datalake_accounts(credentials: Credentials, subscription_id: str) -> list[dict]:
+    try:
+        client = StorageManagementClient(credentials.credential, subscription_id)
+        storage_accounts = [sa.as_dict() for sa in client.storage_accounts.list()]
+        return [sa for sa in storage_accounts if sa.get("is_hns_enabled")]
+    except (ClientAuthenticationError, HttpResponseError) as e:
+        logger.warning(f"Failed to get Storage Accounts for Data Lake sync: {str(e)}")
+        return []
+@timeit
+def get_filesystems_for_account(
+    client: StorageManagementClient,
+    account: dict,
+) -> list[dict]:
+    resource_group_name = _get_resource_group_from_id(account["id"])
+    try:
+        return [
+            c.as_dict()
+            for c in client.blob_containers.list(
+                resource_group_name,
+                account["name"],
+            )
+        ]
+    except (ClientAuthenticationError, HttpResponseError) as e:
+        logger.warning(
+            f"Failed to get containers for storage account {account['name']}: {str(e)}",
+        )
+        return []
+@timeit
+def transform_datalake_filesystems(filesystems_response: list[dict]) -> list[dict]:
+    transformed_filesystems: list[dict[str, Any]] = []
+    for fs in filesystems_response:
+        transformed_filesystem = {
+            "id": fs.get("id"),
+            "name": fs.get("name"),
+            "public_access": fs.get("properties", {}).get("public_access"),
+            "last_modified_time": fs.get("properties", {}).get("last_modified_time"),
+            "has_immutability_policy": fs.get("properties", {}).get(
+                "has_immutability_policy",
+            ),
+            "has_legal_hold": fs.get("properties", {}).get("has_legal_hold"),
+        }
+        transformed_filesystems.append(transformed_filesystem)
+    return transformed_filesystems
+@timeit
+def load_datalake_filesystems(
+    neo4j_session: neo4j.Session,
+    data: list[dict[str, Any]],
+    storage_account_id: str,
+    update_tag: int,
+) -> None:
+    load(
+        neo4j_session,
+        AzureDataLakeFileSystemSchema(),
+        data,
+        lastupdated=update_tag,
+        STORAGE_ACCOUNT_ID=storage_account_id,
+    )
+@timeit
+def sync(
+    neo4j_session: neo4j.Session,
+    credentials: Credentials,
+    subscription_id: str,
+    update_tag: int,
+    common_job_parameters: dict,
+) -> None:
+    logger.info(
+        f"Syncing Azure Data Lake File Systems for subscription {subscription_id}.",
+    )
+    client = StorageManagementClient(credentials.credential, subscription_id)
+    datalake_accounts = get_datalake_accounts(credentials, subscription_id)
+    for account in datalake_accounts:
+        account_id = account["id"]
+        raw_filesystems = get_filesystems_for_account(client, account)
+        transformed_filesystems = transform_datalake_filesystems(raw_filesystems)
+        load_datalake_filesystems(
+            neo4j_session,
+            transformed_filesystems,
+            account_id,
+            update_tag,
+        )
+        cleanup_params = common_job_parameters.copy()
+        cleanup_params["STORAGE_ACCOUNT_ID"] = account_id
+        GraphJob.from_node_schema(AzureDataLakeFileSystemSchema(), cleanup_params).run(
+            neo4j_session,
+        )

cartography/intel/github/teams.py CHANGED Viewed

@@ -84,7 +84,7 @@ def _get_teams_repos_inner_func(
     repo_urls: list[str],
     repo_permissions: list[str],
 ) -> None:
-    logger.info(f"Loading team repos for {team_name}.")
+    logger.info(f"Retrieving team repos for {team_name}.")
     team_repos = _get_team_repos(org, api_url, token, team_name)
     # The `or []` is because `.nodes` can be None. See:
@@ -192,7 +192,7 @@ def _get_teams_users_inner_func(
     user_urls: List[str],
     user_roles: List[str],
 ) -> None:
-    logger.info(f"Loading team users for {team_name}.")
+    logger.info(f"Retrieving team users for {team_name}.")
     team_users = _get_team_users(org, api_url, token, team_name)
     # The `or []` is because `.nodes` can be None. See:
     # https://docs.github.com/en/graphql/reference/objects#teammemberconnection
@@ -299,7 +299,7 @@ def _get_child_teams_inner_func(
     team_name: str,
     team_urls: List[str],
 ) -> None:
-    logger.info(f"Loading child teams for {team_name}.")
+    logger.info(f"Retrieving child teams for {team_name}.")
     child_teams = _get_child_teams(org, api_url, token, team_name)
     # The `or []` is because `.nodes` can be None. See:
     # https://docs.github.com/en/graphql/reference/objects#teammemberconnection

cartography 0.116.0__py3-none-any.whl → 0.117.0__py3-none-any.whl

Potentially problematic release.

cartography 0.116.0py3-none-any.whl → 0.117.0py3-none-any.whl