cartography 0.115.0__py3-none-any.whl → 0.116.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cartography might be problematic. Click here for more details.
- cartography/_version.py +2 -2
- cartography/client/core/tx.py +1 -1
- cartography/intel/aws/ecr_image_layers.py +664 -0
- cartography/intel/aws/resources.py +2 -0
- cartography/intel/azure/__init__.py +8 -0
- cartography/intel/azure/resource_groups.py +82 -0
- cartography/models/aws/ecr/image.py +21 -0
- cartography/models/aws/ecr/image_layer.py +107 -0
- cartography/models/azure/resource_groups.py +52 -0
- cartography/rules/README.md +1 -0
- cartography/rules/__init__.py +0 -0
- cartography/rules/cli.py +342 -0
- cartography/rules/data/__init__.py +0 -0
- cartography/rules/data/frameworks/__init__.py +12 -0
- cartography/rules/data/frameworks/mitre_attack/__init__.py +14 -0
- cartography/rules/data/frameworks/mitre_attack/requirements/__init__.py +0 -0
- cartography/rules/data/frameworks/mitre_attack/requirements/t1190_exploit_public_facing_application/__init__.py +135 -0
- cartography/rules/formatters.py +46 -0
- cartography/rules/runners.py +338 -0
- cartography/rules/spec/__init__.py +0 -0
- cartography/rules/spec/model.py +88 -0
- cartography/rules/spec/result.py +46 -0
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/METADATA +18 -4
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/RECORD +28 -11
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/entry_points.txt +1 -0
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/WHEEL +0 -0
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/licenses/LICENSE +0 -0
- {cartography-0.115.0.dist-info → cartography-0.116.0.dist-info}/top_level.txt +0 -0
cartography/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.116.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 116, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
cartography/client/core/tx.py
CHANGED
|
@@ -42,7 +42,7 @@ def read_list_of_values_tx(
|
|
|
42
42
|
Example usage:
|
|
43
43
|
query = "MATCH (a:TestNode) RETURN a.name ORDER BY a.name"
|
|
44
44
|
|
|
45
|
-
values = neo4j_session.
|
|
45
|
+
values = neo4j_session.execute_read(read_list_of_values_tx, query)
|
|
46
46
|
|
|
47
47
|
:param tx: A neo4j read transaction object
|
|
48
48
|
:param query: A neo4j query string that returns a list of single values. For example,
|
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ECR Image Layers module - fetches and syncs detailed container image layer information.
|
|
3
|
+
|
|
4
|
+
This is separate from the main ECR module to allow independent execution since layer
|
|
5
|
+
fetching can be significantly slower than basic ECR repository/image syncing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import aioboto3
|
|
15
|
+
import boto3
|
|
16
|
+
import httpx
|
|
17
|
+
import neo4j
|
|
18
|
+
from botocore.exceptions import ClientError
|
|
19
|
+
from types_aiobotocore_ecr import ECRClient
|
|
20
|
+
|
|
21
|
+
from cartography.client.core.tx import load
|
|
22
|
+
from cartography.graph.job import GraphJob
|
|
23
|
+
from cartography.models.aws.ecr.image import ECRImageSchema
|
|
24
|
+
from cartography.models.aws.ecr.image_layer import ECRImageLayerSchema
|
|
25
|
+
from cartography.util import timeit
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
EMPTY_LAYER_DIFF_ID = (
|
|
31
|
+
"sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# ECR manifest media types
|
|
35
|
+
ECR_DOCKER_INDEX_MT = "application/vnd.docker.distribution.manifest.list.v2+json"
|
|
36
|
+
ECR_DOCKER_MANIFEST_MT = "application/vnd.docker.distribution.manifest.v2+json"
|
|
37
|
+
ECR_OCI_INDEX_MT = "application/vnd.oci.image.index.v1+json"
|
|
38
|
+
ECR_OCI_MANIFEST_MT = "application/vnd.oci.image.manifest.v1+json"
|
|
39
|
+
|
|
40
|
+
ALL_ACCEPTED = [
|
|
41
|
+
ECR_OCI_INDEX_MT,
|
|
42
|
+
ECR_DOCKER_INDEX_MT,
|
|
43
|
+
ECR_OCI_MANIFEST_MT,
|
|
44
|
+
ECR_DOCKER_MANIFEST_MT,
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
INDEX_MEDIA_TYPES = {ECR_OCI_INDEX_MT, ECR_DOCKER_INDEX_MT}
|
|
48
|
+
INDEX_MEDIA_TYPES_LOWER = {mt.lower() for mt in INDEX_MEDIA_TYPES}
|
|
49
|
+
|
|
50
|
+
# Media types that should be skipped when processing manifests
|
|
51
|
+
SKIP_CONFIG_MEDIA_TYPE_FRAGMENTS = {"buildkit", "attestation", "in-toto"}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def extract_repo_uri_from_image_uri(image_uri: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Extract repository URI from image URI by removing tag or digest.
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
"repo@sha256:digest" -> "repo"
|
|
60
|
+
"repo:tag" -> "repo"
|
|
61
|
+
"repo" -> "repo"
|
|
62
|
+
"""
|
|
63
|
+
if "@sha256:" in image_uri:
|
|
64
|
+
return image_uri.split("@", 1)[0]
|
|
65
|
+
elif ":" in image_uri:
|
|
66
|
+
return image_uri.rsplit(":", 1)[0]
|
|
67
|
+
else:
|
|
68
|
+
return image_uri
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_platform_from_manifest(manifest_ref: dict) -> str:
|
|
72
|
+
"""Extract platform string from manifest reference."""
|
|
73
|
+
platform_info = manifest_ref.get("platform", {})
|
|
74
|
+
return _format_platform(
|
|
75
|
+
platform_info.get("os"),
|
|
76
|
+
platform_info.get("architecture"),
|
|
77
|
+
platform_info.get("variant"),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _format_platform(
|
|
82
|
+
os_name: Optional[str],
|
|
83
|
+
architecture: Optional[str],
|
|
84
|
+
variant: Optional[str] = None,
|
|
85
|
+
) -> str:
|
|
86
|
+
components = [os_name or "unknown", architecture or "unknown"]
|
|
87
|
+
if variant:
|
|
88
|
+
components.append(variant)
|
|
89
|
+
return "/".join(components)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def batch_get_manifest(
|
|
93
|
+
ecr_client: ECRClient, repo: str, image_ref: str, accepted_media_types: list[str]
|
|
94
|
+
) -> tuple[dict, str]:
|
|
95
|
+
"""Get image manifest using batch_get_image API."""
|
|
96
|
+
try:
|
|
97
|
+
resp = await ecr_client.batch_get_image(
|
|
98
|
+
repositoryName=repo,
|
|
99
|
+
imageIds=(
|
|
100
|
+
[{"imageDigest": image_ref}]
|
|
101
|
+
if image_ref.startswith("sha256:")
|
|
102
|
+
else [{"imageTag": image_ref}]
|
|
103
|
+
),
|
|
104
|
+
acceptedMediaTypes=accepted_media_types,
|
|
105
|
+
)
|
|
106
|
+
except ClientError as error:
|
|
107
|
+
error_code = error.response.get("Error", {}).get("Code", "")
|
|
108
|
+
if error_code == "ImageNotFoundException":
|
|
109
|
+
logger.warning(
|
|
110
|
+
"Image %s:%s not found while fetching manifest", repo, image_ref
|
|
111
|
+
)
|
|
112
|
+
return {}, ""
|
|
113
|
+
# Fail loudly on throttling or unexpected AWS errors
|
|
114
|
+
logger.error(
|
|
115
|
+
"Failed to get manifest for %s:%s due to AWS error %s",
|
|
116
|
+
repo,
|
|
117
|
+
image_ref,
|
|
118
|
+
error_code,
|
|
119
|
+
)
|
|
120
|
+
raise
|
|
121
|
+
except Exception:
|
|
122
|
+
logger.exception(
|
|
123
|
+
"Unexpected error fetching manifest for %s:%s", repo, image_ref
|
|
124
|
+
)
|
|
125
|
+
raise
|
|
126
|
+
|
|
127
|
+
if not resp.get("images"):
|
|
128
|
+
logger.warning(f"No image found for {repo}:{image_ref}")
|
|
129
|
+
return {}, ""
|
|
130
|
+
|
|
131
|
+
manifest_json = json.loads(resp["images"][0]["imageManifest"])
|
|
132
|
+
media_type = resp["images"][0].get("imageManifestMediaType", "")
|
|
133
|
+
return manifest_json, media_type
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
async def get_blob_json_via_presigned(
|
|
137
|
+
ecr_client: ECRClient,
|
|
138
|
+
repo: str,
|
|
139
|
+
digest: str,
|
|
140
|
+
http_client: httpx.AsyncClient,
|
|
141
|
+
) -> dict:
|
|
142
|
+
"""Download and parse JSON blob using presigned URL."""
|
|
143
|
+
try:
|
|
144
|
+
url_response = await ecr_client.get_download_url_for_layer(
|
|
145
|
+
repositoryName=repo,
|
|
146
|
+
layerDigest=digest,
|
|
147
|
+
)
|
|
148
|
+
except ClientError as error:
|
|
149
|
+
logger.error(
|
|
150
|
+
"Failed to request download URL for layer %s in repo %s: %s",
|
|
151
|
+
digest,
|
|
152
|
+
repo,
|
|
153
|
+
error.response.get("Error", {}).get("Code", "unknown"),
|
|
154
|
+
)
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
url = url_response["downloadUrl"]
|
|
158
|
+
try:
|
|
159
|
+
response = await http_client.get(url, timeout=30.0)
|
|
160
|
+
response.raise_for_status()
|
|
161
|
+
except httpx.HTTPError as error:
|
|
162
|
+
logger.error(
|
|
163
|
+
"HTTP error downloading blob %s for repo %s: %s",
|
|
164
|
+
digest,
|
|
165
|
+
repo,
|
|
166
|
+
error,
|
|
167
|
+
)
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
return response.json()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
async def _diff_ids_for_manifest(
|
|
174
|
+
ecr_client: ECRClient,
|
|
175
|
+
repo_name: str,
|
|
176
|
+
manifest_doc: dict[str, Any],
|
|
177
|
+
http_client: httpx.AsyncClient,
|
|
178
|
+
platform_hint: Optional[str],
|
|
179
|
+
) -> dict[str, list[str]]:
|
|
180
|
+
config = manifest_doc.get("config", {})
|
|
181
|
+
config_media_type = config.get("mediaType", "").lower()
|
|
182
|
+
|
|
183
|
+
# Skip certain media types
|
|
184
|
+
if any(
|
|
185
|
+
skip_fragment in config_media_type
|
|
186
|
+
for skip_fragment in SKIP_CONFIG_MEDIA_TYPE_FRAGMENTS
|
|
187
|
+
):
|
|
188
|
+
return {}
|
|
189
|
+
|
|
190
|
+
layers = manifest_doc.get("layers", [])
|
|
191
|
+
if layers and all(
|
|
192
|
+
"in-toto" in layer.get("mediaType", "").lower() for layer in layers
|
|
193
|
+
):
|
|
194
|
+
return {}
|
|
195
|
+
|
|
196
|
+
cfg_digest = config.get("digest")
|
|
197
|
+
if not cfg_digest:
|
|
198
|
+
return {}
|
|
199
|
+
|
|
200
|
+
cfg_json = await get_blob_json_via_presigned(
|
|
201
|
+
ecr_client,
|
|
202
|
+
repo_name,
|
|
203
|
+
cfg_digest,
|
|
204
|
+
http_client,
|
|
205
|
+
)
|
|
206
|
+
if not cfg_json:
|
|
207
|
+
return {}
|
|
208
|
+
|
|
209
|
+
# Docker API uses inconsistent casing - check for known variations
|
|
210
|
+
rootfs = cfg_json.get("rootfs") or cfg_json.get("RootFS") or {}
|
|
211
|
+
diff_ids = rootfs.get("diff_ids") or rootfs.get("DiffIDs") or []
|
|
212
|
+
if not diff_ids:
|
|
213
|
+
return {}
|
|
214
|
+
|
|
215
|
+
if platform_hint:
|
|
216
|
+
platform = platform_hint
|
|
217
|
+
else:
|
|
218
|
+
# Docker API uses inconsistent casing for platform components
|
|
219
|
+
platform = _format_platform(
|
|
220
|
+
cfg_json.get("os") or cfg_json.get("OS"),
|
|
221
|
+
cfg_json.get("architecture") or cfg_json.get("Architecture"),
|
|
222
|
+
cfg_json.get("variant") or cfg_json.get("Variant"),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return {platform: diff_ids}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def transform_ecr_image_layers(
|
|
229
|
+
image_layers_data: dict[str, dict[str, list[str]]],
|
|
230
|
+
image_digest_map: dict[str, str],
|
|
231
|
+
) -> tuple[list[dict], list[dict]]:
|
|
232
|
+
"""
|
|
233
|
+
Transform image layer data into format suitable for Neo4j ingestion.
|
|
234
|
+
Creates linked list structure with NEXT relationships and HEAD/TAIL markers.
|
|
235
|
+
|
|
236
|
+
:param image_layers_data: Map of image URI to platform to diff_ids
|
|
237
|
+
:param image_digest_map: Map of image URI to image digest
|
|
238
|
+
:return: List of layer objects ready for ingestion
|
|
239
|
+
"""
|
|
240
|
+
layers_by_diff_id: dict[str, dict[str, Any]] = {}
|
|
241
|
+
memberships_by_digest: dict[str, dict[str, Any]] = {}
|
|
242
|
+
|
|
243
|
+
for image_uri, platforms in image_layers_data.items():
|
|
244
|
+
# fetch_image_layers_async guarantees every uri in image_layers_data has a digest
|
|
245
|
+
image_digest = image_digest_map[image_uri]
|
|
246
|
+
|
|
247
|
+
ordered_layers_for_image: Optional[list[str]] = None
|
|
248
|
+
|
|
249
|
+
for _, diff_ids in platforms.items():
|
|
250
|
+
if not diff_ids:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
if ordered_layers_for_image is None:
|
|
254
|
+
ordered_layers_for_image = list(diff_ids)
|
|
255
|
+
|
|
256
|
+
# Process each layer in the chain
|
|
257
|
+
for i, diff_id in enumerate(diff_ids):
|
|
258
|
+
# Get or create layer
|
|
259
|
+
if diff_id not in layers_by_diff_id:
|
|
260
|
+
layers_by_diff_id[diff_id] = {
|
|
261
|
+
"diff_id": diff_id,
|
|
262
|
+
"is_empty": diff_id == EMPTY_LAYER_DIFF_ID,
|
|
263
|
+
"next_diff_ids": set(),
|
|
264
|
+
"head_image_ids": set(),
|
|
265
|
+
"tail_image_ids": set(),
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
layer = layers_by_diff_id[diff_id]
|
|
269
|
+
|
|
270
|
+
# Add NEXT relationship if not the last layer
|
|
271
|
+
if i < len(diff_ids) - 1:
|
|
272
|
+
layer["next_diff_ids"].add(diff_ids[i + 1])
|
|
273
|
+
|
|
274
|
+
# Track which images this layer is HEAD or TAIL of
|
|
275
|
+
if i == 0:
|
|
276
|
+
layer["head_image_ids"].add(image_digest)
|
|
277
|
+
if i == len(diff_ids) - 1:
|
|
278
|
+
layer["tail_image_ids"].add(image_digest)
|
|
279
|
+
|
|
280
|
+
if ordered_layers_for_image:
|
|
281
|
+
memberships_by_digest[image_digest] = {
|
|
282
|
+
"layer_diff_ids": ordered_layers_for_image,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# Convert sets back to lists for Neo4j ingestion
|
|
286
|
+
layers = []
|
|
287
|
+
for layer in layers_by_diff_id.values():
|
|
288
|
+
layer_dict: dict[str, Any] = {
|
|
289
|
+
"diff_id": layer["diff_id"],
|
|
290
|
+
"is_empty": layer["is_empty"],
|
|
291
|
+
}
|
|
292
|
+
if layer["next_diff_ids"]:
|
|
293
|
+
layer_dict["next_diff_ids"] = list(layer["next_diff_ids"])
|
|
294
|
+
if layer["head_image_ids"]:
|
|
295
|
+
layer_dict["head_image_ids"] = list(layer["head_image_ids"])
|
|
296
|
+
if layer["tail_image_ids"]:
|
|
297
|
+
layer_dict["tail_image_ids"] = list(layer["tail_image_ids"])
|
|
298
|
+
layers.append(layer_dict)
|
|
299
|
+
|
|
300
|
+
# Reconstruct memberships list with imageDigest field
|
|
301
|
+
memberships = [
|
|
302
|
+
{"imageDigest": digest, **membership_data}
|
|
303
|
+
for digest, membership_data in memberships_by_digest.items()
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
return layers, memberships
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@timeit
|
|
310
|
+
def load_ecr_image_layers(
|
|
311
|
+
neo4j_session: neo4j.Session,
|
|
312
|
+
image_layers: list[dict],
|
|
313
|
+
region: str,
|
|
314
|
+
current_aws_account_id: str,
|
|
315
|
+
aws_update_tag: int,
|
|
316
|
+
) -> None:
|
|
317
|
+
"""Load image layers into Neo4j."""
|
|
318
|
+
logger.info(
|
|
319
|
+
f"Loading {len(image_layers)} image layers for region {region} into graph.",
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
load(
|
|
323
|
+
neo4j_session,
|
|
324
|
+
ECRImageLayerSchema(),
|
|
325
|
+
image_layers,
|
|
326
|
+
lastupdated=aws_update_tag,
|
|
327
|
+
AWS_ID=current_aws_account_id,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@timeit
|
|
332
|
+
def load_ecr_image_layer_memberships(
|
|
333
|
+
neo4j_session: neo4j.Session,
|
|
334
|
+
memberships: list[dict[str, Any]],
|
|
335
|
+
region: str,
|
|
336
|
+
current_aws_account_id: str,
|
|
337
|
+
aws_update_tag: int,
|
|
338
|
+
) -> None:
|
|
339
|
+
load(
|
|
340
|
+
neo4j_session,
|
|
341
|
+
ECRImageSchema(),
|
|
342
|
+
memberships,
|
|
343
|
+
lastupdated=aws_update_tag,
|
|
344
|
+
Region=region,
|
|
345
|
+
AWS_ID=current_aws_account_id,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
async def fetch_image_layers_async(
|
|
350
|
+
ecr_client: ECRClient,
|
|
351
|
+
repo_images_list: list[dict],
|
|
352
|
+
max_concurrent: int = 200,
|
|
353
|
+
) -> tuple[dict[str, dict[str, list[str]]], dict[str, str]]:
|
|
354
|
+
"""
|
|
355
|
+
Fetch image layers for ECR images in parallel with caching and non-blocking I/O.
|
|
356
|
+
"""
|
|
357
|
+
image_layers_data: dict[str, dict[str, list[str]]] = {}
|
|
358
|
+
image_digest_map: dict[str, str] = {}
|
|
359
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
360
|
+
|
|
361
|
+
# Cache for manifest fetches keyed by (repo_name, imageDigest)
|
|
362
|
+
manifest_cache: dict[tuple[str, str], tuple[dict, str]] = {}
|
|
363
|
+
# Lock for thread-safe cache access
|
|
364
|
+
cache_lock = asyncio.Lock()
|
|
365
|
+
# In-flight requests to coalesce duplicate fetches
|
|
366
|
+
inflight: dict[tuple[str, str], asyncio.Task] = {}
|
|
367
|
+
|
|
368
|
+
async def _fetch_and_cache_manifest(
|
|
369
|
+
repo_name: str, digest_or_tag: str, accepted: list[str]
|
|
370
|
+
) -> tuple[dict, str]:
|
|
371
|
+
"""
|
|
372
|
+
Fetch and cache manifest with double-checked locking and in-flight coalescing.
|
|
373
|
+
"""
|
|
374
|
+
key = (repo_name, digest_or_tag)
|
|
375
|
+
|
|
376
|
+
# Fast path: check cache without lock
|
|
377
|
+
if key in manifest_cache:
|
|
378
|
+
return manifest_cache[key]
|
|
379
|
+
|
|
380
|
+
# Check for existing in-flight request
|
|
381
|
+
task = inflight.get(key)
|
|
382
|
+
if task is None:
|
|
383
|
+
# Create new task for this manifest
|
|
384
|
+
async def _do() -> tuple[dict, str]:
|
|
385
|
+
# Fetch without holding the lock
|
|
386
|
+
doc, mt = await batch_get_manifest(
|
|
387
|
+
ecr_client, repo_name, digest_or_tag, accepted
|
|
388
|
+
)
|
|
389
|
+
# Store result under lock (second check to avoid races)
|
|
390
|
+
async with cache_lock:
|
|
391
|
+
return manifest_cache.setdefault(key, (doc, mt))
|
|
392
|
+
|
|
393
|
+
task = asyncio.create_task(_do())
|
|
394
|
+
inflight[key] = task
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
return await task
|
|
398
|
+
finally:
|
|
399
|
+
# Clean up inflight entry
|
|
400
|
+
inflight.pop(key, None)
|
|
401
|
+
|
|
402
|
+
async def fetch_single_image_layers(
|
|
403
|
+
repo_image: dict,
|
|
404
|
+
http_client: httpx.AsyncClient,
|
|
405
|
+
) -> Optional[tuple[str, str, dict[str, list[str]]]]:
|
|
406
|
+
"""Fetch layers for a single image."""
|
|
407
|
+
async with semaphore:
|
|
408
|
+
# Caller guarantees these fields exist in every repo_image
|
|
409
|
+
uri = repo_image["uri"]
|
|
410
|
+
digest = repo_image["imageDigest"]
|
|
411
|
+
repo_uri = repo_image["repo_uri"]
|
|
412
|
+
|
|
413
|
+
# Extract repository name
|
|
414
|
+
parts = repo_uri.split("/", 1)
|
|
415
|
+
if len(parts) != 2:
|
|
416
|
+
raise ValueError(f"Unexpected ECR repository URI format: {repo_uri}")
|
|
417
|
+
repo_name = parts[1]
|
|
418
|
+
|
|
419
|
+
# Get manifest using optimized caching
|
|
420
|
+
doc, media_type = await _fetch_and_cache_manifest(
|
|
421
|
+
repo_name, digest, ALL_ACCEPTED
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if not doc:
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
|
|
428
|
+
platform_layers: dict[str, list[str]] = {}
|
|
429
|
+
|
|
430
|
+
if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
|
|
431
|
+
|
|
432
|
+
async def _process_child_manifest(
|
|
433
|
+
manifest_ref: dict,
|
|
434
|
+
) -> dict[str, list[str]]:
|
|
435
|
+
# Skip attestation manifests - these aren't real images
|
|
436
|
+
if (
|
|
437
|
+
manifest_ref.get("annotations", {}).get(
|
|
438
|
+
"vnd.docker.reference.type"
|
|
439
|
+
)
|
|
440
|
+
== "attestation-manifest"
|
|
441
|
+
):
|
|
442
|
+
return {}
|
|
443
|
+
|
|
444
|
+
child_digest = manifest_ref.get("digest")
|
|
445
|
+
if not child_digest:
|
|
446
|
+
return {}
|
|
447
|
+
|
|
448
|
+
# Use optimized caching for child manifest
|
|
449
|
+
child_doc, _ = await _fetch_and_cache_manifest(
|
|
450
|
+
repo_name,
|
|
451
|
+
child_digest,
|
|
452
|
+
[ECR_OCI_MANIFEST_MT, ECR_DOCKER_MANIFEST_MT],
|
|
453
|
+
)
|
|
454
|
+
if not child_doc:
|
|
455
|
+
return {}
|
|
456
|
+
|
|
457
|
+
platform_hint = extract_platform_from_manifest(manifest_ref)
|
|
458
|
+
return await _diff_ids_for_manifest(
|
|
459
|
+
ecr_client,
|
|
460
|
+
repo_name,
|
|
461
|
+
child_doc,
|
|
462
|
+
http_client,
|
|
463
|
+
platform_hint,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Process all child manifests in parallel
|
|
467
|
+
child_tasks = [
|
|
468
|
+
_process_child_manifest(manifest_ref)
|
|
469
|
+
for manifest_ref in doc.get("manifests", [])
|
|
470
|
+
]
|
|
471
|
+
child_results = await asyncio.gather(
|
|
472
|
+
*child_tasks, return_exceptions=True
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Merge results from successful child manifest processing
|
|
476
|
+
for result in child_results:
|
|
477
|
+
if isinstance(result, dict):
|
|
478
|
+
platform_layers.update(result)
|
|
479
|
+
else:
|
|
480
|
+
diff_map = await _diff_ids_for_manifest(
|
|
481
|
+
ecr_client,
|
|
482
|
+
repo_name,
|
|
483
|
+
doc,
|
|
484
|
+
http_client,
|
|
485
|
+
None,
|
|
486
|
+
)
|
|
487
|
+
platform_layers.update(diff_map)
|
|
488
|
+
|
|
489
|
+
if platform_layers:
|
|
490
|
+
return uri, digest, platform_layers
|
|
491
|
+
|
|
492
|
+
return None
|
|
493
|
+
|
|
494
|
+
async with httpx.AsyncClient() as http_client:
|
|
495
|
+
# Create tasks for all images
|
|
496
|
+
tasks = [
|
|
497
|
+
asyncio.create_task(
|
|
498
|
+
fetch_single_image_layers(repo_image, http_client),
|
|
499
|
+
)
|
|
500
|
+
for repo_image in repo_images_list
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
# Process with progress logging
|
|
504
|
+
total = len(tasks)
|
|
505
|
+
logger.info(
|
|
506
|
+
f"Fetching layers for {total} images with {max_concurrent} concurrent connections..."
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
if not tasks:
|
|
510
|
+
return image_layers_data, image_digest_map
|
|
511
|
+
|
|
512
|
+
progress_interval = max(1, min(100, total // 10 or 1))
|
|
513
|
+
completed = 0
|
|
514
|
+
|
|
515
|
+
for task in asyncio.as_completed(tasks):
|
|
516
|
+
result = await task
|
|
517
|
+
completed += 1
|
|
518
|
+
|
|
519
|
+
if completed % progress_interval == 0 or completed == total:
|
|
520
|
+
percent = (completed / total) * 100
|
|
521
|
+
logger.info(
|
|
522
|
+
"Fetched layer metadata for %d/%d images (%.1f%%)",
|
|
523
|
+
completed,
|
|
524
|
+
total,
|
|
525
|
+
percent,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
if result:
|
|
529
|
+
uri, digest, layer_data = result
|
|
530
|
+
if not digest:
|
|
531
|
+
raise ValueError(f"Empty digest returned for image {uri}")
|
|
532
|
+
image_layers_data[uri] = layer_data
|
|
533
|
+
image_digest_map[uri] = digest
|
|
534
|
+
|
|
535
|
+
logger.info(
|
|
536
|
+
f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
|
|
537
|
+
)
|
|
538
|
+
return image_layers_data, image_digest_map
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
|
|
542
|
+
logger.debug("Running image layer cleanup job.")
|
|
543
|
+
GraphJob.from_node_schema(ECRImageLayerSchema(), common_job_parameters).run(
|
|
544
|
+
neo4j_session
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
@timeit
|
|
549
|
+
def sync(
|
|
550
|
+
neo4j_session: neo4j.Session,
|
|
551
|
+
boto3_session: boto3.session.Session,
|
|
552
|
+
regions: list[str],
|
|
553
|
+
current_aws_account_id: str,
|
|
554
|
+
update_tag: int,
|
|
555
|
+
common_job_parameters: dict,
|
|
556
|
+
) -> None:
|
|
557
|
+
"""
|
|
558
|
+
Sync ECR image layers. This fetches detailed layer information for ECR images
|
|
559
|
+
that already exist in the graph.
|
|
560
|
+
|
|
561
|
+
Prerequisites: Basic ECR data (repositories and images) must already be loaded
|
|
562
|
+
via the 'ecr' module before running this.
|
|
563
|
+
|
|
564
|
+
Layer fetching can be slow for accounts with many container images.
|
|
565
|
+
"""
|
|
566
|
+
|
|
567
|
+
for region in regions:
|
|
568
|
+
logger.info(
|
|
569
|
+
"Syncing ECR image layers for region '%s' in account '%s'.",
|
|
570
|
+
region,
|
|
571
|
+
current_aws_account_id,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# Get ECR images from graph using standard client function
|
|
575
|
+
from cartography.client.aws.ecr import get_ecr_images
|
|
576
|
+
|
|
577
|
+
ecr_images = get_ecr_images(neo4j_session, current_aws_account_id)
|
|
578
|
+
|
|
579
|
+
# Filter by region and deduplicate by digest
|
|
580
|
+
repo_images_list = []
|
|
581
|
+
seen_digests = set()
|
|
582
|
+
|
|
583
|
+
for region_name, _, uri, _, digest in ecr_images:
|
|
584
|
+
if region_name == region and digest not in seen_digests:
|
|
585
|
+
seen_digests.add(digest)
|
|
586
|
+
repo_uri = extract_repo_uri_from_image_uri(uri)
|
|
587
|
+
|
|
588
|
+
# Create digest-based URI for manifest fetching
|
|
589
|
+
digest_uri = f"{repo_uri}@{digest}"
|
|
590
|
+
|
|
591
|
+
repo_images_list.append(
|
|
592
|
+
{
|
|
593
|
+
"imageDigest": digest,
|
|
594
|
+
"uri": digest_uri,
|
|
595
|
+
"repo_uri": repo_uri,
|
|
596
|
+
}
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
logger.info(
|
|
600
|
+
f"Found {len(repo_images_list)} distinct ECR image digests in graph for region {region}"
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
if not repo_images_list:
|
|
604
|
+
logger.warning(
|
|
605
|
+
f"No ECR images found in graph for region {region}. "
|
|
606
|
+
f"Run 'ecr' sync first to populate basic ECR data."
|
|
607
|
+
)
|
|
608
|
+
continue
|
|
609
|
+
|
|
610
|
+
# Fetch and load image layers using async ECR client
|
|
611
|
+
if repo_images_list:
|
|
612
|
+
logger.info(
|
|
613
|
+
f"Starting to fetch layers for {len(repo_images_list)} images..."
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
async def _fetch_with_async_client() -> (
|
|
617
|
+
tuple[dict[str, dict[str, list[str]]], dict[str, str]]
|
|
618
|
+
):
|
|
619
|
+
# Use credentials from the existing boto3 session
|
|
620
|
+
credentials = boto3_session.get_credentials()
|
|
621
|
+
session = aioboto3.Session(
|
|
622
|
+
aws_access_key_id=credentials.access_key,
|
|
623
|
+
aws_secret_access_key=credentials.secret_key,
|
|
624
|
+
aws_session_token=credentials.token,
|
|
625
|
+
region_name=region,
|
|
626
|
+
)
|
|
627
|
+
async with session.client("ecr") as ecr_client:
|
|
628
|
+
return await fetch_image_layers_async(ecr_client, repo_images_list)
|
|
629
|
+
|
|
630
|
+
# Use get_event_loop() + run_until_complete() to avoid tearing down loop
|
|
631
|
+
try:
|
|
632
|
+
loop = asyncio.get_event_loop()
|
|
633
|
+
except RuntimeError:
|
|
634
|
+
# No event loop in current thread, create one
|
|
635
|
+
loop = asyncio.new_event_loop()
|
|
636
|
+
asyncio.set_event_loop(loop)
|
|
637
|
+
|
|
638
|
+
image_layers_data, image_digest_map = loop.run_until_complete(
|
|
639
|
+
_fetch_with_async_client()
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
logger.info(
|
|
643
|
+
f"Successfully fetched layers for {len(image_layers_data)} images"
|
|
644
|
+
)
|
|
645
|
+
layers, memberships = transform_ecr_image_layers(
|
|
646
|
+
image_layers_data,
|
|
647
|
+
image_digest_map,
|
|
648
|
+
)
|
|
649
|
+
load_ecr_image_layers(
|
|
650
|
+
neo4j_session,
|
|
651
|
+
layers,
|
|
652
|
+
region,
|
|
653
|
+
current_aws_account_id,
|
|
654
|
+
update_tag,
|
|
655
|
+
)
|
|
656
|
+
load_ecr_image_layer_memberships(
|
|
657
|
+
neo4j_session,
|
|
658
|
+
memberships,
|
|
659
|
+
region,
|
|
660
|
+
current_aws_account_id,
|
|
661
|
+
update_tag,
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
cleanup(neo4j_session, common_job_parameters)
|
|
@@ -14,6 +14,7 @@ from . import cognito
|
|
|
14
14
|
from . import config
|
|
15
15
|
from . import dynamodb
|
|
16
16
|
from . import ecr
|
|
17
|
+
from . import ecr_image_layers
|
|
17
18
|
from . import ecs
|
|
18
19
|
from . import efs
|
|
19
20
|
from . import eks
|
|
@@ -88,6 +89,7 @@ RESOURCE_FUNCTIONS: Dict[str, Callable[..., None]] = {
|
|
|
88
89
|
"ec2:volumes": sync_ebs_volumes,
|
|
89
90
|
"ec2:snapshots": sync_ebs_snapshots,
|
|
90
91
|
"ecr": ecr.sync,
|
|
92
|
+
"ecr:image_layers": ecr_image_layers.sync,
|
|
91
93
|
"ecs": ecs.sync,
|
|
92
94
|
"eks": eks.sync,
|
|
93
95
|
"elasticache": elasticache.sync,
|