cartography 0.116.0__py3-none-any.whl → 0.117.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

cartography/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.116.0'
32
- __version_tuple__ = version_tuple = (0, 116, 0)
31
+ __version__ = version = '0.117.0'
32
+ __version_tuple__ = version_tuple = (0, 117, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -6,6 +6,7 @@ from typing import Optional
6
6
  from typing import Tuple
7
7
  from typing import Union
8
8
 
9
+ import backoff
9
10
  import neo4j
10
11
 
11
12
  from cartography.graph.querybuilder import build_create_index_queries
@@ -14,11 +15,31 @@ from cartography.graph.querybuilder import build_ingestion_query
14
15
  from cartography.graph.querybuilder import build_matchlink_query
15
16
  from cartography.models.core.nodes import CartographyNodeSchema
16
17
  from cartography.models.core.relationships import CartographyRelSchema
18
+ from cartography.util import backoff_handler
17
19
  from cartography.util import batch
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
21
23
 
24
+ @backoff.on_exception( # type: ignore
25
+ backoff.expo,
26
+ (
27
+ ConnectionResetError,
28
+ neo4j.exceptions.ServiceUnavailable,
29
+ neo4j.exceptions.SessionExpired,
30
+ neo4j.exceptions.TransientError,
31
+ ),
32
+ max_tries=5,
33
+ on_backoff=backoff_handler,
34
+ )
35
+ def _run_index_query_with_retry(neo4j_session: neo4j.Session, query: str) -> None:
36
+ """
37
+ Execute an index creation query with retry logic.
38
+ Index creation requires autocommit transactions and can experience transient errors.
39
+ """
40
+ neo4j_session.run(query)
41
+
42
+
22
43
  def run_write_query(
23
44
  neo4j_session: neo4j.Session, query: str, **parameters: Any
24
45
  ) -> None:
@@ -269,7 +290,7 @@ def ensure_indexes(
269
290
  raise ValueError(
270
291
  'Query provided to `ensure_indexes()` does not start with "CREATE INDEX IF NOT EXISTS".',
271
292
  )
272
- neo4j_session.run(query)
293
+ _run_index_query_with_retry(neo4j_session, query)
273
294
 
274
295
 
275
296
  def ensure_indexes_for_matchlinks(
@@ -288,7 +309,7 @@ def ensure_indexes_for_matchlinks(
288
309
  raise ValueError(
289
310
  'Query provided to `ensure_indexes_for_matchlinks()` does not start with "CREATE INDEX IF NOT EXISTS".',
290
311
  )
291
- neo4j_session.run(query)
312
+ _run_index_query_with_retry(neo4j_session, query)
292
313
 
293
314
 
294
315
  def load(
@@ -170,6 +170,111 @@ async def get_blob_json_via_presigned(
170
170
  return response.json()
171
171
 
172
172
 
173
+ async def _extract_parent_image_from_attestation(
174
+ ecr_client: ECRClient,
175
+ repo_name: str,
176
+ attestation_manifest_digest: str,
177
+ http_client: httpx.AsyncClient,
178
+ ) -> Optional[dict[str, str]]:
179
+ """
180
+ Extract parent image information from an in-toto provenance attestation.
181
+
182
+ This function fetches an attestation manifest, downloads its in-toto layer,
183
+ and extracts the parent image reference from the SLSA provenance materials.
184
+
185
+ :param ecr_client: ECR client for fetching manifests and layers
186
+ :param repo_name: ECR repository name
187
+ :param attestation_manifest_digest: Digest of the attestation manifest
188
+ :param http_client: HTTP client for downloading blobs
189
+ :return: Dict with parent_image_uri and parent_image_digest, or None if no parent image found
190
+ """
191
+ try:
192
+ attestation_manifest, _ = await batch_get_manifest(
193
+ ecr_client,
194
+ repo_name,
195
+ attestation_manifest_digest,
196
+ [ECR_OCI_MANIFEST_MT, ECR_DOCKER_MANIFEST_MT],
197
+ )
198
+
199
+ if not attestation_manifest:
200
+ logger.debug(
201
+ "No attestation manifest found for digest %s in repo %s",
202
+ attestation_manifest_digest,
203
+ repo_name,
204
+ )
205
+ return None
206
+
207
+ # Get the in-toto layer from the attestation manifest
208
+ layers = attestation_manifest.get("layers", [])
209
+ intoto_layer = next(
210
+ (
211
+ layer
212
+ for layer in layers
213
+ if "in-toto" in layer.get("mediaType", "").lower()
214
+ ),
215
+ None,
216
+ )
217
+
218
+ if not intoto_layer:
219
+ logger.debug(
220
+ "No in-toto layer found in attestation manifest %s",
221
+ attestation_manifest_digest,
222
+ )
223
+ return None
224
+
225
+ # Download the in-toto attestation blob
226
+ intoto_digest = intoto_layer.get("digest")
227
+ if not intoto_digest:
228
+ logger.debug("No digest found for in-toto layer")
229
+ return None
230
+
231
+ attestation_blob = await get_blob_json_via_presigned(
232
+ ecr_client,
233
+ repo_name,
234
+ intoto_digest,
235
+ http_client,
236
+ )
237
+
238
+ if not attestation_blob:
239
+ logger.debug("Failed to download attestation blob")
240
+ return None
241
+
242
+ # Extract parent image from SLSA provenance materials
243
+ materials = attestation_blob.get("predicate", {}).get("materials", [])
244
+ for material in materials:
245
+ uri = material.get("uri", "")
246
+ uri_l = uri.lower()
247
+ # Look for container image URIs that are NOT the dockerfile itself
248
+ is_container_ref = (
249
+ uri_l.startswith("pkg:docker/")
250
+ or uri_l.startswith("pkg:oci/")
251
+ or uri_l.startswith("oci://")
252
+ )
253
+ if is_container_ref and "dockerfile" not in uri_l:
254
+ digest_obj = material.get("digest", {})
255
+ sha256_digest = digest_obj.get("sha256")
256
+ if sha256_digest:
257
+ return {
258
+ "parent_image_uri": uri,
259
+ "parent_image_digest": f"sha256:{sha256_digest}",
260
+ }
261
+
262
+ logger.debug(
263
+ "No parent image found in attestation materials for %s",
264
+ attestation_manifest_digest,
265
+ )
266
+ return None
267
+
268
+ except Exception as e:
269
+ logger.warning(
270
+ "Error extracting parent image from attestation %s in repo %s: %s",
271
+ attestation_manifest_digest,
272
+ repo_name,
273
+ e,
274
+ )
275
+ return None
276
+
277
+
173
278
  async def _diff_ids_for_manifest(
174
279
  ecr_client: ECRClient,
175
280
  repo_name: str,
@@ -228,6 +333,7 @@ async def _diff_ids_for_manifest(
228
333
  def transform_ecr_image_layers(
229
334
  image_layers_data: dict[str, dict[str, list[str]]],
230
335
  image_digest_map: dict[str, str],
336
+ image_attestation_map: Optional[dict[str, dict[str, str]]] = None,
231
337
  ) -> tuple[list[dict], list[dict]]:
232
338
  """
233
339
  Transform image layer data into format suitable for Neo4j ingestion.
@@ -235,8 +341,11 @@ def transform_ecr_image_layers(
235
341
 
236
342
  :param image_layers_data: Map of image URI to platform to diff_ids
237
343
  :param image_digest_map: Map of image URI to image digest
344
+ :param image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
238
345
  :return: List of layer objects ready for ingestion
239
346
  """
347
+ if image_attestation_map is None:
348
+ image_attestation_map = {}
240
349
  layers_by_diff_id: dict[str, dict[str, Any]] = {}
241
350
  memberships_by_digest: dict[str, dict[str, Any]] = {}
242
351
 
@@ -278,10 +387,20 @@ def transform_ecr_image_layers(
278
387
  layer["tail_image_ids"].add(image_digest)
279
388
 
280
389
  if ordered_layers_for_image:
281
- memberships_by_digest[image_digest] = {
390
+ membership: dict[str, Any] = {
282
391
  "layer_diff_ids": ordered_layers_for_image,
283
392
  }
284
393
 
394
+ # Add attestation data if available for this image
395
+ if image_uri in image_attestation_map:
396
+ attestation = image_attestation_map[image_uri]
397
+ membership["parent_image_uri"] = attestation["parent_image_uri"]
398
+ membership["parent_image_digest"] = attestation["parent_image_digest"]
399
+ membership["from_attestation"] = True
400
+ membership["confidence"] = "explicit"
401
+
402
+ memberships_by_digest[image_digest] = membership
403
+
285
404
  # Convert sets back to lists for Neo4j ingestion
286
405
  layers = []
287
406
  for layer in layers_by_diff_id.values():
@@ -350,12 +469,18 @@ async def fetch_image_layers_async(
350
469
  ecr_client: ECRClient,
351
470
  repo_images_list: list[dict],
352
471
  max_concurrent: int = 200,
353
- ) -> tuple[dict[str, dict[str, list[str]]], dict[str, str]]:
472
+ ) -> tuple[dict[str, dict[str, list[str]]], dict[str, str], dict[str, dict[str, str]]]:
354
473
  """
355
474
  Fetch image layers for ECR images in parallel with caching and non-blocking I/O.
475
+
476
+ Returns:
477
+ - image_layers_data: Map of image URI to platform to diff_ids
478
+ - image_digest_map: Map of image URI to image digest
479
+ - image_attestation_map: Map of image URI to attestation data (parent_image_uri, parent_image_digest)
356
480
  """
357
481
  image_layers_data: dict[str, dict[str, list[str]]] = {}
358
482
  image_digest_map: dict[str, str] = {}
483
+ image_attestation_map: dict[str, dict[str, str]] = {}
359
484
  semaphore = asyncio.Semaphore(max_concurrent)
360
485
 
361
486
  # Cache for manifest fetches keyed by (repo_name, imageDigest)
@@ -402,8 +527,8 @@ async def fetch_image_layers_async(
402
527
  async def fetch_single_image_layers(
403
528
  repo_image: dict,
404
529
  http_client: httpx.AsyncClient,
405
- ) -> Optional[tuple[str, str, dict[str, list[str]]]]:
406
- """Fetch layers for a single image."""
530
+ ) -> Optional[tuple[str, str, dict[str, list[str]], Optional[dict[str, str]]]]:
531
+ """Fetch layers for a single image and extract attestation if present."""
407
532
  async with semaphore:
408
533
  # Caller guarantees these fields exist in every repo_image
409
534
  uri = repo_image["uri"]
@@ -426,24 +551,37 @@ async def fetch_image_layers_async(
426
551
 
427
552
  manifest_media_type = (media_type or doc.get("mediaType", "")).lower()
428
553
  platform_layers: dict[str, list[str]] = {}
554
+ attestation_data: Optional[dict[str, str]] = None
429
555
 
430
556
  if doc.get("manifests") and manifest_media_type in INDEX_MEDIA_TYPES_LOWER:
431
557
 
432
558
  async def _process_child_manifest(
433
559
  manifest_ref: dict,
434
- ) -> dict[str, list[str]]:
435
- # Skip attestation manifests - these aren't real images
560
+ ) -> tuple[dict[str, list[str]], Optional[dict[str, str]]]:
561
+ # Check if this is an attestation manifest
436
562
  if (
437
563
  manifest_ref.get("annotations", {}).get(
438
564
  "vnd.docker.reference.type"
439
565
  )
440
566
  == "attestation-manifest"
441
567
  ):
442
- return {}
568
+ # Extract base image from attestation
569
+ child_digest = manifest_ref.get("digest")
570
+ if child_digest:
571
+ attestation_info = (
572
+ await _extract_parent_image_from_attestation(
573
+ ecr_client,
574
+ repo_name,
575
+ child_digest,
576
+ http_client,
577
+ )
578
+ )
579
+ return {}, attestation_info
580
+ return {}, None
443
581
 
444
582
  child_digest = manifest_ref.get("digest")
445
583
  if not child_digest:
446
- return {}
584
+ return {}, None
447
585
 
448
586
  # Use optimized caching for child manifest
449
587
  child_doc, _ = await _fetch_and_cache_manifest(
@@ -452,16 +590,17 @@ async def fetch_image_layers_async(
452
590
  [ECR_OCI_MANIFEST_MT, ECR_DOCKER_MANIFEST_MT],
453
591
  )
454
592
  if not child_doc:
455
- return {}
593
+ return {}, None
456
594
 
457
595
  platform_hint = extract_platform_from_manifest(manifest_ref)
458
- return await _diff_ids_for_manifest(
596
+ diff_map = await _diff_ids_for_manifest(
459
597
  ecr_client,
460
598
  repo_name,
461
599
  child_doc,
462
600
  http_client,
463
601
  platform_hint,
464
602
  )
603
+ return diff_map, None
465
604
 
466
605
  # Process all child manifests in parallel
467
606
  child_tasks = [
@@ -474,8 +613,13 @@ async def fetch_image_layers_async(
474
613
 
475
614
  # Merge results from successful child manifest processing
476
615
  for result in child_results:
477
- if isinstance(result, dict):
478
- platform_layers.update(result)
616
+ if isinstance(result, tuple) and len(result) == 2:
617
+ layer_data, attest_data = result
618
+ if layer_data:
619
+ platform_layers.update(layer_data)
620
+ if attest_data and not attestation_data:
621
+ # Use first attestation found
622
+ attestation_data = attest_data
479
623
  else:
480
624
  diff_map = await _diff_ids_for_manifest(
481
625
  ecr_client,
@@ -487,7 +631,7 @@ async def fetch_image_layers_async(
487
631
  platform_layers.update(diff_map)
488
632
 
489
633
  if platform_layers:
490
- return uri, digest, platform_layers
634
+ return uri, digest, platform_layers, attestation_data
491
635
 
492
636
  return None
493
637
 
@@ -507,7 +651,7 @@ async def fetch_image_layers_async(
507
651
  )
508
652
 
509
653
  if not tasks:
510
- return image_layers_data, image_digest_map
654
+ return image_layers_data, image_digest_map, image_attestation_map
511
655
 
512
656
  progress_interval = max(1, min(100, total // 10 or 1))
513
657
  completed = 0
@@ -526,16 +670,22 @@ async def fetch_image_layers_async(
526
670
  )
527
671
 
528
672
  if result:
529
- uri, digest, layer_data = result
673
+ uri, digest, layer_data, attestation_data = result
530
674
  if not digest:
531
675
  raise ValueError(f"Empty digest returned for image {uri}")
532
676
  image_layers_data[uri] = layer_data
533
677
  image_digest_map[uri] = digest
678
+ if attestation_data:
679
+ image_attestation_map[uri] = attestation_data
534
680
 
535
681
  logger.info(
536
682
  f"Successfully fetched layers for {len(image_layers_data)}/{len(repo_images_list)} images"
537
683
  )
538
- return image_layers_data, image_digest_map
684
+ if image_attestation_map:
685
+ logger.info(
686
+ f"Found attestations with base image info for {len(image_attestation_map)} images"
687
+ )
688
+ return image_layers_data, image_digest_map, image_attestation_map
539
689
 
540
690
 
541
691
  def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict) -> None:
@@ -613,9 +763,11 @@ def sync(
613
763
  f"Starting to fetch layers for {len(repo_images_list)} images..."
614
764
  )
615
765
 
616
- async def _fetch_with_async_client() -> (
617
- tuple[dict[str, dict[str, list[str]]], dict[str, str]]
618
- ):
766
+ async def _fetch_with_async_client() -> tuple[
767
+ dict[str, dict[str, list[str]]],
768
+ dict[str, str],
769
+ dict[str, dict[str, str]],
770
+ ]:
619
771
  # Use credentials from the existing boto3 session
620
772
  credentials = boto3_session.get_credentials()
621
773
  session = aioboto3.Session(
@@ -635,8 +787,8 @@ def sync(
635
787
  loop = asyncio.new_event_loop()
636
788
  asyncio.set_event_loop(loop)
637
789
 
638
- image_layers_data, image_digest_map = loop.run_until_complete(
639
- _fetch_with_async_client()
790
+ image_layers_data, image_digest_map, image_attestation_map = (
791
+ loop.run_until_complete(_fetch_with_async_client())
640
792
  )
641
793
 
642
794
  logger.info(
@@ -645,6 +797,7 @@ def sync(
645
797
  layers, memberships = transform_ecr_image_layers(
646
798
  image_layers_data,
647
799
  image_digest_map,
800
+ image_attestation_map,
648
801
  )
649
802
  load_ecr_image_layers(
650
803
  neo4j_session,
@@ -9,7 +9,9 @@ from cartography.util import timeit
9
9
 
10
10
  from . import app_service
11
11
  from . import compute
12
+ from . import container_instances
12
13
  from . import cosmosdb
14
+ from . import data_lake
13
15
  from . import functions
14
16
  from . import logic_apps
15
17
  from . import resource_groups
@@ -30,6 +32,13 @@ def _sync_one_subscription(
30
32
  update_tag: int,
31
33
  common_job_parameters: Dict,
32
34
  ) -> None:
35
+ container_instances.sync(
36
+ neo4j_session,
37
+ credentials,
38
+ subscription_id,
39
+ update_tag,
40
+ common_job_parameters,
41
+ )
33
42
  compute.sync(
34
43
  neo4j_session,
35
44
  credentials.credential,
@@ -86,6 +95,13 @@ def _sync_one_subscription(
86
95
  update_tag,
87
96
  common_job_parameters,
88
97
  )
98
+ data_lake.sync(
99
+ neo4j_session,
100
+ credentials,
101
+ subscription_id,
102
+ update_tag,
103
+ common_job_parameters,
104
+ )
89
105
 
90
106
 
91
107
  def _sync_tenant(
@@ -0,0 +1,95 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import neo4j
5
+ from azure.core.exceptions import ClientAuthenticationError
6
+ from azure.core.exceptions import HttpResponseError
7
+ from azure.mgmt.containerinstance import ContainerInstanceManagementClient
8
+
9
+ from cartography.client.core.tx import load
10
+ from cartography.graph.job import GraphJob
11
+ from cartography.models.azure.container_instance import AzureContainerInstanceSchema
12
+ from cartography.util import timeit
13
+
14
+ from .util.credentials import Credentials
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @timeit
20
+ def get_container_instances(
21
+ credentials: Credentials, subscription_id: str
22
+ ) -> list[dict]:
23
+ try:
24
+ client = ContainerInstanceManagementClient(
25
+ credentials.credential, subscription_id
26
+ )
27
+ # NOTE: Azure Container Instances are called "Container Groups" in the SDK
28
+ return [cg.as_dict() for cg in client.container_groups.list()]
29
+ except (ClientAuthenticationError, HttpResponseError) as e:
30
+ logger.warning(
31
+ f"Failed to get Container Instances for subscription {subscription_id}: {str(e)}"
32
+ )
33
+ return []
34
+
35
+
36
+ def transform_container_instances(container_groups: list[dict]) -> list[dict]:
37
+ transformed_instances: list[dict[str, Any]] = []
38
+ for group in container_groups:
39
+ transformed_instance = {
40
+ "id": group.get("id"),
41
+ "name": group.get("name"),
42
+ "location": group.get("location"),
43
+ "type": group.get("type"),
44
+ "provisioning_state": group.get("properties", {}).get("provisioning_state"),
45
+ "ip_address": ((group.get("properties") or {}).get("ip_address") or {}).get(
46
+ "ip"
47
+ ),
48
+ "os_type": group.get("properties", {}).get("os_type"),
49
+ }
50
+ transformed_instances.append(transformed_instance)
51
+ return transformed_instances
52
+
53
+
54
+ @timeit
55
+ def load_container_instances(
56
+ neo4j_session: neo4j.Session,
57
+ data: list[dict[str, Any]],
58
+ subscription_id: str,
59
+ update_tag: int,
60
+ ) -> None:
61
+ load(
62
+ neo4j_session,
63
+ AzureContainerInstanceSchema(),
64
+ data,
65
+ lastupdated=update_tag,
66
+ AZURE_SUBSCRIPTION_ID=subscription_id,
67
+ )
68
+
69
+
70
+ @timeit
71
+ def cleanup_container_instances(
72
+ neo4j_session: neo4j.Session, common_job_parameters: dict
73
+ ) -> None:
74
+ GraphJob.from_node_schema(
75
+ AzureContainerInstanceSchema(), common_job_parameters
76
+ ).run(neo4j_session)
77
+
78
+
79
+ @timeit
80
+ def sync(
81
+ neo4j_session: neo4j.Session,
82
+ credentials: Credentials,
83
+ subscription_id: str,
84
+ update_tag: int,
85
+ common_job_parameters: dict,
86
+ ) -> None:
87
+ logger.info(
88
+ f"Syncing Azure Container Instances for subscription {subscription_id}."
89
+ )
90
+ raw_groups = get_container_instances(credentials, subscription_id)
91
+ transformed_groups = transform_container_instances(raw_groups)
92
+ load_container_instances(
93
+ neo4j_session, transformed_groups, subscription_id, update_tag
94
+ )
95
+ cleanup_container_instances(neo4j_session, common_job_parameters)
@@ -0,0 +1,124 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import neo4j
5
+ from azure.core.exceptions import ClientAuthenticationError
6
+ from azure.core.exceptions import HttpResponseError
7
+ from azure.mgmt.storage import StorageManagementClient
8
+
9
+ from cartography.client.core.tx import load
10
+ from cartography.graph.job import GraphJob
11
+ from cartography.models.azure.data_lake_filesystem import AzureDataLakeFileSystemSchema
12
+ from cartography.util import timeit
13
+
14
+ from .util.credentials import Credentials
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def _get_resource_group_from_id(resource_id: str) -> str:
20
+ """
21
+ Helper function to parse the resource group name from a full resource ID string.
22
+ """
23
+ parts = resource_id.lower().split("/")
24
+ rg_index = parts.index("resourcegroups")
25
+ return parts[rg_index + 1]
26
+
27
+
28
+ @timeit
29
+ def get_datalake_accounts(credentials: Credentials, subscription_id: str) -> list[dict]:
30
+ try:
31
+ client = StorageManagementClient(credentials.credential, subscription_id)
32
+ storage_accounts = [sa.as_dict() for sa in client.storage_accounts.list()]
33
+ return [sa for sa in storage_accounts if sa.get("is_hns_enabled")]
34
+ except (ClientAuthenticationError, HttpResponseError) as e:
35
+ logger.warning(f"Failed to get Storage Accounts for Data Lake sync: {str(e)}")
36
+ return []
37
+
38
+
39
+ @timeit
40
+ def get_filesystems_for_account(
41
+ client: StorageManagementClient,
42
+ account: dict,
43
+ ) -> list[dict]:
44
+ resource_group_name = _get_resource_group_from_id(account["id"])
45
+ try:
46
+ return [
47
+ c.as_dict()
48
+ for c in client.blob_containers.list(
49
+ resource_group_name,
50
+ account["name"],
51
+ )
52
+ ]
53
+ except (ClientAuthenticationError, HttpResponseError) as e:
54
+ logger.warning(
55
+ f"Failed to get containers for storage account {account['name']}: {str(e)}",
56
+ )
57
+ return []
58
+
59
+
60
+ @timeit
61
+ def transform_datalake_filesystems(filesystems_response: list[dict]) -> list[dict]:
62
+ transformed_filesystems: list[dict[str, Any]] = []
63
+ for fs in filesystems_response:
64
+ transformed_filesystem = {
65
+ "id": fs.get("id"),
66
+ "name": fs.get("name"),
67
+ "public_access": fs.get("properties", {}).get("public_access"),
68
+ "last_modified_time": fs.get("properties", {}).get("last_modified_time"),
69
+ "has_immutability_policy": fs.get("properties", {}).get(
70
+ "has_immutability_policy",
71
+ ),
72
+ "has_legal_hold": fs.get("properties", {}).get("has_legal_hold"),
73
+ }
74
+ transformed_filesystems.append(transformed_filesystem)
75
+ return transformed_filesystems
76
+
77
+
78
+ @timeit
79
+ def load_datalake_filesystems(
80
+ neo4j_session: neo4j.Session,
81
+ data: list[dict[str, Any]],
82
+ storage_account_id: str,
83
+ update_tag: int,
84
+ ) -> None:
85
+ load(
86
+ neo4j_session,
87
+ AzureDataLakeFileSystemSchema(),
88
+ data,
89
+ lastupdated=update_tag,
90
+ STORAGE_ACCOUNT_ID=storage_account_id,
91
+ )
92
+
93
+
94
+ @timeit
95
+ def sync(
96
+ neo4j_session: neo4j.Session,
97
+ credentials: Credentials,
98
+ subscription_id: str,
99
+ update_tag: int,
100
+ common_job_parameters: dict,
101
+ ) -> None:
102
+ logger.info(
103
+ f"Syncing Azure Data Lake File Systems for subscription {subscription_id}.",
104
+ )
105
+ client = StorageManagementClient(credentials.credential, subscription_id)
106
+
107
+ datalake_accounts = get_datalake_accounts(credentials, subscription_id)
108
+ for account in datalake_accounts:
109
+ account_id = account["id"]
110
+ raw_filesystems = get_filesystems_for_account(client, account)
111
+ transformed_filesystems = transform_datalake_filesystems(raw_filesystems)
112
+
113
+ load_datalake_filesystems(
114
+ neo4j_session,
115
+ transformed_filesystems,
116
+ account_id,
117
+ update_tag,
118
+ )
119
+
120
+ cleanup_params = common_job_parameters.copy()
121
+ cleanup_params["STORAGE_ACCOUNT_ID"] = account_id
122
+ GraphJob.from_node_schema(AzureDataLakeFileSystemSchema(), cleanup_params).run(
123
+ neo4j_session,
124
+ )
@@ -84,7 +84,7 @@ def _get_teams_repos_inner_func(
84
84
  repo_urls: list[str],
85
85
  repo_permissions: list[str],
86
86
  ) -> None:
87
- logger.info(f"Loading team repos for {team_name}.")
87
+ logger.info(f"Retrieving team repos for {team_name}.")
88
88
  team_repos = _get_team_repos(org, api_url, token, team_name)
89
89
 
90
90
  # The `or []` is because `.nodes` can be None. See:
@@ -192,7 +192,7 @@ def _get_teams_users_inner_func(
192
192
  user_urls: List[str],
193
193
  user_roles: List[str],
194
194
  ) -> None:
195
- logger.info(f"Loading team users for {team_name}.")
195
+ logger.info(f"Retrieving team users for {team_name}.")
196
196
  team_users = _get_team_users(org, api_url, token, team_name)
197
197
  # The `or []` is because `.nodes` can be None. See:
198
198
  # https://docs.github.com/en/graphql/reference/objects#teammemberconnection
@@ -299,7 +299,7 @@ def _get_child_teams_inner_func(
299
299
  team_name: str,
300
300
  team_urls: List[str],
301
301
  ) -> None:
302
- logger.info(f"Loading child teams for {team_name}.")
302
+ logger.info(f"Retrieving child teams for {team_name}.")
303
303
  child_teams = _get_child_teams(org, api_url, token, team_name)
304
304
  # The `or []` is because `.nodes` can be None. See:
305
305
  # https://docs.github.com/en/graphql/reference/objects#teammemberconnection