PyPI - cartography - Versions diffs - 0.111.0__py3-none-any.whl → 0.112.0__py3-none-any.whl - Mend

cartography 0.111.0py3-none-any.whl → 0.112.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cartography might be problematic. Click here for more details.

Files changed (40) hide show

cartography/_version.py +2 -2
cartography/cli.py +11 -0
cartography/config.py +8 -0
cartography/data/indexes.cypher +0 -2
cartography/intel/aws/apigateway.py +126 -17
cartography/intel/aws/ec2/instances.py +3 -1
cartography/intel/aws/ec2/network_interfaces.py +1 -1
cartography/intel/aws/ec2/vpc_peerings.py +262 -125
cartography/intel/azure/__init__.py +35 -32
cartography/intel/azure/subscription.py +2 -2
cartography/intel/azure/tenant.py +39 -30
cartography/intel/azure/util/credentials.py +49 -174
cartography/intel/entra/__init__.py +47 -1
cartography/intel/entra/applications.py +220 -170
cartography/intel/entra/groups.py +41 -22
cartography/intel/entra/ou.py +28 -20
cartography/intel/entra/users.py +24 -18
cartography/intel/gcp/__init__.py +25 -8
cartography/intel/gcp/compute.py +47 -12
cartography/intel/kubernetes/__init__.py +26 -0
cartography/intel/kubernetes/eks.py +402 -0
cartography/intel/kubernetes/rbac.py +133 -0
cartography/models/aws/apigateway/apigatewayintegration.py +79 -0
cartography/models/aws/apigateway/apigatewaymethod.py +74 -0
cartography/models/aws/ec2/vpc_peering.py +157 -0
cartography/models/azure/principal.py +44 -0
cartography/models/azure/tenant.py +20 -0
cartography/models/kubernetes/clusterrolebindings.py +40 -0
cartography/models/kubernetes/groups.py +107 -0
cartography/models/kubernetes/oidc.py +51 -0
cartography/models/kubernetes/rolebindings.py +40 -0
cartography/models/kubernetes/users.py +105 -0
cartography/util.py +2 -0
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/METADATA +8 -5
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/RECORD +39 -31
cartography/data/jobs/cleanup/aws_import_vpc_peering_cleanup.json +0 -45
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/WHEEL +0 -0
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/entry_points.txt +0 -0
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/licenses/LICENSE +0 -0
{cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/top_level.txt +0 -0

cartography/intel/entra/ou.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # cartography/intel/entra/ou.py
 import logging
 from typing import Any
+from typing import AsyncGenerator
+from typing import Generator
 import neo4j
 from azure.identity import ClientSecretCredential
@@ -9,7 +11,6 @@ from msgraph.generated.models.administrative_unit import AdministrativeUnit
 from cartography.client.core.tx import load
 from cartography.graph.job import GraphJob
-from cartography.intel.entra.users import load_tenant
 from cartography.models.entra.ou import EntraOUSchema
 from cartography.util import timeit
@@ -17,12 +18,12 @@ logger = logging.getLogger(__name__)
 @timeit
-async def get_entra_ous(client: GraphServiceClient) -> list[AdministrativeUnit]:
+async def get_entra_ous(
+    client: GraphServiceClient,
+) -> AsyncGenerator[AdministrativeUnit, None]:
     """
-    Get all OUs from Microsoft Graph API with pagination support
+    Get all OUs from Microsoft Graph API with pagination support using a generator
     """
-    all_units: list[AdministrativeUnit] = []
     # Initialize first page request
     current_request = client.directory.administrative_units
@@ -30,7 +31,8 @@ async def get_entra_ous(client: GraphServiceClient) -> list[AdministrativeUnit]:
         try:
             response = await current_request.get()
             if response and response.value:
-                all_units.extend(response.value)
+                for unit in response.value:
+                    yield unit
                 # Handle next page using OData link
                 if response.odata_next_link:
@@ -45,18 +47,15 @@ async def get_entra_ous(client: GraphServiceClient) -> list[AdministrativeUnit]:
             logger.error(f"Failed to retrieve administrative units: {str(e)}")
             current_request = None
-    return all_units
 def transform_ous(
     units: list[AdministrativeUnit], tenant_id: str
-) -> list[dict[str, Any]]:
+) -> Generator[dict[str, Any], None, None]:
     """
-    Transform the API response into the format expected by our schema
+    Transform the API response into the format expected by our schema using a generator
     """
-    result: list[dict[str, Any]] = []
     for unit in units:
-        transformed_unit = {
+        yield {
             "id": unit.id,
             "display_name": unit.display_name,
             "description": unit.description,
@@ -66,8 +65,6 @@ def transform_ous(
             "deleted_date_time": unit.deleted_date_time,
             "tenant_id": tenant_id,
         }
-        result.append(transformed_unit)
-    return result
 @timeit
@@ -116,13 +113,24 @@ async def sync_entra_ous(
         credential, scopes=["https://graph.microsoft.com/.default"]
     )
-    # Get OUs
-    units = await get_entra_ous(client)
-    transformed_units = transform_ous(units, tenant_id)
+    # Process OUs in batches
+    batch_size = 100  # OUs are typically fewer than users/groups
+    units_batch = []
+    async for unit in get_entra_ous(client):
+        units_batch.append(unit)
+        if len(units_batch) >= batch_size:
+            transformed_units = list(transform_ous(units_batch, tenant_id))
+            load_ous(
+                neo4j_session, transformed_units, update_tag, common_job_parameters
+            )
+            units_batch.clear()
-    # Load data
-    load_tenant(neo4j_session, {"id": tenant_id}, update_tag)
-    load_ous(neo4j_session, transformed_units, update_tag, common_job_parameters)
+    # Process any remaining OUs
+    if units_batch:
+        transformed_units = list(transform_ous(units_batch, tenant_id))
+        load_ous(neo4j_session, transformed_units, update_tag, common_job_parameters)
     # Cleanup stale data
     cleanup_ous(neo4j_session, common_job_parameters)

cartography/intel/entra/users.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import logging
 from typing import Any
+from typing import AsyncGenerator
+from typing import Generator
 import neo4j
 from azure.identity import ClientSecretCredential
@@ -71,7 +73,7 @@ async def get_tenant(client: GraphServiceClient) -> Organization:
 @timeit
-async def get_users(client: GraphServiceClient) -> list[User]:
+async def get_users(client: GraphServiceClient) -> AsyncGenerator[User, None]:
     """Fetch all users with their manager reference in as few requests as possible.
     We leverage `$expand=manager($select=id)` so the manager's *id* is hydrated
@@ -80,7 +82,6 @@ async def get_users(client: GraphServiceClient) -> list[User]:
     when a user has no manager assigned.
     """
-    all_users: list[User] = []
     request_configuration = client.users.UsersRequestBuilderGetRequestConfiguration(
         query_parameters=client.users.UsersRequestBuilderGetQueryParameters(
             top=999,
@@ -91,7 +92,9 @@ async def get_users(client: GraphServiceClient) -> list[User]:
     page = await client.users.get(request_configuration=request_configuration)
     while page:
-        all_users.extend(page.value)
+        if page.value:
+            for user in page.value:
+                yield user
         if not page.odata_next_link:
             break
@@ -104,23 +107,20 @@ async def get_users(client: GraphServiceClient) -> list[User]:
             )
             break
-    return all_users
 @timeit
 # The manager reference is now embedded in the user objects courtesy of the
 # `$expand` we added above, so we no longer need a separate `manager_map`.
-def transform_users(users: list[User]) -> list[dict[str, Any]]:
+def transform_users(users: list[User]) -> Generator[dict[str, Any], None, None]:
     """Convert MS Graph SDK `User` models into dicts matching our schema."""
-    result: list[dict[str, Any]] = []
     for user in users:
         manager_id: str | None = None
         if getattr(user, "manager", None) is not None:
             # The SDK materialises `manager` as a DirectoryObject (or subclass)
             manager_id = getattr(user.manager, "id", None)
-        transformed_user = {
+        yield {
             "id": user.id,
             "user_principal_name": user.user_principal_name,
             "display_name": user.display_name,
@@ -143,9 +143,6 @@ def transform_users(users: list[User]) -> list[dict[str, Any]]:
             "age_group": user.age_group,
             "manager_id": manager_id,
         }
-        result.append(transformed_user)
-    return result
 @timeit
@@ -240,14 +237,23 @@ async def sync_entra_users(
         credential, scopes=["https://graph.microsoft.com/.default"]
     )
-    # Fetch tenant and users (with manager reference already populated by `$expand`)
-    tenant = await get_tenant(client)
-    users = await get_users(client)
+    # Process users in batches to reduce memory consumption
+    batch_size = (
+        500  # Process users in larger batches since they're simpler than groups
+    )
+    users_batch = []
+    async for user in get_users(client):
+        users_batch.append(user)
-    transformed_users = transform_users(users)
-    transformed_tenant = transform_tenant(tenant, tenant_id)
+        if len(users_batch) >= batch_size:
+            transformed_users = list(transform_users(users_batch))
+            load_users(neo4j_session, transformed_users, tenant_id, update_tag)
+            users_batch.clear()
-    load_tenant(neo4j_session, transformed_tenant, update_tag)
-    load_users(neo4j_session, transformed_users, tenant_id, update_tag)
+    # Process any remaining users
+    if users_batch:
+        transformed_users = list(transform_users(users_batch))
+        load_users(neo4j_session, transformed_users, tenant_id, update_tag)
     cleanup(neo4j_session, common_job_parameters)

cartography/intel/gcp/__init__.py CHANGED Viewed

@@ -7,10 +7,12 @@ from typing import Optional
 from typing import Set
 import googleapiclient.discovery
+import httplib2
 import neo4j
 from google.auth import default
 from google.auth.credentials import Credentials as GoogleCredentials
 from google.auth.exceptions import DefaultCredentialsError
+from google_auth_httplib2 import AuthorizedHttp
 from googleapiclient.discovery import Resource
 from cartography.config import Config
@@ -39,6 +41,18 @@ service_names = Services(
     iam="iam.googleapis.com",
 )
+# Default HTTP timeout (seconds) for Google API clients built via discovery.build
+_GCP_HTTP_TIMEOUT = 120
+def _authorized_http_with_timeout(
+    credentials: GoogleCredentials, timeout: int = _GCP_HTTP_TIMEOUT
+) -> AuthorizedHttp:
+    """
+    Build an AuthorizedHttp with a per-request timeout, avoiding global socket timeouts.
+    """
+    return AuthorizedHttp(credentials, http=httplib2.Http(timeout=timeout))
 def _get_crm_resource_v1(credentials: GoogleCredentials) -> Resource:
     """
@@ -52,7 +66,7 @@ def _get_crm_resource_v1(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "cloudresourcemanager",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -67,7 +81,7 @@ def _get_crm_resource_v2(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "cloudresourcemanager",
         "v2",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -82,7 +96,7 @@ def _get_compute_resource(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "compute",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -99,7 +113,7 @@ def _get_storage_resource(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "storage",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -115,7 +129,7 @@ def _get_container_resource(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "container",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -131,7 +145,7 @@ def _get_dns_resource(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "dns",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -147,7 +161,7 @@ def _get_serviceusage_resource(credentials: GoogleCredentials) -> Resource:
     return googleapiclient.discovery.build(
         "serviceusage",
         "v1",
-        credentials=credentials,
+        http=_authorized_http_with_timeout(credentials),
         cache_discovery=False,
     )
@@ -157,7 +171,10 @@ def _get_iam_resource(credentials: GoogleCredentials) -> Resource:
     Instantiates a Google IAM resource object to call the IAM API.
     """
     return googleapiclient.discovery.build(
-        "iam", "v1", credentials=credentials, cache_discovery=False
+        "iam",
+        "v1",
+        http=_authorized_http_with_timeout(credentials),
+        cache_discovery=False,
     )

cartography/intel/gcp/compute.py CHANGED Viewed

@@ -11,8 +11,8 @@ from typing import Optional
 from typing import Set
 import neo4j
-from googleapiclient.discovery import HttpError
 from googleapiclient.discovery import Resource
+from googleapiclient.errors import HttpError
 from cartography.client.core.tx import load
 from cartography.graph.job import GraphJob
@@ -24,6 +24,10 @@ logger = logging.getLogger(__name__)
 InstanceUriPrefix = namedtuple("InstanceUriPrefix", "zone_name project_id")
+# Maximum number of retries for Google API requests
+GOOGLE_API_NUM_RETRIES = 5
 def _get_error_reason(http_error: HttpError) -> str:
     """
     Helper function to get an error reason out of the googleapiclient's HttpError object
@@ -66,7 +70,7 @@ def get_zones_in_project(
     """
     try:
         req = compute.zones().list(project=project_id, maxResults=max_results)
-        res = req.execute()
+        res = req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
         return res["items"]
     except HttpError as e:
         reason = _get_error_reason(e)
@@ -120,22 +124,53 @@ def get_gcp_instance_responses(
     response_objects: List[Resource] = []
     for zone in zones:
         req = compute.instances().list(project=project_id, zone=zone["name"])
-        res = req.execute()
-        response_objects.append(res)
+        try:
+            res = req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
+            response_objects.append(res)
+        except HttpError as e:
+            reason = _get_error_reason(e)
+            if reason in {"backendError", "rateLimitExceeded", "internalError"}:
+                logger.warning(
+                    "Transient error listing instances for project %s zone %s: %s; skipping this zone.",
+                    project_id,
+                    zone.get("name"),
+                    e,
+                )
+                continue
+            raise
     return response_objects
 @timeit
-def get_gcp_subnets(projectid: str, region: str, compute: Resource) -> Resource:
+def get_gcp_subnets(projectid: str, region: str, compute: Resource) -> Dict:
     """
-    Return list of all subnets in the given projectid and region
-    :param projectid: THe projectid
+    Return list of all subnets in the given projectid and region.  If the API
+    call times out mid-pagination, return any subnets gathered so far rather than
+    bubbling the error up to the caller.
+    :param projectid: The project ID
     :param region: The region to pull subnets from
     :param compute: The compute resource object created by googleapiclient.discovery.build()
     :return: Response object containing data on all GCP subnets for a given project
     """
     req = compute.subnetworks().list(project=projectid, region=region)
-    return req.execute()
+    items: List[Dict] = []
+    response_id = f"projects/{projectid}/regions/{region}/subnetworks"
+    while req is not None:
+        try:
+            res = req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
+        except TimeoutError:
+            logger.warning(
+                "GCP: subnetworks.list for project %s region %s timed out; continuing with partial data.",
+                projectid,
+                region,
+            )
+            break
+        items.extend(res.get("items", []))
+        response_id = res.get("id", response_id)
+        req = compute.subnetworks().list_next(
+            previous_request=req, previous_response=res
+        )
+    return {"id": response_id, "items": items}
 @timeit
@@ -147,7 +182,7 @@ def get_gcp_vpcs(projectid: str, compute: Resource) -> Resource:
     :return: VPC response object
     """
     req = compute.networks().list(project=projectid)
-    return req.execute()
+    return req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
 @timeit
@@ -164,7 +199,7 @@ def get_gcp_regional_forwarding_rules(
     :return: Response object containing data on all GCP forwarding rules for a given project
     """
     req = compute.forwardingRules().list(project=project_id, region=region)
-    return req.execute()
+    return req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
 @timeit
@@ -176,7 +211,7 @@ def get_gcp_global_forwarding_rules(project_id: str, compute: Resource) -> Resou
     :return: Response object containing data on all GCP forwarding rules for a given project
     """
     req = compute.globalForwardingRules().list(project=project_id)
-    return req.execute()
+    return req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
 @timeit
@@ -188,7 +223,7 @@ def get_gcp_firewall_ingress_rules(project_id: str, compute: Resource) -> Resour
     :return: Firewall response object
     """
     req = compute.firewalls().list(project=project_id, filter='(direction="INGRESS")')
-    return req.execute()
+    return req.execute(num_retries=GOOGLE_API_NUM_RETRIES)
 @timeit

cartography/intel/kubernetes/__init__.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import logging
+import boto3
 from neo4j import Session
 from cartography.config import Config
 from cartography.intel.kubernetes.clusters import sync_kubernetes_cluster
+from cartography.intel.kubernetes.eks import sync as sync_eks
 from cartography.intel.kubernetes.namespaces import sync_namespaces
 from cartography.intel.kubernetes.pods import sync_pods
 from cartography.intel.kubernetes.rbac import sync_kubernetes_rbac
@@ -15,6 +17,17 @@ from cartography.util import timeit
 logger = logging.getLogger(__name__)
+def get_region_from_arn(arn: str) -> str:
+    """
+    Extract AWS region from EKS cluster ARN.
+    Example: arn:aws:eks:us-east-1:205930638578:cluster/infra-test-eks → us-east-1
+    """
+    parts = arn.split(":")
+    if len(parts) < 6 or parts[2] != "eks":
+        raise ValueError(f"Invalid EKS cluster ARN: {arn}")
+    return parts[3]
 @timeit
 def start_k8s_ingestion(session: Session, config: Config) -> None:
     if not config.update_tag:
@@ -42,6 +55,19 @@ def start_k8s_ingestion(session: Session, config: Config) -> None:
             sync_kubernetes_rbac(
                 session, client, config.update_tag, common_job_parameters
             )
+            if config.managed_kubernetes == "eks":
+                # EKS identity provider sync
+                boto3_session = boto3.Session()
+                region = get_region_from_arn(cluster_info.get("id", ""))
+                sync_eks(
+                    session,
+                    client,
+                    boto3_session,
+                    region,
+                    config.update_tag,
+                    cluster_info.get("id", ""),
+                    cluster_info.get("name", ""),
+                )
             all_pods = sync_pods(
                 session,
                 client,

cartography 0.111.0__py3-none-any.whl → 0.112.0__py3-none-any.whl

Potentially problematic release.

cartography 0.111.0py3-none-any.whl → 0.112.0py3-none-any.whl