PyPI - cartography - Versions diffs - 0.95.0rc1__py3-none-any.whl → 0.96.0rc2__py3-none-any.whl - Mend

cartography 0.95.0rc1py3-none-any.whl → 0.96.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cartography might be problematic. Click here for more details.

Files changed (27) hide show

cartography/cli.py +15 -0
cartography/config.py +4 -0
cartography/data/indexes.cypher +1 -2
cartography/data/jobs/cleanup/aws_import_identity_center_cleanup.json +16 -0
cartography/data/jobs/cleanup/{github_users_cleanup.json → github_org_and_users_cleanup.json} +5 -0
cartography/graph/querybuilder.py +4 -0
cartography/intel/aws/ec2/network_acls.py +208 -0
cartography/intel/aws/identitycenter.py +307 -0
cartography/intel/aws/resources.py +4 -0
cartography/intel/github/users.py +156 -39
cartography/intel/okta/users.py +2 -1
cartography/intel/semgrep/__init__.py +9 -2
cartography/intel/semgrep/dependencies.py +233 -0
cartography/intel/semgrep/deployment.py +67 -0
cartography/intel/semgrep/findings.py +22 -53
cartography/models/aws/ec2/network_acl_rules.py +97 -0
cartography/models/aws/ec2/network_acls.py +86 -0
cartography/models/core/common.py +18 -1
cartography/models/github/orgs.py +26 -0
cartography/models/github/users.py +119 -0
cartography/models/semgrep/dependencies.py +90 -0
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/METADATA +1 -1
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/RECORD +27 -17
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/WHEEL +1 -1
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/LICENSE +0 -0
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/entry_points.txt +0 -0
{cartography-0.95.0rc1.dist-info → cartography-0.96.0rc2.dist-info}/top_level.txt +0 -0

cartography/intel/github/users.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+from copy import deepcopy
 from typing import Any
 from typing import Dict
 from typing import List
@@ -6,7 +7,11 @@ from typing import Tuple
 import neo4j
+from cartography.client.core.tx import load
 from cartography.intel.github.util import fetch_all
+from cartography.models.github.orgs import GitHubOrganizationSchema
+from cartography.models.github.users import GitHubOrganizationUserSchema
+from cartography.models.github.users import GitHubUnaffiliatedUserSchema
 from cartography.stats import get_stats_client
 from cartography.util import merge_module_sync_metadata
 from cartography.util import run_cleanup_job
@@ -44,17 +49,46 @@ GITHUB_ORG_USERS_PAGINATED_GRAPHQL = """
     }
     """
+GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL = """
+    query($login: String!, $cursor: String) {
+    organization(login: $login)
+        {
+            url
+            login
+            enterpriseOwners(first:100, after: $cursor){
+                edges {
+                    node {
+                        url
+                        login
+                        name
+                        isSiteAdmin
+                        email
+                        company
+                    }
+                    organizationRole
+                }
+                pageInfo{
+                    endCursor
+                    hasNextPage
+                }
+            }
+        }
+    }
+    """
 @timeit
-def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
+def get_users(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
     """
     Retrieve a list of users from the given GitHub organization as described in
     https://docs.github.com/en/graphql/reference/objects#organizationmemberedge.
     :param token: The Github API token as string.
     :param api_url: The Github v4 API endpoint as string.
     :param organization: The name of the target Github organization as string.
-    :return: A 2-tuple containing 1. a list of dicts representing users - see tests.data.github.users.GITHUB_USER_DATA
-    for shape, and 2. data on the owning GitHub organization - see tests.data.github.users.GITHUB_ORG_DATA for shape.
+    :return: A 2-tuple containing
+        1. a list of dicts representing users and
+        2. data on the owning GitHub organization
+        see tests.data.github.users.GITHUB_USER_DATA for shape of both
     """
     users, org = fetch_all(
         token,
@@ -66,56 +100,139 @@ def get(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
     return users.edges, org
+def get_enterprise_owners(token: str, api_url: str, organization: str) -> Tuple[List[Dict], Dict]:
+    """
+    Retrieve a list of enterprise owners from the given GitHub organization as described in
+    https://docs.github.com/en/graphql/reference/objects#organizationenterpriseowneredge.
+    :param token: The Github API token as string.
+    :param api_url: The Github v4 API endpoint as string.
+    :param organization: The name of the target Github organization as string.
+    :return: A 2-tuple containing
+        1. a list of dicts representing users who are enterprise owners
+        3. data on the owning GitHub organization
+        see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
+    """
+    owners, org = fetch_all(
+        token,
+        api_url,
+        organization,
+        GITHUB_ENTERPRISE_OWNER_USERS_PAGINATED_GRAPHQL,
+        'enterpriseOwners',
+    )
+    return owners.edges, org
 @timeit
-def load_organization_users(
-    neo4j_session: neo4j.Session, user_data: List[Dict], org_data: Dict,
+def transform_users(user_data: List[Dict], owners_data: List[Dict], org_data: Dict) -> Tuple[List[Dict], List[Dict]]:
+    """
+    Taking raw user and owner data, return two lists of processed user data:
+    * organization users aka affiliated users (users directly affiliated with an organization)
+    * unaffiliated users (user who, for example, are enterprise owners but not members of the target organization).
+    :param token: The Github API token as string.
+    :param api_url: The Github v4 API endpoint as string.
+    :param organization: The name of the target Github organization as string.
+    :return: A 2-tuple containing
+        1. a list of dicts representing users who are affiliated with the target org
+           see tests.data.github.users.GITHUB_USER_DATA for shape
+        2. a list of dicts representing users who are not affiliated (e.g. enterprise owners who are not also in
+           the target org) — see tests.data.github.users.GITHUB_ENTERPRISE_OWNER_DATA for shape
+        3. data on the owning GitHub organization
+    """
+    users_dict = {}
+    for user in user_data:
+        processed_user = deepcopy(user['node'])
+        processed_user['role'] = user['role']
+        processed_user['hasTwoFactorEnabled'] = user['hasTwoFactorEnabled']
+        processed_user['MEMBER_OF'] = org_data['url']
+        users_dict[processed_user['url']] = processed_user
+    owners_dict = {}
+    for owner in owners_data:
+        processed_owner = deepcopy(owner['node'])
+        processed_owner['isEnterpriseOwner'] = True
+        if owner['organizationRole'] == 'UNAFFILIATED':
+            processed_owner['UNAFFILIATED'] = org_data['url']
+        else:
+            processed_owner['MEMBER_OF'] = org_data['url']
+        owners_dict[processed_owner['url']] = processed_owner
+    affiliated_users = []  # users affiliated with the target org
+    for url, user in users_dict.items():
+        user['isEnterpriseOwner'] = url in owners_dict
+        affiliated_users.append(user)
+    unaffiliated_users = []  # users not affiliated with the target org
+    for url, owner in owners_dict.items():
+        if url not in users_dict:
+            unaffiliated_users.append(owner)
+    return affiliated_users, unaffiliated_users
+@timeit
+def load_users(
+    neo4j_session: neo4j.Session,
+    node_schema: GitHubOrganizationUserSchema | GitHubUnaffiliatedUserSchema,
+    user_data: List[Dict],
+    org_data: Dict,
     update_tag: int,
 ) -> None:
-    query = """
-    MERGE (org:GitHubOrganization{id: $OrgUrl})
-    ON CREATE SET org.firstseen = timestamp()
-    SET org.username = $OrgLogin,
-    org.lastupdated = $UpdateTag
-    WITH org
-    UNWIND $UserData as user
-    MERGE (u:GitHubUser{id: user.node.url})
-    ON CREATE SET u.firstseen = timestamp()
-    SET u.fullname = user.node.name,
-    u.username = user.node.login,
-    u.has_2fa_enabled = user.hasTwoFactorEnabled,
-    u.role = user.role,
-    u.is_site_admin = user.node.isSiteAdmin,
-    u.email = user.node.email,
-    u.company = user.node.company,
-    u.lastupdated = $UpdateTag
-    MERGE (u)-[r:MEMBER_OF]->(org)
-    ON CREATE SET r.firstseen = timestamp()
-    SET r.lastupdated = $UpdateTag
-    """
-    neo4j_session.run(
-        query,
-        OrgUrl=org_data['url'],
-        OrgLogin=org_data['login'],
-        UserData=user_data,
-        UpdateTag=update_tag,
+    logger.info(f"Loading {len(user_data)} GitHub users to the graph")
+    load(
+        neo4j_session,
+        node_schema,
+        user_data,
+        lastupdated=update_tag,
+        org_url=org_data['url'],
+    )
+@timeit
+def load_organization(
+    neo4j_session: neo4j.Session,
+    node_schema: GitHubOrganizationSchema,
+    org_data: List[Dict[str, Any]],
+    update_tag: int,
+) -> None:
+    logger.info(f"Loading {len(org_data)} GitHub organization to the graph")
+    load(
+        neo4j_session,
+        node_schema,
+        org_data,
+        lastupdated=update_tag,
     )
 @timeit
 def sync(
         neo4j_session: neo4j.Session,
-        common_job_parameters: Dict[str, Any],
+        common_job_parameters: Dict,
         github_api_key: str,
         github_url: str,
         organization: str,
 ) -> None:
     logger.info("Syncing GitHub users")
-    user_data, org_data = get(github_api_key, github_url, organization)
-    load_organization_users(neo4j_session, user_data, org_data, common_job_parameters['UPDATE_TAG'])
-    run_cleanup_job('github_users_cleanup.json', neo4j_session, common_job_parameters)
+    user_data, org_data = get_users(github_api_key, github_url, organization)
+    owners_data, org_data = get_enterprise_owners(github_api_key, github_url, organization)
+    processed_affiliated_user_data, processed_unaffiliated_user_data = (
+        transform_users(user_data, owners_data, org_data)
+    )
+    load_organization(
+        neo4j_session, GitHubOrganizationSchema(), [org_data],
+        common_job_parameters['UPDATE_TAG'],
+    )
+    load_users(
+        neo4j_session, GitHubOrganizationUserSchema(), processed_affiliated_user_data, org_data,
+        common_job_parameters['UPDATE_TAG'],
+    )
+    load_users(
+        neo4j_session, GitHubUnaffiliatedUserSchema(), processed_unaffiliated_user_data, org_data,
+        common_job_parameters['UPDATE_TAG'],
+    )
+    # no automated cleanup job for users because user node has no sub_resource_relationship
+    run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters)
     merge_module_sync_metadata(
         neo4j_session,
         group_type='GitHubOrganization',

cartography/intel/okta/users.py CHANGED Viewed

@@ -150,7 +150,8 @@ def _load_okta_users(
     new_user.okta_last_updated = user_data.okta_last_updated,
     new_user.password_changed = user_data.password_changed,
     new_user.transition_to_status = user_data.transition_to_status,
-    new_user.lastupdated = $okta_update_tag
+    new_user.lastupdated = $okta_update_tag,
+    new_user :UserAccount
     WITH new_user, org
     MERGE (org)-[org_r:RESOURCE]->(new_user)
     ON CREATE SET org_r.firstseen = timestamp()

cartography/intel/semgrep/__init__.py CHANGED Viewed

@@ -3,7 +3,9 @@ import logging
 import neo4j
 from cartography.config import Config
-from cartography.intel.semgrep.findings import sync
+from cartography.intel.semgrep.dependencies import sync_dependencies
+from cartography.intel.semgrep.deployment import sync_deployment
+from cartography.intel.semgrep.findings import sync_findings
 from cartography.util import timeit
@@ -20,4 +22,9 @@ def start_semgrep_ingestion(
     if not config.semgrep_app_token:
         logger.info('Semgrep import is not configured - skipping this module. See docs to configure.')
         return
-    sync(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
+    # sync_deployment must be called first since it populates common_job_parameters
+    # with the deployment ID and slug, which are required by the other sync functions
+    sync_deployment(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)
+    sync_dependencies(neo4j_session, config.semgrep_app_token, config.semgrep_dependency_ecosystems, config.update_tag, common_job_parameters)  # noqa: E501
+    sync_findings(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters)

cartography/intel/semgrep/dependencies.py ADDED Viewed

@@ -0,0 +1,233 @@
+import logging
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+import neo4j
+import requests
+from requests.exceptions import HTTPError
+from requests.exceptions import ReadTimeout
+from cartography.client.core.tx import load
+from cartography.graph.job import GraphJob
+from cartography.models.semgrep.dependencies import SemgrepGoLibrarySchema
+from cartography.models.semgrep.dependencies import SemgrepNpmLibrarySchema
+from cartography.stats import get_stats_client
+from cartography.util import merge_module_sync_metadata
+from cartography.util import timeit
+logger = logging.getLogger(__name__)
+stat_handler = get_stats_client(__name__)
+_PAGE_SIZE = 10000
+_TIMEOUT = (60, 60)
+_MAX_RETRIES = 3
+# The keys in this dictionary must be in Semgrep's list of supported ecosystems, defined here:
+# https://semgrep.dev/api/v1/docs/#tag/SupplyChainService/operation/semgrep_app.products.sca.handlers.dependency.list_dependencies_conexxion
+ECOSYSTEM_TO_SCHEMA: Dict = {
+    'gomod': SemgrepGoLibrarySchema,
+    'npm': SemgrepNpmLibrarySchema,
+}
+def parse_and_validate_semgrep_ecosystems(ecosystems: str) -> List[str]:
+    validated_ecosystems: List[str] = []
+    for ecosystem in ecosystems.split(','):
+        ecosystem = ecosystem.strip().lower()
+        if ecosystem in ECOSYSTEM_TO_SCHEMA:
+            validated_ecosystems.append(ecosystem)
+        else:
+            valid_ecosystems: str = ','.join(ECOSYSTEM_TO_SCHEMA.keys())
+            raise ValueError(
+                f'Error parsing `semgrep-dependency-ecosystems`. You specified "{ecosystems}". '
+                f'Please check that your input is formatted as comma-separated values, e.g. "gomod,npm". '
+                f'Full list of supported ecosystems: {valid_ecosystems}.',
+            )
+    return validated_ecosystems
+@timeit
+def get_dependencies(semgrep_app_token: str, deployment_id: str, ecosystem: str) -> List[Dict[str, Any]]:
+    """
+    Gets all dependencies for the given ecosystem within the given Semgrep deployment ID.
+    param: semgrep_app_token: The Semgrep App token to use for authentication.
+    param: deployment_id: The Semgrep deployment ID to use for retrieving dependencies.
+    param: ecosystem: The ecosystem to import dependencies from, e.g. "gomod" or "npm".
+    """
+    all_deps = []
+    deps_url = f"https://semgrep.dev/api/v1/deployments/{deployment_id}/dependencies"
+    has_more = True
+    page = 0
+    retries = 0
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {semgrep_app_token}",
+    }
+    request_data: dict[str, Any] = {
+        "pageSize": _PAGE_SIZE,
+        "dependencyFilter": {
+            "ecosystem": [ecosystem],
+        },
+    }
+    logger.info(f"Retrieving Semgrep {ecosystem} dependencies for deployment '{deployment_id}'.")
+    while has_more:
+        try:
+            response = requests.post(deps_url, json=request_data, headers=headers, timeout=_TIMEOUT)
+            response.raise_for_status()
+            data = response.json()
+        except (ReadTimeout, HTTPError):
+            logger.warning(f"Failed to retrieve Semgrep {ecosystem} dependencies for page {page}. Retrying...")
+            retries += 1
+            if retries >= _MAX_RETRIES:
+                raise
+            continue
+        deps = data.get("dependencies", [])
+        has_more = data.get("hasMore", False)
+        logger.info(f"Processed page {page} of Semgrep {ecosystem} dependencies.")
+        all_deps.extend(deps)
+        retries = 0
+        page += 1
+        request_data["cursor"] = data.get("cursor")
+    logger.info(f"Retrieved {len(all_deps)} Semgrep {ecosystem} dependencies in {page} pages.")
+    return all_deps
+def transform_dependencies(raw_deps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Transforms the raw dependencies response from Semgrep API into a list of dicts
+    that can be used to create the Dependency nodes.
+    """
+    """
+    sample raw_dep as of November 2024:
+    {
+        "repositoryId": "123456",
+        "definedAt": {
+            "path": "go.mod",
+            "startLine": "6",
+            "endLine": "6",
+            "url": "https://github.com/org/repo-name/blob/00000000000000000000000000000000/go.mod#L6",
+            "committedAt": "1970-01-01T00:00:00Z",
+            "startCol": "0",
+            "endCol": "0"
+        },
+        "transitivity": "DIRECT",
+        "package": {
+            "name": "github.com/foo/bar",
+            "versionSpecifier": "1.2.3"
+        },
+        "ecosystem": "gomod",
+        "licenses": [],
+        "pathToTransitivity": []
+    },
+    """
+    deps = []
+    for raw_dep in raw_deps:
+        # We could call a different endpoint to get all repo IDs and store a mapping of repo ID to URL,
+        # but it's much simpler to just extract the URL from the definedAt field.
+        repo_url = raw_dep["definedAt"]["url"].split("/blob/", 1)[0]
+        name = raw_dep["package"]["name"]
+        version = raw_dep["package"]["versionSpecifier"]
+        id = f"{name}|{version}"
+        # As of November 2024, Semgrep does not import dependencies with version specifiers such as >, <, etc.
+        # For now, hardcode the specifier to ==<version> to align with GitHub-sourced Python dependencies.
+        # If Semgrep eventually supports version specifiers, update this line accordingly.
+        specifier = f"=={version}"
+        deps.append({
+            # existing dependency properties:
+            "id": id,
+            "name": name,
+            "specifier": specifier,
+            "version": version,
+            "repo_url": repo_url,
+            # Semgrep-specific properties:
+            "ecosystem": raw_dep["ecosystem"],
+            "transitivity": raw_dep["transitivity"].lower(),
+            "url": raw_dep["definedAt"]["url"],
+        })
+    return deps
+@timeit
+def load_dependencies(
+    neo4j_session: neo4j.Session,
+    dependency_schema: Callable,
+    dependencies: List[Dict],
+    deployment_id: str,
+    update_tag: int,
+) -> None:
+    logger.info(f"Loading {len(dependencies)} {dependency_schema().label} objects into the graph.")
+    load(
+        neo4j_session,
+        dependency_schema(),
+        dependencies,
+        lastupdated=update_tag,
+        DEPLOYMENT_ID=deployment_id,
+    )
+@timeit
+def cleanup(
+    neo4j_session: neo4j.Session,
+    dependency_schema: Callable,
+    common_job_parameters: Dict[str, Any],
+) -> None:
+    logger.info(f"Running Semgrep Dependencies cleanup job for {dependency_schema().label}.")
+    GraphJob.from_node_schema(dependency_schema(), common_job_parameters).run(neo4j_session)
+@timeit
+def sync_dependencies(
+    neo4j_session: neo4j.Session,
+    semgrep_app_token: str,
+    ecosystems_str: str,
+    update_tag: int,
+    common_job_parameters: Dict[str, Any],
+) -> None:
+    deployment_id = common_job_parameters.get("DEPLOYMENT_ID")
+    if not deployment_id:
+        logger.warning(
+            "Missing Semgrep deployment ID, ensure that sync_deployment() has been called. "
+            "Skipping Semgrep dependencies sync job.",
+        )
+        return
+    if not ecosystems_str:
+        logger.warning(
+            "Semgrep is not configured to import dependencies for any ecosystems, see docs to configure. "
+            "Skipping Semgrep dependencies sync job.",
+        )
+        return
+    # We don't expect an error here since we've already validated the input in cli.py
+    ecosystems = parse_and_validate_semgrep_ecosystems(ecosystems_str)
+    logger.info("Running Semgrep dependencies sync job.")
+    for ecosystem in ecosystems:
+        schema = ECOSYSTEM_TO_SCHEMA[ecosystem]
+        raw_deps = get_dependencies(semgrep_app_token, deployment_id, ecosystem)
+        deps = transform_dependencies(raw_deps)
+        load_dependencies(neo4j_session, schema, deps, deployment_id, update_tag)
+        cleanup(neo4j_session, schema, common_job_parameters)
+    merge_module_sync_metadata(
+        neo4j_session=neo4j_session,
+        group_type='Semgrep',
+        group_id=deployment_id,
+        synced_type='SemgrepDependency',
+        update_tag=update_tag,
+        stat_handler=stat_handler,
+    )

cartography/intel/semgrep/deployment.py ADDED Viewed

@@ -0,0 +1,67 @@
+import logging
+from typing import Any
+from typing import Dict
+import neo4j
+import requests
+from cartography.client.core.tx import load
+from cartography.models.semgrep.deployment import SemgrepDeploymentSchema
+from cartography.stats import get_stats_client
+from cartography.util import timeit
+logger = logging.getLogger(__name__)
+stat_handler = get_stats_client(__name__)
+_TIMEOUT = (60, 60)
+@timeit
+def get_deployment(semgrep_app_token: str) -> Dict[str, Any]:
+    """
+    Gets the deployment associated with the passed Semgrep App token.
+    param: semgrep_app_token: The Semgrep App token to use for authentication.
+    """
+    deployment = {}
+    deployment_url = "https://semgrep.dev/api/v1/deployments"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {semgrep_app_token}",
+    }
+    response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT)
+    response.raise_for_status()
+    data = response.json()
+    deployment["id"] = data["deployments"][0]["id"]
+    deployment["name"] = data["deployments"][0]["name"]
+    deployment["slug"] = data["deployments"][0]["slug"]
+    return deployment
+@timeit
+def load_semgrep_deployment(
+    neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int,
+) -> None:
+    logger.info(f"Loading SemgrepDeployment {deployment} into the graph.")
+    load(
+        neo4j_session,
+        SemgrepDeploymentSchema(),
+        [deployment],
+        lastupdated=update_tag,
+    )
+@timeit
+def sync_deployment(
+    neo4j_session: neo4j.Session,
+    semgrep_app_token: str,
+    update_tag: int,
+    common_job_parameters: Dict[str, Any],
+) -> None:
+    semgrep_deployment = get_deployment(semgrep_app_token)
+    deployment_id = semgrep_deployment["id"]
+    deployment_slug = semgrep_deployment["slug"]
+    load_semgrep_deployment(neo4j_session, semgrep_deployment, update_tag)
+    common_job_parameters["DEPLOYMENT_ID"] = deployment_id
+    common_job_parameters["DEPLOYMENT_SLUG"] = deployment_slug

cartography 0.95.0rc1__py3-none-any.whl → 0.96.0rc2__py3-none-any.whl

Potentially problematic release.

cartography 0.95.0rc1py3-none-any.whl → 0.96.0rc2py3-none-any.whl