cartography 0.112.0__py3-none-any.whl → 0.114.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (82) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +8 -0
  3. cartography/config.py +4 -0
  4. cartography/data/indexes.cypher +0 -31
  5. cartography/intel/aws/apigatewayv2.py +116 -0
  6. cartography/intel/aws/iam.py +741 -492
  7. cartography/intel/aws/organizations.py +7 -8
  8. cartography/intel/aws/permission_relationships.py +4 -16
  9. cartography/intel/aws/resources.py +2 -0
  10. cartography/intel/azure/__init__.py +16 -0
  11. cartography/intel/azure/app_service.py +105 -0
  12. cartography/intel/azure/functions.py +124 -0
  13. cartography/intel/entra/__init__.py +31 -0
  14. cartography/intel/entra/app_role_assignments.py +277 -0
  15. cartography/intel/entra/applications.py +4 -238
  16. cartography/intel/entra/federation/__init__.py +0 -0
  17. cartography/intel/entra/federation/aws_identity_center.py +77 -0
  18. cartography/intel/entra/service_principals.py +217 -0
  19. cartography/intel/gcp/__init__.py +136 -436
  20. cartography/intel/gcp/clients.py +65 -0
  21. cartography/intel/gcp/compute.py +18 -44
  22. cartography/intel/gcp/crm/__init__.py +0 -0
  23. cartography/intel/gcp/crm/folders.py +108 -0
  24. cartography/intel/gcp/crm/orgs.py +65 -0
  25. cartography/intel/gcp/crm/projects.py +109 -0
  26. cartography/intel/gcp/dns.py +82 -169
  27. cartography/intel/gcp/gke.py +72 -113
  28. cartography/intel/gcp/iam.py +66 -54
  29. cartography/intel/gcp/storage.py +75 -159
  30. cartography/intel/github/__init__.py +41 -0
  31. cartography/intel/github/commits.py +423 -0
  32. cartography/intel/github/repos.py +73 -39
  33. cartography/models/aws/apigatewayv2/__init__.py +0 -0
  34. cartography/models/aws/apigatewayv2/apigatewayv2.py +53 -0
  35. cartography/models/aws/iam/access_key.py +103 -0
  36. cartography/models/aws/iam/account_role.py +24 -0
  37. cartography/models/aws/iam/federated_principal.py +60 -0
  38. cartography/models/aws/iam/group.py +60 -0
  39. cartography/models/aws/iam/group_membership.py +26 -0
  40. cartography/models/aws/iam/inline_policy.py +78 -0
  41. cartography/models/aws/iam/managed_policy.py +51 -0
  42. cartography/models/aws/iam/policy_statement.py +57 -0
  43. cartography/models/aws/iam/role.py +83 -0
  44. cartography/models/aws/iam/root_principal.py +52 -0
  45. cartography/models/aws/iam/service_principal.py +30 -0
  46. cartography/models/aws/iam/sts_assumerole_allow.py +38 -0
  47. cartography/models/aws/iam/user.py +54 -0
  48. cartography/models/azure/__init__.py +0 -0
  49. cartography/models/azure/app_service.py +59 -0
  50. cartography/models/azure/function_app.py +59 -0
  51. cartography/models/entra/entra_user_to_aws_sso.py +41 -0
  52. cartography/models/entra/service_principal.py +104 -0
  53. cartography/models/gcp/compute/subnet.py +74 -0
  54. cartography/models/gcp/crm/__init__.py +0 -0
  55. cartography/models/gcp/crm/folders.py +98 -0
  56. cartography/models/gcp/crm/organizations.py +21 -0
  57. cartography/models/gcp/crm/projects.py +100 -0
  58. cartography/models/gcp/dns.py +109 -0
  59. cartography/models/gcp/gke.py +69 -0
  60. cartography/models/gcp/iam.py +3 -0
  61. cartography/models/gcp/storage/__init__.py +0 -0
  62. cartography/models/gcp/storage/bucket.py +119 -0
  63. cartography/models/github/commits.py +63 -0
  64. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/METADATA +7 -5
  65. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/RECORD +69 -39
  66. cartography/data/jobs/cleanup/aws_import_account_access_key_cleanup.json +0 -17
  67. cartography/data/jobs/cleanup/aws_import_groups_cleanup.json +0 -13
  68. cartography/data/jobs/cleanup/aws_import_principals_cleanup.json +0 -30
  69. cartography/data/jobs/cleanup/aws_import_roles_cleanup.json +0 -13
  70. cartography/data/jobs/cleanup/aws_import_users_cleanup.json +0 -8
  71. cartography/data/jobs/cleanup/gcp_compute_vpc_subnet_cleanup.json +0 -35
  72. cartography/data/jobs/cleanup/gcp_crm_folder_cleanup.json +0 -23
  73. cartography/data/jobs/cleanup/gcp_crm_organization_cleanup.json +0 -17
  74. cartography/data/jobs/cleanup/gcp_crm_project_cleanup.json +0 -23
  75. cartography/data/jobs/cleanup/gcp_dns_cleanup.json +0 -29
  76. cartography/data/jobs/cleanup/gcp_gke_cluster_cleanup.json +0 -17
  77. cartography/data/jobs/cleanup/gcp_storage_bucket_cleanup.json +0 -29
  78. cartography/intel/gcp/crm.py +0 -355
  79. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/WHEEL +0 -0
  80. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/entry_points.txt +0 -0
  81. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/licenses/LICENSE +0 -0
  82. {cartography-0.112.0.dist-info → cartography-0.114.0.dist-info}/top_level.txt +0 -0
@@ -3,32 +3,30 @@ import logging
3
3
  from collections import namedtuple
4
4
  from typing import Dict
5
5
  from typing import List
6
- from typing import Optional
7
6
  from typing import Set
8
7
 
9
- import googleapiclient.discovery
10
- import httplib2
11
8
  import neo4j
12
- from google.auth import default
13
- from google.auth.credentials import Credentials as GoogleCredentials
14
- from google.auth.exceptions import DefaultCredentialsError
15
- from google_auth_httplib2 import AuthorizedHttp
9
+ from googleapiclient.discovery import HttpError
16
10
  from googleapiclient.discovery import Resource
17
11
 
18
12
  from cartography.config import Config
13
+ from cartography.graph.job import GraphJob
19
14
  from cartography.intel.gcp import compute
20
- from cartography.intel.gcp import crm
21
15
  from cartography.intel.gcp import dns
22
16
  from cartography.intel.gcp import gke
23
17
  from cartography.intel.gcp import iam
24
18
  from cartography.intel.gcp import storage
19
+ from cartography.intel.gcp.clients import build_client
20
+ from cartography.intel.gcp.crm.folders import sync_gcp_folders
21
+ from cartography.intel.gcp.crm.orgs import sync_gcp_organizations
22
+ from cartography.intel.gcp.crm.projects import sync_gcp_projects
23
+ from cartography.models.gcp.crm.folders import GCPFolderSchema
24
+ from cartography.models.gcp.crm.organizations import GCPOrganizationSchema
25
+ from cartography.models.gcp.crm.projects import GCPProjectSchema
25
26
  from cartography.util import run_analysis_job
26
27
  from cartography.util import timeit
27
28
 
28
29
  logger = logging.getLogger(__name__)
29
- Resources = namedtuple(
30
- "Resources", "compute container crm_v1 crm_v2 dns storage serviceusage iam"
31
- )
32
30
 
33
31
  # Mapping of service short names to their full names as in docs. See https://developers.google.com/apis-explorer,
34
32
  # and https://cloud.google.com/service-usage/docs/reference/rest/v1/services#ServiceConfig
@@ -41,160 +39,6 @@ service_names = Services(
41
39
  iam="iam.googleapis.com",
42
40
  )
43
41
 
44
- # Default HTTP timeout (seconds) for Google API clients built via discovery.build
45
- _GCP_HTTP_TIMEOUT = 120
46
-
47
-
48
- def _authorized_http_with_timeout(
49
- credentials: GoogleCredentials, timeout: int = _GCP_HTTP_TIMEOUT
50
- ) -> AuthorizedHttp:
51
- """
52
- Build an AuthorizedHttp with a per-request timeout, avoiding global socket timeouts.
53
- """
54
- return AuthorizedHttp(credentials, http=httplib2.Http(timeout=timeout))
55
-
56
-
57
- def _get_crm_resource_v1(credentials: GoogleCredentials) -> Resource:
58
- """
59
- Instantiates a Google Compute Resource Manager v1 resource object to call the Resource Manager API.
60
- See https://cloud.google.com/resource-manager/reference/rest/.
61
- :param credentials: The GoogleCredentials object
62
- :return: A CRM v1 resource object
63
- """
64
- # cache_discovery=False to suppress extra warnings.
65
- # See https://github.com/googleapis/google-api-python-client/issues/299#issuecomment-268915510 and related issues
66
- return googleapiclient.discovery.build(
67
- "cloudresourcemanager",
68
- "v1",
69
- http=_authorized_http_with_timeout(credentials),
70
- cache_discovery=False,
71
- )
72
-
73
-
74
- def _get_crm_resource_v2(credentials: GoogleCredentials) -> Resource:
75
- """
76
- Instantiates a Google Compute Resource Manager v2 resource object to call the Resource Manager API.
77
- We need a v2 resource object to query for GCP folders.
78
- :param credentials: The GoogleCredentials object
79
- :return: A CRM v2 resource object
80
- """
81
- return googleapiclient.discovery.build(
82
- "cloudresourcemanager",
83
- "v2",
84
- http=_authorized_http_with_timeout(credentials),
85
- cache_discovery=False,
86
- )
87
-
88
-
89
- def _get_compute_resource(credentials: GoogleCredentials) -> Resource:
90
- """
91
- Instantiates a Google Compute resource object to call the Compute API. This is used to pull zone, instance, and
92
- networking data. See https://cloud.google.com/compute/docs/reference/rest/v1/.
93
- :param credentials: The GoogleCredentials object
94
- :return: A Compute resource object
95
- """
96
- return googleapiclient.discovery.build(
97
- "compute",
98
- "v1",
99
- http=_authorized_http_with_timeout(credentials),
100
- cache_discovery=False,
101
- )
102
-
103
-
104
- def _get_storage_resource(credentials: GoogleCredentials) -> Resource:
105
- """
106
- Instantiates a Google Cloud Storage resource object to call the Storage API.
107
- This is used to pull bucket metadata and IAM Policies
108
- as well as list buckets in a specified project.
109
- See https://cloud.google.com/storage/docs/json_api/.
110
- :param credentials: The GoogleCredentials object
111
- :return: A Storage resource object
112
- """
113
- return googleapiclient.discovery.build(
114
- "storage",
115
- "v1",
116
- http=_authorized_http_with_timeout(credentials),
117
- cache_discovery=False,
118
- )
119
-
120
-
121
- def _get_container_resource(credentials: GoogleCredentials) -> Resource:
122
- """
123
- Instantiates a Google Cloud Container resource object to call the
124
- Container API. See: https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/.
125
-
126
- :param credentials: The GoogleCredentials object
127
- :return: A Container resource object
128
- """
129
- return googleapiclient.discovery.build(
130
- "container",
131
- "v1",
132
- http=_authorized_http_with_timeout(credentials),
133
- cache_discovery=False,
134
- )
135
-
136
-
137
- def _get_dns_resource(credentials: GoogleCredentials) -> Resource:
138
- """
139
- Instantiates a Google Cloud DNS resource object to call the
140
- Container API. See: https://cloud.google.com/dns/docs/reference/v1/.
141
-
142
- :param credentials: The GoogleCredentials object
143
- :return: A DNS resource object
144
- """
145
- return googleapiclient.discovery.build(
146
- "dns",
147
- "v1",
148
- http=_authorized_http_with_timeout(credentials),
149
- cache_discovery=False,
150
- )
151
-
152
-
153
- def _get_serviceusage_resource(credentials: GoogleCredentials) -> Resource:
154
- """
155
- Instantiates a serviceusage resource object.
156
- See: https://cloud.google.com/service-usage/docs/reference/rest/v1/operations/list.
157
-
158
- :param credentials: The GoogleCredentials object
159
- :return: A serviceusage resource object
160
- """
161
- return googleapiclient.discovery.build(
162
- "serviceusage",
163
- "v1",
164
- http=_authorized_http_with_timeout(credentials),
165
- cache_discovery=False,
166
- )
167
-
168
-
169
- def _get_iam_resource(credentials: GoogleCredentials) -> Resource:
170
- """
171
- Instantiates a Google IAM resource object to call the IAM API.
172
- """
173
- return googleapiclient.discovery.build(
174
- "iam",
175
- "v1",
176
- http=_authorized_http_with_timeout(credentials),
177
- cache_discovery=False,
178
- )
179
-
180
-
181
- def _initialize_resources(credentials: GoogleCredentials) -> Resource:
182
- """
183
- Create namedtuple of all resource objects necessary for GCP data gathering.
184
- :param credentials: The GoogleCredentials object
185
- :return: namedtuple of all resource objects
186
- """
187
- return Resources(
188
- crm_v1=_get_crm_resource_v1(credentials),
189
- crm_v2=_get_crm_resource_v2(credentials),
190
- serviceusage=_get_serviceusage_resource(credentials),
191
- compute=None,
192
- container=None,
193
- dns=None,
194
- storage=None,
195
- iam=_get_iam_resource(credentials),
196
- )
197
-
198
42
 
199
43
  def _services_enabled_on_project(serviceusage: Resource, project_id: str) -> Set:
200
44
  """
@@ -220,7 +64,7 @@ def _services_enabled_on_project(serviceusage: Resource, project_id: str) -> Set
220
64
  previous_response=res,
221
65
  )
222
66
  return services
223
- except googleapiclient.discovery.HttpError as http_error:
67
+ except HttpError as http_error:
224
68
  http_error = json.loads(http_error.content.decode("utf-8"))
225
69
  # This is set to log-level `info` because Google creates many projects under the hood that cartography cannot
226
70
  # audit (e.g. adding a script to a Google spreadsheet causes a project to get created) and we don't need to emit
@@ -233,318 +77,174 @@ def _services_enabled_on_project(serviceusage: Resource, project_id: str) -> Set
233
77
  return set()
234
78
 
235
79
 
236
- def _sync_single_project_compute(
80
+ def _sync_project_resources(
237
81
  neo4j_session: neo4j.Session,
238
- resources: Resource,
239
- project_id: str,
82
+ projects: List[Dict],
240
83
  gcp_update_tag: int,
241
84
  common_job_parameters: Dict,
242
85
  ) -> None:
243
86
  """
244
- Handles graph sync for a single GCP project on Compute resources.
87
+ Syncs GCP service-specific resources (Compute, Storage, GKE, DNS, IAM) for each project.
245
88
  :param neo4j_session: The Neo4j session
246
- :param resources: namedtuple of the GCP resource objects
247
- :param project_id: The project ID number to sync. See the `projectId` field in
248
- https://cloud.google.com/resource-manager/reference/rest/v1/projects
89
+ :param projects: A list of projects containing at minimum a "projectId" field.
249
90
  :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
250
91
  :param common_job_parameters: Other parameters sent to Neo4j
251
92
  :return: Nothing
252
93
  """
253
- # Determine the resources available on the project.
254
- enabled_services = _services_enabled_on_project(resources.serviceusage, project_id)
255
- compute_cred = _get_compute_resource(get_gcp_credentials())
256
- if service_names.compute in enabled_services:
257
- compute.sync(
258
- neo4j_session,
259
- compute_cred,
260
- project_id,
261
- gcp_update_tag,
262
- common_job_parameters,
94
+ logger.info("Syncing resources for %d GCP projects.", len(projects))
95
+ # Per-project sync across services
96
+ for project in projects:
97
+ project_id = project["projectId"]
98
+ common_job_parameters["PROJECT_ID"] = project_id
99
+ enabled_services = _services_enabled_on_project(
100
+ build_client("serviceusage", "v1"), project_id
263
101
  )
264
102
 
103
+ if service_names.compute in enabled_services:
104
+ logger.info("Syncing GCP project %s for Compute.", project_id)
105
+ compute_cred = build_client("compute", "v1")
106
+ compute.sync(
107
+ neo4j_session,
108
+ compute_cred,
109
+ project_id,
110
+ gcp_update_tag,
111
+ common_job_parameters,
112
+ )
265
113
 
266
- def _sync_single_project_storage(
267
- neo4j_session: neo4j.Session,
268
- resources: Resource,
269
- project_id: str,
270
- gcp_update_tag: int,
271
- common_job_parameters: Dict,
272
- ) -> None:
273
- """
274
- Handles graph sync for a single GCP project on Storage resources.
275
- :param neo4j_session: The Neo4j session
276
- :param resources: namedtuple of the GCP resource objects
277
- :param project_id: The project ID number to sync. See the `projectId` field in
278
- https://cloud.google.com/resource-manager/reference/rest/v1/projects
279
- :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
280
- :param common_job_parameters: Other parameters sent to Neo4j
281
- :return: Nothing
282
- """
283
- # Determine the resources available on the project.
284
- enabled_services = _services_enabled_on_project(resources.serviceusage, project_id)
285
- storage_cred = _get_storage_resource(get_gcp_credentials())
286
- if service_names.storage in enabled_services:
287
- storage.sync_gcp_buckets(
288
- neo4j_session,
289
- storage_cred,
290
- project_id,
291
- gcp_update_tag,
292
- common_job_parameters,
293
- )
114
+ if service_names.storage in enabled_services:
115
+ logger.info("Syncing GCP project %s for Storage.", project_id)
116
+ storage_cred = build_client("storage", "v1")
117
+ storage.sync_gcp_buckets(
118
+ neo4j_session,
119
+ storage_cred,
120
+ project_id,
121
+ gcp_update_tag,
122
+ common_job_parameters,
123
+ )
294
124
 
125
+ if service_names.gke in enabled_services:
126
+ logger.info("Syncing GCP project %s for GKE.", project_id)
127
+ container_cred = build_client("container", "v1")
128
+ gke.sync_gke_clusters(
129
+ neo4j_session,
130
+ container_cred,
131
+ project_id,
132
+ gcp_update_tag,
133
+ common_job_parameters,
134
+ )
295
135
 
296
- def _sync_single_project_gke(
297
- neo4j_session: neo4j.Session,
298
- resources: Resource,
299
- project_id: str,
300
- gcp_update_tag: int,
301
- common_job_parameters: Dict,
302
- ) -> None:
303
- """
304
- Handles graph sync for a single GCP project GKE resources.
305
- :param neo4j_session: The Neo4j session
306
- :param resources: namedtuple of the GCP resource objects
307
- :param project_id: The project ID number to sync. See the `projectId` field in
308
- https://cloud.google.com/resource-manager/reference/rest/v1/projects
309
- :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
310
- :param common_job_parameters: Other parameters sent to Neo4j
311
- :return: Nothing
312
- """
313
- # Determine the resources available on the project.
314
- enabled_services = _services_enabled_on_project(resources.serviceusage, project_id)
315
- container_cred = _get_container_resource(get_gcp_credentials())
316
- if service_names.gke in enabled_services:
317
- gke.sync_gke_clusters(
318
- neo4j_session,
319
- container_cred,
320
- project_id,
321
- gcp_update_tag,
322
- common_job_parameters,
323
- )
136
+ if service_names.dns in enabled_services:
137
+ logger.info("Syncing GCP project %s for DNS.", project_id)
138
+ dns_cred = build_client("dns", "v1")
139
+ dns.sync(
140
+ neo4j_session,
141
+ dns_cred,
142
+ project_id,
143
+ gcp_update_tag,
144
+ common_job_parameters,
145
+ )
324
146
 
147
+ if service_names.iam in enabled_services:
148
+ logger.info("Syncing GCP project %s for IAM.", project_id)
149
+ iam_cred = build_client("iam", "v1")
150
+ iam.sync(
151
+ neo4j_session,
152
+ iam_cred,
153
+ project_id,
154
+ gcp_update_tag,
155
+ common_job_parameters,
156
+ )
325
157
 
326
- def _sync_single_project_dns(
327
- neo4j_session: neo4j.Session,
328
- resources: Resource,
329
- project_id: str,
330
- gcp_update_tag: int,
331
- common_job_parameters: Dict,
332
- ) -> None:
333
- """
334
- Handles graph sync for a single GCP project DNS resources.
335
- :param neo4j_session: The Neo4j session
336
- :param resources: namedtuple of the GCP resource objects
337
- :param project_id: The project ID number to sync. See the `projectId` field in
338
- https://cloud.google.com/resource-manager/reference/rest/v1/projects
339
- :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
340
- :param common_job_parameters: Other parameters sent to Neo4j
341
- :return: Nothing
342
- """
343
- # Determine the resources available on the project.
344
- enabled_services = _services_enabled_on_project(resources.serviceusage, project_id)
345
- dns_cred = _get_dns_resource(get_gcp_credentials())
346
- if service_names.dns in enabled_services:
347
- dns.sync(
348
- neo4j_session,
349
- dns_cred,
350
- project_id,
351
- gcp_update_tag,
352
- common_job_parameters,
353
- )
158
+ del common_job_parameters["PROJECT_ID"]
354
159
 
355
160
 
356
- def _sync_single_project_iam(
357
- neo4j_session: neo4j.Session,
358
- resources: Resource,
359
- project_id: str,
360
- gcp_update_tag: int,
361
- common_job_parameters: Dict,
362
- ) -> None:
161
+ @timeit
162
+ def start_gcp_ingestion(neo4j_session: neo4j.Session, config: Config) -> None:
363
163
  """
364
- Handles graph sync for a single GCP project's IAM resources.
164
+ Starts the GCP ingestion process by initializing Google Application Default Credentials, creating the necessary
165
+ resource objects, listing all GCP organizations and projects available to the GCP identity, and supplying that
166
+ context to all intel modules.
365
167
  :param neo4j_session: The Neo4j session
366
- :param resources: namedtuple of the GCP resource objects
367
- :param project_id: The project ID number to sync. See the `projectId` field in
368
- https://cloud.google.com/resource-manager/reference/rest/v1/projects
369
- :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
370
- :param common_job_parameters: Other parameters sent to Neo4j
168
+ :param config: A `cartography.config` object
371
169
  :return: Nothing
372
170
  """
373
- # Determine if IAM service is enabled
374
- enabled_services = _services_enabled_on_project(resources.serviceusage, project_id)
375
- iam_cred = _get_iam_resource(get_gcp_credentials())
376
- if service_names.iam in enabled_services:
377
- iam.sync(
378
- neo4j_session, iam_cred, project_id, gcp_update_tag, common_job_parameters
379
- )
380
-
171
+ common_job_parameters = {
172
+ "UPDATE_TAG": config.update_tag,
173
+ }
381
174
 
382
- def _sync_multiple_projects(
383
- neo4j_session: neo4j.Session,
384
- resources: Resource,
385
- projects: List[Dict],
386
- gcp_update_tag: int,
387
- common_job_parameters: Dict,
388
- ) -> None:
389
- """
390
- Handles graph sync for multiple GCP projects.
391
- :param neo4j_session: The Neo4j session
392
- :param resources: namedtuple of the GCP resource objects
393
- :param: projects: A list of projects. At minimum, this list should contain a list of dicts with the key "projectId"
394
- defined; so it would look like this: [{"projectId": "my-project-id-12345"}].
395
- This is the returned data from `crm.get_gcp_projects()`.
396
- See https://cloud.google.com/resource-manager/reference/rest/v1/projects.
397
- :param gcp_update_tag: The timestamp value to set our new Neo4j nodes with
398
- :param common_job_parameters: Other parameters sent to Neo4j
399
- :return: Nothing
400
- """
401
- logger.info("Syncing %d GCP projects.", len(projects))
402
- crm.sync_gcp_projects(
403
- neo4j_session,
404
- projects,
405
- gcp_update_tag,
406
- common_job_parameters,
175
+ # IMPORTANT: We defer cleanup for hierarchical resources (orgs, folders, projects) and run them
176
+ # in reverse order. This prevents orphaned nodes when a parent is deleted.
177
+ # Without this, deleting an org would break its relationships to projects/folders, leaving them
178
+ # disconnected and unable to be cleaned up by their own cleanup jobs.
179
+ #
180
+ # Order of operations:
181
+ # 1. Sync all orgs
182
+ # 2. For each org:
183
+ # a. Sync folders and projects
184
+ # b. Sync project resources (with immediate cleanup)
185
+ # c. Clean up projects and folders for this org
186
+ # 3. Clean up all orgs at the end
187
+ #
188
+ # This ensures children are cleaned up before their parents.
189
+
190
+ orgs = sync_gcp_organizations(
191
+ neo4j_session, config.update_tag, common_job_parameters
407
192
  )
408
- # Compute data sync
409
- for project in projects:
410
- project_id = project["projectId"]
411
- common_job_parameters["PROJECT_ID"] = project_id
412
- logger.info("Syncing GCP project %s for Compute.", project_id)
413
- _sync_single_project_compute(
414
- neo4j_session,
415
- resources,
416
- project_id,
417
- gcp_update_tag,
418
- common_job_parameters,
419
- )
420
- del common_job_parameters["PROJECT_ID"]
421
193
 
422
- # Storage data sync
423
- for project in projects:
424
- project_id = project["projectId"]
425
- common_job_parameters["PROJECT_ID"] = project_id
426
- logger.info("Syncing GCP project %s for Storage", project_id)
427
- _sync_single_project_storage(
428
- neo4j_session,
429
- resources,
430
- project_id,
431
- gcp_update_tag,
432
- common_job_parameters,
433
- )
434
- del common_job_parameters["PROJECT_ID"]
194
+ # Track org cleanup jobs to run at the very end
195
+ org_cleanup_jobs = []
435
196
 
436
- # GKE data sync
437
- for project in projects:
438
- project_id = project["projectId"]
439
- common_job_parameters["PROJECT_ID"] = project_id
440
- logger.info("Syncing GCP project %s for GKE", project_id)
441
- _sync_single_project_gke(
197
+ # For each org, sync its folders and projects (as sub-resources), then ingest per-project services
198
+ for org in orgs:
199
+ org_resource_name = org.get("name", "") # e.g., organizations/123456789012
200
+ if not org_resource_name or "/" not in org_resource_name:
201
+ logger.error(f"Invalid org resource name: {org_resource_name}")
202
+ continue
203
+
204
+ # Store the full resource name for cleanup operations
205
+ common_job_parameters["ORG_RESOURCE_NAME"] = org_resource_name
206
+
207
+ # Sync folders under org
208
+ folders = sync_gcp_folders(
442
209
  neo4j_session,
443
- resources,
444
- project_id,
445
- gcp_update_tag,
210
+ config.update_tag,
446
211
  common_job_parameters,
212
+ org_resource_name,
447
213
  )
448
- del common_job_parameters["PROJECT_ID"]
449
214
 
450
- # DNS data sync
451
- for project in projects:
452
- project_id = project["projectId"]
453
- common_job_parameters["PROJECT_ID"] = project_id
454
- logger.info("Syncing GCP project %s for DNS", project_id)
455
- _sync_single_project_dns(
215
+ # Sync projects under org and each folder
216
+ projects = sync_gcp_projects(
456
217
  neo4j_session,
457
- resources,
458
- project_id,
459
- gcp_update_tag,
218
+ org_resource_name,
219
+ folders,
220
+ config.update_tag,
460
221
  common_job_parameters,
461
222
  )
462
- del common_job_parameters["PROJECT_ID"]
463
223
 
464
- # IAM data sync
465
- for project in projects:
466
- project_id = project["projectId"]
467
- common_job_parameters["PROJECT_ID"] = project_id
468
- logger.info("Syncing GCP project %s for IAM", project_id)
469
- _sync_single_project_iam(
470
- neo4j_session, resources, project_id, gcp_update_tag, common_job_parameters
224
+ # Ingest per-project resources (these run their own cleanup immediately since they're leaf nodes)
225
+ _sync_project_resources(
226
+ neo4j_session, projects, config.update_tag, common_job_parameters
471
227
  )
472
- del common_job_parameters["PROJECT_ID"]
473
228
 
474
-
475
- @timeit
476
- def get_gcp_credentials() -> Optional[GoogleCredentials]:
477
- """
478
- Gets access tokens for GCP API access.
479
- :param: None
480
- :return: GoogleCredentials
481
- """
482
- try:
483
- # Explicitly use Application Default Credentials.
484
- # See https://google-auth.readthedocs.io/en/master/user-guide.html#application-default-credentials
485
- credentials, project_id = default()
486
- return credentials
487
- except DefaultCredentialsError as e:
488
- logger.debug(
489
- "Error occurred calling GoogleCredentials.get_application_default().",
490
- exc_info=True,
229
+ # Clean up projects and folders for this org (children before parents)
230
+ logger.debug(f"Running cleanup for projects and folders in {org_resource_name}")
231
+ GraphJob.from_node_schema(GCPProjectSchema(), common_job_parameters).run(
232
+ neo4j_session
491
233
  )
492
- logger.error(
493
- (
494
- "Unable to initialize Google Compute Platform creds. If you don't have GCP data or don't want to load "
495
- "GCP data then you can ignore this message. Otherwise, the error code is: %s "
496
- "Make sure your GCP credentials are configured correctly, your credentials file (if any) is valid, and "
497
- "that the identity you are authenticating to has the securityReviewer role attached."
498
- ),
499
- e,
234
+ GraphJob.from_node_schema(GCPFolderSchema(), common_job_parameters).run(
235
+ neo4j_session
500
236
  )
501
- return None
502
237
 
238
+ # Save org cleanup job for later
239
+ org_cleanup_jobs.append((GCPOrganizationSchema, dict(common_job_parameters)))
503
240
 
504
- @timeit
505
- def start_gcp_ingestion(neo4j_session: neo4j.Session, config: Config) -> None:
506
- """
507
- Starts the GCP ingestion process by initializing Google Application Default Credentials, creating the necessary
508
- resource objects, listing all GCP organizations and projects available to the GCP identity, and supplying that
509
- context to all intel modules.
510
- :param neo4j_session: The Neo4j session
511
- :param config: A `cartography.config` object
512
- :return: Nothing
513
- """
514
- common_job_parameters = {
515
- "UPDATE_TAG": config.update_tag,
516
- }
517
-
518
- credentials = get_gcp_credentials()
519
- if credentials is None:
520
- logger.warning("Unable to initialize GCP credentials. Skipping module.")
521
- return
241
+ # Remove org ID from common job parameters after processing
242
+ del common_job_parameters["ORG_RESOURCE_NAME"]
522
243
 
523
- resources = _initialize_resources(credentials)
524
-
525
- # If we don't have perms to pull Orgs or Folders from GCP, we will skip safely
526
- crm.sync_gcp_organizations(
527
- neo4j_session,
528
- resources.crm_v1,
529
- config.update_tag,
530
- common_job_parameters,
531
- )
532
- crm.sync_gcp_folders(
533
- neo4j_session,
534
- resources.crm_v2,
535
- config.update_tag,
536
- common_job_parameters,
537
- )
538
-
539
- projects = crm.get_gcp_projects(resources.crm_v1)
540
-
541
- _sync_multiple_projects(
542
- neo4j_session,
543
- resources,
544
- projects,
545
- config.update_tag,
546
- common_job_parameters,
547
- )
244
+ # Run all org cleanup jobs at the very end, after all children have been cleaned up
245
+ logger.info("Running cleanup for GCP organizations")
246
+ for schema_class, params in org_cleanup_jobs:
247
+ GraphJob.from_node_schema(schema_class(), params).run(neo4j_session)
548
248
 
549
249
  run_analysis_job(
550
250
  "gcp_compute_asset_inet_exposure.json",