cartography 0.111.0__py3-none-any.whl → 0.112.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (40) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +11 -0
  3. cartography/config.py +8 -0
  4. cartography/data/indexes.cypher +0 -2
  5. cartography/intel/aws/apigateway.py +126 -17
  6. cartography/intel/aws/ec2/instances.py +3 -1
  7. cartography/intel/aws/ec2/network_interfaces.py +1 -1
  8. cartography/intel/aws/ec2/vpc_peerings.py +262 -125
  9. cartography/intel/azure/__init__.py +35 -32
  10. cartography/intel/azure/subscription.py +2 -2
  11. cartography/intel/azure/tenant.py +39 -30
  12. cartography/intel/azure/util/credentials.py +49 -174
  13. cartography/intel/entra/__init__.py +47 -1
  14. cartography/intel/entra/applications.py +220 -170
  15. cartography/intel/entra/groups.py +41 -22
  16. cartography/intel/entra/ou.py +28 -20
  17. cartography/intel/entra/users.py +24 -18
  18. cartography/intel/gcp/__init__.py +25 -8
  19. cartography/intel/gcp/compute.py +47 -12
  20. cartography/intel/kubernetes/__init__.py +26 -0
  21. cartography/intel/kubernetes/eks.py +402 -0
  22. cartography/intel/kubernetes/rbac.py +133 -0
  23. cartography/models/aws/apigateway/apigatewayintegration.py +79 -0
  24. cartography/models/aws/apigateway/apigatewaymethod.py +74 -0
  25. cartography/models/aws/ec2/vpc_peering.py +157 -0
  26. cartography/models/azure/principal.py +44 -0
  27. cartography/models/azure/tenant.py +20 -0
  28. cartography/models/kubernetes/clusterrolebindings.py +40 -0
  29. cartography/models/kubernetes/groups.py +107 -0
  30. cartography/models/kubernetes/oidc.py +51 -0
  31. cartography/models/kubernetes/rolebindings.py +40 -0
  32. cartography/models/kubernetes/users.py +105 -0
  33. cartography/util.py +2 -0
  34. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/METADATA +8 -5
  35. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/RECORD +39 -31
  36. cartography/data/jobs/cleanup/aws_import_vpc_peering_cleanup.json +0 -45
  37. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/WHEEL +0 -0
  38. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/entry_points.txt +0 -0
  39. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/licenses/LICENSE +0 -0
  40. {cartography-0.111.0.dist-info → cartography-0.112.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,20 @@
1
+ import gc
1
2
  import logging
2
3
  from typing import Any
3
- from typing import Dict
4
- from typing import List
4
+ from typing import AsyncGenerator
5
+ from typing import Generator
5
6
 
6
- import httpx
7
7
  import neo4j
8
8
  from azure.identity import ClientSecretCredential
9
- from kiota_abstractions.api_error import APIError
9
+ from msgraph.generated.models.app_role_assignment_collection_response import (
10
+ AppRoleAssignmentCollectionResponse,
11
+ )
12
+ from msgraph.generated.models.application import Application
13
+ from msgraph.generated.models.service_principal import ServicePrincipal
10
14
  from msgraph.graph_service_client import GraphServiceClient
11
15
 
12
16
  from cartography.client.core.tx import load
13
17
  from cartography.graph.job import GraphJob
14
- from cartography.intel.entra.users import load_tenant
15
18
  from cartography.models.entra.app_role_assignment import EntraAppRoleAssignmentSchema
16
19
  from cartography.models.entra.application import EntraApplicationSchema
17
20
  from cartography.util import timeit
@@ -27,25 +30,20 @@ logger = logging.getLogger(__name__)
27
30
  # - You want to minimize API calls (increase values up to 999)
28
31
  # - You're hitting rate limits (decrease values)
29
32
  APPLICATIONS_PAGE_SIZE = 999
30
- APP_ROLE_ASSIGNMENTS_PAGE_SIZE = (
31
- 999 # Currently not used, but reserved for future pagination improvements
32
- )
33
-
34
- # Warning thresholds for potential data completeness issues
35
- # Log warnings when individual users/groups have more assignments than this threshold
36
- HIGH_ASSIGNMENT_COUNT_THRESHOLD = 100
33
+ APP_ROLE_ASSIGNMENTS_PAGE_SIZE = 999
37
34
 
38
35
 
39
36
  @timeit
40
- async def get_entra_applications(client: GraphServiceClient) -> List[Any]:
37
+ async def get_entra_applications(
38
+ client: GraphServiceClient,
39
+ ) -> AsyncGenerator[Application, None]:
41
40
  """
42
- Gets Entra applications using the Microsoft Graph API.
41
+ Gets Entra applications using the Microsoft Graph API with a generator.
43
42
 
44
43
  :param client: GraphServiceClient
45
- :return: List of raw Application objects from Microsoft Graph
44
+ :return: Generator of raw Application objects from Microsoft Graph
46
45
  """
47
- applications = []
48
-
46
+ count = 0
49
47
  # Get all applications with pagination
50
48
  request_configuration = client.applications.ApplicationsRequestBuilderGetRequestConfiguration(
51
49
  query_parameters=client.applications.ApplicationsRequestBuilderGetQueryParameters(
@@ -56,189 +54,192 @@ async def get_entra_applications(client: GraphServiceClient) -> List[Any]:
56
54
 
57
55
  while page:
58
56
  if page.value:
59
- applications.extend(page.value)
57
+ for app in page.value:
58
+ count += 1
59
+ yield app
60
60
 
61
61
  if not page.odata_next_link:
62
62
  break
63
63
  page = await client.applications.with_url(page.odata_next_link).get()
64
64
 
65
- logger.info(f"Retrieved {len(applications)} Entra applications total")
66
- return applications
65
+ logger.info(f"Retrieved {count} Entra applications total")
67
66
 
68
67
 
69
68
  @timeit
70
- async def get_app_role_assignments(
71
- client: GraphServiceClient, applications: List[Any]
72
- ) -> List[Any]:
69
+ async def get_app_role_assignments_for_app(
70
+ client: GraphServiceClient, app: Application
71
+ ) -> AsyncGenerator[dict[str, Any], None]:
73
72
  """
74
- Gets app role assignments efficiently by querying each application's service principal.
73
+ Gets app role assignments for a single application with safety limits.
75
74
 
76
75
  :param client: GraphServiceClient
77
- :param applications: List of Application objects (from get_entra_applications)
78
- :return: List of raw app role assignment objects from Microsoft Graph
76
+ :param app: Application object
77
+ :return: Generator of app role assignment data as dicts
79
78
  """
80
- assignments = []
79
+ if not app.app_id:
80
+ logger.warning(f"Application {app.id} has no app_id, skipping")
81
+ return
81
82
 
82
- for app in applications:
83
- if not app.app_id:
84
- logger.warning(f"Application {app.id} has no app_id, skipping")
85
- continue
86
-
87
- try:
88
- # First, get the service principal for this application
89
- # The service principal represents the app in the directory
90
- service_principals_page = await client.service_principals.get(
91
- request_configuration=client.service_principals.ServicePrincipalsRequestBuilderGetRequestConfiguration(
92
- query_parameters=client.service_principals.ServicePrincipalsRequestBuilderGetQueryParameters(
93
- filter=f"appId eq '{app.app_id}'"
94
- )
95
- )
83
+ logger.info(
84
+ f"Fetching role assignments for application: {app.display_name} ({app.app_id})"
85
+ )
86
+
87
+ # First, get the service principal for this application
88
+ service_principals_page = await client.service_principals.get(
89
+ request_configuration=client.service_principals.ServicePrincipalsRequestBuilderGetRequestConfiguration(
90
+ query_parameters=client.service_principals.ServicePrincipalsRequestBuilderGetQueryParameters(
91
+ filter=f"appId eq '{app.app_id}'"
96
92
  )
93
+ )
94
+ )
97
95
 
98
- if not service_principals_page or not service_principals_page.value:
99
- logger.debug(
100
- f"No service principal found for application {app.app_id} ({app.display_name})"
101
- )
102
- continue
96
+ if not service_principals_page or not service_principals_page.value:
97
+ logger.warning(
98
+ f"No service principal found for application {app.app_id} ({app.display_name}). Continuing."
99
+ )
100
+ return
101
+
102
+ service_principal: ServicePrincipal = service_principals_page.value[0]
103
+
104
+ # Get assignments for this service principal with pagination and limits
105
+ # Use maximum page size (999) to get more data per request
106
+ # Memory is managed through streaming and batching, not page size
107
+ request_config = client.service_principals.by_service_principal_id(
108
+ service_principal.id
109
+ ).app_role_assigned_to.AppRoleAssignedToRequestBuilderGetRequestConfiguration(
110
+ query_parameters=client.service_principals.by_service_principal_id(
111
+ service_principal.id
112
+ ).app_role_assigned_to.AppRoleAssignedToRequestBuilderGetQueryParameters(
113
+ top=APP_ROLE_ASSIGNMENTS_PAGE_SIZE # Maximum allowed by Microsoft Graph API
114
+ )
115
+ )
103
116
 
104
- service_principal = service_principals_page.value[0]
117
+ assignments_page: AppRoleAssignmentCollectionResponse | None = (
118
+ await client.service_principals.by_service_principal_id(
119
+ service_principal.id
120
+ ).app_role_assigned_to.get(request_configuration=request_config)
121
+ )
105
122
 
106
- # Ensure service principal has an ID
107
- if not service_principal.id:
123
+ assignment_count = 0
124
+ page_count = 0
125
+
126
+ while assignments_page:
127
+ page_count += 1
128
+
129
+ if assignments_page.value:
130
+ page_valid_count = 0
131
+ page_skipped_count = 0
132
+
133
+ # Process assignments and immediately yield to avoid accumulation
134
+ for assignment in assignments_page.value:
135
+ # Only yield if we have valid data since it's possible (but unlikely) for assignment.id to be None
136
+ if assignment.principal_id:
137
+ assignment_count += 1
138
+ page_valid_count += 1
139
+ yield {
140
+ "id": assignment.id,
141
+ "app_role_id": assignment.app_role_id,
142
+ "created_date_time": assignment.created_date_time,
143
+ "principal_id": assignment.principal_id,
144
+ "principal_display_name": assignment.principal_display_name,
145
+ "principal_type": assignment.principal_type,
146
+ "resource_display_name": assignment.resource_display_name,
147
+ "resource_id": assignment.resource_id,
148
+ "application_app_id": app.app_id,
149
+ }
150
+ else:
151
+ page_skipped_count += 1
152
+
153
+ # Log page results with details about skipped objects
154
+ if page_skipped_count > 0:
108
155
  logger.warning(
109
- f"Service principal for application {app.app_id} ({app.display_name}) has no ID, skipping"
156
+ f"Page {page_count} for {app.display_name}: {page_valid_count} valid assignments, "
157
+ f"{page_skipped_count} skipped objects. Total valid: {assignment_count}"
110
158
  )
111
- continue
112
-
113
- # Get all assignments for this service principal (users, groups, service principals)
114
- assignments_page = await client.service_principals.by_service_principal_id(
115
- service_principal.id
116
- ).app_role_assigned_to.get()
117
-
118
- app_assignments = []
119
- while assignments_page:
120
- if assignments_page.value:
121
- # Add application context to each assignment
122
- for assignment in assignments_page.value:
123
- # Add the application app_id to the assignment for relationship matching
124
- assignment.application_app_id = app.app_id
125
- app_assignments.extend(assignments_page.value)
126
-
127
- if not assignments_page.odata_next_link:
128
- break
129
- assignments_page = await client.service_principals.with_url(
130
- assignments_page.odata_next_link
131
- ).get()
132
-
133
- # Log warning if a single application has many assignments (potential pagination issues)
134
- if len(app_assignments) >= HIGH_ASSIGNMENT_COUNT_THRESHOLD:
135
- logger.warning(
136
- f"Application {app.display_name} ({app.app_id}) has {len(app_assignments)} role assignments. "
137
- f"If this seems unexpectedly high, there may be pagination limits affecting data completeness."
159
+ else:
160
+ logger.debug(
161
+ f"Page {page_count} for {app.display_name}: {page_valid_count} assignments. "
162
+ f"Total: {assignment_count}"
138
163
  )
139
164
 
140
- assignments.extend(app_assignments)
141
- logger.debug(
142
- f"Retrieved {len(app_assignments)} assignments for application {app.display_name}"
143
- )
165
+ # Force garbage collection after each page
166
+ gc.collect()
144
167
 
145
- except APIError as e:
146
- # Handle Microsoft Graph API errors (403 Forbidden, 404 Not Found, etc.)
147
- if e.response_status_code == 403:
148
- logger.warning(
149
- f"Access denied when fetching app role assignments for application {app.app_id} ({app.display_name}). "
150
- f"This application may not have sufficient permissions or may not exist."
151
- )
152
- elif e.response_status_code == 404:
153
- logger.warning(
154
- f"Application {app.app_id} ({app.display_name}) not found when fetching app role assignments. "
155
- f"Application may have been deleted or does not exist."
156
- )
157
- elif e.response_status_code == 429:
158
- logger.warning(
159
- f"Rate limit hit when fetching app role assignments for application {app.app_id} ({app.display_name}). "
160
- f"Consider reducing APPLICATIONS_PAGE_SIZE or implementing retry logic."
161
- )
162
- else:
163
- logger.warning(
164
- f"Microsoft Graph API error when fetching app role assignments for application {app.app_id} ({app.display_name}): "
165
- f"Status {e.response_status_code}, Error: {str(e)}"
166
- )
167
- continue
168
- except (httpx.TimeoutException, httpx.ConnectError, httpx.NetworkError) as e:
169
- # Handle network-related errors
170
- logger.warning(
171
- f"Network error when fetching app role assignments for application {app.app_id} ({app.display_name}): {e}"
172
- )
173
- continue
174
- except Exception as e:
175
- # Only catch truly unexpected errors - these should be rare
176
- logger.error(
177
- f"Unexpected error when fetching app role assignments for application {app.app_id} ({app.display_name}): {e}",
178
- exc_info=True,
179
- )
180
- continue
168
+ # Check if we have more pages to fetch
169
+ if not assignments_page.odata_next_link:
170
+ break
171
+
172
+ # Clear previous page before fetching next
173
+ assignments_page.value = None
174
+
175
+ # Fetch next page
176
+ logger.debug(
177
+ f"Fetching page {page_count + 1} of assignments for {app.display_name}"
178
+ )
179
+ next_page_url = assignments_page.odata_next_link
180
+ assignments_page = await client.service_principals.with_url(next_page_url).get()
181
181
 
182
- logger.info(f"Retrieved {len(assignments)} app role assignments total")
183
- return assignments
182
+ logger.info(
183
+ f"Successfully retrieved {assignment_count} assignments for application {app.display_name} (pages: {page_count})"
184
+ )
184
185
 
185
186
 
186
- def transform_applications(applications: List[Any]) -> List[Dict[str, Any]]:
187
+ def transform_applications(
188
+ applications: list[Application],
189
+ ) -> Generator[dict[str, Any], None, None]:
187
190
  """
188
- Transform application data for graph loading.
191
+ Transform application data for graph loading using a generator.
189
192
 
190
193
  :param applications: Raw Application objects from Microsoft Graph API
191
- :return: Transformed application data for graph loading
194
+ :return: Generator of transformed application data for graph loading
192
195
  """
193
- result = []
194
196
  for app in applications:
195
- transformed = {
197
+ yield {
196
198
  "id": app.id,
197
199
  "app_id": app.app_id,
198
200
  "display_name": app.display_name,
199
- "publisher_domain": getattr(app, "publisher_domain", None),
201
+ "publisher_domain": app.publisher_domain,
200
202
  "sign_in_audience": app.sign_in_audience,
201
203
  }
202
- result.append(transformed)
203
- return result
204
204
 
205
205
 
206
206
  def transform_app_role_assignments(
207
- assignments: List[Any],
208
- ) -> List[Dict[str, Any]]:
207
+ assignments: list[dict[str, Any]],
208
+ ) -> list[dict[str, Any]]:
209
209
  """
210
210
  Transform app role assignment data for graph loading.
211
211
 
212
- :param assignments: Raw app role assignment objects from Microsoft Graph API
212
+ :param assignments: Raw app role assignment data as dicts
213
213
  :return: Transformed assignment data for graph loading
214
214
  """
215
- result = []
216
- for assignment in assignments:
217
- transformed = {
218
- "id": assignment.id,
219
- "app_role_id": (
220
- str(assignment.app_role_id) if assignment.app_role_id else None
221
- ),
222
- "created_date_time": assignment.created_date_time,
223
- "principal_id": (
224
- str(assignment.principal_id) if assignment.principal_id else None
225
- ),
226
- "principal_display_name": assignment.principal_display_name,
227
- "principal_type": assignment.principal_type,
228
- "resource_display_name": assignment.resource_display_name,
229
- "resource_id": (
230
- str(assignment.resource_id) if assignment.resource_id else None
231
- ),
232
- "application_app_id": getattr(assignment, "application_app_id", None),
233
- }
234
- result.append(transformed)
235
- return result
215
+ transformed = []
216
+ for assign in assignments:
217
+ transformed.append(
218
+ {
219
+ "id": assign["id"],
220
+ "app_role_id": (
221
+ str(assign["app_role_id"]) if assign["app_role_id"] else None
222
+ ),
223
+ "created_date_time": assign["created_date_time"],
224
+ "principal_id": (
225
+ str(assign["principal_id"]) if assign["principal_id"] else None
226
+ ),
227
+ "principal_display_name": assign["principal_display_name"],
228
+ "principal_type": assign["principal_type"],
229
+ "resource_display_name": assign["resource_display_name"],
230
+ "resource_id": (
231
+ str(assign["resource_id"]) if assign["resource_id"] else None
232
+ ),
233
+ "application_app_id": assign["application_app_id"],
234
+ }
235
+ )
236
+ return transformed
236
237
 
237
238
 
238
239
  @timeit
239
240
  def load_applications(
240
241
  neo4j_session: neo4j.Session,
241
- applications_data: List[Dict[str, Any]],
242
+ applications_data: list[dict[str, Any]],
242
243
  update_tag: int,
243
244
  tenant_id: str,
244
245
  ) -> None:
@@ -262,7 +263,7 @@ def load_applications(
262
263
  @timeit
263
264
  def load_app_role_assignments(
264
265
  neo4j_session: neo4j.Session,
265
- assignments_data: List[Dict[str, Any]],
266
+ assignments_data: list[dict[str, Any]],
266
267
  update_tag: int,
267
268
  tenant_id: str,
268
269
  ) -> None:
@@ -285,7 +286,7 @@ def load_app_role_assignments(
285
286
 
286
287
  @timeit
287
288
  def cleanup_applications(
288
- neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]
289
+ neo4j_session: neo4j.Session, common_job_parameters: dict[str, Any]
289
290
  ) -> None:
290
291
  """
291
292
  Delete Entra applications and their relationships from the graph if they were not updated in the last sync.
@@ -300,7 +301,7 @@ def cleanup_applications(
300
301
 
301
302
  @timeit
302
303
  def cleanup_app_role_assignments(
303
- neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]
304
+ neo4j_session: neo4j.Session, common_job_parameters: dict[str, Any]
304
305
  ) -> None:
305
306
  """
306
307
  Delete Entra app role assignments and their relationships from the graph if they were not updated in the last sync.
@@ -320,7 +321,7 @@ async def sync_entra_applications(
320
321
  client_id: str,
321
322
  client_secret: str,
322
323
  update_tag: int,
323
- common_job_parameters: Dict[str, Any],
324
+ common_job_parameters: dict[str, Any],
324
325
  ) -> None:
325
326
  """
326
327
  Sync Entra applications and their app role assignments to the graph.
@@ -344,22 +345,71 @@ async def sync_entra_applications(
344
345
  scopes=["https://graph.microsoft.com/.default"],
345
346
  )
346
347
 
347
- # Load tenant (prerequisite)
348
- load_tenant(neo4j_session, {"id": tenant_id}, update_tag)
349
-
350
- # Get and transform applications data
351
- applications_data = await get_entra_applications(client)
352
- transformed_applications = transform_applications(applications_data)
348
+ # Process applications and their assignments in batches
349
+ app_batch_size = 10 # Batch size for applications
350
+ assignment_batch_size = (
351
+ 200 # Batch size for assignments (increased since we handle memory better now)
352
+ )
353
353
 
354
- # Get and transform app role assignments data
355
- assignments_data = await get_app_role_assignments(client, applications_data)
356
- transformed_assignments = transform_app_role_assignments(assignments_data)
354
+ apps_batch = []
355
+ assignments_batch = []
356
+ total_assignment_count = 0
357
+ total_app_count = 0
358
+
359
+ # Stream apps
360
+ async for app in get_entra_applications(client):
361
+ total_app_count += 1
362
+ apps_batch.append(app)
363
+
364
+ # Transform and load applications in batches
365
+ if len(apps_batch) >= app_batch_size:
366
+ transformed_apps = list(transform_applications(apps_batch))
367
+ load_applications(neo4j_session, transformed_apps, update_tag, tenant_id)
368
+ logger.info(
369
+ f"Loaded batch of {len(apps_batch)} applications (total: {total_app_count})"
370
+ )
371
+ apps_batch.clear()
372
+ transformed_apps.clear()
373
+ gc.collect() # Force garbage collection
374
+
375
+ # Stream app role assignments
376
+ async for assignment in get_app_role_assignments_for_app(client, app):
377
+ assignments_batch.append(assignment)
378
+ total_assignment_count += 1
379
+
380
+ # Transform and load assignments in batches
381
+ if len(assignments_batch) >= assignment_batch_size:
382
+ transformed_assignments = transform_app_role_assignments(
383
+ assignments_batch
384
+ )
385
+ load_app_role_assignments(
386
+ neo4j_session, transformed_assignments, update_tag, tenant_id
387
+ )
388
+ logger.debug(f"Loaded batch of {len(assignments_batch)} assignments")
389
+ assignments_batch.clear()
390
+ transformed_assignments.clear()
391
+
392
+ # Force garbage collection after batch load
393
+ gc.collect()
394
+
395
+ # Process remaining applications
396
+ if apps_batch:
397
+ transformed_apps = list(transform_applications(apps_batch))
398
+ load_applications(neo4j_session, transformed_apps, update_tag, tenant_id)
399
+ apps_batch.clear()
400
+ transformed_apps.clear()
401
+
402
+ # Process remaining assignments
403
+ if assignments_batch:
404
+ transformed_assignments = transform_app_role_assignments(assignments_batch)
405
+ load_app_role_assignments(
406
+ neo4j_session, transformed_assignments, update_tag, tenant_id
407
+ )
408
+ assignments_batch.clear()
409
+ transformed_assignments.clear()
357
410
 
358
- # Load applications and assignments
359
- load_applications(neo4j_session, transformed_applications, update_tag, tenant_id)
360
- load_app_role_assignments(
361
- neo4j_session, transformed_assignments, update_tag, tenant_id
362
- )
411
+ # Final garbage collection
412
+ gc.collect()
363
413
 
364
414
  # Cleanup stale data
365
415
  cleanup_applications(neo4j_session, common_job_parameters)
@@ -1,5 +1,7 @@
1
1
  import logging
2
2
  from typing import Any
3
+ from typing import AsyncGenerator
4
+ from typing import Generator
3
5
 
4
6
  import neo4j
5
7
  from azure.identity import ClientSecretCredential
@@ -9,7 +11,6 @@ from msgraph.generated.models.group import Group
9
11
 
10
12
  from cartography.client.core.tx import load
11
13
  from cartography.graph.job import GraphJob
12
- from cartography.intel.entra.users import load_tenant
13
14
  from cartography.models.entra.group import EntraGroupSchema
14
15
  from cartography.util import timeit
15
16
 
@@ -17,23 +18,20 @@ logger = logging.getLogger(__name__)
17
18
 
18
19
 
19
20
  @timeit
20
- async def get_entra_groups(client: GraphServiceClient) -> list[Group]:
21
- """Get all groups from Microsoft Graph API with pagination."""
22
- all_groups: list[Group] = []
23
-
21
+ async def get_entra_groups(client: GraphServiceClient) -> AsyncGenerator[Group, None]:
22
+ """Get all groups from Microsoft Graph API with pagination using a generator."""
24
23
  request_configuration = client.groups.GroupsRequestBuilderGetRequestConfiguration(
25
24
  query_parameters=client.groups.GroupsRequestBuilderGetQueryParameters(top=999)
26
25
  )
27
26
  page = await client.groups.get(request_configuration=request_configuration)
28
27
  while page:
29
28
  if page.value:
30
- all_groups.extend(page.value)
29
+ for group in page.value:
30
+ yield group
31
31
  if not page.odata_next_link:
32
32
  break
33
33
  page = await client.groups.with_url(page.odata_next_link).get()
34
34
 
35
- return all_groups
36
-
37
35
 
38
36
  @timeit
39
37
  async def get_group_members(
@@ -82,11 +80,10 @@ def transform_groups(
82
80
  user_member_map: dict[str, list[str]],
83
81
  group_member_map: dict[str, list[str]],
84
82
  group_owner_map: dict[str, list[str]],
85
- ) -> list[dict[str, Any]]:
86
- """Transform API responses into dictionaries for ingestion."""
87
- result: list[dict[str, Any]] = []
83
+ ) -> Generator[dict[str, Any], None, None]:
84
+ """Transform API responses into dictionaries for ingestion using a generator."""
88
85
  for g in groups:
89
- transformed = {
86
+ yield {
90
87
  "id": g.id,
91
88
  "display_name": g.display_name,
92
89
  "description": g.description,
@@ -103,8 +100,6 @@ def transform_groups(
103
100
  "member_group_ids": group_member_map.get(g.id, []),
104
101
  "owner_ids": group_owner_map.get(g.id, []),
105
102
  }
106
- result.append(transformed)
107
- return result
108
103
 
109
104
 
110
105
  @timeit
@@ -150,17 +145,22 @@ async def sync_entra_groups(
150
145
  credential, scopes=["https://graph.microsoft.com/.default"]
151
146
  )
152
147
 
153
- groups = await get_entra_groups(client)
148
+ # Collect groups in batches to avoid loading all at once
149
+ groups_batch = []
150
+ batch_size = 100 # Process groups in batches
154
151
 
155
152
  user_member_map: dict[str, list[str]] = {}
156
153
  group_member_map: dict[str, list[str]] = {}
157
154
  group_owner_map: dict[str, list[str]] = {}
158
155
 
159
- for group in groups:
156
+ # First pass: collect groups and their owners/members
157
+ async for group in get_entra_groups(client):
158
+ groups_batch.append(group)
159
+
160
+ # Fetch owners and members for this group
160
161
  owners = await get_group_owners(client, group.id)
161
162
  group_owner_map[group.id] = owners
162
163
 
163
- for group in groups:
164
164
  try:
165
165
  users, subgroups = await get_group_members(client, group.id)
166
166
  user_member_map[group.id] = users
@@ -170,10 +170,29 @@ async def sync_entra_groups(
170
170
  user_member_map[group.id] = []
171
171
  group_member_map[group.id] = []
172
172
 
173
- transformed_groups = transform_groups(
174
- groups, user_member_map, group_member_map, group_owner_map
175
- )
173
+ # Process batch when it reaches the size limit
174
+ if len(groups_batch) >= batch_size:
175
+ transformed_groups = list(
176
+ transform_groups(
177
+ groups_batch, user_member_map, group_member_map, group_owner_map
178
+ )
179
+ )
180
+ load_groups(neo4j_session, transformed_groups, update_tag, tenant_id)
181
+
182
+ # Clear the batch and maps for processed groups
183
+ for g in groups_batch:
184
+ user_member_map.pop(g.id, None)
185
+ group_member_map.pop(g.id, None)
186
+ group_owner_map.pop(g.id, None)
187
+ groups_batch.clear()
188
+
189
+ # Process any remaining groups
190
+ if groups_batch:
191
+ transformed_groups = list(
192
+ transform_groups(
193
+ groups_batch, user_member_map, group_member_map, group_owner_map
194
+ )
195
+ )
196
+ load_groups(neo4j_session, transformed_groups, update_tag, tenant_id)
176
197
 
177
- load_tenant(neo4j_session, {"id": tenant_id}, update_tag)
178
- load_groups(neo4j_session, transformed_groups, update_tag, tenant_id)
179
198
  cleanup_groups(neo4j_session, common_job_parameters)