cartography 0.107.0rc2__py3-none-any.whl → 0.108.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cartography might be problematic. Click here for more details.

Files changed (40) hide show
  1. cartography/_version.py +2 -2
  2. cartography/cli.py +10 -0
  3. cartography/config.py +5 -0
  4. cartography/data/indexes.cypher +0 -8
  5. cartography/data/jobs/cleanup/github_repos_cleanup.json +2 -0
  6. cartography/intel/aws/__init__.py +1 -0
  7. cartography/intel/aws/cloudwatch.py +77 -0
  8. cartography/intel/aws/ec2/security_groups.py +140 -122
  9. cartography/intel/aws/ec2/snapshots.py +47 -84
  10. cartography/intel/aws/ec2/subnets.py +1 -1
  11. cartography/intel/aws/ecs.py +17 -0
  12. cartography/intel/aws/guardduty.py +275 -0
  13. cartography/intel/aws/resources.py +2 -0
  14. cartography/intel/github/repos.py +370 -28
  15. cartography/intel/sentinelone/__init__.py +8 -2
  16. cartography/intel/sentinelone/application.py +248 -0
  17. cartography/intel/sentinelone/utils.py +20 -1
  18. cartography/models/aws/cloudwatch/log_metric_filter.py +79 -0
  19. cartography/models/aws/ec2/networkinterfaces.py +2 -0
  20. cartography/models/aws/ec2/security_group_rules.py +109 -0
  21. cartography/models/aws/ec2/security_groups.py +90 -0
  22. cartography/models/aws/ec2/snapshots.py +58 -0
  23. cartography/models/aws/ec2/subnet_instance.py +2 -0
  24. cartography/models/aws/ec2/subnet_networkinterface.py +2 -0
  25. cartography/models/aws/ec2/volumes.py +20 -0
  26. cartography/models/aws/ecs/tasks.py +24 -1
  27. cartography/models/aws/guardduty/__init__.py +1 -0
  28. cartography/models/aws/guardduty/findings.py +102 -0
  29. cartography/models/github/dependencies.py +74 -0
  30. cartography/models/github/manifests.py +49 -0
  31. cartography/models/sentinelone/application.py +44 -0
  32. cartography/models/sentinelone/application_version.py +96 -0
  33. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/METADATA +3 -3
  34. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/RECORD +38 -28
  35. cartography/data/jobs/cleanup/aws_import_ec2_security_groupinfo_cleanup.json +0 -24
  36. cartography/data/jobs/cleanup/aws_import_snapshots_cleanup.json +0 -30
  37. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/WHEEL +0 -0
  38. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/entry_points.txt +0 -0
  39. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/licenses/LICENSE +0 -0
  40. {cartography-0.107.0rc2.dist-info → cartography-0.108.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import configparser
2
2
  import logging
3
+ from collections import defaultdict
3
4
  from collections import namedtuple
4
5
  from string import Template
5
6
  from typing import Any
@@ -12,8 +13,12 @@ from packaging.requirements import InvalidRequirement
12
13
  from packaging.requirements import Requirement
13
14
  from packaging.utils import canonicalize_name
14
15
 
16
+ from cartography.client.core.tx import load as load_data
17
+ from cartography.graph.job import GraphJob
15
18
  from cartography.intel.github.util import fetch_all
16
19
  from cartography.intel.github.util import PaginatedGraphqlData
20
+ from cartography.models.github.dependencies import GitHubDependencySchema
21
+ from cartography.models.github.manifests import DependencyGraphManifestSchema
17
22
  from cartography.util import backoff_handler
18
23
  from cartography.util import retries_with_backoff
19
24
  from cartography.util import run_cleanup_job
@@ -93,6 +98,18 @@ GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
93
98
  text
94
99
  }
95
100
  }
101
+ dependencyGraphManifests(first: 20) {
102
+ nodes {
103
+ blobPath
104
+ dependencies(first: 100) {
105
+ nodes {
106
+ packageName
107
+ requirements
108
+ packageManager
109
+ }
110
+ }
111
+ }
112
+ }
96
113
  }
97
114
  }
98
115
  }
@@ -291,8 +308,10 @@ def transform(
291
308
  :param outside_collaborators: dict of repo URL to list of outside collaborators.
292
309
  See tests.data.github.repos.OUTSIDE_COLLABORATORS for data shape.
293
310
  :return: Dict containing the repos, repo->language mapping, owners->repo mapping, outside collaborators->repo
294
- mapping, and Python requirements files (if any) in a repo.
311
+ mapping, Python requirements files (if any) in a repo, manifests from GitHub's dependency graph, and all
312
+ dependencies from GitHub's dependency graph.
295
313
  """
314
+ logger.info(f"Processing {len(repos_json)} GitHub repositories")
296
315
  transformed_repo_list: List[Dict] = []
297
316
  transformed_repo_languages: List[Dict] = []
298
317
  transformed_repo_owners: List[Dict] = []
@@ -312,6 +331,8 @@ def transform(
312
331
  "WRITE": [],
313
332
  }
314
333
  transformed_requirements_files: List[Dict] = []
334
+ transformed_dependencies: List[Dict] = []
335
+ transformed_manifests: List[Dict] = []
315
336
  for repo_object in repos_json:
316
337
  _transform_repo_languages(
317
338
  repo_object["url"],
@@ -350,6 +371,16 @@ def transform(
350
371
  repo_url,
351
372
  transformed_requirements_files,
352
373
  )
374
+ _transform_dependency_manifests(
375
+ repo_object.get("dependencyGraphManifests"),
376
+ repo_url,
377
+ transformed_manifests,
378
+ )
379
+ _transform_dependency_graph(
380
+ repo_object.get("dependencyGraphManifests"),
381
+ repo_url,
382
+ transformed_dependencies,
383
+ )
353
384
  results = {
354
385
  "repos": transformed_repo_list,
355
386
  "repo_languages": transformed_repo_languages,
@@ -357,7 +388,10 @@ def transform(
357
388
  "repo_outside_collaborators": transformed_outside_collaborators,
358
389
  "repo_direct_collaborators": transformed_direct_collaborators,
359
390
  "python_requirements": transformed_requirements_files,
391
+ "dependencies": transformed_dependencies,
392
+ "manifests": transformed_manifests,
360
393
  }
394
+
361
395
  return results
362
396
 
363
397
 
@@ -533,6 +567,185 @@ def _transform_setup_cfg_requirements(
533
567
  _transform_python_requirements(requirements_list, repo_url, out_requirements_files)
534
568
 
535
569
 
570
+ def _transform_dependency_manifests(
571
+ dependency_manifests: Optional[Dict],
572
+ repo_url: str,
573
+ out_manifests_list: List[Dict],
574
+ ) -> None:
575
+ """
576
+ Transform GitHub dependency graph manifests into cartography manifest format.
577
+ :param dependency_manifests: dependencyGraphManifests from GitHub GraphQL API
578
+ :param repo_url: The URL of the GitHub repo
579
+ :param out_manifests_list: Output array to append transformed results to
580
+ :return: Nothing
581
+ """
582
+ if not dependency_manifests or not dependency_manifests.get("nodes"):
583
+ return
584
+
585
+ manifests_added = 0
586
+
587
+ for manifest in dependency_manifests["nodes"]:
588
+ blob_path = manifest.get("blobPath", "")
589
+ if not blob_path:
590
+ continue
591
+
592
+ # Count dependencies in this manifest
593
+ dependencies = manifest.get("dependencies", {})
594
+ dependencies_count = len(dependencies.get("nodes", []) if dependencies else [])
595
+
596
+ # Create unique manifest ID by combining repo URL and blob path
597
+ manifest_id = f"{repo_url}#{blob_path}"
598
+
599
+ # Extract filename from blob path
600
+ filename = blob_path.split("/")[-1] if blob_path else "None"
601
+
602
+ out_manifests_list.append(
603
+ {
604
+ "id": manifest_id,
605
+ "blob_path": blob_path,
606
+ "filename": filename,
607
+ "dependencies_count": dependencies_count,
608
+ "repo_url": repo_url,
609
+ }
610
+ )
611
+ manifests_added += 1
612
+
613
+ if manifests_added > 0:
614
+ repo_name = repo_url.split("/")[-1] if repo_url else "repository"
615
+ logger.info(f"Found {manifests_added} dependency manifests in {repo_name}")
616
+
617
+
618
+ def _transform_dependency_graph(
619
+ dependency_manifests: Optional[Dict],
620
+ repo_url: str,
621
+ out_dependencies_list: List[Dict],
622
+ ) -> None:
623
+ """
624
+ Transform GitHub dependency graph manifests into cartography dependency format.
625
+ :param dependency_manifests: dependencyGraphManifests from GitHub GraphQL API
626
+ :param repo_url: The URL of the GitHub repo
627
+ :param out_dependencies_list: Output array to append transformed results to
628
+ :return: Nothing
629
+ """
630
+ if not dependency_manifests or not dependency_manifests.get("nodes"):
631
+ return
632
+
633
+ dependencies_added = 0
634
+
635
+ for manifest in dependency_manifests["nodes"]:
636
+ dependencies = manifest.get("dependencies", {})
637
+ if not dependencies or not dependencies.get("nodes"):
638
+ continue
639
+
640
+ manifest_path = manifest.get("blobPath", "")
641
+
642
+ for dep in dependencies["nodes"]:
643
+ package_name = dep.get("packageName")
644
+ if not package_name:
645
+ continue
646
+
647
+ requirements = dep.get("requirements", "")
648
+ package_manager = dep.get("packageManager", "").upper()
649
+
650
+ # Extract version from requirements string if available
651
+ pinned_version = _extract_version_from_requirements(requirements)
652
+
653
+ # Create ecosystem-specific canonical name
654
+ canonical_name = _canonicalize_dependency_name(
655
+ package_name, package_manager
656
+ )
657
+
658
+ # Create ecosystem identifier
659
+ ecosystem = package_manager.lower() if package_manager else "unknown"
660
+
661
+ # Create simple dependency ID using canonical name and version
662
+ # This allows the same dependency to be shared across multiple repos
663
+ dependency_id = (
664
+ f"{canonical_name}|{pinned_version}"
665
+ if pinned_version
666
+ else canonical_name
667
+ )
668
+
669
+ # Normalize requirements field (prefer None over empty string)
670
+ normalized_requirements = requirements if requirements else None
671
+
672
+ # Create manifest ID for the HAS_DEP relationship
673
+ manifest_id = f"{repo_url}#{manifest_path}"
674
+
675
+ out_dependencies_list.append(
676
+ {
677
+ "id": dependency_id,
678
+ "name": canonical_name,
679
+ "original_name": package_name, # Keep original for reference
680
+ "version": pinned_version,
681
+ "requirements": normalized_requirements,
682
+ "ecosystem": ecosystem,
683
+ "package_manager": package_manager,
684
+ "manifest_path": manifest_path,
685
+ "manifest_id": manifest_id,
686
+ "repo_url": repo_url,
687
+ # Add separate fields for easier querying
688
+ "repo_name": repo_url.split("/")[-1] if repo_url else "",
689
+ "manifest_file": (
690
+ manifest_path.split("/")[-1] if manifest_path else ""
691
+ ),
692
+ }
693
+ )
694
+ dependencies_added += 1
695
+
696
+ if dependencies_added > 0:
697
+ repo_name = repo_url.split("/")[-1] if repo_url else "repository"
698
+ logger.info(f"Found {dependencies_added} dependencies in {repo_name}")
699
+
700
+
701
+ def _extract_version_from_requirements(requirements: Optional[str]) -> Optional[str]:
702
+ """
703
+ Extract a pinned version from a requirements string if it exists.
704
+ Examples: "1.2.3" -> "1.2.3", "^1.2.3" -> None, ">=1.0,<2.0" -> None
705
+ """
706
+ if not requirements or not requirements.strip():
707
+ return None
708
+
709
+ # Handle exact version specifications (no operators)
710
+ if requirements and not any(
711
+ op in requirements for op in ["^", "~", ">", "<", "=", "*"]
712
+ ):
713
+ stripped = requirements.strip()
714
+ return stripped if stripped else None
715
+
716
+ # Handle == specifications
717
+ if "==" in requirements:
718
+ parts = requirements.split("==")
719
+ if len(parts) == 2:
720
+ version = parts[1].strip()
721
+ # Remove any trailing constraints
722
+ version = version.split(",")[0].split(" ")[0]
723
+ return version if version else None
724
+
725
+ return None
726
+
727
+
728
+ def _canonicalize_dependency_name(name: str, package_manager: Optional[str]) -> str:
729
+ """
730
+ Canonicalize dependency names based on ecosystem conventions.
731
+ """
732
+ if not name:
733
+ return name
734
+
735
+ # For Python packages, use existing canonicalization
736
+ if package_manager in ["PIP", "CONDA"]:
737
+ try:
738
+ from packaging.utils import canonicalize_name
739
+
740
+ return str(canonicalize_name(name))
741
+ except ImportError:
742
+ # Fallback if packaging not available
743
+ return name.lower().replace("_", "-")
744
+
745
+ # For other ecosystems, use lowercase
746
+ return name.lower()
747
+
748
+
536
749
  def _transform_python_requirements(
537
750
  requirements_list: List[str],
538
751
  repo_url: str,
@@ -785,6 +998,136 @@ def load_collaborators(
785
998
  )
786
999
 
787
1000
 
1001
+ @timeit
1002
+ def load_python_requirements(
1003
+ neo4j_session: neo4j.Session,
1004
+ update_tag: int,
1005
+ requirements_objects: List[Dict],
1006
+ ) -> None:
1007
+ query = """
1008
+ UNWIND $Requirements AS req
1009
+ MERGE (lib:PythonLibrary:Dependency{id: req.id})
1010
+ ON CREATE SET lib.firstseen = timestamp(),
1011
+ lib.name = req.name
1012
+ SET lib.lastupdated = $UpdateTag,
1013
+ lib.version = req.version
1014
+
1015
+ WITH lib, req
1016
+ MATCH (repo:GitHubRepository{id: req.repo_url})
1017
+ MERGE (repo)-[r:REQUIRES]->(lib)
1018
+ ON CREATE SET r.firstseen = timestamp()
1019
+ SET r.lastupdated = $UpdateTag,
1020
+ r.specifier = req.specifier
1021
+ """
1022
+ neo4j_session.run(
1023
+ query,
1024
+ Requirements=requirements_objects,
1025
+ UpdateTag=update_tag,
1026
+ )
1027
+
1028
+
1029
+ @timeit
1030
+ def load_github_dependencies(
1031
+ neo4j_session: neo4j.Session,
1032
+ update_tag: int,
1033
+ dependencies: List[Dict],
1034
+ ) -> None:
1035
+ """
1036
+ Ingest GitHub dependency data into Neo4j using the new data model
1037
+ :param neo4j_session: Neo4J session object for server communication
1038
+ :param update_tag: Timestamp used to determine data freshness
1039
+ :param dependencies: List of dependency objects from GitHub's dependency graph
1040
+ :return: Nothing
1041
+ """
1042
+ # Group dependencies by both repo_url and manifest_id for schema-based loading
1043
+ dependencies_by_repo_and_manifest = defaultdict(list)
1044
+
1045
+ for dep in dependencies:
1046
+ repo_url = dep["repo_url"]
1047
+ manifest_id = dep["manifest_id"]
1048
+ # Create a key combining both repo_url and manifest_id
1049
+ group_key = (repo_url, manifest_id)
1050
+ # Remove repo_url and manifest_id from the dependency object since we'll pass them as kwargs
1051
+ dep_without_kwargs = {
1052
+ k: v for k, v in dep.items() if k not in ["repo_url", "manifest_id"]
1053
+ }
1054
+ dependencies_by_repo_and_manifest[group_key].append(dep_without_kwargs)
1055
+
1056
+ # Load dependencies for each repository/manifest combination separately
1057
+ for (
1058
+ repo_url,
1059
+ manifest_id,
1060
+ ), group_dependencies in dependencies_by_repo_and_manifest.items():
1061
+ load_data(
1062
+ neo4j_session,
1063
+ GitHubDependencySchema(),
1064
+ group_dependencies,
1065
+ lastupdated=update_tag,
1066
+ repo_url=repo_url,
1067
+ manifest_id=manifest_id,
1068
+ )
1069
+
1070
+
1071
+ @timeit
1072
+ def load_github_dependency_manifests(
1073
+ neo4j_session: neo4j.Session,
1074
+ update_tag: int,
1075
+ manifests: List[Dict],
1076
+ ) -> None:
1077
+ """
1078
+ Ingest GitHub dependency manifests into Neo4j
1079
+ """
1080
+ manifests_by_repo = defaultdict(list)
1081
+
1082
+ for manifest in manifests:
1083
+ repo_url = manifest["repo_url"]
1084
+ manifests_by_repo[repo_url].append(manifest)
1085
+
1086
+ # Load manifests for each repository separately
1087
+ for repo_url, repo_manifests in manifests_by_repo.items():
1088
+ load_data(
1089
+ neo4j_session,
1090
+ DependencyGraphManifestSchema(),
1091
+ repo_manifests,
1092
+ lastupdated=update_tag,
1093
+ repo_url=repo_url,
1094
+ )
1095
+
1096
+
1097
+ @timeit
1098
+ def cleanup_github_dependencies(
1099
+ neo4j_session: neo4j.Session,
1100
+ common_job_parameters: Dict[str, Any],
1101
+ repo_urls: List[str],
1102
+ ) -> None:
1103
+ # Run cleanup for each repository separately
1104
+ for repo_url in repo_urls:
1105
+ cleanup_params = {**common_job_parameters, "repo_url": repo_url}
1106
+ GraphJob.from_node_schema(GitHubDependencySchema(), cleanup_params).run(
1107
+ neo4j_session
1108
+ )
1109
+
1110
+
1111
+ @timeit
1112
+ def cleanup_github_manifests(
1113
+ neo4j_session: neo4j.Session,
1114
+ common_job_parameters: Dict[str, Any],
1115
+ repo_urls: List[str],
1116
+ ) -> None:
1117
+ """
1118
+ Delete GitHub dependency manifests and their relationships from the graph if they were not updated in the last sync.
1119
+ :param neo4j_session: Neo4j session
1120
+ :param common_job_parameters: Common job parameters containing UPDATE_TAG
1121
+ :param repo_urls: List of repository URLs to clean up manifests for
1122
+ """
1123
+ # Run cleanup for each repository separately
1124
+ for repo_url in repo_urls:
1125
+ cleanup_params = {**common_job_parameters, "repo_url": repo_url}
1126
+ GraphJob.from_node_schema(DependencyGraphManifestSchema(), cleanup_params).run(
1127
+ neo4j_session
1128
+ )
1129
+
1130
+
788
1131
  @timeit
789
1132
  def load(
790
1133
  neo4j_session: neo4j.Session,
@@ -823,33 +1166,15 @@ def load(
823
1166
  common_job_parameters["UPDATE_TAG"],
824
1167
  repo_data["python_requirements"],
825
1168
  )
826
-
827
-
828
- @timeit
829
- def load_python_requirements(
830
- neo4j_session: neo4j.Session,
831
- update_tag: int,
832
- requirements_objects: List[Dict],
833
- ) -> None:
834
- query = """
835
- UNWIND $Requirements AS req
836
- MERGE (lib:PythonLibrary:Dependency{id: req.id})
837
- ON CREATE SET lib.firstseen = timestamp(),
838
- lib.name = req.name
839
- SET lib.lastupdated = $UpdateTag,
840
- lib.version = req.version
841
-
842
- WITH lib, req
843
- MATCH (repo:GitHubRepository{id: req.repo_url})
844
- MERGE (repo)-[r:REQUIRES]->(lib)
845
- ON CREATE SET r.firstseen = timestamp()
846
- SET r.lastupdated = $UpdateTag,
847
- r.specifier = req.specifier
848
- """
849
- neo4j_session.run(
850
- query,
851
- Requirements=requirements_objects,
852
- UpdateTag=update_tag,
1169
+ load_github_dependency_manifests(
1170
+ neo4j_session,
1171
+ common_job_parameters["UPDATE_TAG"],
1172
+ repo_data["manifests"],
1173
+ )
1174
+ load_github_dependencies(
1175
+ neo4j_session,
1176
+ common_job_parameters["UPDATE_TAG"],
1177
+ repo_data["dependencies"],
853
1178
  )
854
1179
 
855
1180
 
@@ -896,4 +1221,21 @@ def sync(
896
1221
  )
897
1222
  repo_data = transform(repos_json, direct_collabs, outside_collabs)
898
1223
  load(neo4j_session, common_job_parameters, repo_data)
1224
+
1225
+ # Collect repository URLs that have dependencies for cleanup
1226
+ repo_urls_with_dependencies = list(
1227
+ {dep["repo_url"] for dep in repo_data["dependencies"]}
1228
+ )
1229
+ cleanup_github_dependencies(
1230
+ neo4j_session, common_job_parameters, repo_urls_with_dependencies
1231
+ )
1232
+
1233
+ # Collect repository URLs that have manifests for cleanup
1234
+ repo_urls_with_manifests = list(
1235
+ {manifest["repo_url"] for manifest in repo_data["manifests"]}
1236
+ )
1237
+ cleanup_github_manifests(
1238
+ neo4j_session, common_job_parameters, repo_urls_with_manifests
1239
+ )
1240
+
899
1241
  run_cleanup_job("github_repos_cleanup.json", neo4j_session, common_job_parameters)
@@ -3,6 +3,7 @@ import logging
3
3
  import neo4j
4
4
 
5
5
  import cartography.intel.sentinelone.agent
6
+ import cartography.intel.sentinelone.application
6
7
  from cartography.config import Config
7
8
  from cartography.intel.sentinelone.account import sync_accounts
8
9
  from cartography.stats import get_stats_client
@@ -39,7 +40,7 @@ def start_sentinelone_ingestion(neo4j_session: neo4j.Session, config: Config) ->
39
40
  config.sentinelone_account_ids,
40
41
  )
41
42
 
42
- # Sync agents for each account
43
+ # Sync agents and applications for each account
43
44
  for account_id in synced_account_ids:
44
45
  # Add account-specific parameter
45
46
  common_job_parameters["S1_ACCOUNT_ID"] = account_id
@@ -49,7 +50,12 @@ def start_sentinelone_ingestion(neo4j_session: neo4j.Session, config: Config) ->
49
50
  common_job_parameters,
50
51
  )
51
52
 
52
- # Clean up account-specific parameter
53
+ cartography.intel.sentinelone.application.sync(
54
+ neo4j_session,
55
+ common_job_parameters,
56
+ )
57
+
58
+ # Clean up account-specific parameters
53
59
  del common_job_parameters["S1_ACCOUNT_ID"]
54
60
 
55
61
  # Record that the sync is complete