guarddog 2.7.1__py3-none-any.whl → 2.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. guarddog/analyzer/metadata/__init__.py +3 -0
  2. guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
  3. guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
  4. guarddog/analyzer/metadata/resources/top_rubygems_packages.json +976 -0
  5. guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
  6. guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
  7. guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
  8. guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
  9. guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
  10. guarddog/analyzer/metadata/rubygems/typosquatting.py +140 -0
  11. guarddog/analyzer/metadata/utils.py +23 -0
  12. guarddog/analyzer/sourcecode/__init__.py +2 -0
  13. guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
  14. guarddog/analyzer/sourcecode/code-execution.yml +20 -0
  15. guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
  16. guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
  17. guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
  18. guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
  19. guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
  20. guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
  21. guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
  22. guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
  23. guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
  24. guarddog/ecosystems.py +3 -0
  25. guarddog/scanners/__init__.py +6 -0
  26. guarddog/scanners/rubygems_package_scanner.py +112 -0
  27. guarddog/scanners/rubygems_project_scanner.py +75 -0
  28. guarddog/scanners/scanner.py +34 -8
  29. guarddog-2.8.4.dist-info/METADATA +471 -0
  30. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/RECORD +35 -19
  31. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/WHEEL +1 -1
  32. guarddog-2.7.1.dist-info/METADATA +0 -40
  33. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/entry_points.txt +0 -0
  34. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/licenses/LICENSE +0 -0
  35. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
  36. {guarddog-2.7.1.dist-info → guarddog-2.8.4.dist-info}/licenses/NOTICE +0 -0
@@ -3,6 +3,7 @@ from guarddog.analyzer.metadata.npm import NPM_METADATA_RULES
3
3
  from guarddog.analyzer.metadata.pypi import PYPI_METADATA_RULES
4
4
  from guarddog.analyzer.metadata.go import GO_METADATA_RULES
5
5
  from guarddog.analyzer.metadata.github_action import GITHUB_ACTION_METADATA_RULES
6
+ from guarddog.analyzer.metadata.rubygems import RUBYGEMS_METADATA_RULES
6
7
  from guarddog.ecosystems import ECOSYSTEM
7
8
 
8
9
 
@@ -18,3 +19,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
18
19
  return GITHUB_ACTION_METADATA_RULES
19
20
  case ECOSYSTEM.EXTENSION:
20
21
  return {} # No metadata detectors for extensions currently
22
+ case ECOSYSTEM.RUBYGEMS:
23
+ return RUBYGEMS_METADATA_RULES
@@ -4,14 +4,12 @@ Detects if a package contains an empty description
4
4
  """
5
5
 
6
6
  import configparser
7
- import hashlib
8
7
  import logging
9
8
  import os
10
9
  import re
11
10
  import requests
12
11
  from typing import Optional, Tuple
13
12
 
14
- import pygit2 # type: ignore
15
13
  import urllib3.util
16
14
 
17
15
  from guarddog.analyzer.metadata.repository_integrity_mismatch import IntegrityMismatch
@@ -90,18 +88,6 @@ def dict_generator(indict, pre=None):
90
88
  yield pre + [indict]
91
89
 
92
90
 
93
- def get_file_hash(path):
94
- with open(path, "rb") as f:
95
- # Read the contents of the file
96
- file_contents = f.read()
97
- # Create a hash object
98
- hash_object = hashlib.sha256()
99
- # Feed the file contents to the hash object
100
- hash_object.update(file_contents)
101
- # Get the hexadecimal hash value
102
- return hash_object.hexdigest(), str(file_contents).strip().splitlines()
103
-
104
-
105
91
  def _ensure_proper_url(url):
106
92
  parsed = urllib3.util.parse_url(url)
107
93
  if parsed.scheme is None:
@@ -140,80 +126,6 @@ def find_github_candidates(package_info) -> Tuple[set[str], Optional[str]]:
140
126
  return github_urls, best
141
127
 
142
128
 
143
- EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
144
-
145
-
146
- def exclude_result(file_name, repo_root, pkg_root):
147
- """
148
- This method filters out some results that are known false positives:
149
- * if the file is a documentation file (based on its extension)
150
- * if the file is a setup.cfg file with the egg_info claim present on Pypi and not on GitHub
151
- """
152
- for extension in EXCLUDED_EXTENSIONS:
153
- if file_name.endswith(extension):
154
- return True
155
- if file_name.endswith("setup.cfg"):
156
- repo_cfg = configparser.ConfigParser()
157
- repo_cfg.read(os.path.join(repo_root, file_name))
158
- pkg_cfg = configparser.ConfigParser()
159
- pkg_cfg.read(os.path.join(pkg_root, file_name))
160
- repo_sections = list(repo_cfg.keys())
161
- pkg_sections = list(pkg_cfg.keys())
162
- if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
163
- return True
164
- return False
165
-
166
-
167
- def find_mismatch_for_tag(repo, tag, base_path, repo_path):
168
- repo.checkout(tag)
169
- mismatch = []
170
- for root, dirs, files in os.walk(base_path):
171
- relative_path = os.path.relpath(root, base_path)
172
- repo_root = os.path.join(repo_path, relative_path)
173
- if not os.path.exists(repo_root):
174
- continue
175
- repo_files = list(
176
- filter(
177
- lambda x: os.path.isfile(os.path.join(repo_root, x)),
178
- os.listdir(repo_root),
179
- )
180
- )
181
- for file_name in repo_files:
182
- if file_name not in files: # ignore files we don't have in the distribution
183
- continue
184
- repo_hash, repo_content = get_file_hash(os.path.join(repo_root, file_name))
185
- pkg_hash, pkg_content = get_file_hash(os.path.join(root, file_name))
186
- if repo_hash != pkg_hash:
187
- if exclude_result(file_name, repo_root, root):
188
- continue
189
- res = {
190
- "file": os.path.join(relative_path, file_name),
191
- "repo_sha256": repo_hash,
192
- "pkg_sha256": pkg_hash,
193
- }
194
- mismatch.append(res)
195
- return mismatch
196
-
197
-
198
- def find_suitable_tags_in_list(tags, version):
199
- tag_candidates = []
200
- for tag_name in tags:
201
- if tag_name.endswith(version):
202
- tag_candidates.append(tag_name)
203
- return tag_candidates
204
-
205
-
206
- def find_suitable_tags(repo, version):
207
- tags_regex = re.compile("^refs/tags/(.*)")
208
- tags = []
209
- for ref in repo.references:
210
- match = tags_regex.match(ref)
211
- if match is not None:
212
- tags.append(match.group(0))
213
-
214
- return find_suitable_tags_in_list(tags, version)
215
-
216
-
217
129
  # Note: we should have the GitHub related logic factored out as we will need it when we check for signed commits
218
130
  class PypiIntegrityMismatchDetector(IntegrityMismatch):
219
131
  """
@@ -228,94 +140,71 @@ class PypiIntegrityMismatchDetector(IntegrityMismatch):
228
140
  """
229
141
 
230
142
  RULE_NAME = "repository_integrity_mismatch"
143
+ EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
231
144
 
232
- def detect(
233
- self,
234
- package_info,
235
- path: Optional[str] = None,
236
- name: Optional[str] = None,
237
- version: Optional[str] = None,
238
- ) -> tuple[bool, str]:
239
- if name is None:
240
- raise Exception("Detector needs the name of the package")
241
- if path is None:
242
- raise Exception("Detector needs the path of the package")
243
-
244
- log.debug(
245
- f"Running repository integrity mismatch heuristic on PyPI package {name} version {version}"
246
- )
247
- # let's extract a source repository (GitHub only for now) if we can
145
+ def extract_github_url(self, package_info, name: str) -> Optional[str]:
146
+ """Extract GitHub URL from PyPI metadata."""
248
147
  github_urls, best_github_candidate = find_github_candidates(package_info)
249
148
  if len(github_urls) == 0:
250
- return False, "Could not find any GitHub url in the project's description"
251
- # now, let's find the right url
149
+ return None
252
150
 
253
151
  github_url = find_best_github_candidate(
254
152
  (github_urls, best_github_candidate), name
255
153
  )
154
+ return github_url
256
155
 
257
- if github_url is None:
258
- return (
259
- False,
260
- "Could not find a good GitHub url in the project's description",
261
- )
262
-
263
- log.debug(f"Using GitHub URL {github_url}")
264
- # ok, now let's try to find the version! (I need to know which version we are scanning)
265
- if version is None:
266
- version = package_info["info"]["version"]
267
- if version is None:
268
- raise Exception("Could not find suitable version to scan")
269
- tmp_dir = os.path.dirname(path)
270
- if tmp_dir is None:
271
- raise Exception("no current scanning directory")
272
-
273
- repo_path = os.path.join(tmp_dir, "sources", name)
274
- try:
275
- repo = pygit2.clone_repository(url=github_url, path=repo_path)
276
- except pygit2.GitError as git_error:
277
- # Handle generic Git-related errors
278
- raise Exception(
279
- f"Error while cloning repository {str(git_error)} with github url {github_url}"
280
- )
281
- except Exception as e:
282
- # Catch any other unexpected exceptions
283
- raise Exception(
284
- f"An unexpected error occurred: {str(e)}. github url {github_url}"
285
- )
286
-
287
- tag_candidates = find_suitable_tags(repo, version)
288
-
289
- if len(tag_candidates) == 0:
290
- return False, "Could not find any suitable tag in repository"
291
-
292
- target_tag = None
293
- # TODO: this one is a bit weak. let's find something stronger - maybe use the closest string?
294
- for tag in tag_candidates:
295
- target_tag = tag
296
-
297
- # Idea: parse the code of the package to find the real version - we can grep the project files for
298
- # the version, git bisect until we have a file with the same version? will not work if main has not
299
- # been bumped yet in version so tags and releases are out only solutions here print(tag_candidates)
300
- # Well, that works if we run integrity check for multiple commits
301
-
302
- # should be good, let's open the sources
156
+ def get_base_path(self, path: str, name: str) -> str:
157
+ """
158
+ PyPI: find the subdirectory containing the package files.
159
+ The extracted archive typically has a subdirectory with the package name.
160
+ """
303
161
  base_dir_name = None
304
162
  for entry in os.listdir(path):
305
163
  if entry.lower().startswith(
306
164
  name.lower().replace("-", "_")
307
165
  ) or entry.lower().startswith(name.lower()):
308
166
  base_dir_name = entry
167
+
168
+ if base_dir_name is None or base_dir_name == "sources":
169
+ raise Exception("Could not find package directory in extracted files")
170
+
171
+ return os.path.join(path, base_dir_name)
172
+
173
+ def get_version(self, package_info, version: Optional[str]) -> Optional[str]:
174
+ """Get version from PyPI metadata or use provided version."""
175
+ if version is None:
176
+ version = package_info["info"]["version"]
177
+ return version
178
+
179
+ def exclude_result(
180
+ self,
181
+ file_name: str,
182
+ repo_root: Optional[str] = None,
183
+ pkg_root: Optional[str] = None,
184
+ ) -> bool:
185
+ """
186
+ Override base class method to add PyPI-specific exclusion logic.
187
+
188
+ This method filters out some results that are known false positives:
189
+ * if the file is a documentation file (based on its extension)
190
+ * if the file is a setup.cfg file with the egg_info claim present on PyPI and not on GitHub
191
+ """
192
+ # First check standard extensions using base class logic
193
+ if super().exclude_result(file_name, repo_root, pkg_root):
194
+ return True
195
+
196
+ # PyPI-specific: check for setup.cfg with egg_info differences
309
197
  if (
310
- base_dir_name is None or base_dir_name == "sources"
311
- ): # I am not sure how we can get there
312
- raise Exception("something went wrong when opening the package")
313
- base_path = os.path.join(path, base_dir_name)
314
-
315
- mismatch = find_mismatch_for_tag(repo, target_tag, base_path, repo_path)
316
- message = "\n".join(map(lambda x: "* " + x["file"], mismatch))
317
- return (
318
- len(mismatch) > 0,
319
- f"Some files present in the package are different from the ones on GitHub for "
320
- f"the same version of the package: \n{message}",
321
- )
198
+ file_name.endswith("setup.cfg")
199
+ and repo_root is not None
200
+ and pkg_root is not None
201
+ ):
202
+ repo_cfg = configparser.ConfigParser()
203
+ repo_cfg.read(os.path.join(repo_root, file_name))
204
+ pkg_cfg = configparser.ConfigParser()
205
+ pkg_cfg.read(os.path.join(pkg_root, file_name))
206
+ repo_sections = list(repo_cfg.keys())
207
+ pkg_sections = list(pkg_cfg.keys())
208
+ if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
209
+ return True
210
+ return False
@@ -1,13 +1,22 @@
1
+ import logging
2
+ import os
3
+ import re
1
4
  from abc import abstractmethod
2
- from typing import Optional
5
+ from typing import List, Optional
6
+
7
+ import pygit2
3
8
 
4
9
  from guarddog.analyzer.metadata.detector import Detector
10
+ from guarddog.analyzer.metadata.utils import get_file_hash
11
+
12
+ log = logging.getLogger("guarddog")
5
13
 
6
14
 
7
15
  class IntegrityMismatch(Detector):
8
16
  """This package contains files that have been tampered with between the source repository and the package CDN"""
9
17
 
10
18
  RULE_NAME = "repository_integrity_mismatch"
19
+ EXCLUDED_EXTENSIONS: List[str] = []
11
20
 
12
21
  def __init__(self):
13
22
  super().__init__(
@@ -17,6 +26,47 @@ class IntegrityMismatch(Detector):
17
26
  )
18
27
 
19
28
  @abstractmethod
29
+ def extract_github_url(self, package_info, name: str) -> Optional[str]:
30
+ """
31
+ Extract GitHub URL from package metadata.
32
+
33
+ Args:
34
+ package_info: Package metadata dictionary
35
+ name: Package name
36
+
37
+ Returns:
38
+ GitHub URL if found, None otherwise
39
+ """
40
+ pass
41
+
42
+ @abstractmethod
43
+ def get_base_path(self, path: str, name: str) -> str:
44
+ """
45
+ Get the base path where package files are located.
46
+
47
+ Args:
48
+ path: Root extraction path
49
+ name: Package name
50
+
51
+ Returns:
52
+ Path to the package source files
53
+ """
54
+ pass
55
+
56
+ @abstractmethod
57
+ def get_version(self, package_info, version: Optional[str]) -> Optional[str]:
58
+ """
59
+ Extract version from package info or use provided version.
60
+
61
+ Args:
62
+ package_info: Package metadata dictionary
63
+ version: Optional version string
64
+
65
+ Returns:
66
+ Version string
67
+ """
68
+ pass
69
+
20
70
  def detect(
21
71
  self,
22
72
  package_info,
@@ -24,4 +74,154 @@ class IntegrityMismatch(Detector):
24
74
  name: Optional[str] = None,
25
75
  version: Optional[str] = None,
26
76
  ) -> tuple[bool, str]:
27
- pass
77
+ """
78
+ Template method for detecting repository integrity mismatches.
79
+
80
+ This method implements the common algorithm for comparing package files
81
+ with their source repository. Subclasses customize behavior by implementing
82
+ the abstract methods for URL extraction, path resolution, and version handling.
83
+ """
84
+ if name is None:
85
+ return False, "Detector needs the name of the package"
86
+ if path is None:
87
+ return False, "Detector needs the path of the package"
88
+
89
+ log.debug(f"Running repository integrity mismatch heuristic on package {name}")
90
+
91
+ # Step 1: Extract GitHub URL (ecosystem-specific)
92
+ github_url = self.extract_github_url(package_info, name)
93
+ if github_url is None:
94
+ return False, "Could not find a GitHub URL in the package metadata"
95
+
96
+ log.debug(f"Using GitHub URL {github_url}")
97
+
98
+ # Step 2: Get version (ecosystem-specific)
99
+ version = self.get_version(package_info, version)
100
+ if version is None:
101
+ return False, "Could not determine version to scan"
102
+
103
+ # Step 3: Clone repository
104
+ tmp_dir = os.path.dirname(path)
105
+ repo_path = os.path.join(tmp_dir, "sources", name)
106
+
107
+ try:
108
+ repo = pygit2.clone_repository(url=github_url, path=repo_path)
109
+ except Exception as e:
110
+ return False, f"Could not clone repository: {str(e)}"
111
+
112
+ # Step 4: Find matching git tag
113
+ tag_candidates = self.find_suitable_tags(repo, version)
114
+ if len(tag_candidates) == 0:
115
+ return False, f"Could not find a tag matching version {version}"
116
+
117
+ target_tag = tag_candidates[-1]
118
+
119
+ # Step 5: Get base path where files are located (ecosystem-specific)
120
+ try:
121
+ base_path = self.get_base_path(path, name)
122
+ except Exception as e:
123
+ return False, f"Could not locate package files: {str(e)}"
124
+
125
+ # Step 6: Compare files
126
+ mismatch = self.find_mismatch_for_tag(repo, target_tag, base_path, repo_path)
127
+
128
+ if len(mismatch) == 0:
129
+ return False, ""
130
+
131
+ # Step 7: Format result message
132
+ message = "\n".join(map(lambda x: "* " + x["file"], mismatch))
133
+ return (
134
+ True,
135
+ f"Files in package differ from GitHub repository for version {version}:\n{message}",
136
+ )
137
+
138
+ def find_suitable_tags(self, repo, version: str) -> List[str]:
139
+ """
140
+ Find git tags that match the given version.
141
+
142
+ Args:
143
+ repo: pygit2.Repository object
144
+ version: version string to match
145
+
146
+ Returns:
147
+ List of tag references that match the version
148
+ """
149
+ tags_regex = re.compile("^refs/tags/(.*)")
150
+ tags = []
151
+ for ref in repo.references:
152
+ match = tags_regex.match(ref)
153
+ if match is not None:
154
+ tags.append(match.group(0))
155
+
156
+ tag_candidates = []
157
+ for tag_name in tags:
158
+ tag_ref = tag_name.rsplit("/", 1)[-1]
159
+ if tag_ref == version or tag_ref == f"v{version}":
160
+ tag_candidates.append(tag_name)
161
+ return tag_candidates
162
+
163
+ def exclude_result(
164
+ self,
165
+ file_name: str,
166
+ repo_root: Optional[str] = None,
167
+ pkg_root: Optional[str] = None,
168
+ ) -> bool:
169
+ """
170
+ Check if a file should be excluded from integrity checking.
171
+
172
+ Args:
173
+ file_name: name of the file to check
174
+ repo_root: path to the repository directory (optional, for subclass-specific logic)
175
+ pkg_root: path to the package directory (optional, for subclass-specific logic)
176
+
177
+ Returns:
178
+ True if the file should be excluded, False otherwise
179
+ """
180
+ for extension in self.EXCLUDED_EXTENSIONS:
181
+ if file_name.endswith(extension):
182
+ return True
183
+ return False
184
+
185
+ def find_mismatch_for_tag(
186
+ self, repo, tag: str, base_path: str, repo_path: str
187
+ ) -> list[dict]:
188
+ """
189
+ Find files that differ between the repository and the package.
190
+
191
+ Args:
192
+ repo: pygit2.Repository object
193
+ tag: git tag reference to checkout
194
+ base_path: path to the extracted package
195
+ repo_path: path to the cloned repository
196
+
197
+ Returns:
198
+ List of dictionaries describing mismatched files
199
+ """
200
+ repo.checkout(tag)
201
+ mismatch = []
202
+ for root, dirs, files in os.walk(base_path):
203
+ relative_path = os.path.relpath(root, base_path)
204
+ repo_root = os.path.join(repo_path, relative_path)
205
+ if not os.path.exists(repo_root):
206
+ continue
207
+ repo_files = list(
208
+ filter(
209
+ lambda x: os.path.isfile(os.path.join(repo_root, x)),
210
+ os.listdir(repo_root),
211
+ )
212
+ )
213
+ for file_name in repo_files:
214
+ if file_name not in files:
215
+ continue
216
+ if self.exclude_result(file_name, repo_root, root):
217
+ continue
218
+ repo_hash, _ = get_file_hash(os.path.join(repo_root, file_name))
219
+ pkg_hash, _ = get_file_hash(os.path.join(root, file_name))
220
+ if repo_hash != pkg_hash:
221
+ res = {
222
+ "file": os.path.join(relative_path, file_name),
223
+ "repo_sha256": repo_hash,
224
+ "pkg_sha256": pkg_hash,
225
+ }
226
+ mismatch.append(res)
227
+ return mismatch