guarddog 2.7.0__py3-none-any.whl → 2.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. guarddog/analyzer/metadata/__init__.py +3 -0
  2. guarddog/analyzer/metadata/go/typosquatting.py +13 -6
  3. guarddog/analyzer/metadata/npm/typosquatting.py +43 -12
  4. guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
  5. guarddog/analyzer/metadata/pypi/typosquatting.py +51 -17
  6. guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
  7. guarddog/analyzer/metadata/resources/top_pypi_packages.json +29998 -29986
  8. guarddog/analyzer/metadata/resources/top_rubygems_packages.json +976 -0
  9. guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
  10. guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
  11. guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
  12. guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
  13. guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
  14. guarddog/analyzer/metadata/rubygems/typosquatting.py +140 -0
  15. guarddog/analyzer/metadata/utils.py +24 -1
  16. guarddog/analyzer/sourcecode/__init__.py +2 -0
  17. guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
  18. guarddog/analyzer/sourcecode/code-execution.yml +20 -0
  19. guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
  20. guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
  21. guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
  22. guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
  23. guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
  24. guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
  25. guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
  26. guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
  27. guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
  28. guarddog/analyzer/sourcecode/shady-links.yml +1 -1
  29. guarddog/ecosystems.py +3 -0
  30. guarddog/scanners/__init__.py +6 -0
  31. guarddog/scanners/rubygems_package_scanner.py +112 -0
  32. guarddog/scanners/rubygems_project_scanner.py +75 -0
  33. guarddog/scanners/scanner.py +34 -8
  34. guarddog/utils/archives.py +133 -9
  35. guarddog/utils/config.py +24 -2
  36. guarddog-2.8.4.dist-info/METADATA +471 -0
  37. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/RECORD +42 -26
  38. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/WHEEL +1 -1
  39. guarddog-2.7.0.dist-info/METADATA +0 -40
  40. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/entry_points.txt +0 -0
  41. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/licenses/LICENSE +0 -0
  42. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
  43. {guarddog-2.7.0.dist-info → guarddog-2.8.4.dist-info}/licenses/NOTICE +0 -0
@@ -3,6 +3,7 @@ from guarddog.analyzer.metadata.npm import NPM_METADATA_RULES
3
3
  from guarddog.analyzer.metadata.pypi import PYPI_METADATA_RULES
4
4
  from guarddog.analyzer.metadata.go import GO_METADATA_RULES
5
5
  from guarddog.analyzer.metadata.github_action import GITHUB_ACTION_METADATA_RULES
6
+ from guarddog.analyzer.metadata.rubygems import RUBYGEMS_METADATA_RULES
6
7
  from guarddog.ecosystems import ECOSYSTEM
7
8
 
8
9
 
@@ -18,3 +19,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
18
19
  return GITHUB_ACTION_METADATA_RULES
19
20
  case ECOSYSTEM.EXTENSION:
20
21
  return {} # No metadata detectors for extensions currently
22
+ case ECOSYSTEM.RUBYGEMS:
23
+ return RUBYGEMS_METADATA_RULES
@@ -1,10 +1,13 @@
1
1
  import json
2
+ import logging
2
3
  import os
3
4
  from typing import Optional
4
5
 
5
6
  from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
6
7
  from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
7
8
 
9
+ log = logging.getLogger("guarddog")
10
+
8
11
 
9
12
  class GoTyposquatDetector(TyposquatDetector):
10
13
  """Detector for typosquatting attacks for go modules. Checks for distance one Levenshtein,
@@ -25,12 +28,7 @@ class GoTyposquatDetector(TyposquatDetector):
25
28
  )
26
29
 
27
30
  top_packages_path = os.path.join(resources_dir, top_packages_filename)
28
-
29
- top_packages_information = None
30
-
31
- if top_packages_filename in os.listdir(resources_dir):
32
- with open(top_packages_path, "r") as top_packages_file:
33
- top_packages_information = json.load(top_packages_file)
31
+ top_packages_information = self._get_top_packages_local(top_packages_path)
34
32
 
35
33
  if top_packages_information is None:
36
34
  raise Exception(
@@ -39,6 +37,15 @@ class GoTyposquatDetector(TyposquatDetector):
39
37
 
40
38
  return set(top_packages_information)
41
39
 
40
+ def _get_top_packages_local(self, path: str) -> list[dict] | None:
41
+ try:
42
+ with open(path, "r") as f:
43
+ result = json.load(f)
44
+ return result
45
+ except FileNotFoundError:
46
+ log.error(f"File not found: {path}")
47
+ return None
48
+
42
49
  def detect(
43
50
  self,
44
51
  package_info,
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import logging
2
3
  import os
3
4
  from datetime import datetime, timedelta
4
5
  from typing import Optional
@@ -7,6 +8,8 @@ from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
7
8
  from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
8
9
  import requests
9
10
 
11
+ log = logging.getLogger("guarddog")
12
+
10
13
 
11
14
  class NPMTyposquatDetector(TyposquatDetector):
12
15
  """Detector for typosquatting attacks. Detects if a package name is a typosquat of one of the top 5000 packages.
@@ -32,24 +35,52 @@ class NPMTyposquatDetector(TyposquatDetector):
32
35
  )
33
36
 
34
37
  top_packages_path = os.path.join(resources_dir, top_packages_filename)
38
+ top_packages_information = self._get_top_packages_local(top_packages_path)
35
39
 
36
- top_packages_information = None
37
-
38
- if top_packages_filename in os.listdir(resources_dir):
39
- update_time = datetime.fromtimestamp(os.path.getmtime(top_packages_path))
40
+ if self._file_is_expired(top_packages_path, days=30):
41
+ new_information = self._get_top_packages_network(popular_packages_url)
42
+ if new_information is not None:
43
+ top_packages_information = new_information
40
44
 
41
- if datetime.now() - update_time <= timedelta(days=30):
42
- with open(top_packages_path, "r") as top_packages_file:
43
- top_packages_information = json.load(top_packages_file)
45
+ with open(top_packages_path, "w+") as f:
46
+ json.dump(new_information, f, ensure_ascii=False, indent=4)
44
47
 
45
48
  if top_packages_information is None:
46
- response = requests.get(popular_packages_url).json()
47
- top_packages_information = list([i["name"] for i in response[0:8000]])
48
- with open(top_packages_path, "w+") as f:
49
- json.dump(top_packages_information, f, ensure_ascii=False, indent=4)
50
-
49
+ return set()
51
50
  return set(top_packages_information)
52
51
 
52
+ def _file_is_expired(self, path: str, days: int) -> bool:
53
+ try:
54
+ update_time = datetime.fromtimestamp(os.path.getmtime(path))
55
+ return datetime.now() - update_time > timedelta(days=days)
56
+ except FileNotFoundError:
57
+ return True
58
+
59
+ def _get_top_packages_local(self, path: str) -> list[dict] | None:
60
+ try:
61
+ with open(path, "r") as f:
62
+ result = json.load(f)
63
+ return result
64
+ except FileNotFoundError:
65
+ log.error(f"File not found: {path}")
66
+ return None
67
+
68
+ def _get_top_packages_network(self, url: str) -> list[dict] | None:
69
+ try:
70
+ response = requests.get(url)
71
+ response.raise_for_status()
72
+
73
+ response_data = response.json()
74
+ result = list([i["name"] for i in response_data[0:8000]])
75
+
76
+ return result
77
+ except json.JSONDecodeError:
78
+ log.error(f'Couldn`t convert to json: "{response.text}"')
79
+ return None
80
+ except requests.exceptions.RequestException as e:
81
+ log.error(f"Network error: {e}")
82
+ return None
83
+
53
84
  def detect(
54
85
  self,
55
86
  package_info,
@@ -4,14 +4,12 @@ Detects if a package contains an empty description
4
4
  """
5
5
 
6
6
  import configparser
7
- import hashlib
8
7
  import logging
9
8
  import os
10
9
  import re
11
10
  import requests
12
11
  from typing import Optional, Tuple
13
12
 
14
- import pygit2 # type: ignore
15
13
  import urllib3.util
16
14
 
17
15
  from guarddog.analyzer.metadata.repository_integrity_mismatch import IntegrityMismatch
@@ -90,18 +88,6 @@ def dict_generator(indict, pre=None):
90
88
  yield pre + [indict]
91
89
 
92
90
 
93
- def get_file_hash(path):
94
- with open(path, "rb") as f:
95
- # Read the contents of the file
96
- file_contents = f.read()
97
- # Create a hash object
98
- hash_object = hashlib.sha256()
99
- # Feed the file contents to the hash object
100
- hash_object.update(file_contents)
101
- # Get the hexadecimal hash value
102
- return hash_object.hexdigest(), str(file_contents).strip().splitlines()
103
-
104
-
105
91
  def _ensure_proper_url(url):
106
92
  parsed = urllib3.util.parse_url(url)
107
93
  if parsed.scheme is None:
@@ -140,80 +126,6 @@ def find_github_candidates(package_info) -> Tuple[set[str], Optional[str]]:
140
126
  return github_urls, best
141
127
 
142
128
 
143
- EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
144
-
145
-
146
- def exclude_result(file_name, repo_root, pkg_root):
147
- """
148
- This method filters out some results that are known false positives:
149
- * if the file is a documentation file (based on its extension)
150
- * if the file is a setup.cfg file with the egg_info claim present on Pypi and not on GitHub
151
- """
152
- for extension in EXCLUDED_EXTENSIONS:
153
- if file_name.endswith(extension):
154
- return True
155
- if file_name.endswith("setup.cfg"):
156
- repo_cfg = configparser.ConfigParser()
157
- repo_cfg.read(os.path.join(repo_root, file_name))
158
- pkg_cfg = configparser.ConfigParser()
159
- pkg_cfg.read(os.path.join(pkg_root, file_name))
160
- repo_sections = list(repo_cfg.keys())
161
- pkg_sections = list(pkg_cfg.keys())
162
- if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
163
- return True
164
- return False
165
-
166
-
167
- def find_mismatch_for_tag(repo, tag, base_path, repo_path):
168
- repo.checkout(tag)
169
- mismatch = []
170
- for root, dirs, files in os.walk(base_path):
171
- relative_path = os.path.relpath(root, base_path)
172
- repo_root = os.path.join(repo_path, relative_path)
173
- if not os.path.exists(repo_root):
174
- continue
175
- repo_files = list(
176
- filter(
177
- lambda x: os.path.isfile(os.path.join(repo_root, x)),
178
- os.listdir(repo_root),
179
- )
180
- )
181
- for file_name in repo_files:
182
- if file_name not in files: # ignore files we don't have in the distribution
183
- continue
184
- repo_hash, repo_content = get_file_hash(os.path.join(repo_root, file_name))
185
- pkg_hash, pkg_content = get_file_hash(os.path.join(root, file_name))
186
- if repo_hash != pkg_hash:
187
- if exclude_result(file_name, repo_root, root):
188
- continue
189
- res = {
190
- "file": os.path.join(relative_path, file_name),
191
- "repo_sha256": repo_hash,
192
- "pkg_sha256": pkg_hash,
193
- }
194
- mismatch.append(res)
195
- return mismatch
196
-
197
-
198
- def find_suitable_tags_in_list(tags, version):
199
- tag_candidates = []
200
- for tag_name in tags:
201
- if tag_name.endswith(version):
202
- tag_candidates.append(tag_name)
203
- return tag_candidates
204
-
205
-
206
- def find_suitable_tags(repo, version):
207
- tags_regex = re.compile("^refs/tags/(.*)")
208
- tags = []
209
- for ref in repo.references:
210
- match = tags_regex.match(ref)
211
- if match is not None:
212
- tags.append(match.group(0))
213
-
214
- return find_suitable_tags_in_list(tags, version)
215
-
216
-
217
129
  # Note: we should have the GitHub related logic factored out as we will need it when we check for signed commits
218
130
  class PypiIntegrityMismatchDetector(IntegrityMismatch):
219
131
  """
@@ -228,94 +140,71 @@ class PypiIntegrityMismatchDetector(IntegrityMismatch):
228
140
  """
229
141
 
230
142
  RULE_NAME = "repository_integrity_mismatch"
143
+ EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
231
144
 
232
- def detect(
233
- self,
234
- package_info,
235
- path: Optional[str] = None,
236
- name: Optional[str] = None,
237
- version: Optional[str] = None,
238
- ) -> tuple[bool, str]:
239
- if name is None:
240
- raise Exception("Detector needs the name of the package")
241
- if path is None:
242
- raise Exception("Detector needs the path of the package")
243
-
244
- log.debug(
245
- f"Running repository integrity mismatch heuristic on PyPI package {name} version {version}"
246
- )
247
- # let's extract a source repository (GitHub only for now) if we can
145
+ def extract_github_url(self, package_info, name: str) -> Optional[str]:
146
+ """Extract GitHub URL from PyPI metadata."""
248
147
  github_urls, best_github_candidate = find_github_candidates(package_info)
249
148
  if len(github_urls) == 0:
250
- return False, "Could not find any GitHub url in the project's description"
251
- # now, let's find the right url
149
+ return None
252
150
 
253
151
  github_url = find_best_github_candidate(
254
152
  (github_urls, best_github_candidate), name
255
153
  )
154
+ return github_url
256
155
 
257
- if github_url is None:
258
- return (
259
- False,
260
- "Could not find a good GitHub url in the project's description",
261
- )
262
-
263
- log.debug(f"Using GitHub URL {github_url}")
264
- # ok, now let's try to find the version! (I need to know which version we are scanning)
265
- if version is None:
266
- version = package_info["info"]["version"]
267
- if version is None:
268
- raise Exception("Could not find suitable version to scan")
269
- tmp_dir = os.path.dirname(path)
270
- if tmp_dir is None:
271
- raise Exception("no current scanning directory")
272
-
273
- repo_path = os.path.join(tmp_dir, "sources", name)
274
- try:
275
- repo = pygit2.clone_repository(url=github_url, path=repo_path)
276
- except pygit2.GitError as git_error:
277
- # Handle generic Git-related errors
278
- raise Exception(
279
- f"Error while cloning repository {str(git_error)} with github url {github_url}"
280
- )
281
- except Exception as e:
282
- # Catch any other unexpected exceptions
283
- raise Exception(
284
- f"An unexpected error occurred: {str(e)}. github url {github_url}"
285
- )
286
-
287
- tag_candidates = find_suitable_tags(repo, version)
288
-
289
- if len(tag_candidates) == 0:
290
- return False, "Could not find any suitable tag in repository"
291
-
292
- target_tag = None
293
- # TODO: this one is a bit weak. let's find something stronger - maybe use the closest string?
294
- for tag in tag_candidates:
295
- target_tag = tag
296
-
297
- # Idea: parse the code of the package to find the real version - we can grep the project files for
298
- # the version, git bisect until we have a file with the same version? will not work if main has not
299
- # been bumped yet in version so tags and releases are out only solutions here print(tag_candidates)
300
- # Well, that works if we run integrity check for multiple commits
301
-
302
- # should be good, let's open the sources
156
+ def get_base_path(self, path: str, name: str) -> str:
157
+ """
158
+ PyPI: find the subdirectory containing the package files.
159
+ The extracted archive typically has a subdirectory with the package name.
160
+ """
303
161
  base_dir_name = None
304
162
  for entry in os.listdir(path):
305
163
  if entry.lower().startswith(
306
164
  name.lower().replace("-", "_")
307
165
  ) or entry.lower().startswith(name.lower()):
308
166
  base_dir_name = entry
167
+
168
+ if base_dir_name is None or base_dir_name == "sources":
169
+ raise Exception("Could not find package directory in extracted files")
170
+
171
+ return os.path.join(path, base_dir_name)
172
+
173
+ def get_version(self, package_info, version: Optional[str]) -> Optional[str]:
174
+ """Get version from PyPI metadata or use provided version."""
175
+ if version is None:
176
+ version = package_info["info"]["version"]
177
+ return version
178
+
179
+ def exclude_result(
180
+ self,
181
+ file_name: str,
182
+ repo_root: Optional[str] = None,
183
+ pkg_root: Optional[str] = None,
184
+ ) -> bool:
185
+ """
186
+ Override base class method to add PyPI-specific exclusion logic.
187
+
188
+ This method filters out some results that are known false positives:
189
+ * if the file is a documentation file (based on its extension)
190
+ * if the file is a setup.cfg file with the egg_info claim present on PyPI and not on GitHub
191
+ """
192
+ # First check standard extensions using base class logic
193
+ if super().exclude_result(file_name, repo_root, pkg_root):
194
+ return True
195
+
196
+ # PyPI-specific: check for setup.cfg with egg_info differences
309
197
  if (
310
- base_dir_name is None or base_dir_name == "sources"
311
- ): # I am not sure how we can get there
312
- raise Exception("something went wrong when opening the package")
313
- base_path = os.path.join(path, base_dir_name)
314
-
315
- mismatch = find_mismatch_for_tag(repo, target_tag, base_path, repo_path)
316
- message = "\n".join(map(lambda x: "* " + x["file"], mismatch))
317
- return (
318
- len(mismatch) > 0,
319
- f"Some files present in the package are different from the ones on GitHub for "
320
- f"the same version of the package: \n{message}",
321
- )
198
+ file_name.endswith("setup.cfg")
199
+ and repo_root is not None
200
+ and pkg_root is not None
201
+ ):
202
+ repo_cfg = configparser.ConfigParser()
203
+ repo_cfg.read(os.path.join(repo_root, file_name))
204
+ pkg_cfg = configparser.ConfigParser()
205
+ pkg_cfg.read(os.path.join(pkg_root, file_name))
206
+ repo_sections = list(repo_cfg.keys())
207
+ pkg_sections = list(pkg_cfg.keys())
208
+ if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
209
+ return True
210
+ return False
@@ -51,27 +51,61 @@ class PypiTyposquatDetector(TyposquatDetector):
51
51
  )
52
52
 
53
53
  top_packages_path = os.path.join(resources_dir, top_packages_filename)
54
+ top_packages_information = self._get_top_packages_local(top_packages_path)
54
55
 
55
- top_packages_information = None
56
+ if self._file_is_expired(top_packages_path, days=30):
57
+ new_information = self._get_top_packages_network(popular_packages_url)
58
+ if new_information is not None:
59
+ top_packages_information = new_information
56
60
 
57
- if top_packages_filename in os.listdir(resources_dir):
58
- update_time = datetime.fromtimestamp(os.path.getmtime(top_packages_path))
59
-
60
- if datetime.now() - update_time <= timedelta(days=30):
61
- with open(top_packages_path, "r") as top_packages_file:
62
- top_packages_information = json.load(top_packages_file)["rows"]
61
+ with open(top_packages_path, "w+") as f:
62
+ json.dump(new_information, f, ensure_ascii=False, indent=4)
63
63
 
64
64
  if top_packages_information is None:
65
- response = requests.get(popular_packages_url).json()
66
- with open(top_packages_path, "w+") as f:
67
- json.dump(response, f, ensure_ascii=False, indent=4)
68
-
69
- top_packages_information = response["rows"]
70
-
71
- def get_safe_name(package):
72
- return packaging.utils.canonicalize_name(package["project"])
73
-
74
- return set(map(get_safe_name, top_packages_information))
65
+ return set()
66
+ return set(map(self.get_safe_name, top_packages_information))
67
+
68
+ @staticmethod
69
+ def get_safe_name(package):
70
+ return packaging.utils.canonicalize_name(package["project"])
71
+
72
+ def _file_is_expired(self, path: str, days: int) -> bool:
73
+ try:
74
+ update_time = datetime.fromtimestamp(os.path.getmtime(path))
75
+ return datetime.now() - update_time > timedelta(days=days)
76
+ except FileNotFoundError:
77
+ return True
78
+
79
+ def _get_top_packages_local(self, path: str) -> list[dict] | None:
80
+ try:
81
+ with open(path, "r") as f:
82
+ result = json.load(f)
83
+ return self.extract_information(result)
84
+ except FileNotFoundError:
85
+ log.error(f"File not found: {path}")
86
+ return None
87
+
88
+ def _get_top_packages_network(self, url: str) -> list[dict] | None:
89
+ try:
90
+ response = requests.get(url)
91
+ response.raise_for_status()
92
+
93
+ response_data = response.json()
94
+ result = response_data
95
+
96
+ return self.extract_information(result)
97
+ except json.JSONDecodeError:
98
+ log.error(f'Couldn`t convert to json: "{response.text}"')
99
+ return None
100
+ except requests.exceptions.RequestException as e:
101
+ log.error(f"Network error: {e}")
102
+ return None
103
+
104
+ @staticmethod
105
+ def extract_information(data: dict | None) -> list[dict] | None:
106
+ if data is not None:
107
+ return data.get("rows")
108
+ return None
75
109
 
76
110
  def detect(
77
111
  self,