guarddog 2.7.1__py3-none-any.whl → 2.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. guarddog/analyzer/metadata/__init__.py +3 -0
  2. guarddog/analyzer/metadata/go/typosquatting.py +11 -28
  3. guarddog/analyzer/metadata/npm/direct_url_dependency.py +0 -1
  4. guarddog/analyzer/metadata/npm/typosquatting.py +24 -59
  5. guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
  6. guarddog/analyzer/metadata/pypi/typosquatting.py +20 -77
  7. guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
  8. guarddog/analyzer/metadata/resources/top_go_packages.json +2926 -2923
  9. guarddog/analyzer/metadata/resources/top_npm_packages.json +8005 -8002
  10. guarddog/analyzer/metadata/resources/top_pypi_packages.json +15003 -60021
  11. guarddog/analyzer/metadata/resources/top_rubygems_packages.json +979 -0
  12. guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
  13. guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
  14. guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
  15. guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
  16. guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
  17. guarddog/analyzer/metadata/rubygems/typosquatting.py +91 -0
  18. guarddog/analyzer/metadata/typosquatting.py +218 -0
  19. guarddog/analyzer/metadata/utils.py +23 -0
  20. guarddog/analyzer/sourcecode/__init__.py +2 -0
  21. guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
  22. guarddog/analyzer/sourcecode/code-execution.yml +20 -0
  23. guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
  24. guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
  25. guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
  26. guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
  27. guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
  28. guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
  29. guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
  30. guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
  31. guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
  32. guarddog/analyzer/sourcecode/screenshot.yml +38 -0
  33. guarddog/ecosystems.py +3 -0
  34. guarddog/scanners/__init__.py +6 -0
  35. guarddog/scanners/npm_project_scanner.py +1 -1
  36. guarddog/scanners/rubygems_package_scanner.py +112 -0
  37. guarddog/scanners/rubygems_project_scanner.py +75 -0
  38. guarddog/scanners/scanner.py +36 -12
  39. guarddog/utils/archives.py +1 -1
  40. guarddog-2.9.0.dist-info/METADATA +471 -0
  41. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/RECORD +46 -29
  42. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/WHEEL +1 -1
  43. guarddog-2.7.1.dist-info/METADATA +0 -40
  44. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/entry_points.txt +0 -0
  45. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE +0 -0
  46. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
  47. {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,75 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from typing import List
5
+
6
+ from guarddog.scanners.rubygems_package_scanner import RubyGemsPackageScanner
7
+ from guarddog.scanners.scanner import ProjectScanner, Dependency, DependencyVersion
8
+
9
+ log = logging.getLogger("guarddog")
10
+
11
+
12
+ class RubyGemsRequirementsScanner(ProjectScanner):
13
+ """
14
+ Scans all gems in the Gemfile.lock file of a project
15
+ """
16
+
17
+ def __init__(self) -> None:
18
+ super().__init__(RubyGemsPackageScanner())
19
+
20
+ def parse_requirements(self, raw_requirements: str) -> List[Dependency]:
21
+ """
22
+ Parses Gemfile.lock and extracts gem names and versions.
23
+
24
+ Gemfile.lock format:
25
+ GEM
26
+ remote: https://rubygems.org/
27
+ specs:
28
+ actioncable (7.0.4)
29
+ actionpack (= 7.0.4)
30
+ rails (7.0.4)
31
+ ...
32
+ """
33
+ dependencies: List[Dependency] = []
34
+ lines = raw_requirements.splitlines()
35
+
36
+ in_gem_specs = False
37
+ gem_pattern = re.compile(r"^ (\S+) \(([^)]+)\)$")
38
+
39
+ for idx, line in enumerate(lines):
40
+ if line.strip() == "GEM":
41
+ continue
42
+ elif line.strip() == "specs:":
43
+ in_gem_specs = True
44
+ continue
45
+ elif line and not line.startswith(" "):
46
+ in_gem_specs = False
47
+ continue
48
+
49
+ if not in_gem_specs:
50
+ continue
51
+
52
+ match = gem_pattern.match(line)
53
+ if match:
54
+ name = match.group(1)
55
+ version = match.group(2)
56
+
57
+ dep = next(
58
+ filter(lambda d: d.name == name, dependencies),
59
+ None,
60
+ )
61
+ if not dep:
62
+ dep = Dependency(name=name, versions=set())
63
+ dependencies.append(dep)
64
+
65
+ dep.versions.add(DependencyVersion(version=version, location=idx + 1))
66
+
67
+ return dependencies
68
+
69
+ def find_requirements(self, directory: str) -> list[str]:
70
+ requirement_files = []
71
+ for root, dirs, files in os.walk(directory):
72
+ for name in files:
73
+ if name == "Gemfile.lock":
74
+ requirement_files.append(os.path.join(root, name))
75
+ return requirement_files
@@ -187,24 +187,50 @@ class PackageScanner:
187
187
  name, tmpdirname, version, rules, write_package_info
188
188
  )
189
189
 
190
- def download_compressed(self, url, archive_path, target_path):
191
- """Downloads a compressed file and extracts it
190
+ def _fetch_archive(self, url: str, archive_path: str) -> None:
191
+ """Downloads an archive file from a URL.
192
+
193
+ This method can be overridden by subclasses if custom download logic is needed.
192
194
 
193
195
  Args:
194
196
  url (str): download link
195
- archive_path (str): path to download compressed file
196
- target_path (str): path to unzip compressed file
197
+ archive_path (str): path to save the downloaded file
197
198
  """
198
-
199
- log.debug(f"Downloading package archive from {url} into {target_path}")
199
+ log.debug(f"Downloading package archive from {url}")
200
200
  response = requests.get(url, stream=True)
201
201
 
202
202
  with open(archive_path, "wb") as f:
203
203
  f.write(response.raw.read())
204
204
 
205
+ def _extract_archive(self, archive_path: str, target_path: str) -> None:
206
+ """Extracts an archive file to a target directory.
207
+
208
+ This method can be overridden by subclasses to handle special archive formats
209
+ (e.g., nested archives like .gem files).
210
+
211
+ Args:
212
+ archive_path (str): path to the archive file
213
+ target_path (str): directory to extract files into
214
+ """
215
+ safe_extract(archive_path, target_path)
216
+ log.debug(f"Successfully extracted files to {target_path}")
217
+
218
+ def download_compressed(self, url, archive_path, target_path):
219
+ """Downloads a compressed file and extracts it.
220
+
221
+ This is a template method that orchestrates the download, extraction, and cleanup
222
+ process. Subclasses can override individual steps (_fetch_archive, _extract_archive)
223
+ to customize behavior.
224
+
225
+ Args:
226
+ url (str): download link
227
+ archive_path (str): path to download compressed file
228
+ target_path (str): path to unzip compressed file
229
+ """
230
+ self._fetch_archive(url, archive_path)
231
+
205
232
  try:
206
- safe_extract(archive_path, target_path)
207
- log.debug(f"Successfully extracted files to {target_path}")
233
+ self._extract_archive(archive_path, target_path)
208
234
  finally:
209
235
  log.debug(f"Removing temporary archive file {archive_path}")
210
236
  os.remove(archive_path)
@@ -226,11 +252,9 @@ class ProjectScanner:
226
252
  user = os.getenv("GIT_USERNAME")
227
253
  personal_access_token = os.getenv("GH_TOKEN")
228
254
  if not user or not personal_access_token:
229
- log.error(
230
- """WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
255
+ log.error("""WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
231
256
  (generate a personal access token in Github settings > developer)
232
- as environment variables before proceeding."""
233
- )
257
+ as environment variables before proceeding.""")
234
258
  exit(1)
235
259
  return (user, personal_access_token)
236
260
 
@@ -4,7 +4,7 @@ import pathlib
4
4
  import stat
5
5
  import zipfile
6
6
 
7
- import tarsafe # type:ignore
7
+ import tarsafe # type: ignore
8
8
 
9
9
  from guarddog.utils.config import (
10
10
  MAX_UNCOMPRESSED_SIZE,
@@ -0,0 +1,471 @@
1
+ Metadata-Version: 2.4
2
+ Name: guarddog
3
+ Version: 2.9.0
4
+ Summary: GuardDog is a CLI tool for identifying malicious open source packages
5
+ License: Apache-2.0
6
+ License-File: LICENSE
7
+ License-File: LICENSE-3rdparty.csv
8
+ License-File: NOTICE
9
+ Author: Ellen Wang
10
+ Requires-Python: >=3.10,<4
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Requires-Dist: click (>=8.1.3,<9.0.0)
19
+ Requires-Dist: configparser (>=5.3,<8.0)
20
+ Requires-Dist: disposable-email-domains (>=0.0.103,<0.0.160)
21
+ Requires-Dist: prettytable (>=3.6.0,<4.0.0)
22
+ Requires-Dist: pygit2 (>=1.11,<1.19)
23
+ Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
24
+ Requires-Dist: python-whois (>=0.8,<0.10)
25
+ Requires-Dist: pyyaml (>=6.0,<7.0)
26
+ Requires-Dist: requests (>=2.29.0,<3.0.0)
27
+ Requires-Dist: semantic-version (>=2.10.0,<3.0.0)
28
+ Requires-Dist: semgrep (>=1.147.0,<2.0.0)
29
+ Requires-Dist: tarsafe (>=0.0.5,<0.0.6)
30
+ Requires-Dist: termcolor (>=3.3.0,<4.0.0)
31
+ Requires-Dist: urllib3 (>=2.5.0,<3.0.0)
32
+ Requires-Dist: yara-python (>=4.5.1,<5.0.0)
33
+ Project-URL: Repository, https://github.com/DataDog/guarddog
34
+ Description-Content-Type: text/markdown
35
+
36
+ # GuardDog
37
+
38
+ [![Test](https://github.com/DataDog/guarddog/actions/workflows/checks.yml/badge.svg)](https://github.com/DataDog/guarddog/actions/workflows/checks.yml)
39
+
40
+ <p align="center">
41
+ <img src="https://github.com/DataDog/guarddog/blob/main/docs/images/logo.png?raw=true" alt="GuardDog" width="300" />
42
+ </p>
43
+
44
+ GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, RubyGems, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
45
+
46
+ GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, RubyGems, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics).
47
+
48
+ It downloads and scans code from:
49
+
50
+ * NPM: Packages hosted in [npmjs.org](https://www.npmjs.com/)
51
+ * PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/)
52
+ * Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com)
53
+ * RubyGems: Gem packages hosted in [rubygems.org](https://rubygems.org/)
54
+ * GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com)
55
+ * VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/)
56
+
57
+ ![GuardDog demo usage](https://github.com/DataDog/guarddog/blob/main/docs/images/demo.png?raw=true)
58
+
59
+ ## Getting started
60
+
61
+ ### Installation
62
+
63
+ ```sh
64
+ pip install guarddog
65
+ ```
66
+
67
+ Or use the Docker image:
68
+
69
+ ```sh
70
+ docker pull ghcr.io/datadog/guarddog
71
+ alias guarddog='docker run --rm ghcr.io/datadog/guarddog'
72
+ ```
73
+
74
+ *Note: On Windows, the only supported installation method is Docker.*
75
+
76
+ ### Sample usage
77
+
78
+ ```sh
79
+ # Scan the most recent version of the 'requests' package
80
+ guarddog pypi scan requests
81
+
82
+ # Scan a specific version of the 'requests' package
83
+ guarddog pypi scan requests --version 2.28.1
84
+
85
+ # Scan the 'request' package using 2 specific heuristics
86
+ guarddog pypi scan requests --rules exec-base64 --rules code-execution
87
+
88
+ # Scan the 'requests' package using all rules but one
89
+ guarddog pypi scan requests --exclude-rules exec-base64
90
+
91
+ # Scan a local package archive
92
+ guarddog pypi scan /tmp/triage.tar.gz
93
+
94
+ # Scan a local package directory
95
+ guarddog pypi scan /tmp/triage/
96
+
97
+ # Scan every package referenced in a requirements.txt file of a local folder
98
+ guarddog pypi verify workspace/guarddog/requirements.txt
99
+
100
+ # Scan every package referenced in a requirements.txt file and output a sarif file - works only for verify
101
+ guarddog pypi verify --output-format=sarif workspace/guarddog/requirements.txt
102
+
103
+ # Output JSON to standard output - works for every command
104
+ guarddog pypi scan requests --output-format=json
105
+
106
+ # All the commands also work on npm, go, rubygems
107
+ guarddog npm scan express
108
+
109
+ guarddog go scan github.com/DataDog/dd-trace-go
110
+
111
+ guarddog go verify /tmp/repo/go.mod
112
+
113
+ # Scan RubyGems packages
114
+ guarddog rubygems scan rails
115
+
116
+ guarddog rubygems verify /tmp/repo/Gemfile.lock
117
+
118
+ # Additionally can support scanning GitHub actions that are implemented in JavaScript
119
+ guarddog github_action scan DataDog/synthetics-ci-github-action
120
+
121
+ guarddog github_action verify /tmp/repo/.github/workflows/main.yml
122
+
123
+ # Scan VSCode extensions from the marketplace
124
+ guarddog extension scan ms-python.python
125
+
126
+ # Scan a specific version of a VSCode extension
127
+ guarddog extension scan ms-python.python --version 2023.20.0
128
+
129
+ # Scan a local VSCode extension directory or VSIX archive
130
+ guarddog extension scan /tmp/my-extension/
131
+
132
+ # Run in debug mode
133
+ guarddog --log-level debug npm scan express
134
+ ```
135
+
136
+
137
+ ## Heuristics
138
+
139
+ GuardDog comes with 2 types of heuristics:
140
+
141
+ * [**Source code heuristics**](https://github.com/DataDog/guarddog/tree/main/guarddog/analyzer/sourcecode): Semgrep rules running against the package source code.
142
+
143
+ * [**Package metadata heuristics**](https://github.com/DataDog/guarddog/tree/main/guarddog/analyzer/metadata): Python or Javascript heuristics running against the package metadata on PyPI or npm.
144
+
145
+ <!-- BEGIN_RULE_LIST -->
146
+ ### PyPI
147
+
148
+ Source code heuristics:
149
+
150
+ | **Heuristic** | **Description** |
151
+ |:-------------:|:---------------:|
152
+ | api-obfuscation | Identify obfuscated API calls using alternative Python syntax patterns |
153
+ | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
154
+ | obfuscation | Identify when a package uses a common obfuscation method often used by malware |
155
+ | clipboard-access | Identify when a package reads or write data from the clipboard |
156
+ | exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
157
+ | download-executable | Identify when a package downloads and makes executable a remote binary |
158
+ | exec-base64 | Identify when a package dynamically executes base64-encoded code |
159
+ | silent-process-execution | Identify when a package silently executes an executable |
160
+ | dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
161
+ | steganography | Identify when a package retrieves hidden data from an image and executes it |
162
+ | code-execution | Identify when an OS command is executed in the setup.py file |
163
+ | unicode | Identify suspicious unicode characters |
164
+ | cmd-overwrite | Identify when the 'install' command is overwritten in setup.py, indicating a piece of code automatically running when the package is installed |
165
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
166
+
167
+ Metadata heuristics:
168
+
169
+ | **Heuristic** | **Description** |
170
+ |:-------------:|:---------------:|
171
+ | empty_information | Identify packages with an empty description field |
172
+ | release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
173
+ | typosquatting | Identify packages that are named closely to an highly popular package |
174
+ | potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised |
175
+ | unclaimed_maintainer_email_domain | Identify when a package maintainer e-mail domain (and therefore npm account) is unclaimed and can be registered by an attacker |
176
+ | repository_integrity_mismatch | Identify packages with a linked GitHub repository where the package has extra unexpected files |
177
+ | single_python_file | Identify packages that have only a single Python file |
178
+ | bundled_binary | Identify packages bundling binaries |
179
+ | deceptive_author | This heuristic detects when an author is using a disposable email |
180
+
181
+
182
+ ### npm
183
+
184
+ Source code heuristics:
185
+
186
+ | **Heuristic** | **Description** |
187
+ |:-------------:|:---------------:|
188
+ | npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
189
+ | npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
190
+ | npm-silent-process-execution | Identify when a package silently executes an executable |
191
+ | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
192
+ | npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
193
+ | npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
194
+ | npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
195
+ | npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
196
+ | npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
197
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
198
+
199
+ Metadata heuristics:
200
+
201
+ | **Heuristic** | **Description** |
202
+ |:-------------:|:---------------:|
203
+ | empty_information | Identify packages with an empty description field |
204
+ | release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
205
+ | potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised; note that NPM's API may not provide accurate information regarding the maintainer's email, so this detector may cause false positives for NPM packages. see https://www.theregister.com/2022/05/10/security_npm_email/ |
206
+ | unclaimed_maintainer_email_domain | Identify when a package maintainer e-mail domain (and therefore npm account) is unclaimed and can be registered by an attacker; note that NPM's API may not provide accurate information regarding the maintainer's email, so this detector may cause false positives for NPM packages. see https://www.theregister.com/2022/05/10/security_npm_email/ |
207
+ | typosquatting | Identify packages that are named closely to an highly popular package |
208
+ | direct_url_dependency | Identify packages with direct URL dependencies. Dependencies fetched this way are not immutable and can be used to inject untrusted code or reduce the likelihood of a reproducible install. |
209
+ | npm_metadata_mismatch | Identify packages which have mismatches between the npm package manifest and the package info for some critical fields |
210
+ | bundled_binary | Identify packages bundling binaries |
211
+ | deceptive_author | This heuristic detects when an author is using a disposable email |
212
+
213
+
214
+ ### go
215
+
216
+ Source code heuristics:
217
+
218
+ | **Heuristic** | **Description** |
219
+ |:-------------:|:---------------:|
220
+ | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
221
+ | go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go |
222
+ | go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. |
223
+ | go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. |
224
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
225
+
226
+ Metadata heuristics:
227
+
228
+ | **Heuristic** | **Description** |
229
+ |:-------------:|:---------------:|
230
+ | typosquatting | Identify packages that are named closely to an highly popular package |
231
+
232
+
233
+ ### GitHub Action
234
+
235
+ Source code heuristics:
236
+
237
+ | **Heuristic** | **Description** |
238
+ |:-------------:|:---------------:|
239
+ | npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
240
+ | npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
241
+ | npm-silent-process-execution | Identify when a package silently executes an executable |
242
+ | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
243
+ | npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
244
+ | npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
245
+ | npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
246
+ | npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
247
+ | npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
248
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
249
+ ### Extension
250
+
251
+ Source code heuristics:
252
+
253
+ | **Heuristic** | **Description** |
254
+ |:-------------:|:---------------:|
255
+ | npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
256
+ | npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
257
+ | npm-silent-process-execution | Identify when a package silently executes an executable |
258
+ | shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
259
+ | npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
260
+ | npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
261
+ | npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
262
+ | npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
263
+ | npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
264
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
265
+ ### RubyGems
266
+
267
+ Source code heuristics:
268
+
269
+ | **Heuristic** | **Description** |
270
+ |:-------------:|:---------------:|
271
+ | rubygems-code-execution | Identify when a gem executes OS commands |
272
+ | rubygems-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
273
+ | rubygems-serialize-environment | Identify when a package serializes ENV to exfiltrate environment variables |
274
+ | rubygems-network-on-require | Identify when a gem makes network requests when required |
275
+ | rubygems-install-hook | Identify when a gem registers installation hooks |
276
+ | rubygems-exec-base64 | Identify when a package dynamically executes base64-encoded code |
277
+ | suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
278
+
279
+ Metadata heuristics:
280
+
281
+ | **Heuristic** | **Description** |
282
+ |:-------------:|:---------------:|
283
+ | typosquatting | Identify packages that are named closely to an highly popular package |
284
+ | empty_information | Identify packages with an empty description field |
285
+ | release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
286
+ | bundled_binary | Identify packages bundling binaries |
287
+ | repository_integrity_mismatch | Identify packages with a linked GitHub repository where the package has extra unexpected files |
288
+
289
+
290
+ <!-- END_RULE_LIST -->
291
+
292
+ ## Custom Rules
293
+
294
+ Guarddog allows to implement custom sourcecode rules.
295
+ Sourcecode rules live under the [guarddog/analyzer/sourcecode](guarddog/analyzer/sourcecode) directory, and supported formats are [Semgrep](https://github.com/semgrep/semgrep) or [Yara](https://github.com/VirusTotal/yara).
296
+
297
+ * Semgrep rules are language-dependent, and Guarddog will import all `.yml` rules where the language matches the ecosystem selected by the user in CLI.
298
+ * Yara rules on the other hand are language agnostic, therefore all matching `.yar` rules present will be imported.
299
+
300
+ Is possible then to write your own rule and drop it into that directory, Guarddog will allow you to select it or exclude it as any built-in rule as well as appending the findings to its output.
301
+
302
+ For example, you can create the following semgrep rule:
303
+ ```yaml
304
+ rules:
305
+ - id: sample-rule
306
+ languages:
307
+ - python
308
+ message: Output message when rule matches
309
+ metadata:
310
+ description: Description used in the CLI help
311
+ patterns:
312
+ YOUR RULE HEURISTICS GO HERE
313
+ severity: WARNING
314
+ ```
315
+
316
+ Then you'll need to save it as `sample-rule.yml` and note that the id must match the filename
317
+
318
+ In the case of Yara, you can create the following rule:
319
+ ```
320
+ rule sample-rule
321
+ {
322
+ meta:
323
+ description = "Description used in the output message"
324
+ target_entity = "file"
325
+ strings:
326
+ $exec = "exec"
327
+ condition:
328
+ 1 of them
329
+ }
330
+ ```
331
+ Then you'll need to save it as `sample-rule.yar`.
332
+
333
+ Note that in both cases, the rule id must match the filename
334
+
335
+ ## Running GuardDog in a GitHub Action
336
+
337
+ The easiest way to integrate GuardDog in your CI pipeline is to leverage the SARIF output format, and upload it to GitHub's [code scanning](https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/about-code-scanning) feature.
338
+
339
+ Using this, you get:
340
+ * Automated comments to your pull requests based on the GuardDog scan output
341
+ * Built-in false positive management directly in the GitHub UI
342
+
343
+
344
+ Sample GitHub Action using GuardDog:
345
+
346
+ ```yaml
347
+ name: GuardDog
348
+
349
+ on:
350
+ push:
351
+ branches:
352
+ - main
353
+ pull_request:
354
+ branches:
355
+ - main
356
+
357
+ permissions:
358
+ contents: read
359
+
360
+ jobs:
361
+ guarddog:
362
+ permissions:
363
+ contents: read # for actions/checkout to fetch code
364
+ security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
365
+ name: Scan dependencies
366
+ runs-on: ubuntu-latest
367
+
368
+ steps:
369
+ - uses: actions/checkout@v4
370
+
371
+ - name: Set up Python
372
+ uses: actions/setup-python@v5
373
+ with:
374
+ python-version: "3.10"
375
+
376
+ - name: Install GuardDog
377
+ run: pip install guarddog
378
+
379
+ - run: guarddog pypi verify requirements.txt --output-format sarif --exclude-rules repository_integrity_mismatch > guarddog.sarif
380
+
381
+ - name: Upload SARIF file to GitHub
382
+ uses: github/codeql-action/upload-sarif@v3
383
+ with:
384
+ category: guarddog-builtin
385
+ sarif_file: guarddog.sarif
386
+ ```
387
+
388
+
389
+ ## Development
390
+
391
+ ### Running a local version of GuardDog
392
+
393
+ * Ensure poetry has an env with `python >=3.10` `poetry env use 3.10.0`
394
+ * Install dependencies `poetry install`
395
+ * Run guarddog `poetry run guarddog` or `poetry shell` then run `guarddog`
396
+
397
+ ### Unit tests
398
+
399
+ Running all unit tests: `make test`
400
+
401
+ Running unit tests against Semgrep rules: `make test-semgrep-rules` (tests are [here](https://github.com/DataDog/guarddog/tree/main/tests/analyzer/sourcecode)). These use the standard methodology for [testing Semgrep rules](https://semgrep.dev/docs/writing-rules/testing-rules/).
402
+
403
+ Running unit tests against package metadata heuristics: `make test-metadata-rules` (tests are [here](https://github.com/DataDog/guarddog/tree/main/tests/analyzer/metadata)).
404
+
405
+ ### Benchmarking
406
+
407
+ You can run GuardDog on legitimate and malicious packages to determine false positives and false negatives. See [./tests/samples](./tests/samples)
408
+
409
+ ### Code quality checks
410
+
411
+ Run the type checker with
412
+ ```shell
413
+ mypy --install-types --non-interactive guarddog
414
+ ```
415
+ and the linter with
416
+ ```shell
417
+ flake8 guarddog --count --select=E9,F63,F7,F82 --show-source --statistics --exclude tests/analyzer/sourcecode,tests/analyzer/metadata/resources,evaluator/data
418
+ flake8 guarddog --count --max-line-length=120 --statistics --exclude tests/analyzer/sourcecode,tests/analyzer/metadata/resources,evaluator/data --ignore=E203,W503
419
+ ```
420
+
421
+ ### Configuration via Environment Variables
422
+
423
+ GuardDog's behavior can be customized using environment variables:
424
+
425
+ #### General Configuration
426
+
427
+ | Environment Variable | Description | Default Value |
428
+ |---------------------|-------------|---------------|
429
+ | `GUARDDOG_PARALLELISM` | Number of threads to use for parallel processing | Number of CPUs available |
430
+ | `GUARDDOG_VERIFY_EXHAUSTIVE_DEPENDENCIES` | Analyze all possible versions of dependencies (`true`/`false`) | `false` |
431
+ | `GUARDDOG_TOP_PACKAGES_CACHE_LOCATION` | Location of the top packages cache directory | `guarddog/analyzer/metadata/resources` |
432
+ | `GUARDDOG_YARA_EXT_EXCLUDE` | Comma-separated list of file extensions to exclude from YARA scanning | `ini,md,rst,txt,lock,json,yaml,yml,toml,xml,html,csv,sql,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,changelog,readme,makefile,dockerfile,pkg-info,d.ts` |
433
+
434
+ #### Semgrep Configuration
435
+
436
+ GuardDog uses `Semgrep`, a powerful static analysis tool that scans code for patterns.
437
+
438
+ | Environment Variable | Description | Default Value |
439
+ |---------------------|-------------|---------------|
440
+ | `GUARDDOG_SEMGREP_MAX_TARGET_BYTES` | Maximum size of a file that Semgrep will analyze (files exceeding this will be skipped) | 10MB (10485760 bytes) |
441
+ | `GUARDDOG_SEMGREP_TIMEOUT` | Maximum time in seconds that Semgrep will spend running a rule on a single file | 10 seconds |
442
+
443
+ #### Archive Extraction Security Limits
444
+
445
+ GuardDog implements multiple security checks when extracting package archives to protect against compression bombs and file descriptor exhaustion attacks:
446
+
447
+ | Environment Variable | Description | Default Value |
448
+ |---------------------|-------------|---------------|
449
+ | `GUARDDOG_MAX_UNCOMPRESSED_SIZE` | Maximum allowed uncompressed size in bytes (prevents disk space exhaustion) | 2147483648 (2 GB) |
450
+ | `GUARDDOG_MAX_COMPRESSION_RATIO` | Maximum allowed compression ratio (detects suspicious compression patterns) | 100 (100:1) |
451
+ | `GUARDDOG_MAX_FILE_COUNT` | Maximum number of files allowed in an archive (prevents file descriptor/inode exhaustion) | 100000 |
452
+
453
+ ## Maintainers
454
+
455
+ * [Sebastian Obregoso](https://www.linkedin.com/in/sebastianobregoso/)
456
+ * [Ian Kretz](https://github.com/ikretz)
457
+ * [Tesnim Hamdouni](https://github.com/tesnim5hamdouni)
458
+
459
+ ## Authors
460
+ * [Ellen Wang](https://www.linkedin.com/in/ellen-wang-4bb5961a0/)
461
+ * [Christophe Tafani-Dereeper](https://github.com/christophetd)
462
+
463
+ ## Acknowledgments
464
+
465
+ Inspiration:
466
+ * [Backstabber’s Knife Collection: A Review of Open Source Software Supply Chain Attacks](https://arxiv.org/pdf/2005.09535)
467
+ * [What are Weak Links in the npm Supply Chain?](https://arxiv.org/pdf/2112.10165.pdf)
468
+ * [A Survey on Common Threats in npm and PyPi Registries](https://arxiv.org/pdf/2108.09576.pdf)
469
+ * [A Benchmark Comparison of Python Malware Detection Approaches](https://arxiv.org/pdf/2209.13288.pdf)
470
+ * [Towards Measuring Supply Chain Attacks on Package Managers for Interpreted Languages](https://arxiv.org/pdf/2002.01139)
471
+