guarddog 2.7.1__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guarddog/analyzer/metadata/__init__.py +3 -0
- guarddog/analyzer/metadata/go/typosquatting.py +11 -28
- guarddog/analyzer/metadata/npm/direct_url_dependency.py +0 -1
- guarddog/analyzer/metadata/npm/typosquatting.py +24 -59
- guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
- guarddog/analyzer/metadata/pypi/typosquatting.py +20 -77
- guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
- guarddog/analyzer/metadata/resources/top_go_packages.json +2926 -2923
- guarddog/analyzer/metadata/resources/top_npm_packages.json +8005 -8002
- guarddog/analyzer/metadata/resources/top_pypi_packages.json +15003 -60021
- guarddog/analyzer/metadata/resources/top_rubygems_packages.json +979 -0
- guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
- guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
- guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
- guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
- guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
- guarddog/analyzer/metadata/rubygems/typosquatting.py +91 -0
- guarddog/analyzer/metadata/typosquatting.py +218 -0
- guarddog/analyzer/metadata/utils.py +23 -0
- guarddog/analyzer/sourcecode/__init__.py +2 -0
- guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
- guarddog/analyzer/sourcecode/code-execution.yml +20 -0
- guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
- guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
- guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
- guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
- guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
- guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
- guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
- guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
- guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
- guarddog/analyzer/sourcecode/screenshot.yml +38 -0
- guarddog/ecosystems.py +3 -0
- guarddog/scanners/__init__.py +6 -0
- guarddog/scanners/npm_project_scanner.py +1 -1
- guarddog/scanners/rubygems_package_scanner.py +112 -0
- guarddog/scanners/rubygems_project_scanner.py +75 -0
- guarddog/scanners/scanner.py +36 -12
- guarddog/utils/archives.py +1 -1
- guarddog-2.9.0.dist-info/METADATA +471 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/RECORD +46 -29
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/WHEEL +1 -1
- guarddog-2.7.1.dist-info/METADATA +0 -40
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/entry_points.txt +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from guarddog.scanners.rubygems_package_scanner import RubyGemsPackageScanner
|
|
7
|
+
from guarddog.scanners.scanner import ProjectScanner, Dependency, DependencyVersion
|
|
8
|
+
|
|
9
|
+
log = logging.getLogger("guarddog")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RubyGemsRequirementsScanner(ProjectScanner):
|
|
13
|
+
"""
|
|
14
|
+
Scans all gems in the Gemfile.lock file of a project
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
super().__init__(RubyGemsPackageScanner())
|
|
19
|
+
|
|
20
|
+
def parse_requirements(self, raw_requirements: str) -> List[Dependency]:
|
|
21
|
+
"""
|
|
22
|
+
Parses Gemfile.lock and extracts gem names and versions.
|
|
23
|
+
|
|
24
|
+
Gemfile.lock format:
|
|
25
|
+
GEM
|
|
26
|
+
remote: https://rubygems.org/
|
|
27
|
+
specs:
|
|
28
|
+
actioncable (7.0.4)
|
|
29
|
+
actionpack (= 7.0.4)
|
|
30
|
+
rails (7.0.4)
|
|
31
|
+
...
|
|
32
|
+
"""
|
|
33
|
+
dependencies: List[Dependency] = []
|
|
34
|
+
lines = raw_requirements.splitlines()
|
|
35
|
+
|
|
36
|
+
in_gem_specs = False
|
|
37
|
+
gem_pattern = re.compile(r"^ (\S+) \(([^)]+)\)$")
|
|
38
|
+
|
|
39
|
+
for idx, line in enumerate(lines):
|
|
40
|
+
if line.strip() == "GEM":
|
|
41
|
+
continue
|
|
42
|
+
elif line.strip() == "specs:":
|
|
43
|
+
in_gem_specs = True
|
|
44
|
+
continue
|
|
45
|
+
elif line and not line.startswith(" "):
|
|
46
|
+
in_gem_specs = False
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if not in_gem_specs:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
match = gem_pattern.match(line)
|
|
53
|
+
if match:
|
|
54
|
+
name = match.group(1)
|
|
55
|
+
version = match.group(2)
|
|
56
|
+
|
|
57
|
+
dep = next(
|
|
58
|
+
filter(lambda d: d.name == name, dependencies),
|
|
59
|
+
None,
|
|
60
|
+
)
|
|
61
|
+
if not dep:
|
|
62
|
+
dep = Dependency(name=name, versions=set())
|
|
63
|
+
dependencies.append(dep)
|
|
64
|
+
|
|
65
|
+
dep.versions.add(DependencyVersion(version=version, location=idx + 1))
|
|
66
|
+
|
|
67
|
+
return dependencies
|
|
68
|
+
|
|
69
|
+
def find_requirements(self, directory: str) -> list[str]:
|
|
70
|
+
requirement_files = []
|
|
71
|
+
for root, dirs, files in os.walk(directory):
|
|
72
|
+
for name in files:
|
|
73
|
+
if name == "Gemfile.lock":
|
|
74
|
+
requirement_files.append(os.path.join(root, name))
|
|
75
|
+
return requirement_files
|
guarddog/scanners/scanner.py
CHANGED
|
@@ -187,24 +187,50 @@ class PackageScanner:
|
|
|
187
187
|
name, tmpdirname, version, rules, write_package_info
|
|
188
188
|
)
|
|
189
189
|
|
|
190
|
-
def
|
|
191
|
-
"""Downloads
|
|
190
|
+
def _fetch_archive(self, url: str, archive_path: str) -> None:
|
|
191
|
+
"""Downloads an archive file from a URL.
|
|
192
|
+
|
|
193
|
+
This method can be overridden by subclasses if custom download logic is needed.
|
|
192
194
|
|
|
193
195
|
Args:
|
|
194
196
|
url (str): download link
|
|
195
|
-
archive_path (str): path to
|
|
196
|
-
target_path (str): path to unzip compressed file
|
|
197
|
+
archive_path (str): path to save the downloaded file
|
|
197
198
|
"""
|
|
198
|
-
|
|
199
|
-
log.debug(f"Downloading package archive from {url} into {target_path}")
|
|
199
|
+
log.debug(f"Downloading package archive from {url}")
|
|
200
200
|
response = requests.get(url, stream=True)
|
|
201
201
|
|
|
202
202
|
with open(archive_path, "wb") as f:
|
|
203
203
|
f.write(response.raw.read())
|
|
204
204
|
|
|
205
|
+
def _extract_archive(self, archive_path: str, target_path: str) -> None:
|
|
206
|
+
"""Extracts an archive file to a target directory.
|
|
207
|
+
|
|
208
|
+
This method can be overridden by subclasses to handle special archive formats
|
|
209
|
+
(e.g., nested archives like .gem files).
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
archive_path (str): path to the archive file
|
|
213
|
+
target_path (str): directory to extract files into
|
|
214
|
+
"""
|
|
215
|
+
safe_extract(archive_path, target_path)
|
|
216
|
+
log.debug(f"Successfully extracted files to {target_path}")
|
|
217
|
+
|
|
218
|
+
def download_compressed(self, url, archive_path, target_path):
|
|
219
|
+
"""Downloads a compressed file and extracts it.
|
|
220
|
+
|
|
221
|
+
This is a template method that orchestrates the download, extraction, and cleanup
|
|
222
|
+
process. Subclasses can override individual steps (_fetch_archive, _extract_archive)
|
|
223
|
+
to customize behavior.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
url (str): download link
|
|
227
|
+
archive_path (str): path to download compressed file
|
|
228
|
+
target_path (str): path to unzip compressed file
|
|
229
|
+
"""
|
|
230
|
+
self._fetch_archive(url, archive_path)
|
|
231
|
+
|
|
205
232
|
try:
|
|
206
|
-
|
|
207
|
-
log.debug(f"Successfully extracted files to {target_path}")
|
|
233
|
+
self._extract_archive(archive_path, target_path)
|
|
208
234
|
finally:
|
|
209
235
|
log.debug(f"Removing temporary archive file {archive_path}")
|
|
210
236
|
os.remove(archive_path)
|
|
@@ -226,11 +252,9 @@ class ProjectScanner:
|
|
|
226
252
|
user = os.getenv("GIT_USERNAME")
|
|
227
253
|
personal_access_token = os.getenv("GH_TOKEN")
|
|
228
254
|
if not user or not personal_access_token:
|
|
229
|
-
log.error(
|
|
230
|
-
"""WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
|
|
255
|
+
log.error("""WARNING: Please set GIT_USERNAME (Github handle) and GH_TOKEN
|
|
231
256
|
(generate a personal access token in Github settings > developer)
|
|
232
|
-
as environment variables before proceeding."""
|
|
233
|
-
)
|
|
257
|
+
as environment variables before proceeding.""")
|
|
234
258
|
exit(1)
|
|
235
259
|
return (user, personal_access_token)
|
|
236
260
|
|
guarddog/utils/archives.py
CHANGED
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: guarddog
|
|
3
|
+
Version: 2.9.0
|
|
4
|
+
Summary: GuardDog is a CLI tool for identifying malicious open source packages
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
License-File: LICENSE-3rdparty.csv
|
|
8
|
+
License-File: NOTICE
|
|
9
|
+
Author: Ellen Wang
|
|
10
|
+
Requires-Python: >=3.10,<4
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Requires-Dist: click (>=8.1.3,<9.0.0)
|
|
19
|
+
Requires-Dist: configparser (>=5.3,<8.0)
|
|
20
|
+
Requires-Dist: disposable-email-domains (>=0.0.103,<0.0.160)
|
|
21
|
+
Requires-Dist: prettytable (>=3.6.0,<4.0.0)
|
|
22
|
+
Requires-Dist: pygit2 (>=1.11,<1.19)
|
|
23
|
+
Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
|
|
24
|
+
Requires-Dist: python-whois (>=0.8,<0.10)
|
|
25
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
26
|
+
Requires-Dist: requests (>=2.29.0,<3.0.0)
|
|
27
|
+
Requires-Dist: semantic-version (>=2.10.0,<3.0.0)
|
|
28
|
+
Requires-Dist: semgrep (>=1.147.0,<2.0.0)
|
|
29
|
+
Requires-Dist: tarsafe (>=0.0.5,<0.0.6)
|
|
30
|
+
Requires-Dist: termcolor (>=3.3.0,<4.0.0)
|
|
31
|
+
Requires-Dist: urllib3 (>=2.5.0,<3.0.0)
|
|
32
|
+
Requires-Dist: yara-python (>=4.5.1,<5.0.0)
|
|
33
|
+
Project-URL: Repository, https://github.com/DataDog/guarddog
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# GuardDog
|
|
37
|
+
|
|
38
|
+
[](https://github.com/DataDog/guarddog/actions/workflows/checks.yml)
|
|
39
|
+
|
|
40
|
+
<p align="center">
|
|
41
|
+
<img src="https://github.com/DataDog/guarddog/blob/main/docs/images/logo.png?raw=true" alt="GuardDog" width="300" />
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
GuardDog is a CLI tool that allows to identify malicious PyPI and npm packages, Go modules, RubyGems, GitHub actions, or VSCode extensions. It runs a set of heuristics on the package source code (through Semgrep rules) and on the package metadata.
|
|
45
|
+
|
|
46
|
+
GuardDog can be used to scan local or remote PyPI and npm packages, Go modules, RubyGems, GitHub actions, or VSCode extensions using any of the available [heuristics](#heuristics).
|
|
47
|
+
|
|
48
|
+
It downloads and scans code from:
|
|
49
|
+
|
|
50
|
+
* NPM: Packages hosted in [npmjs.org](https://www.npmjs.com/)
|
|
51
|
+
* PyPI: Source files (tar.gz) packages hosted in [PyPI.org](https://pypi.org/)
|
|
52
|
+
* Go: GoLang source files of repositories hosted in [GitHub.com](https://github.com)
|
|
53
|
+
* RubyGems: Gem packages hosted in [rubygems.org](https://rubygems.org/)
|
|
54
|
+
* GitHub Actions: Javascript source files of repositories hosted in [GitHub.com](https://github.com)
|
|
55
|
+
* VSCode Extensions: Extensions (.vsix) packages hosted in [marketplace.visualstudio.com](https://marketplace.visualstudio.com/)
|
|
56
|
+
|
|
57
|
+

|
|
58
|
+
|
|
59
|
+
## Getting started
|
|
60
|
+
|
|
61
|
+
### Installation
|
|
62
|
+
|
|
63
|
+
```sh
|
|
64
|
+
pip install guarddog
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Or use the Docker image:
|
|
68
|
+
|
|
69
|
+
```sh
|
|
70
|
+
docker pull ghcr.io/datadog/guarddog
|
|
71
|
+
alias guarddog='docker run --rm ghcr.io/datadog/guarddog'
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
*Note: On Windows, the only supported installation method is Docker.*
|
|
75
|
+
|
|
76
|
+
### Sample usage
|
|
77
|
+
|
|
78
|
+
```sh
|
|
79
|
+
# Scan the most recent version of the 'requests' package
|
|
80
|
+
guarddog pypi scan requests
|
|
81
|
+
|
|
82
|
+
# Scan a specific version of the 'requests' package
|
|
83
|
+
guarddog pypi scan requests --version 2.28.1
|
|
84
|
+
|
|
85
|
+
# Scan the 'request' package using 2 specific heuristics
|
|
86
|
+
guarddog pypi scan requests --rules exec-base64 --rules code-execution
|
|
87
|
+
|
|
88
|
+
# Scan the 'requests' package using all rules but one
|
|
89
|
+
guarddog pypi scan requests --exclude-rules exec-base64
|
|
90
|
+
|
|
91
|
+
# Scan a local package archive
|
|
92
|
+
guarddog pypi scan /tmp/triage.tar.gz
|
|
93
|
+
|
|
94
|
+
# Scan a local package directory
|
|
95
|
+
guarddog pypi scan /tmp/triage/
|
|
96
|
+
|
|
97
|
+
# Scan every package referenced in a requirements.txt file of a local folder
|
|
98
|
+
guarddog pypi verify workspace/guarddog/requirements.txt
|
|
99
|
+
|
|
100
|
+
# Scan every package referenced in a requirements.txt file and output a sarif file - works only for verify
|
|
101
|
+
guarddog pypi verify --output-format=sarif workspace/guarddog/requirements.txt
|
|
102
|
+
|
|
103
|
+
# Output JSON to standard output - works for every command
|
|
104
|
+
guarddog pypi scan requests --output-format=json
|
|
105
|
+
|
|
106
|
+
# All the commands also work on npm, go, rubygems
|
|
107
|
+
guarddog npm scan express
|
|
108
|
+
|
|
109
|
+
guarddog go scan github.com/DataDog/dd-trace-go
|
|
110
|
+
|
|
111
|
+
guarddog go verify /tmp/repo/go.mod
|
|
112
|
+
|
|
113
|
+
# Scan RubyGems packages
|
|
114
|
+
guarddog rubygems scan rails
|
|
115
|
+
|
|
116
|
+
guarddog rubygems verify /tmp/repo/Gemfile.lock
|
|
117
|
+
|
|
118
|
+
# Additionally can support scanning GitHub actions that are implemented in JavaScript
|
|
119
|
+
guarddog github_action scan DataDog/synthetics-ci-github-action
|
|
120
|
+
|
|
121
|
+
guarddog github_action verify /tmp/repo/.github/workflows/main.yml
|
|
122
|
+
|
|
123
|
+
# Scan VSCode extensions from the marketplace
|
|
124
|
+
guarddog extension scan ms-python.python
|
|
125
|
+
|
|
126
|
+
# Scan a specific version of a VSCode extension
|
|
127
|
+
guarddog extension scan ms-python.python --version 2023.20.0
|
|
128
|
+
|
|
129
|
+
# Scan a local VSCode extension directory or VSIX archive
|
|
130
|
+
guarddog extension scan /tmp/my-extension/
|
|
131
|
+
|
|
132
|
+
# Run in debug mode
|
|
133
|
+
guarddog --log-level debug npm scan express
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
## Heuristics
|
|
138
|
+
|
|
139
|
+
GuardDog comes with 2 types of heuristics:
|
|
140
|
+
|
|
141
|
+
* [**Source code heuristics**](https://github.com/DataDog/guarddog/tree/main/guarddog/analyzer/sourcecode): Semgrep rules running against the package source code.
|
|
142
|
+
|
|
143
|
+
* [**Package metadata heuristics**](https://github.com/DataDog/guarddog/tree/main/guarddog/analyzer/metadata): Python or Javascript heuristics running against the package metadata on PyPI or npm.
|
|
144
|
+
|
|
145
|
+
<!-- BEGIN_RULE_LIST -->
|
|
146
|
+
### PyPI
|
|
147
|
+
|
|
148
|
+
Source code heuristics:
|
|
149
|
+
|
|
150
|
+
| **Heuristic** | **Description** |
|
|
151
|
+
|:-------------:|:---------------:|
|
|
152
|
+
| api-obfuscation | Identify obfuscated API calls using alternative Python syntax patterns |
|
|
153
|
+
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
|
|
154
|
+
| obfuscation | Identify when a package uses a common obfuscation method often used by malware |
|
|
155
|
+
| clipboard-access | Identify when a package reads or write data from the clipboard |
|
|
156
|
+
| exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
|
|
157
|
+
| download-executable | Identify when a package downloads and makes executable a remote binary |
|
|
158
|
+
| exec-base64 | Identify when a package dynamically executes base64-encoded code |
|
|
159
|
+
| silent-process-execution | Identify when a package silently executes an executable |
|
|
160
|
+
| dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
|
|
161
|
+
| steganography | Identify when a package retrieves hidden data from an image and executes it |
|
|
162
|
+
| code-execution | Identify when an OS command is executed in the setup.py file |
|
|
163
|
+
| unicode | Identify suspicious unicode characters |
|
|
164
|
+
| cmd-overwrite | Identify when the 'install' command is overwritten in setup.py, indicating a piece of code automatically running when the package is installed |
|
|
165
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
166
|
+
|
|
167
|
+
Metadata heuristics:
|
|
168
|
+
|
|
169
|
+
| **Heuristic** | **Description** |
|
|
170
|
+
|:-------------:|:---------------:|
|
|
171
|
+
| empty_information | Identify packages with an empty description field |
|
|
172
|
+
| release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
|
|
173
|
+
| typosquatting | Identify packages that are named closely to an highly popular package |
|
|
174
|
+
| potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised |
|
|
175
|
+
| unclaimed_maintainer_email_domain | Identify when a package maintainer e-mail domain (and therefore npm account) is unclaimed and can be registered by an attacker |
|
|
176
|
+
| repository_integrity_mismatch | Identify packages with a linked GitHub repository where the package has extra unexpected files |
|
|
177
|
+
| single_python_file | Identify packages that have only a single Python file |
|
|
178
|
+
| bundled_binary | Identify packages bundling binaries |
|
|
179
|
+
| deceptive_author | This heuristic detects when an author is using a disposable email |
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
### npm
|
|
183
|
+
|
|
184
|
+
Source code heuristics:
|
|
185
|
+
|
|
186
|
+
| **Heuristic** | **Description** |
|
|
187
|
+
|:-------------:|:---------------:|
|
|
188
|
+
| npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
|
|
189
|
+
| npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
|
|
190
|
+
| npm-silent-process-execution | Identify when a package silently executes an executable |
|
|
191
|
+
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
|
|
192
|
+
| npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
|
|
193
|
+
| npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
|
|
194
|
+
| npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
|
|
195
|
+
| npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
|
|
196
|
+
| npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
|
|
197
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
198
|
+
|
|
199
|
+
Metadata heuristics:
|
|
200
|
+
|
|
201
|
+
| **Heuristic** | **Description** |
|
|
202
|
+
|:-------------:|:---------------:|
|
|
203
|
+
| empty_information | Identify packages with an empty description field |
|
|
204
|
+
| release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
|
|
205
|
+
| potentially_compromised_email_domain | Identify when a package maintainer e-mail domain (and therefore package manager account) might have been compromised; note that NPM's API may not provide accurate information regarding the maintainer's email, so this detector may cause false positives for NPM packages. see https://www.theregister.com/2022/05/10/security_npm_email/ |
|
|
206
|
+
| unclaimed_maintainer_email_domain | Identify when a package maintainer e-mail domain (and therefore npm account) is unclaimed and can be registered by an attacker; note that NPM's API may not provide accurate information regarding the maintainer's email, so this detector may cause false positives for NPM packages. see https://www.theregister.com/2022/05/10/security_npm_email/ |
|
|
207
|
+
| typosquatting | Identify packages that are named closely to an highly popular package |
|
|
208
|
+
| direct_url_dependency | Identify packages with direct URL dependencies. Dependencies fetched this way are not immutable and can be used to inject untrusted code or reduce the likelihood of a reproducible install. |
|
|
209
|
+
| npm_metadata_mismatch | Identify packages which have mismatches between the npm package manifest and the package info for some critical fields |
|
|
210
|
+
| bundled_binary | Identify packages bundling binaries |
|
|
211
|
+
| deceptive_author | This heuristic detects when an author is using a disposable email |
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
### go
|
|
215
|
+
|
|
216
|
+
Source code heuristics:
|
|
217
|
+
|
|
218
|
+
| **Heuristic** | **Description** |
|
|
219
|
+
|:-------------:|:---------------:|
|
|
220
|
+
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
|
|
221
|
+
| go-exec-base64 | Identify Base64-decoded content being passed to execution functions in Go |
|
|
222
|
+
| go-exfiltrate-sensitive-data | This rule identifies when a package reads and exfiltrates sensitive data from the local system. |
|
|
223
|
+
| go-exec-download | This rule downloads and executes a remote binary after setting executable permissions. |
|
|
224
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
225
|
+
|
|
226
|
+
Metadata heuristics:
|
|
227
|
+
|
|
228
|
+
| **Heuristic** | **Description** |
|
|
229
|
+
|:-------------:|:---------------:|
|
|
230
|
+
| typosquatting | Identify packages that are named closely to an highly popular package |
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
### GitHub Action
|
|
234
|
+
|
|
235
|
+
Source code heuristics:
|
|
236
|
+
|
|
237
|
+
| **Heuristic** | **Description** |
|
|
238
|
+
|:-------------:|:---------------:|
|
|
239
|
+
| npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
|
|
240
|
+
| npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
|
|
241
|
+
| npm-silent-process-execution | Identify when a package silently executes an executable |
|
|
242
|
+
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
|
|
243
|
+
| npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
|
|
244
|
+
| npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
|
|
245
|
+
| npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
|
|
246
|
+
| npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
|
|
247
|
+
| npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
|
|
248
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
249
|
+
### Extension
|
|
250
|
+
|
|
251
|
+
Source code heuristics:
|
|
252
|
+
|
|
253
|
+
| **Heuristic** | **Description** |
|
|
254
|
+
|:-------------:|:---------------:|
|
|
255
|
+
| npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables |
|
|
256
|
+
| npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware |
|
|
257
|
+
| npm-silent-process-execution | Identify when a package silently executes an executable |
|
|
258
|
+
| shady-links | Identify when a package contains an URL to a domain with a suspicious extension |
|
|
259
|
+
| npm-exec-base64 | Identify when a package dynamically executes code through 'eval' |
|
|
260
|
+
| npm-install-script | Identify when a package has a pre or post-install script automatically running commands |
|
|
261
|
+
| npm-steganography | Identify when a package retrieves hidden data from an image and executes it |
|
|
262
|
+
| npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL |
|
|
263
|
+
| npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
|
|
264
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
265
|
+
### RubyGems
|
|
266
|
+
|
|
267
|
+
Source code heuristics:
|
|
268
|
+
|
|
269
|
+
| **Heuristic** | **Description** |
|
|
270
|
+
|:-------------:|:---------------:|
|
|
271
|
+
| rubygems-code-execution | Identify when a gem executes OS commands |
|
|
272
|
+
| rubygems-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system |
|
|
273
|
+
| rubygems-serialize-environment | Identify when a package serializes ENV to exfiltrate environment variables |
|
|
274
|
+
| rubygems-network-on-require | Identify when a gem makes network requests when required |
|
|
275
|
+
| rubygems-install-hook | Identify when a gem registers installation hooks |
|
|
276
|
+
| rubygems-exec-base64 | Identify when a package dynamically executes base64-encoded code |
|
|
277
|
+
| suspicious_passwd_access_linux | Detects suspicious read access to /etc/passwd file, which is often targeted by malware for credential harvesting |
|
|
278
|
+
|
|
279
|
+
Metadata heuristics:
|
|
280
|
+
|
|
281
|
+
| **Heuristic** | **Description** |
|
|
282
|
+
|:-------------:|:---------------:|
|
|
283
|
+
| typosquatting | Identify packages that are named closely to an highly popular package |
|
|
284
|
+
| empty_information | Identify packages with an empty description field |
|
|
285
|
+
| release_zero | Identify packages with an release version that's 0.0 or 0.0.0 |
|
|
286
|
+
| bundled_binary | Identify packages bundling binaries |
|
|
287
|
+
| repository_integrity_mismatch | Identify packages with a linked GitHub repository where the package has extra unexpected files |
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
<!-- END_RULE_LIST -->
|
|
291
|
+
|
|
292
|
+
## Custom Rules
|
|
293
|
+
|
|
294
|
+
Guarddog allows to implement custom sourcecode rules.
|
|
295
|
+
Sourcecode rules live under the [guarddog/analyzer/sourcecode](guarddog/analyzer/sourcecode) directory, and supported formats are [Semgrep](https://github.com/semgrep/semgrep) or [Yara](https://github.com/VirusTotal/yara).
|
|
296
|
+
|
|
297
|
+
* Semgrep rules are language-dependent, and Guarddog will import all `.yml` rules where the language matches the ecosystem selected by the user in CLI.
|
|
298
|
+
* Yara rules on the other hand are language agnostic, therefore all matching `.yar` rules present will be imported.
|
|
299
|
+
|
|
300
|
+
Is possible then to write your own rule and drop it into that directory, Guarddog will allow you to select it or exclude it as any built-in rule as well as appending the findings to its output.
|
|
301
|
+
|
|
302
|
+
For example, you can create the following semgrep rule:
|
|
303
|
+
```yaml
|
|
304
|
+
rules:
|
|
305
|
+
- id: sample-rule
|
|
306
|
+
languages:
|
|
307
|
+
- python
|
|
308
|
+
message: Output message when rule matches
|
|
309
|
+
metadata:
|
|
310
|
+
description: Description used in the CLI help
|
|
311
|
+
patterns:
|
|
312
|
+
YOUR RULE HEURISTICS GO HERE
|
|
313
|
+
severity: WARNING
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Then you'll need to save it as `sample-rule.yml` and note that the id must match the filename
|
|
317
|
+
|
|
318
|
+
In the case of Yara, you can create the following rule:
|
|
319
|
+
```
|
|
320
|
+
rule sample-rule
|
|
321
|
+
{
|
|
322
|
+
meta:
|
|
323
|
+
description = "Description used in the output message"
|
|
324
|
+
target_entity = "file"
|
|
325
|
+
strings:
|
|
326
|
+
$exec = "exec"
|
|
327
|
+
condition:
|
|
328
|
+
1 of them
|
|
329
|
+
}
|
|
330
|
+
```
|
|
331
|
+
Then you'll need to save it as `sample-rule.yar`.
|
|
332
|
+
|
|
333
|
+
Note that in both cases, the rule id must match the filename
|
|
334
|
+
|
|
335
|
+
## Running GuardDog in a GitHub Action
|
|
336
|
+
|
|
337
|
+
The easiest way to integrate GuardDog in your CI pipeline is to leverage the SARIF output format, and upload it to GitHub's [code scanning](https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/about-code-scanning) feature.
|
|
338
|
+
|
|
339
|
+
Using this, you get:
|
|
340
|
+
* Automated comments to your pull requests based on the GuardDog scan output
|
|
341
|
+
* Built-in false positive management directly in the GitHub UI
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
Sample GitHub Action using GuardDog:
|
|
345
|
+
|
|
346
|
+
```yaml
|
|
347
|
+
name: GuardDog
|
|
348
|
+
|
|
349
|
+
on:
|
|
350
|
+
push:
|
|
351
|
+
branches:
|
|
352
|
+
- main
|
|
353
|
+
pull_request:
|
|
354
|
+
branches:
|
|
355
|
+
- main
|
|
356
|
+
|
|
357
|
+
permissions:
|
|
358
|
+
contents: read
|
|
359
|
+
|
|
360
|
+
jobs:
|
|
361
|
+
guarddog:
|
|
362
|
+
permissions:
|
|
363
|
+
contents: read # for actions/checkout to fetch code
|
|
364
|
+
security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
|
|
365
|
+
name: Scan dependencies
|
|
366
|
+
runs-on: ubuntu-latest
|
|
367
|
+
|
|
368
|
+
steps:
|
|
369
|
+
- uses: actions/checkout@v4
|
|
370
|
+
|
|
371
|
+
- name: Set up Python
|
|
372
|
+
uses: actions/setup-python@v5
|
|
373
|
+
with:
|
|
374
|
+
python-version: "3.10"
|
|
375
|
+
|
|
376
|
+
- name: Install GuardDog
|
|
377
|
+
run: pip install guarddog
|
|
378
|
+
|
|
379
|
+
- run: guarddog pypi verify requirements.txt --output-format sarif --exclude-rules repository_integrity_mismatch > guarddog.sarif
|
|
380
|
+
|
|
381
|
+
- name: Upload SARIF file to GitHub
|
|
382
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
383
|
+
with:
|
|
384
|
+
category: guarddog-builtin
|
|
385
|
+
sarif_file: guarddog.sarif
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
## Development
|
|
390
|
+
|
|
391
|
+
### Running a local version of GuardDog
|
|
392
|
+
|
|
393
|
+
* Ensure poetry has an env with `python >=3.10` `poetry env use 3.10.0`
|
|
394
|
+
* Install dependencies `poetry install`
|
|
395
|
+
* Run guarddog `poetry run guarddog` or `poetry shell` then run `guarddog`
|
|
396
|
+
|
|
397
|
+
### Unit tests
|
|
398
|
+
|
|
399
|
+
Running all unit tests: `make test`
|
|
400
|
+
|
|
401
|
+
Running unit tests against Semgrep rules: `make test-semgrep-rules` (tests are [here](https://github.com/DataDog/guarddog/tree/main/tests/analyzer/sourcecode)). These use the standard methodology for [testing Semgrep rules](https://semgrep.dev/docs/writing-rules/testing-rules/).
|
|
402
|
+
|
|
403
|
+
Running unit tests against package metadata heuristics: `make test-metadata-rules` (tests are [here](https://github.com/DataDog/guarddog/tree/main/tests/analyzer/metadata)).
|
|
404
|
+
|
|
405
|
+
### Benchmarking
|
|
406
|
+
|
|
407
|
+
You can run GuardDog on legitimate and malicious packages to determine false positives and false negatives. See [./tests/samples](./tests/samples)
|
|
408
|
+
|
|
409
|
+
### Code quality checks
|
|
410
|
+
|
|
411
|
+
Run the type checker with
|
|
412
|
+
```shell
|
|
413
|
+
mypy --install-types --non-interactive guarddog
|
|
414
|
+
```
|
|
415
|
+
and the linter with
|
|
416
|
+
```shell
|
|
417
|
+
flake8 guarddog --count --select=E9,F63,F7,F82 --show-source --statistics --exclude tests/analyzer/sourcecode,tests/analyzer/metadata/resources,evaluator/data
|
|
418
|
+
flake8 guarddog --count --max-line-length=120 --statistics --exclude tests/analyzer/sourcecode,tests/analyzer/metadata/resources,evaluator/data --ignore=E203,W503
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### Configuration via Environment Variables
|
|
422
|
+
|
|
423
|
+
GuardDog's behavior can be customized using environment variables:
|
|
424
|
+
|
|
425
|
+
#### General Configuration
|
|
426
|
+
|
|
427
|
+
| Environment Variable | Description | Default Value |
|
|
428
|
+
|---------------------|-------------|---------------|
|
|
429
|
+
| `GUARDDOG_PARALLELISM` | Number of threads to use for parallel processing | Number of CPUs available |
|
|
430
|
+
| `GUARDDOG_VERIFY_EXHAUSTIVE_DEPENDENCIES` | Analyze all possible versions of dependencies (`true`/`false`) | `false` |
|
|
431
|
+
| `GUARDDOG_TOP_PACKAGES_CACHE_LOCATION` | Location of the top packages cache directory | `guarddog/analyzer/metadata/resources` |
|
|
432
|
+
| `GUARDDOG_YARA_EXT_EXCLUDE` | Comma-separated list of file extensions to exclude from YARA scanning | `ini,md,rst,txt,lock,json,yaml,yml,toml,xml,html,csv,sql,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,changelog,readme,makefile,dockerfile,pkg-info,d.ts` |
|
|
433
|
+
|
|
434
|
+
#### Semgrep Configuration
|
|
435
|
+
|
|
436
|
+
GuardDog uses `Semgrep`, a powerful static analysis tool that scans code for patterns.
|
|
437
|
+
|
|
438
|
+
| Environment Variable | Description | Default Value |
|
|
439
|
+
|---------------------|-------------|---------------|
|
|
440
|
+
| `GUARDDOG_SEMGREP_MAX_TARGET_BYTES` | Maximum size of a file that Semgrep will analyze (files exceeding this will be skipped) | 10MB (10485760 bytes) |
|
|
441
|
+
| `GUARDDOG_SEMGREP_TIMEOUT` | Maximum time in seconds that Semgrep will spend running a rule on a single file | 10 seconds |
|
|
442
|
+
|
|
443
|
+
#### Archive Extraction Security Limits
|
|
444
|
+
|
|
445
|
+
GuardDog implements multiple security checks when extracting package archives to protect against compression bombs and file descriptor exhaustion attacks:
|
|
446
|
+
|
|
447
|
+
| Environment Variable | Description | Default Value |
|
|
448
|
+
|---------------------|-------------|---------------|
|
|
449
|
+
| `GUARDDOG_MAX_UNCOMPRESSED_SIZE` | Maximum allowed uncompressed size in bytes (prevents disk space exhaustion) | 2147483648 (2 GB) |
|
|
450
|
+
| `GUARDDOG_MAX_COMPRESSION_RATIO` | Maximum allowed compression ratio (detects suspicious compression patterns) | 100 (100:1) |
|
|
451
|
+
| `GUARDDOG_MAX_FILE_COUNT` | Maximum number of files allowed in an archive (prevents file descriptor/inode exhaustion) | 100000 |
|
|
452
|
+
|
|
453
|
+
## Maintainers
|
|
454
|
+
|
|
455
|
+
* [Sebastian Obregoso](https://www.linkedin.com/in/sebastianobregoso/)
|
|
456
|
+
* [Ian Kretz](https://github.com/ikretz)
|
|
457
|
+
* [Tesnim Hamdouni](https://github.com/tesnim5hamdouni)
|
|
458
|
+
|
|
459
|
+
## Authors
|
|
460
|
+
* [Ellen Wang](https://www.linkedin.com/in/ellen-wang-4bb5961a0/)
|
|
461
|
+
* [Christophe Tafani-Dereeper](https://github.com/christophetd)
|
|
462
|
+
|
|
463
|
+
## Acknowledgments
|
|
464
|
+
|
|
465
|
+
Inspiration:
|
|
466
|
+
* [Backstabber’s Knife Collection: A Review of Open Source Software Supply Chain Attacks](https://arxiv.org/pdf/2005.09535)
|
|
467
|
+
* [What are Weak Links in the npm Supply Chain?](https://arxiv.org/pdf/2112.10165.pdf)
|
|
468
|
+
* [A Survey on Common Threats in npm and PyPi Registries](https://arxiv.org/pdf/2108.09576.pdf)
|
|
469
|
+
* [A Benchmark Comparison of Python Malware Detection Approaches](https://arxiv.org/pdf/2209.13288.pdf)
|
|
470
|
+
* [Towards Measuring Supply Chain Attacks on Package Managers for Interpreted Languages](https://arxiv.org/pdf/2002.01139)
|
|
471
|
+
|