guarddog 2.7.1__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guarddog/analyzer/metadata/__init__.py +3 -0
- guarddog/analyzer/metadata/go/typosquatting.py +11 -28
- guarddog/analyzer/metadata/npm/direct_url_dependency.py +0 -1
- guarddog/analyzer/metadata/npm/typosquatting.py +24 -59
- guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
- guarddog/analyzer/metadata/pypi/typosquatting.py +20 -77
- guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
- guarddog/analyzer/metadata/resources/top_go_packages.json +2926 -2923
- guarddog/analyzer/metadata/resources/top_npm_packages.json +8005 -8002
- guarddog/analyzer/metadata/resources/top_pypi_packages.json +15003 -60021
- guarddog/analyzer/metadata/resources/top_rubygems_packages.json +979 -0
- guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
- guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
- guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
- guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
- guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
- guarddog/analyzer/metadata/rubygems/typosquatting.py +91 -0
- guarddog/analyzer/metadata/typosquatting.py +218 -0
- guarddog/analyzer/metadata/utils.py +23 -0
- guarddog/analyzer/sourcecode/__init__.py +2 -0
- guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
- guarddog/analyzer/sourcecode/code-execution.yml +20 -0
- guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
- guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
- guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
- guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
- guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
- guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
- guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
- guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
- guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
- guarddog/analyzer/sourcecode/screenshot.yml +38 -0
- guarddog/ecosystems.py +3 -0
- guarddog/scanners/__init__.py +6 -0
- guarddog/scanners/npm_project_scanner.py +1 -1
- guarddog/scanners/rubygems_package_scanner.py +112 -0
- guarddog/scanners/rubygems_project_scanner.py +75 -0
- guarddog/scanners/scanner.py +36 -12
- guarddog/utils/archives.py +1 -1
- guarddog-2.9.0.dist-info/METADATA +471 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/RECORD +46 -29
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/WHEEL +1 -1
- guarddog-2.7.1.dist-info/METADATA +0 -40
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/entry_points.txt +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -3,6 +3,7 @@ from guarddog.analyzer.metadata.npm import NPM_METADATA_RULES
|
|
|
3
3
|
from guarddog.analyzer.metadata.pypi import PYPI_METADATA_RULES
|
|
4
4
|
from guarddog.analyzer.metadata.go import GO_METADATA_RULES
|
|
5
5
|
from guarddog.analyzer.metadata.github_action import GITHUB_ACTION_METADATA_RULES
|
|
6
|
+
from guarddog.analyzer.metadata.rubygems import RUBYGEMS_METADATA_RULES
|
|
6
7
|
from guarddog.ecosystems import ECOSYSTEM
|
|
7
8
|
|
|
8
9
|
|
|
@@ -18,3 +19,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
|
|
|
18
19
|
return GITHUB_ACTION_METADATA_RULES
|
|
19
20
|
case ECOSYSTEM.EXTENSION:
|
|
20
21
|
return {} # No metadata detectors for extensions currently
|
|
22
|
+
case ECOSYSTEM.RUBYGEMS:
|
|
23
|
+
return RUBYGEMS_METADATA_RULES
|
|
@@ -1,12 +1,6 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
1
|
from typing import Optional
|
|
5
2
|
|
|
6
3
|
from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
|
|
7
|
-
from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
|
|
8
|
-
|
|
9
|
-
log = logging.getLogger("guarddog")
|
|
10
4
|
|
|
11
5
|
|
|
12
6
|
class GoTyposquatDetector(TyposquatDetector):
|
|
@@ -19,32 +13,21 @@ class GoTyposquatDetector(TyposquatDetector):
|
|
|
19
13
|
"""
|
|
20
14
|
|
|
21
15
|
def _get_top_packages(self) -> set:
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
top_packages_path = os.path.join(resources_dir, top_packages_filename)
|
|
31
|
-
top_packages_information = self._get_top_packages_local(top_packages_path)
|
|
16
|
+
"""
|
|
17
|
+
Gets the top Go packages from local cache.
|
|
18
|
+
Uses the base class implementation without network refresh.
|
|
19
|
+
"""
|
|
20
|
+
packages = self._get_top_packages_with_refresh(
|
|
21
|
+
packages_filename="top_go_packages.json",
|
|
22
|
+
popular_packages_url=None, # No URL = no auto-refresh
|
|
23
|
+
)
|
|
32
24
|
|
|
33
|
-
if
|
|
25
|
+
if not packages:
|
|
34
26
|
raise Exception(
|
|
35
|
-
|
|
27
|
+
"Could not retrieve top Go packages from top_go_packages.json"
|
|
36
28
|
)
|
|
37
29
|
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
def _get_top_packages_local(self, path: str) -> list[dict] | None:
|
|
41
|
-
try:
|
|
42
|
-
with open(path, "r") as f:
|
|
43
|
-
result = json.load(f)
|
|
44
|
-
return result
|
|
45
|
-
except FileNotFoundError:
|
|
46
|
-
log.error(f"File not found: {path}")
|
|
47
|
-
return None
|
|
30
|
+
return packages
|
|
48
31
|
|
|
49
32
|
def detect(
|
|
50
33
|
self,
|
|
@@ -1,14 +1,6 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
from datetime import datetime, timedelta
|
|
5
1
|
from typing import Optional
|
|
6
2
|
|
|
7
3
|
from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
|
|
8
|
-
from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
|
-
log = logging.getLogger("guarddog")
|
|
12
4
|
|
|
13
5
|
|
|
14
6
|
class NPMTyposquatDetector(TyposquatDetector):
|
|
@@ -21,65 +13,38 @@ class NPMTyposquatDetector(TyposquatDetector):
|
|
|
21
13
|
"""
|
|
22
14
|
|
|
23
15
|
def _get_top_packages(self) -> set:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
16
|
+
"""
|
|
17
|
+
Gets the top 8000 most popular NPM packages.
|
|
18
|
+
Uses the base class implementation with NPM-specific parameters.
|
|
19
|
+
"""
|
|
20
|
+
return self._get_top_packages_with_refresh(
|
|
21
|
+
packages_filename="top_npm_packages.json",
|
|
22
|
+
popular_packages_url="https://github.com/LeoDog896/npm-rank/releases/download/latest/raw.json",
|
|
23
|
+
refresh_days=30,
|
|
27
24
|
)
|
|
28
25
|
|
|
29
|
-
|
|
26
|
+
def _extract_package_names(self, data: dict | list | None) -> list | None:
|
|
27
|
+
"""
|
|
28
|
+
Extract package names from NPM data structure.
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
resources_dir = os.path.abspath(
|
|
34
|
-
os.path.join(os.path.dirname(__file__), "..", "resources")
|
|
35
|
-
)
|
|
30
|
+
Network response format: [{"name": "package-name", ...}, ...]
|
|
31
|
+
Local file format: ["package-name", "package-name", ...]
|
|
36
32
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if self._file_is_expired(top_packages_path, days=30):
|
|
41
|
-
new_information = self._get_top_packages_network(popular_packages_url)
|
|
42
|
-
if new_information is not None:
|
|
43
|
-
top_packages_information = new_information
|
|
44
|
-
|
|
45
|
-
with open(top_packages_path, "w+") as f:
|
|
46
|
-
json.dump(new_information, f, ensure_ascii=False, indent=4)
|
|
47
|
-
|
|
48
|
-
if top_packages_information is None:
|
|
49
|
-
return set()
|
|
50
|
-
return set(top_packages_information)
|
|
51
|
-
|
|
52
|
-
def _file_is_expired(self, path: str, days: int) -> bool:
|
|
53
|
-
try:
|
|
54
|
-
update_time = datetime.fromtimestamp(os.path.getmtime(path))
|
|
55
|
-
return datetime.now() - update_time > timedelta(days=days)
|
|
56
|
-
except FileNotFoundError:
|
|
57
|
-
return True
|
|
58
|
-
|
|
59
|
-
def _get_top_packages_local(self, path: str) -> list[dict] | None:
|
|
60
|
-
try:
|
|
61
|
-
with open(path, "r") as f:
|
|
62
|
-
result = json.load(f)
|
|
63
|
-
return result
|
|
64
|
-
except FileNotFoundError:
|
|
65
|
-
log.error(f"File not found: {path}")
|
|
33
|
+
This method handles both formats and limits to top 8000 packages.
|
|
34
|
+
"""
|
|
35
|
+
if data is None:
|
|
66
36
|
return None
|
|
67
37
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
38
|
+
# If data is already a list of strings (local file format)
|
|
39
|
+
if isinstance(data, list) and len(data) > 0:
|
|
40
|
+
if isinstance(data[0], str):
|
|
41
|
+
return data
|
|
72
42
|
|
|
73
|
-
|
|
74
|
-
|
|
43
|
+
# If data is list of dicts (network response format)
|
|
44
|
+
if isinstance(data[0], dict) and "name" in data[0]:
|
|
45
|
+
return [item["name"] for item in data[0:8000]]
|
|
75
46
|
|
|
76
|
-
|
|
77
|
-
except json.JSONDecodeError:
|
|
78
|
-
log.error(f'Couldn`t convert to json: "{response.text}"')
|
|
79
|
-
return None
|
|
80
|
-
except requests.exceptions.RequestException as e:
|
|
81
|
-
log.error(f"Network error: {e}")
|
|
82
|
-
return None
|
|
47
|
+
return None
|
|
83
48
|
|
|
84
49
|
def detect(
|
|
85
50
|
self,
|
|
@@ -4,14 +4,12 @@ Detects if a package contains an empty description
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import configparser
|
|
7
|
-
import hashlib
|
|
8
7
|
import logging
|
|
9
8
|
import os
|
|
10
9
|
import re
|
|
11
10
|
import requests
|
|
12
11
|
from typing import Optional, Tuple
|
|
13
12
|
|
|
14
|
-
import pygit2 # type: ignore
|
|
15
13
|
import urllib3.util
|
|
16
14
|
|
|
17
15
|
from guarddog.analyzer.metadata.repository_integrity_mismatch import IntegrityMismatch
|
|
@@ -90,18 +88,6 @@ def dict_generator(indict, pre=None):
|
|
|
90
88
|
yield pre + [indict]
|
|
91
89
|
|
|
92
90
|
|
|
93
|
-
def get_file_hash(path):
|
|
94
|
-
with open(path, "rb") as f:
|
|
95
|
-
# Read the contents of the file
|
|
96
|
-
file_contents = f.read()
|
|
97
|
-
# Create a hash object
|
|
98
|
-
hash_object = hashlib.sha256()
|
|
99
|
-
# Feed the file contents to the hash object
|
|
100
|
-
hash_object.update(file_contents)
|
|
101
|
-
# Get the hexadecimal hash value
|
|
102
|
-
return hash_object.hexdigest(), str(file_contents).strip().splitlines()
|
|
103
|
-
|
|
104
|
-
|
|
105
91
|
def _ensure_proper_url(url):
|
|
106
92
|
parsed = urllib3.util.parse_url(url)
|
|
107
93
|
if parsed.scheme is None:
|
|
@@ -140,80 +126,6 @@ def find_github_candidates(package_info) -> Tuple[set[str], Optional[str]]:
|
|
|
140
126
|
return github_urls, best
|
|
141
127
|
|
|
142
128
|
|
|
143
|
-
EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def exclude_result(file_name, repo_root, pkg_root):
|
|
147
|
-
"""
|
|
148
|
-
This method filters out some results that are known false positives:
|
|
149
|
-
* if the file is a documentation file (based on its extension)
|
|
150
|
-
* if the file is a setup.cfg file with the egg_info claim present on Pypi and not on GitHub
|
|
151
|
-
"""
|
|
152
|
-
for extension in EXCLUDED_EXTENSIONS:
|
|
153
|
-
if file_name.endswith(extension):
|
|
154
|
-
return True
|
|
155
|
-
if file_name.endswith("setup.cfg"):
|
|
156
|
-
repo_cfg = configparser.ConfigParser()
|
|
157
|
-
repo_cfg.read(os.path.join(repo_root, file_name))
|
|
158
|
-
pkg_cfg = configparser.ConfigParser()
|
|
159
|
-
pkg_cfg.read(os.path.join(pkg_root, file_name))
|
|
160
|
-
repo_sections = list(repo_cfg.keys())
|
|
161
|
-
pkg_sections = list(pkg_cfg.keys())
|
|
162
|
-
if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
|
|
163
|
-
return True
|
|
164
|
-
return False
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def find_mismatch_for_tag(repo, tag, base_path, repo_path):
|
|
168
|
-
repo.checkout(tag)
|
|
169
|
-
mismatch = []
|
|
170
|
-
for root, dirs, files in os.walk(base_path):
|
|
171
|
-
relative_path = os.path.relpath(root, base_path)
|
|
172
|
-
repo_root = os.path.join(repo_path, relative_path)
|
|
173
|
-
if not os.path.exists(repo_root):
|
|
174
|
-
continue
|
|
175
|
-
repo_files = list(
|
|
176
|
-
filter(
|
|
177
|
-
lambda x: os.path.isfile(os.path.join(repo_root, x)),
|
|
178
|
-
os.listdir(repo_root),
|
|
179
|
-
)
|
|
180
|
-
)
|
|
181
|
-
for file_name in repo_files:
|
|
182
|
-
if file_name not in files: # ignore files we don't have in the distribution
|
|
183
|
-
continue
|
|
184
|
-
repo_hash, repo_content = get_file_hash(os.path.join(repo_root, file_name))
|
|
185
|
-
pkg_hash, pkg_content = get_file_hash(os.path.join(root, file_name))
|
|
186
|
-
if repo_hash != pkg_hash:
|
|
187
|
-
if exclude_result(file_name, repo_root, root):
|
|
188
|
-
continue
|
|
189
|
-
res = {
|
|
190
|
-
"file": os.path.join(relative_path, file_name),
|
|
191
|
-
"repo_sha256": repo_hash,
|
|
192
|
-
"pkg_sha256": pkg_hash,
|
|
193
|
-
}
|
|
194
|
-
mismatch.append(res)
|
|
195
|
-
return mismatch
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def find_suitable_tags_in_list(tags, version):
|
|
199
|
-
tag_candidates = []
|
|
200
|
-
for tag_name in tags:
|
|
201
|
-
if tag_name.endswith(version):
|
|
202
|
-
tag_candidates.append(tag_name)
|
|
203
|
-
return tag_candidates
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def find_suitable_tags(repo, version):
|
|
207
|
-
tags_regex = re.compile("^refs/tags/(.*)")
|
|
208
|
-
tags = []
|
|
209
|
-
for ref in repo.references:
|
|
210
|
-
match = tags_regex.match(ref)
|
|
211
|
-
if match is not None:
|
|
212
|
-
tags.append(match.group(0))
|
|
213
|
-
|
|
214
|
-
return find_suitable_tags_in_list(tags, version)
|
|
215
|
-
|
|
216
|
-
|
|
217
129
|
# Note: we should have the GitHub related logic factored out as we will need it when we check for signed commits
|
|
218
130
|
class PypiIntegrityMismatchDetector(IntegrityMismatch):
|
|
219
131
|
"""
|
|
@@ -228,94 +140,71 @@ class PypiIntegrityMismatchDetector(IntegrityMismatch):
|
|
|
228
140
|
"""
|
|
229
141
|
|
|
230
142
|
RULE_NAME = "repository_integrity_mismatch"
|
|
143
|
+
EXCLUDED_EXTENSIONS = [".rst", ".md", ".txt"]
|
|
231
144
|
|
|
232
|
-
def
|
|
233
|
-
|
|
234
|
-
package_info,
|
|
235
|
-
path: Optional[str] = None,
|
|
236
|
-
name: Optional[str] = None,
|
|
237
|
-
version: Optional[str] = None,
|
|
238
|
-
) -> tuple[bool, str]:
|
|
239
|
-
if name is None:
|
|
240
|
-
raise Exception("Detector needs the name of the package")
|
|
241
|
-
if path is None:
|
|
242
|
-
raise Exception("Detector needs the path of the package")
|
|
243
|
-
|
|
244
|
-
log.debug(
|
|
245
|
-
f"Running repository integrity mismatch heuristic on PyPI package {name} version {version}"
|
|
246
|
-
)
|
|
247
|
-
# let's extract a source repository (GitHub only for now) if we can
|
|
145
|
+
def extract_github_url(self, package_info, name: str) -> Optional[str]:
|
|
146
|
+
"""Extract GitHub URL from PyPI metadata."""
|
|
248
147
|
github_urls, best_github_candidate = find_github_candidates(package_info)
|
|
249
148
|
if len(github_urls) == 0:
|
|
250
|
-
return
|
|
251
|
-
# now, let's find the right url
|
|
149
|
+
return None
|
|
252
150
|
|
|
253
151
|
github_url = find_best_github_candidate(
|
|
254
152
|
(github_urls, best_github_candidate), name
|
|
255
153
|
)
|
|
154
|
+
return github_url
|
|
256
155
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
log.debug(f"Using GitHub URL {github_url}")
|
|
264
|
-
# ok, now let's try to find the version! (I need to know which version we are scanning)
|
|
265
|
-
if version is None:
|
|
266
|
-
version = package_info["info"]["version"]
|
|
267
|
-
if version is None:
|
|
268
|
-
raise Exception("Could not find suitable version to scan")
|
|
269
|
-
tmp_dir = os.path.dirname(path)
|
|
270
|
-
if tmp_dir is None:
|
|
271
|
-
raise Exception("no current scanning directory")
|
|
272
|
-
|
|
273
|
-
repo_path = os.path.join(tmp_dir, "sources", name)
|
|
274
|
-
try:
|
|
275
|
-
repo = pygit2.clone_repository(url=github_url, path=repo_path)
|
|
276
|
-
except pygit2.GitError as git_error:
|
|
277
|
-
# Handle generic Git-related errors
|
|
278
|
-
raise Exception(
|
|
279
|
-
f"Error while cloning repository {str(git_error)} with github url {github_url}"
|
|
280
|
-
)
|
|
281
|
-
except Exception as e:
|
|
282
|
-
# Catch any other unexpected exceptions
|
|
283
|
-
raise Exception(
|
|
284
|
-
f"An unexpected error occurred: {str(e)}. github url {github_url}"
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
tag_candidates = find_suitable_tags(repo, version)
|
|
288
|
-
|
|
289
|
-
if len(tag_candidates) == 0:
|
|
290
|
-
return False, "Could not find any suitable tag in repository"
|
|
291
|
-
|
|
292
|
-
target_tag = None
|
|
293
|
-
# TODO: this one is a bit weak. let's find something stronger - maybe use the closest string?
|
|
294
|
-
for tag in tag_candidates:
|
|
295
|
-
target_tag = tag
|
|
296
|
-
|
|
297
|
-
# Idea: parse the code of the package to find the real version - we can grep the project files for
|
|
298
|
-
# the version, git bisect until we have a file with the same version? will not work if main has not
|
|
299
|
-
# been bumped yet in version so tags and releases are out only solutions here print(tag_candidates)
|
|
300
|
-
# Well, that works if we run integrity check for multiple commits
|
|
301
|
-
|
|
302
|
-
# should be good, let's open the sources
|
|
156
|
+
def get_base_path(self, path: str, name: str) -> str:
|
|
157
|
+
"""
|
|
158
|
+
PyPI: find the subdirectory containing the package files.
|
|
159
|
+
The extracted archive typically has a subdirectory with the package name.
|
|
160
|
+
"""
|
|
303
161
|
base_dir_name = None
|
|
304
162
|
for entry in os.listdir(path):
|
|
305
163
|
if entry.lower().startswith(
|
|
306
164
|
name.lower().replace("-", "_")
|
|
307
165
|
) or entry.lower().startswith(name.lower()):
|
|
308
166
|
base_dir_name = entry
|
|
167
|
+
|
|
168
|
+
if base_dir_name is None or base_dir_name == "sources":
|
|
169
|
+
raise Exception("Could not find package directory in extracted files")
|
|
170
|
+
|
|
171
|
+
return os.path.join(path, base_dir_name)
|
|
172
|
+
|
|
173
|
+
def get_version(self, package_info, version: Optional[str]) -> Optional[str]:
|
|
174
|
+
"""Get version from PyPI metadata or use provided version."""
|
|
175
|
+
if version is None:
|
|
176
|
+
version = package_info["info"]["version"]
|
|
177
|
+
return version
|
|
178
|
+
|
|
179
|
+
def exclude_result(
|
|
180
|
+
self,
|
|
181
|
+
file_name: str,
|
|
182
|
+
repo_root: Optional[str] = None,
|
|
183
|
+
pkg_root: Optional[str] = None,
|
|
184
|
+
) -> bool:
|
|
185
|
+
"""
|
|
186
|
+
Override base class method to add PyPI-specific exclusion logic.
|
|
187
|
+
|
|
188
|
+
This method filters out some results that are known false positives:
|
|
189
|
+
* if the file is a documentation file (based on its extension)
|
|
190
|
+
* if the file is a setup.cfg file with the egg_info claim present on PyPI and not on GitHub
|
|
191
|
+
"""
|
|
192
|
+
# First check standard extensions using base class logic
|
|
193
|
+
if super().exclude_result(file_name, repo_root, pkg_root):
|
|
194
|
+
return True
|
|
195
|
+
|
|
196
|
+
# PyPI-specific: check for setup.cfg with egg_info differences
|
|
309
197
|
if (
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
198
|
+
file_name.endswith("setup.cfg")
|
|
199
|
+
and repo_root is not None
|
|
200
|
+
and pkg_root is not None
|
|
201
|
+
):
|
|
202
|
+
repo_cfg = configparser.ConfigParser()
|
|
203
|
+
repo_cfg.read(os.path.join(repo_root, file_name))
|
|
204
|
+
pkg_cfg = configparser.ConfigParser()
|
|
205
|
+
pkg_cfg.read(os.path.join(pkg_root, file_name))
|
|
206
|
+
repo_sections = list(repo_cfg.keys())
|
|
207
|
+
pkg_sections = list(pkg_cfg.keys())
|
|
208
|
+
if "egg_info" in pkg_sections and "egg_info" not in repo_sections:
|
|
209
|
+
return True
|
|
210
|
+
return False
|
|
@@ -1,14 +1,9 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
3
|
-
import os
|
|
4
|
-
from datetime import datetime, timedelta
|
|
5
2
|
from typing import Optional
|
|
6
3
|
|
|
7
|
-
import requests
|
|
8
4
|
import packaging.utils
|
|
9
5
|
|
|
10
6
|
from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
|
|
11
|
-
from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
|
|
12
7
|
|
|
13
8
|
log = logging.getLogger("guarddog")
|
|
14
9
|
|
|
@@ -25,87 +20,35 @@ class PypiTyposquatDetector(TyposquatDetector):
|
|
|
25
20
|
|
|
26
21
|
def _get_top_packages(self) -> set:
|
|
27
22
|
"""
|
|
28
|
-
Gets the package information of the top 5000 most downloaded PyPI packages
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
set: set of package data in the format:
|
|
32
|
-
{
|
|
33
|
-
...
|
|
34
|
-
{
|
|
35
|
-
download_count: ...
|
|
36
|
-
project: <package-name>
|
|
37
|
-
}
|
|
38
|
-
...
|
|
39
|
-
}
|
|
23
|
+
Gets the package information of the top 5000 most downloaded PyPI packages.
|
|
24
|
+
Uses the base class implementation with PyPI-specific parameters.
|
|
40
25
|
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
|
|
26
|
+
packages = self._get_top_packages_with_refresh(
|
|
27
|
+
packages_filename="top_pypi_packages.json",
|
|
28
|
+
popular_packages_url="https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json",
|
|
29
|
+
refresh_days=30,
|
|
44
30
|
)
|
|
45
31
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if resources_dir is None:
|
|
49
|
-
resources_dir = os.path.abspath(
|
|
50
|
-
os.path.join(os.path.dirname(__file__), "..", "resources")
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
top_packages_path = os.path.join(resources_dir, top_packages_filename)
|
|
54
|
-
top_packages_information = self._get_top_packages_local(top_packages_path)
|
|
55
|
-
|
|
56
|
-
if self._file_is_expired(top_packages_path, days=30):
|
|
57
|
-
new_information = self._get_top_packages_network(popular_packages_url)
|
|
58
|
-
if new_information is not None:
|
|
59
|
-
top_packages_information = new_information
|
|
60
|
-
|
|
61
|
-
with open(top_packages_path, "w+") as f:
|
|
62
|
-
json.dump(new_information, f, ensure_ascii=False, indent=4)
|
|
63
|
-
|
|
64
|
-
if top_packages_information is None:
|
|
65
|
-
return set()
|
|
66
|
-
return set(map(self.get_safe_name, top_packages_information))
|
|
32
|
+
# Apply canonicalization to PyPI package names
|
|
33
|
+
return set(map(self._canonicalize_name, packages))
|
|
67
34
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
update_time = datetime.fromtimestamp(os.path.getmtime(path))
|
|
75
|
-
return datetime.now() - update_time > timedelta(days=days)
|
|
76
|
-
except FileNotFoundError:
|
|
77
|
-
return True
|
|
78
|
-
|
|
79
|
-
def _get_top_packages_local(self, path: str) -> list[dict] | None:
|
|
80
|
-
try:
|
|
81
|
-
with open(path, "r") as f:
|
|
82
|
-
result = json.load(f)
|
|
83
|
-
return self.extract_information(result)
|
|
84
|
-
except FileNotFoundError:
|
|
85
|
-
log.error(f"File not found: {path}")
|
|
35
|
+
def _extract_package_names(self, data: dict | list | None) -> list | None:
|
|
36
|
+
"""
|
|
37
|
+
Extract package names from PyPI data structure.
|
|
38
|
+
PyPI data has format: {"rows": [{"project": "name", "download_count": ...}, ...]}
|
|
39
|
+
"""
|
|
40
|
+
if data is None:
|
|
86
41
|
return None
|
|
87
42
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
response = requests.get(url)
|
|
91
|
-
response.raise_for_status()
|
|
92
|
-
|
|
93
|
-
response_data = response.json()
|
|
94
|
-
result = response_data
|
|
43
|
+
if isinstance(data, dict) and "rows" in data:
|
|
44
|
+
return [row["project"] for row in data["rows"]]
|
|
95
45
|
|
|
96
|
-
|
|
97
|
-
except json.JSONDecodeError:
|
|
98
|
-
log.error(f'Couldn`t convert to json: "{response.text}"')
|
|
99
|
-
return None
|
|
100
|
-
except requests.exceptions.RequestException as e:
|
|
101
|
-
log.error(f"Network error: {e}")
|
|
102
|
-
return None
|
|
46
|
+
return None
|
|
103
47
|
|
|
104
48
|
@staticmethod
|
|
105
|
-
def
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
return None
|
|
49
|
+
def _canonicalize_name(package_name: str) -> str:
|
|
50
|
+
"""Canonicalize PyPI package names according to PEP 503."""
|
|
51
|
+
return packaging.utils.canonicalize_name(package_name)
|
|
109
52
|
|
|
110
53
|
def detect(
|
|
111
54
|
self,
|