guarddog 2.7.1__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- guarddog/analyzer/metadata/__init__.py +3 -0
- guarddog/analyzer/metadata/go/typosquatting.py +11 -28
- guarddog/analyzer/metadata/npm/direct_url_dependency.py +0 -1
- guarddog/analyzer/metadata/npm/typosquatting.py +24 -59
- guarddog/analyzer/metadata/pypi/repository_integrity_mismatch.py +53 -164
- guarddog/analyzer/metadata/pypi/typosquatting.py +20 -77
- guarddog/analyzer/metadata/repository_integrity_mismatch.py +202 -2
- guarddog/analyzer/metadata/resources/top_go_packages.json +2926 -2923
- guarddog/analyzer/metadata/resources/top_npm_packages.json +8005 -8002
- guarddog/analyzer/metadata/resources/top_pypi_packages.json +15003 -60021
- guarddog/analyzer/metadata/resources/top_rubygems_packages.json +979 -0
- guarddog/analyzer/metadata/rubygems/__init__.py +26 -0
- guarddog/analyzer/metadata/rubygems/bundled_binary.py +13 -0
- guarddog/analyzer/metadata/rubygems/empty_information.py +24 -0
- guarddog/analyzer/metadata/rubygems/release_zero.py +22 -0
- guarddog/analyzer/metadata/rubygems/repository_integrity_mismatch.py +49 -0
- guarddog/analyzer/metadata/rubygems/typosquatting.py +91 -0
- guarddog/analyzer/metadata/typosquatting.py +218 -0
- guarddog/analyzer/metadata/utils.py +23 -0
- guarddog/analyzer/sourcecode/__init__.py +2 -0
- guarddog/analyzer/sourcecode/api-obfuscation.yml +35 -40
- guarddog/analyzer/sourcecode/code-execution.yml +20 -0
- guarddog/analyzer/sourcecode/exec-base64.yml +19 -0
- guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml +31 -5
- guarddog/analyzer/sourcecode/npm-api-obfuscation.yml +51 -0
- guarddog/analyzer/sourcecode/rubygems-code-execution.yml +67 -0
- guarddog/analyzer/sourcecode/rubygems-exec-base64.yml +26 -0
- guarddog/analyzer/sourcecode/rubygems-exfiltrate-sensitive-data.yml +70 -0
- guarddog/analyzer/sourcecode/rubygems-install-hook.yml +45 -0
- guarddog/analyzer/sourcecode/rubygems-network-on-require.yml +78 -0
- guarddog/analyzer/sourcecode/rubygems-serialize-environment.yml +38 -0
- guarddog/analyzer/sourcecode/screenshot.yml +38 -0
- guarddog/ecosystems.py +3 -0
- guarddog/scanners/__init__.py +6 -0
- guarddog/scanners/npm_project_scanner.py +1 -1
- guarddog/scanners/rubygems_package_scanner.py +112 -0
- guarddog/scanners/rubygems_project_scanner.py +75 -0
- guarddog/scanners/scanner.py +36 -12
- guarddog/utils/archives.py +1 -1
- guarddog-2.9.0.dist-info/METADATA +471 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/RECORD +46 -29
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/WHEEL +1 -1
- guarddog-2.7.1.dist-info/METADATA +0 -40
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/entry_points.txt +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/LICENSE-3rdparty.csv +0 -0
- {guarddog-2.7.1.dist-info → guarddog-2.9.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Type
|
|
2
|
+
|
|
3
|
+
from guarddog.analyzer.metadata import Detector
|
|
4
|
+
from guarddog.analyzer.metadata.rubygems.typosquatting import RubyGemsTyposquatDetector
|
|
5
|
+
from guarddog.analyzer.metadata.rubygems.empty_information import (
|
|
6
|
+
RubyGemsEmptyInfoDetector,
|
|
7
|
+
)
|
|
8
|
+
from guarddog.analyzer.metadata.rubygems.release_zero import RubyGemsReleaseZeroDetector
|
|
9
|
+
from guarddog.analyzer.metadata.rubygems.bundled_binary import RubyGemsBundledBinary
|
|
10
|
+
from guarddog.analyzer.metadata.rubygems.repository_integrity_mismatch import (
|
|
11
|
+
RubyGemsIntegrityMismatchDetector,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
RUBYGEMS_METADATA_RULES = {}
|
|
15
|
+
|
|
16
|
+
classes: list[Type[Detector]] = [
|
|
17
|
+
RubyGemsTyposquatDetector,
|
|
18
|
+
RubyGemsEmptyInfoDetector,
|
|
19
|
+
RubyGemsReleaseZeroDetector,
|
|
20
|
+
RubyGemsBundledBinary,
|
|
21
|
+
RubyGemsIntegrityMismatchDetector,
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
for detectorClass in classes:
|
|
25
|
+
detectorInstance = detectorClass() # type: ignore
|
|
26
|
+
RUBYGEMS_METADATA_RULES[detectorInstance.get_name()] = detectorInstance
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from guarddog.analyzer.metadata.bundled_binary import BundledBinary
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RubyGemsBundledBinary(BundledBinary):
|
|
6
|
+
def detect(
|
|
7
|
+
self,
|
|
8
|
+
package_info,
|
|
9
|
+
path: Optional[str] = None,
|
|
10
|
+
name: Optional[str] = None,
|
|
11
|
+
version: Optional[str] = None,
|
|
12
|
+
) -> tuple[bool, str]:
|
|
13
|
+
return super().detect(package_info, path, name, version)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from guarddog.analyzer.metadata.empty_information import EmptyInfoDetector
|
|
5
|
+
|
|
6
|
+
log = logging.getLogger("guarddog")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RubyGemsEmptyInfoDetector(EmptyInfoDetector):
|
|
10
|
+
def detect(
|
|
11
|
+
self,
|
|
12
|
+
package_info,
|
|
13
|
+
path: Optional[str] = None,
|
|
14
|
+
name: Optional[str] = None,
|
|
15
|
+
version: Optional[str] = None,
|
|
16
|
+
) -> tuple[bool, str]:
|
|
17
|
+
log.debug(f"Running RubyGems empty description heuristic on package {name}")
|
|
18
|
+
info = package_info.get("info", "")
|
|
19
|
+
if info is None:
|
|
20
|
+
info = ""
|
|
21
|
+
return (
|
|
22
|
+
len(info.strip()) == 0,
|
|
23
|
+
EmptyInfoDetector.MESSAGE_TEMPLATE % "RubyGems",
|
|
24
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from guarddog.analyzer.metadata.release_zero import ReleaseZeroDetector
|
|
5
|
+
|
|
6
|
+
log = logging.getLogger("guarddog")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RubyGemsReleaseZeroDetector(ReleaseZeroDetector):
|
|
10
|
+
def detect(
|
|
11
|
+
self,
|
|
12
|
+
package_info,
|
|
13
|
+
path: Optional[str] = None,
|
|
14
|
+
name: Optional[str] = None,
|
|
15
|
+
version: Optional[str] = None,
|
|
16
|
+
) -> tuple[bool, str]:
|
|
17
|
+
log.debug(f"Running zero version heuristic on RubyGems package {name}")
|
|
18
|
+
gem_version = package_info.get("version", "")
|
|
19
|
+
return (
|
|
20
|
+
gem_version in ["0.0.0", "0.0"],
|
|
21
|
+
ReleaseZeroDetector.MESSAGE_TEMPLATE % gem_version,
|
|
22
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import urllib3.util
|
|
5
|
+
|
|
6
|
+
from guarddog.analyzer.metadata.repository_integrity_mismatch import IntegrityMismatch
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger("guarddog")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def normalize_github_url(url):
|
|
12
|
+
if url is None:
|
|
13
|
+
return None
|
|
14
|
+
url = url.strip()
|
|
15
|
+
if url.endswith(".git"):
|
|
16
|
+
url = url[:-4]
|
|
17
|
+
if url.startswith("git://"):
|
|
18
|
+
url = url.replace("git://", "https://")
|
|
19
|
+
if url.startswith("http://"):
|
|
20
|
+
url = url.replace("http://", "https://")
|
|
21
|
+
parsed = urllib3.util.parse_url(url)
|
|
22
|
+
if parsed.host not in ("github.com", "www.github.com"):
|
|
23
|
+
return None
|
|
24
|
+
return url
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RubyGemsIntegrityMismatchDetector(IntegrityMismatch):
|
|
28
|
+
EXCLUDED_EXTENSIONS = [".md", ".txt", ".rdoc"]
|
|
29
|
+
|
|
30
|
+
def extract_github_url(self, package_info, name: str) -> Optional[str]:
|
|
31
|
+
"""Extract GitHub URL from RubyGems metadata."""
|
|
32
|
+
source_code_uri = package_info.get("source_code_uri")
|
|
33
|
+
homepage_uri = package_info.get("homepage_uri")
|
|
34
|
+
|
|
35
|
+
github_url = normalize_github_url(source_code_uri)
|
|
36
|
+
if github_url is None:
|
|
37
|
+
github_url = normalize_github_url(homepage_uri)
|
|
38
|
+
|
|
39
|
+
return github_url
|
|
40
|
+
|
|
41
|
+
def get_base_path(self, path: str, name: str) -> str:
|
|
42
|
+
"""RubyGems: files are extracted directly to the path."""
|
|
43
|
+
return path
|
|
44
|
+
|
|
45
|
+
def get_version(self, package_info, version: Optional[str]) -> Optional[str]:
|
|
46
|
+
"""Get version from RubyGems metadata or use provided version."""
|
|
47
|
+
if version is None:
|
|
48
|
+
version = package_info.get("version")
|
|
49
|
+
return version
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from guarddog.analyzer.metadata.typosquatting import TyposquatDetector
|
|
5
|
+
|
|
6
|
+
log = logging.getLogger("guarddog")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RubyGemsTyposquatDetector(TyposquatDetector):
|
|
10
|
+
"""
|
|
11
|
+
Detector for typosquatting attacks on RubyGems.
|
|
12
|
+
Checks for distance one Levenshtein, one-off character swaps,
|
|
13
|
+
permutations around hyphens, and substrings.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
popular_packages (set): set of critical/popular gems from ecosyste.ms
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def _get_top_packages(self) -> set:
|
|
20
|
+
"""
|
|
21
|
+
Gets the top 1000 critical RubyGems packages.
|
|
22
|
+
Uses the base class implementation with RubyGems-specific parameters.
|
|
23
|
+
"""
|
|
24
|
+
url = "https://packages.ecosyste.ms/api/v1/registries/rubygems.org/package_names?critical=true&per_page=1000"
|
|
25
|
+
return self._get_top_packages_with_refresh(
|
|
26
|
+
packages_filename="top_rubygems_packages.json",
|
|
27
|
+
popular_packages_url=url,
|
|
28
|
+
refresh_days=30,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def detect(
|
|
32
|
+
self,
|
|
33
|
+
package_info,
|
|
34
|
+
path: Optional[str] = None,
|
|
35
|
+
name: Optional[str] = None,
|
|
36
|
+
version: Optional[str] = None,
|
|
37
|
+
) -> tuple[bool, Optional[str]]:
|
|
38
|
+
"""
|
|
39
|
+
Uses a gem's information to determine if it's attempting
|
|
40
|
+
a typosquatting attack.
|
|
41
|
+
"""
|
|
42
|
+
gem_name = package_info.get("name", name)
|
|
43
|
+
log.debug(f"Running typosquatting heuristic on RubyGems package {gem_name}")
|
|
44
|
+
|
|
45
|
+
similar_package_names = self.get_typosquatted_package(gem_name)
|
|
46
|
+
if len(similar_package_names) > 0:
|
|
47
|
+
return True, TyposquatDetector.MESSAGE_TEMPLATE % ", ".join(
|
|
48
|
+
similar_package_names
|
|
49
|
+
)
|
|
50
|
+
return False, None
|
|
51
|
+
|
|
52
|
+
def _get_confused_forms(self, package_name) -> list:
|
|
53
|
+
"""
|
|
54
|
+
Gets confused terms for Ruby gems.
|
|
55
|
+
Confused terms are:
|
|
56
|
+
- ruby to rb swaps (or vice versa)
|
|
57
|
+
- the removal of ruby/rb terms
|
|
58
|
+
- rails to ruby-on-rails swaps
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
package_name (str): name of the package
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
list: list of confused terms
|
|
65
|
+
"""
|
|
66
|
+
confused_forms = []
|
|
67
|
+
|
|
68
|
+
terms = package_name.split("-")
|
|
69
|
+
|
|
70
|
+
for i in range(len(terms)):
|
|
71
|
+
confused_term = None
|
|
72
|
+
|
|
73
|
+
if "ruby" in terms[i]:
|
|
74
|
+
confused_term = terms[i].replace("ruby", "rb")
|
|
75
|
+
elif "rb" in terms[i]:
|
|
76
|
+
confused_term = terms[i].replace("rb", "ruby")
|
|
77
|
+
else:
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
replaced_form = terms[:i] + [confused_term] + terms[i + 1 :]
|
|
81
|
+
removed_form = terms[:i] + terms[i + 1 :]
|
|
82
|
+
|
|
83
|
+
for form in (replaced_form, removed_form):
|
|
84
|
+
confused_forms.append("-".join(form))
|
|
85
|
+
|
|
86
|
+
if package_name == "rails":
|
|
87
|
+
confused_forms.append("ruby-on-rails")
|
|
88
|
+
elif package_name == "ruby-on-rails":
|
|
89
|
+
confused_forms.append("rails")
|
|
90
|
+
|
|
91
|
+
return confused_forms
|
|
@@ -1,7 +1,18 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timedelta
|
|
2
7
|
from itertools import permutations
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import requests
|
|
3
11
|
|
|
4
12
|
from guarddog.analyzer.metadata.detector import Detector
|
|
13
|
+
from guarddog.utils.config import TOP_PACKAGES_CACHE_LOCATION
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger("guarddog")
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class TyposquatDetector(Detector):
|
|
@@ -19,8 +30,215 @@ class TyposquatDetector(Detector):
|
|
|
19
30
|
|
|
20
31
|
@abc.abstractmethod
|
|
21
32
|
def _get_top_packages(self) -> set:
|
|
33
|
+
"""
|
|
34
|
+
Subclasses should implement this to return a set of top package names.
|
|
35
|
+
|
|
36
|
+
For simple implementations without network refresh, override this directly.
|
|
37
|
+
For implementations with network refresh, use _get_top_packages_with_refresh().
|
|
38
|
+
"""
|
|
22
39
|
pass
|
|
23
40
|
|
|
41
|
+
def _get_top_packages_with_refresh(
|
|
42
|
+
self,
|
|
43
|
+
packages_filename: str,
|
|
44
|
+
popular_packages_url: Optional[str] = None,
|
|
45
|
+
refresh_days: int = 30,
|
|
46
|
+
) -> set:
|
|
47
|
+
"""
|
|
48
|
+
Common implementation for getting top packages with optional network refresh.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
packages_filename: Name of the JSON file (e.g., "top_pypi_packages.json")
|
|
52
|
+
popular_packages_url: URL to fetch fresh package data. If None, refresh is disabled.
|
|
53
|
+
refresh_days: Number of days before file is considered expired
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
set: Set of package names
|
|
57
|
+
"""
|
|
58
|
+
resources_dir = TOP_PACKAGES_CACHE_LOCATION
|
|
59
|
+
if resources_dir is None:
|
|
60
|
+
resources_dir = os.path.abspath(
|
|
61
|
+
os.path.join(os.path.dirname(__file__), "resources")
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
top_packages_path = os.path.join(resources_dir, packages_filename)
|
|
65
|
+
log.debug(f"Loading cache from: {top_packages_path}")
|
|
66
|
+
|
|
67
|
+
cache_data = self._load_cache_file(top_packages_path)
|
|
68
|
+
|
|
69
|
+
if cache_data:
|
|
70
|
+
log.debug(f"Cache loaded successfully with keys: {list(cache_data.keys())}")
|
|
71
|
+
else:
|
|
72
|
+
log.debug("Cache is empty or invalid")
|
|
73
|
+
|
|
74
|
+
top_packages_information = cache_data.get("packages") if cache_data else None
|
|
75
|
+
|
|
76
|
+
# Enable refresh if URL is provided
|
|
77
|
+
enable_refresh = popular_packages_url is not None
|
|
78
|
+
is_expired = self._cache_is_expired(cache_data, days=refresh_days)
|
|
79
|
+
log.debug(
|
|
80
|
+
f"Cache expired check: {is_expired} (refresh enabled: {enable_refresh})"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if enable_refresh and is_expired and popular_packages_url is not None:
|
|
84
|
+
log.info(
|
|
85
|
+
f"Cache is expired, attempting to refresh from: {popular_packages_url}"
|
|
86
|
+
)
|
|
87
|
+
new_response_data = self._get_top_packages_network_raw(popular_packages_url)
|
|
88
|
+
if new_response_data is not None:
|
|
89
|
+
log.debug("Downloaded new data, extracting package names")
|
|
90
|
+
top_packages_information = self._extract_package_names(
|
|
91
|
+
new_response_data
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Save with new standardized format
|
|
95
|
+
cache_data = {
|
|
96
|
+
"downloaded_timestamp": int(time.time()),
|
|
97
|
+
"packages": top_packages_information,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if top_packages_information is not None:
|
|
101
|
+
log.info(
|
|
102
|
+
f"Saving refreshed cache with {len(top_packages_information)} packages to {top_packages_path}"
|
|
103
|
+
)
|
|
104
|
+
with open(top_packages_path, "w+") as f:
|
|
105
|
+
json.dump(cache_data, f, ensure_ascii=False, indent=4)
|
|
106
|
+
else:
|
|
107
|
+
log.warning(
|
|
108
|
+
f"Failed to download new cache data from {popular_packages_url}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if top_packages_information is None:
|
|
112
|
+
return set()
|
|
113
|
+
|
|
114
|
+
return set(top_packages_information)
|
|
115
|
+
|
|
116
|
+
def _cache_is_expired(self, cache_data: dict | None, days: int) -> bool:
|
|
117
|
+
"""
|
|
118
|
+
Check if cache data is expired based on downloaded_timestamp.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
cache_data: Cache dictionary with 'downloaded_timestamp' key
|
|
122
|
+
days: Number of days before cache is considered expired
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
bool: True if expired or timestamp missing, False otherwise
|
|
126
|
+
"""
|
|
127
|
+
if cache_data is None:
|
|
128
|
+
log.debug("Cache is expired: cache_data is None")
|
|
129
|
+
return True
|
|
130
|
+
|
|
131
|
+
timestamp = cache_data.get("downloaded_timestamp")
|
|
132
|
+
if timestamp is None:
|
|
133
|
+
# Missing timestamp, consider expired
|
|
134
|
+
log.debug("Cache is expired: missing 'downloaded_timestamp' field")
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
download_time = datetime.fromtimestamp(timestamp)
|
|
139
|
+
age = datetime.now() - download_time
|
|
140
|
+
is_expired = age > timedelta(days=days)
|
|
141
|
+
log.debug(
|
|
142
|
+
f"Cache age: {age.days} days, threshold: {days} days, expired: {is_expired}"
|
|
143
|
+
)
|
|
144
|
+
return is_expired
|
|
145
|
+
except (ValueError, OSError) as e:
|
|
146
|
+
# Invalid timestamp
|
|
147
|
+
log.debug(f"Cache is expired: invalid timestamp {timestamp} - {e}")
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
def _load_cache_file(self, path: str) -> dict | None:
|
|
151
|
+
"""
|
|
152
|
+
Load cache data from local JSON file.
|
|
153
|
+
|
|
154
|
+
Expected format: {"downloaded_timestamp": epoch, "packages": [...]}
|
|
155
|
+
|
|
156
|
+
If the file doesn't match this format, it will be considered invalid
|
|
157
|
+
and trigger a refresh to download data in the correct format.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
path: Path to the JSON file
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
dict: Cache data with 'packages' and 'downloaded_timestamp', or None if invalid
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
with open(path, "r") as f:
|
|
167
|
+
result = json.load(f)
|
|
168
|
+
|
|
169
|
+
# Validate new format structure
|
|
170
|
+
if (
|
|
171
|
+
isinstance(result, dict)
|
|
172
|
+
and "packages" in result
|
|
173
|
+
and "downloaded_timestamp" in result
|
|
174
|
+
):
|
|
175
|
+
# Validate that packages is a list
|
|
176
|
+
if isinstance(result["packages"], list):
|
|
177
|
+
return result
|
|
178
|
+
else:
|
|
179
|
+
log.warning(
|
|
180
|
+
f"Invalid cache format in {path}: 'packages' must be a list. Will trigger refresh."
|
|
181
|
+
)
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
# File doesn't have the correct format - invalidate it
|
|
185
|
+
log.info(
|
|
186
|
+
f"Cache file {path} has old or invalid format. Will trigger refresh to new format."
|
|
187
|
+
)
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
except FileNotFoundError:
|
|
191
|
+
log.debug(f"Cache file not found: {path}")
|
|
192
|
+
return None
|
|
193
|
+
except json.JSONDecodeError:
|
|
194
|
+
log.error(f"Invalid JSON in file: {path}")
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
def _get_top_packages_network_raw(self, url: str) -> dict | list | None:
|
|
198
|
+
"""
|
|
199
|
+
Fetch the complete response data from the network.
|
|
200
|
+
Returns the full JSON structure to preserve format when saving.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
url: URL to fetch package data from
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
dict | list: Full response data or None on error
|
|
207
|
+
"""
|
|
208
|
+
try:
|
|
209
|
+
response = requests.get(url)
|
|
210
|
+
response.raise_for_status()
|
|
211
|
+
return response.json()
|
|
212
|
+
except json.JSONDecodeError:
|
|
213
|
+
log.error(f'Couldn\'t convert to json: "{response.text}"')
|
|
214
|
+
return None
|
|
215
|
+
except requests.exceptions.RequestException as e:
|
|
216
|
+
log.error(f"Network error: {e}")
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
def _extract_package_names(self, data: dict | list | None) -> list | None:
|
|
220
|
+
"""
|
|
221
|
+
Extract package names from the raw data structure.
|
|
222
|
+
|
|
223
|
+
Override this method in subclasses if the data format is specific to the ecosystem.
|
|
224
|
+
Default implementation assumes data is already a list of package names.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
data: Raw data from JSON file or network response
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
list: List of package names or None
|
|
231
|
+
"""
|
|
232
|
+
if data is None:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
# Default: assume data is already a list
|
|
236
|
+
if isinstance(data, list):
|
|
237
|
+
return data
|
|
238
|
+
|
|
239
|
+
# If it's a dict, subclasses should override this method
|
|
240
|
+
return None
|
|
241
|
+
|
|
24
242
|
def _is_distance_one_Levenshtein(self, name1, name2) -> bool:
|
|
25
243
|
"""
|
|
26
244
|
Returns True if two names have a Levenshtein distance of one
|
|
@@ -2,6 +2,7 @@ from datetime import datetime, timezone
|
|
|
2
2
|
from functools import cache
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
|
+
import hashlib
|
|
5
6
|
import whois # type: ignore
|
|
6
7
|
|
|
7
8
|
NPM_MAINTAINER_EMAIL_WARNING = (
|
|
@@ -53,3 +54,25 @@ def extract_email_address_domain(email_address: str):
|
|
|
53
54
|
|
|
54
55
|
except IndexError:
|
|
55
56
|
raise ValueError(f"Invalid email address: {email_address}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_file_hash(path: str) -> tuple[str, list[str]]:
|
|
60
|
+
"""
|
|
61
|
+
Gets the sha256 of the file
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
path (str): Full file path
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
str: The SHA256 hash of the file as a hexadecimal string
|
|
68
|
+
list: The file contents as a list of lines
|
|
69
|
+
"""
|
|
70
|
+
with open(path, "rb") as f:
|
|
71
|
+
# Read the contents of the file
|
|
72
|
+
file_contents = f.read()
|
|
73
|
+
# Create a hash object
|
|
74
|
+
hash_object = hashlib.sha256()
|
|
75
|
+
# Feed the file contents to the hash object
|
|
76
|
+
hash_object.update(file_contents)
|
|
77
|
+
# Get the hexadecimal hash value
|
|
78
|
+
return hash_object.hexdigest(), str(file_contents).strip().splitlines()
|
|
@@ -1,42 +1,37 @@
|
|
|
1
1
|
rules:
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
metavariable: $METHOD
|
|
29
|
-
regex: "^[\"'][A-Za-z_][A-Za-z0-9_]*[\"']$"
|
|
2
|
+
- id: api-obfuscation
|
|
3
|
+
languages:
|
|
4
|
+
- python
|
|
5
|
+
message: This package uses obfuscated API calls that may evade static analysis detection
|
|
6
|
+
metadata:
|
|
7
|
+
description: Identify obfuscated API calls using alternative Python syntax patterns
|
|
8
|
+
severity: WARNING
|
|
9
|
+
patterns:
|
|
10
|
+
- pattern-either:
|
|
11
|
+
# Covered cases:
|
|
12
|
+
# 1) __dict__ access patterns: $MODULE.__dict__[$METHOD](...) / .__call__(...)
|
|
13
|
+
# 2) __getattribute__ patterns: $MODULE.__getattribute__($METHOD)(...) / .__call__(...)
|
|
14
|
+
# 3) getattr patterns: getattr($MODULE, $METHOD)(...) / .__call__(...)
|
|
15
|
+
# It also covers the case where $MODULE is imported as __import__($mod),
|
|
16
|
+
# where $mod is a generic expression (e.g., string literal, variable, etc.)
|
|
17
|
+
- patterns:
|
|
18
|
+
- pattern-either:
|
|
19
|
+
- pattern: $MODULE.__dict__[$METHOD]($...ARGS)
|
|
20
|
+
- pattern: $MODULE.__dict__[$METHOD].__call__($...ARGS)
|
|
21
|
+
- pattern: $MODULE.__getattribute__($METHOD)($...ARGS)
|
|
22
|
+
- pattern: $MODULE.__getattribute__($METHOD).__call__($...ARGS)
|
|
23
|
+
- pattern: getattr($MODULE, $METHOD)($...ARGS)
|
|
24
|
+
- pattern: getattr($MODULE, $METHOD).__call__($...ARGS)
|
|
25
|
+
- metavariable-regex:
|
|
26
|
+
metavariable: $MODULE
|
|
27
|
+
regex: "^[A-Za-z_][A-Za-z0-9_\\.]*$|^__import__\\(.*\\)$"
|
|
30
28
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
metavariable: $METHOD
|
|
41
|
-
# avoid matching __getattribute__
|
|
42
|
-
regex: "[^(__getattribute__)][A-Za-z_][A-Za-z0-9_]*"
|
|
29
|
+
# --- Additional Cases: __import__('mod').method(...) / .__call__(...)
|
|
30
|
+
- patterns:
|
|
31
|
+
- pattern-either:
|
|
32
|
+
- pattern: __import__($MODULE).$METHOD($...ARGS)
|
|
33
|
+
- pattern: __import__($MODULE).$METHOD.__call__($...ARGS)
|
|
34
|
+
- metavariable-regex:
|
|
35
|
+
metavariable: $METHOD
|
|
36
|
+
# avoid matching __getattribute__
|
|
37
|
+
regex: "[^(__getattribute__)][A-Za-z_][A-Za-z0-9_]*"
|
|
@@ -114,6 +114,26 @@ rules:
|
|
|
114
114
|
- pattern-either:
|
|
115
115
|
- pattern: globals()['eval']($ARG1)
|
|
116
116
|
- pattern: globals()['\x65\x76\x61\x6c']($ARG1) # that's "eval" in hexadecimal
|
|
117
|
+
|
|
118
|
+
# vars() indirection to access builtins
|
|
119
|
+
- pattern: vars(__builtins__)['compile']($ARG1, ...)
|
|
120
|
+
- pattern: vars(__builtins__)['exec']($ARG1)
|
|
121
|
+
- pattern: vars(__builtins__)['eval']($ARG1)
|
|
122
|
+
|
|
123
|
+
# vars().get() variant
|
|
124
|
+
- pattern: vars(__builtins__).get('compile')($ARG1, ...)
|
|
125
|
+
- pattern: vars(__builtins__).get('exec')($ARG1)
|
|
126
|
+
- pattern: vars(__builtins__).get('eval')($ARG1)
|
|
127
|
+
|
|
128
|
+
# vars/globals combinations
|
|
129
|
+
- pattern: vars(globals()['__builtins__'])['exec']($ARG1)
|
|
130
|
+
- pattern: vars(globals()['__builtins__'])['eval']($ARG1)
|
|
131
|
+
- pattern: vars(locals()['__builtins__'])['exec']($ARG1)
|
|
132
|
+
- pattern: vars(locals()['__builtins__'])['eval']($ARG1)
|
|
133
|
+
|
|
134
|
+
# Direct compile() calls
|
|
135
|
+
- pattern: compile($ARG1, '<string>', 'exec')
|
|
136
|
+
- pattern: compile($ARG1, '<string>', 'eval')
|
|
117
137
|
|
|
118
138
|
- metavariable-pattern:
|
|
119
139
|
metavariable: $ARG1
|
|
@@ -56,4 +56,23 @@ rules:
|
|
|
56
56
|
- pattern: __import__("base64").b64decode(...)
|
|
57
57
|
- pattern: marshal.loads(zlib.decompress(...))
|
|
58
58
|
- pattern: $FUNC("...").decrypt(...)
|
|
59
|
+
|
|
60
|
+
# codecs.decode with base64 (all valid aliases)
|
|
61
|
+
- pattern: codecs.decode(..., 'base64')
|
|
62
|
+
- pattern: codecs.decode(..., 'base_64')
|
|
63
|
+
- pattern: codecs.decode(..., 'base-64')
|
|
64
|
+
- pattern: codecs.decode(..., 'BASE64')
|
|
65
|
+
- pattern: codecs.decode(..., 'BASE_64')
|
|
66
|
+
- pattern: codecs.decode(..., 'BASE-64')
|
|
67
|
+
|
|
68
|
+
# importlib + base64 module
|
|
69
|
+
- pattern: importlib.import_module('base64').b64decode(...)
|
|
70
|
+
|
|
71
|
+
# importlib + codecs module (all base64 aliases)
|
|
72
|
+
- pattern: importlib.import_module('codecs').decode(..., 'base64')
|
|
73
|
+
- pattern: importlib.import_module('codecs').decode(..., 'base_64')
|
|
74
|
+
- pattern: importlib.import_module('codecs').decode(..., 'base-64')
|
|
75
|
+
- pattern: importlib.import_module('codecs').decode(..., 'BASE64')
|
|
76
|
+
- pattern: importlib.import_module('codecs').decode(..., 'BASE_64')
|
|
77
|
+
- pattern: importlib.import_module('codecs').decode(..., 'BASE-64')
|
|
59
78
|
severity: WARNING
|