PyPI - guarddog - Versions diffs - 1.11.2__tar.gz → 2.0.1__tar.gz - Mend

guarddog 1.11.2tar.gz → 2.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

{guarddog-1.11.2 → guarddog-2.0.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: guarddog
-Version: 1.11.2
+Version: 2.0.1
 Summary: GuardDog is a CLI tool to Identify malicious PyPI packages
 Home-page: https://github.com/DataDog/guarddog
 License: Apache-2.0
@@ -15,7 +15,7 @@ Requires-Dist: click (>=8.1.3,<9.0.0)
 Requires-Dist: click-option-group (>=0.5.5,<0.6.0)
 Requires-Dist: colorama (>=0.4.6,<0.5.0)
 Requires-Dist: configparser (>=5.3,<8.0)
-Requires-Dist: disposable-email-domains (>=0.0.103,<0.0.104)
+Requires-Dist: disposable-email-domains (>=0.0.103,<0.0.105)
 Requires-Dist: prettytable (>=3.6.0,<4.0.0)
 Requires-Dist: pygit2 (>=1.11,<1.16)
 Requires-Dist: python-dateutil (>=2.8.2,<3.0.0)
@@ -24,9 +24,11 @@ Requires-Dist: pyyaml (>=6.0,<7.0)
 Requires-Dist: requests (>=2.29.0,<3.0.0)
 Requires-Dist: semantic-version (>=2.10.0,<3.0.0)
 Requires-Dist: semgrep (==1.67.0)
+Requires-Dist: setuptools (>=70.3.0,<71.0.0)
 Requires-Dist: tarsafe (>=0.0.5,<0.0.6)
 Requires-Dist: termcolor (>=2.1.0,<3.0.0)
 Requires-Dist: urllib3 (==2.2.2)
+Requires-Dist: yara-python (>=4.5.1,<5.0.0)
 Project-URL: Repository, https://github.com/DataDog/guarddog
 Description-Content-Type: text/x-rst

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/__init__.py RENAMED Viewed

@@ -1,2 +1,3 @@
 from guarddog.scanners.npm_package_scanner import NPMPackageScanner  # NOQA
 from guarddog.scanners.pypi_package_scanner import PypiPackageScanner  # NOQA
+from guarddog.scanners.go_package_scanner import GoModuleScanner  # NOQA

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/analyzer.py RENAMED Viewed

@@ -2,12 +2,13 @@ import json
 import logging
 import os
 import subprocess
+import yara  # type: ignore
 from collections import defaultdict
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Iterable, Optional, Dict
 from guarddog.analyzer.metadata import get_metadata_detectors
-from guarddog.analyzer.sourcecode import SOURCECODE_RULES
+from guarddog.analyzer.sourcecode import get_sourcecode_rules, SempgrepRule, YaraRule
 from guarddog.ecosystems import ECOSYSTEM
 SEMGREP_MAX_TARGET_BYTES = 10_000_000
@@ -24,6 +25,7 @@ class Analyzer:
         ecosystem (str): name of the current ecosystem
         metadata_ruleset (list): list of metadata rule names
         sourcecode_ruleset (list): list of source code rule names
+        ioc_ruleset (list): list of ioc rule names
         exclude (list): list of directories to exclude from source code search
@@ -32,14 +34,18 @@ class Analyzer:
     def __init__(self, ecosystem=ECOSYSTEM.PYPI) -> None:
         self.sourcecode_rules_path = os.path.join(os.path.dirname(__file__), "sourcecode")
         self.ecosystem = ecosystem
         # Rules and associated detectors
         self.metadata_detectors = get_metadata_detectors(ecosystem)
         self.metadata_ruleset: set[str] = set(self.metadata_detectors.keys())
-        self.sourcecode_ruleset: set[str] = set(rule["id"] for rule in SOURCECODE_RULES[ecosystem])
+        self.semgrep_ruleset: set[str] = set(
+            r.id for r in get_sourcecode_rules(ecosystem, SempgrepRule)
+        )
+        self.yara_ruleset: set[str] = set(
+            r.id for r in get_sourcecode_rules(ecosystem, YaraRule)
+        )
         # Define paths to exclude from sourcecode analysis
         self.exclude = [
@@ -77,10 +83,7 @@ class Analyzer:
         sourcecode_results = None
         # populate results, errors, and number of issues
-        log.debug(f"Running metadata rules against package '{name}'")
         metadata_results = self.analyze_metadata(path, info, rules, name, version)
-        log.debug(f"Running source code rules against directory '{path}'")
         sourcecode_results = self.analyze_sourcecode(path, rules)
         # Concatenate dictionaries together
@@ -104,6 +107,8 @@ class Analyzer:
             dict[str]: map from each metadata rule and their corresponding output
         """
+        log.debug(f"Running metadata rules against package '{name}'")
         all_rules = self.metadata_ruleset
         if rules is not None:
             # filtering the full ruleset witht the user's input
@@ -139,11 +144,87 @@ class Analyzer:
         Returns:
             dict[str]: map from each source code rule and their corresponding output
         """
+        semgrepscan_results = self.analyze_semgrep(path, rules)
+        yarascan_results = self.analyze_yara(path, rules)
+        # Concatenate dictionaries together
+        issues = semgrepscan_results["issues"] + yarascan_results["issues"]
+        results = semgrepscan_results["results"] | yarascan_results["results"]
+        errors = semgrepscan_results["errors"] | yarascan_results["errors"]
+        return {"issues": issues, "errors": errors, "results": results, "path": path}
+    def analyze_yara(self, path: str, rules: Optional[set] = None) -> dict:
+        """
+        Analyzes the IOCs of a given package
+        Args:
+            path (str): path to package
+            rules (set, optional): Set of IOC rules to analyze. Defaults to all rules.
+        Returns:
+            dict[str]: map from each IOC rule and their corresponding output
+        """
+        log.debug(f"Running yara rules against directory '{path}'")
+        all_rules = self.yara_ruleset
+        if rules is not None:
+            # filtering the full ruleset witht the user's input
+            all_rules = self.yara_ruleset & rules
+        results = {rule: {} for rule in all_rules}  # type: dict
+        errors: Dict[str, str] = {}
+        issues = 0
+        rules_path = {
+            rule_name: os.path.join(self.sourcecode_rules_path, f"{rule_name}.yar")
+            for rule_name in all_rules
+        }
+        if len(rules_path) == 0:
+            log.debug("No yara rules to run")
+            return {"results": results, "errors": errors, "issues": issues}
+        try:
+            scan_rules = yara.compile(filepaths=rules_path)
+            for root, _, files in os.walk(path):
+                for f in files:
+                    matches = scan_rules.match(os.path.join(root, f))
+                    for m in matches:
+                        for s in m.strings:
+                            for i in s.instances:
+                                rule_results = {
+                                    "location": f"{f}:{i.offset}",
+                                    "code": self.trim_code_snippet(str(i.matched_data)),
+                                    'message': m.meta.get("description", f"{m.rule} rule matched")
+                                }
+                                issues += len(m.strings)
+                                results[m.rule].update(rule_results)
+        except Exception as e:
+            errors["rules-all"] = f"failed to run rule: {str(e)}"
+        return {"results": results, "errors": errors, "issues": issues}
+    def analyze_semgrep(self, path, rules=None) -> dict:
+        """
+        Analyzes the source code of a given package
+        Args:
+            path (str): path to directory of package
+            rules (set, optional): Set of source code rules to analyze. Defaults to all rules.
+        Returns:
+            dict[str]: map from each source code rule and their corresponding output
+        """
+        log.debug(f"Running semgrep rules against directory '{path}'")
         targetpath = Path(path)
-        all_rules = self.sourcecode_ruleset
+        all_rules = self.semgrep_ruleset
         if rules is not None:
             # filtering the full ruleset witht the user's input
-            all_rules = self.sourcecode_ruleset & rules
+            all_rules = self.semgrep_ruleset & rules
         results = {rule: {} for rule in all_rules}  # type: dict
         errors = {}
@@ -155,11 +236,11 @@ class Analyzer:
         ))
         if len(rules_path) == 0:
-            log.debug("No source code rules to run")
+            log.debug("No semgrep code rules to run")
             return {"results": {}, "errors": {}, "issues": 0}
         try:
-            log.debug(f"Running source code rules against {path}")
+            log.debug(f"Running semgrep code rules against {path}")
             response = self._invoke_semgrep(target=path, rules=rules_path)
             rule_results = self._format_semgrep_response(response, targetpath=targetpath)
             issues += sum(len(res) for res in rule_results.values())
@@ -240,11 +321,16 @@ output: {e.output}
             location = file_path + ":" + str(line)
             code = self.trim_code_snippet(code_snippet)
-            results[rule_name].append({
+            finding = {
                 'location': location,
                 'code': code,
                 'message': result["extra"]["message"]
-            })
+            }
+            rule_results = results[rule_name]
+            if finding in rule_results:
+                continue
+            results[rule_name].append(finding)
         return results

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/metadata/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from guarddog.analyzer.metadata.detector import Detector
 from guarddog.analyzer.metadata.npm import NPM_METADATA_RULES
 from guarddog.analyzer.metadata.pypi import PYPI_METADATA_RULES
+from guarddog.analyzer.metadata.go import GO_METADATA_RULES
 from guarddog.ecosystems import ECOSYSTEM
@@ -10,3 +11,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]:
             return PYPI_METADATA_RULES
         case ECOSYSTEM.NPM:
             return NPM_METADATA_RULES
+        case ECOSYSTEM.GO:
+            return GO_METADATA_RULES

guarddog-2.0.1/guarddog/analyzer/metadata/go/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from guarddog.analyzer.metadata import Detector
+GO_METADATA_RULES = {}
+classes: list[Detector] = []
+for detectorClass in classes:
+    detectorInstance = detectorClass()  # type: ignore
+    GO_METADATA_RULES[detectorInstance.get_name()] = detectorInstance

guarddog-2.0.1/guarddog/analyzer/sourcecode/__init__.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import pathlib
+from dataclasses import dataclass
+from typing import Optional, Iterable
+import yaml
+from yaml.loader import SafeLoader
+from guarddog.ecosystems import ECOSYSTEM
+current_dir = pathlib.Path(__file__).parent.resolve()
+# These data class aim to reduce the spreading of the logic
+# Instead of using the a dict as a structure and parse it difffently depending on the type
+@dataclass
+class SourceCodeRule:
+    """
+    Base class for source code rules
+    """
+    id: str
+    file: str
+@dataclass
+class YaraRule(SourceCodeRule):
+    """
+    Yara rule just reimplements base
+    """
+    pass
+@dataclass
+class SempgrepRule(SourceCodeRule):
+    """
+    Semgrep rule are language specific
+    Content of rule in yaml format is accessible through rule_content
+    """
+    description: str
+    ecosystem: ECOSYSTEM
+    rule_content: dict
+def get_sourcecode_rules(
+    ecosystem: ECOSYSTEM, kind: Optional[type] = None
+) -> Iterable[SourceCodeRule]:
+    """
+    This function returns the source code rules for a given ecosystem and kind.
+    Args:
+        ecosystem: The ecosystem to filter for if rules are ecosystem specific
+        kind: The kind of rule to filter for
+    """
+    for rule in SOURCECODE_RULES:
+        if kind and not isinstance(rule, kind):
+            continue
+        if not (getattr(rule, "ecosystem", ecosystem) == ecosystem):
+            continue
+        yield rule
+SOURCECODE_RULES: list[SourceCodeRule] = list()
+semgrep_rule_file_names = list(
+    filter(lambda x: x.endswith("yml"), os.listdir(current_dir))
+)
+# all yml files placed in the sourcecode directory are loaded as semgrep rules
+# refer to README.md for more information
+for file_name in semgrep_rule_file_names:
+    with open(os.path.join(current_dir, file_name), "r") as fd:
+        data = yaml.load(fd, Loader=SafeLoader)
+        for rule in data["rules"]:
+            for lang in rule["languages"]:
+                ecosystem = None
+                match lang:
+                    case "python":
+                        ecosystem = ECOSYSTEM.PYPI
+                    case "javascript" | "typescript" | "json":
+                        ecosystem = ECOSYSTEM.NPM
+                    case "go":
+                        ecosystem = ECOSYSTEM.GO
+                    case _:
+                        continue
+                # avoids duplicates when multiple languages are supported by a rule
+                if not next(
+                    filter(
+                        lambda r: r.id == rule["id"],
+                        get_sourcecode_rules(ecosystem, SempgrepRule),
+                    ),
+                    None,
+                ):
+                    SOURCECODE_RULES.append(
+                        SempgrepRule(
+                            id=rule["id"],
+                            ecosystem=ecosystem,
+                            description=rule.get("metadata", {}).get("description", ""),
+                            file=file_name,
+                            rule_content=rule,
+                        )
+                    )
+yara_rule_file_names = list(
+    filter(lambda x: x.endswith("yar"), os.listdir(current_dir))
+)
+# all yar files placed in the sourcecode directory are loaded as YARA rules
+# refer to README.md for more information
+for file_name in yara_rule_file_names:
+    SOURCECODE_RULES.append(YaraRule(id=pathlib.Path(file_name).stem, file=file_name))

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/sourcecode/dll-hijacking.yml RENAMED Viewed

@@ -11,6 +11,8 @@ rules:
       - pattern-either:
         - patterns:
           - pattern: "$DLL_LOAD"
+          # Ignore docstrings
+          - pattern-not-regex: ^\s*"""(.|\n)*?"""\s*$
           - metavariable-pattern:
               metavariable: $DLL_LOAD
               pattern-either:
@@ -20,6 +22,21 @@ rules:
                 - pattern-regex: (?i).*?\/bin/.+\s+.*?\.so
                 # environment preload
                 - pattern-regex: LD_PRELOAD
+                # MITRE ATT&CK "System Binary Proxy Execution" techniques
+                # https://attack.mitre.org/techniques/T1218/
+                - pattern-regex: (?i)control(.exe)?\s+\S+.cpl
+                - pattern-regex: (?i)cmstp(.exe)?\s+\S+
+                - pattern-regex: (?i)InstallUtil(.exe)?\s+\S+
+                - pattern-regex: (?i)mshta(.exe)?\s+\S+
+                - pattern-regex: (?i)msiexec(.exe)?\s+\S+
+                - pattern-regex: (?i)odbcconf(.exe)?\s+.*{\s*REGSVR\s+\S+\s*}
+                - pattern-regex: (?i)regsvcs(.exe)?\s+\S+
+                - pattern-regex: (?i)regasm(.exe)?\s+\S+
+                - pattern-regex: (?i)regsvr32(.exe)?\s+\S+
+                - pattern-regex: (?i)rundll32(.exe)?\s+\S+
+                - pattern-regex: (?i)verclsid(.exe)?\s+.*{\s*\S+\s*}
+                - pattern-regex: (?i)mavinject(.exe)?\s+\d+\s+/INJECTRUNNING\s+\S+
+                - pattern-regex: (?i)mmc(.exe)?\s+-Embedding\s+\S+.ms
         - patterns:
           - pattern: $FN($EXE,...,$DLL)
           - metavariable-pattern:

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/sourcecode/exfiltrate-sensitive-data.yml RENAMED Viewed

@@ -31,6 +31,18 @@ rules:
           - metavariable-regex:
               metavariable: $ENVVAR
               regex: ([\"\'](AWS_ACCESS_KEY_ID|AWS_SECRET_ACCESS_KEY|AWS_SESSION_TOKEN)[\"\'])
+      - patterns:
+          - pattern-inside: |
+              $CONNECT = sqlite3.connect(...)
+              ...
+              $CURSOR = $CONNECT.cursor(...)
+              ...
+          - pattern: $CURSOR.execute($QUERY, ...)
+          - metavariable-pattern:
+              metavariable: $QUERY
+              patterns:
+                - pattern: "..."
+                - pattern-regex: (?i)(cookies|credit_cards|logins|moz_cookies|moz_formhistory|moz_logins)
     pattern-sinks:
       - pattern-either:
           - pattern-inside: requests.$METHOD(...)

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/sourcecode/npm-dll-hijacking.yml RENAMED Viewed

@@ -2,7 +2,7 @@ rules:
   - id: npm-dll-hijacking
     languages:
       - javascript
-    message: This package manipulates a trusted application into loading a malicious dll
+    message: This package manipulates a trusted application into loading a malicious DLL
     metadata:
       description: Identifies when a malicious package manipulates a trusted application into loading a malicious DLL
     pattern-either:
@@ -20,6 +20,21 @@ rules:
                 - pattern-regex: (?i).*?\/bin/.+\s+.*?\.so
                 # environment preload
                 - pattern-regex: LD_PRELOAD
+                # MITRE ATT&CK "System Binary Proxy Execution" techniques
+                # https://attack.mitre.org/techniques/T1218/
+                - pattern-regex: (?i)control(.exe)?\s+\S+.cpl
+                - pattern-regex: (?i)cmstp(.exe)?\s+\S+
+                - pattern-regex: (?i)InstallUtil(.exe)?\s+\S+
+                - pattern-regex: (?i)mshta(.exe)?\s+\S+
+                - pattern-regex: (?i)msiexec(.exe)?\s+\S+
+                - pattern-regex: (?i)odbcconf(.exe)?\s+.*{\s*REGSVR\s+\S+\s*}
+                - pattern-regex: (?i)regsvcs(.exe)?\s+\S+
+                - pattern-regex: (?i)regasm(.exe)?\s+\S+
+                - pattern-regex: (?i)regsvr32(.exe)?\s+\S+
+                - pattern-regex: (?i)rundll32(.exe)?\s+\S+
+                - pattern-regex: (?i)verclsid(.exe)?\s+.*{\s*\S+\s*}
+                - pattern-regex: (?i)mavinject(.exe)?\s+\d+\s+/INJECTRUNNING\s+\S+
+                - pattern-regex: (?i)mmc(.exe)?\s+-Embedding\s+\S+.ms
         - patterns:
           - pattern: $FN($EXE,...,$DLL)
           - metavariable-pattern:
@@ -58,7 +73,7 @@ rules:
                   - pattern: ....appendFile
             - metavariable-pattern:
                 metavariable: $EXE
-                patterns:
+                patterns:
                   # a string with .exe or /bin/[whatever] in it
                   - pattern: "..."
                   - pattern-regex: (?i).*?(\.exe|\/bin/.+)

{guarddog-1.11.2 → guarddog-2.0.1}/guarddog/analyzer/sourcecode/npm-install-script.yml RENAMED Viewed

@@ -10,6 +10,17 @@ rules:
       # (typically when a dependency is a git repository, see https://github.com/npm/cli/issues/6031#issuecomment-1449119423)
       # however this happens pretty rarely so reporting every package with a "prepare" script would be too noisy;
       # see https://github.com/DataDog/guarddog/issues/308
+      - pattern-not: |
+            "...": "npx only-allow pnpm"
+      - pattern-not: |
+          "...": ""
+      - pattern-not: |
+          "...": "patch-package"
+      - pattern-not: |
+          "...": "husky"
+      - pattern-not: |
+          "preinstall": "echo \"preinstall script\""
       - pattern-either:
           - pattern: |
               "preinstall": "..."

guarddog-2.0.1/guarddog/analyzer/sourcecode/shady-links.yml ADDED Viewed

@@ -0,0 +1,44 @@
+# TODO: Detects these links well, but lots of legitimate packages seem to use these domain extensions
+rules:
+  - id: shady-links
+    message: This package contains an URL to a domain with a suspicious extension
+    metadata:
+      description: Identify when a package contains an URL to a domain with a suspicious extension
+    patterns:
+      # ignore comments
+      - pattern-not-regex: ^\s*\# .*
+      - pattern-not-regex: ^\s*\/\*(.|\n)*?\*\/\s*$
+      - pattern-not-regex: ^\s*\/\/.*$
+      # ignore docstring
+      - pattern-not-regex: ^\s*"""(.|\n)*?"""\s*$
+      # Exclude local IPv4 sometimes used in tests
+      - pattern-not-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?(?:192\.168|10\.\d{1,3}|172\.(?:1[6-9]|2\d|3[0-1])|127\.\d{1,3})\.\d{1,3}\.\d{1,3}|0\.0\.0\.0|localhost)
+      # Exclude public IPv4 sometimes used in tests
+      - pattern-not-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?(?:1\.1\.1\.1|8\.8\.8\.8))
+      - patterns:
+        - pattern: ("...")
+        - pattern-either:
+            # complete domains
+            - pattern-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?(bit\.ly|discord\.com|workers\.dev|transfer\.sh|filetransfer\.io|sendspace\.com|appdomain\.cloud|backblazeb2\.com\|paste\.ee|ngrok\.io|termbin\.com|localhost\.run|webhook\.site|oastify\.com|burpcollaborator\.me)\/)
+            # top-level domains
+            - pattern-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?\.(link|xyz|tk|ml|ga|cf|gq|pw|top|club|mw|bd|ke|am|sbs|date|quest|cd|bid|cd|ws|icu|cam|uno|email|stream)\/)
+            # IPv4
+            - pattern-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}))
+            # IPv6
+            - pattern-regex: (http[s]?:\/\/[^\n\[\/\?#"']*?(?:\[(([A-Fa-f0-9]{1,4}:){0,7}|:):?[A-Fa-f0-9]{1,4}(:[A-Fa-f0-9]{1,4}){0,7})\])
+    paths:
+      exclude:
+        - "*/test/*"
+        - "*/tests/*"
+        - "*/test_*"
+    languages:
+      - javascript
+      - python
+      - typescript
+      - go
+    severity: WARNING

guarddog 1.11.2__tar.gz → 2.0.1__tar.gz

guarddog 1.11.2tar.gz → 2.0.1tar.gz