PyPI - credsweeper - Versions diffs - 1.12.2__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

credsweeper 1.12.2py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of credsweeper might be problematic. Click here for more details.

Files changed (46) hide show

credsweeper/__init__.py +1 -1
credsweeper/__main__.py +15 -8
credsweeper/app.py +7 -2
credsweeper/common/keyword_pattern.py +6 -3
credsweeper/common/morpheme_checklist.txt +11 -1
credsweeper/config/config.py +1 -0
credsweeper/credentials/line_data.py +16 -0
credsweeper/deep_scanner/deep_scanner.py +12 -6
credsweeper/deep_scanner/rtf_scanner.py +41 -0
credsweeper/deep_scanner/strings_scanner.py +52 -0
credsweeper/file_handler/byte_content_provider.py +10 -1
credsweeper/file_handler/file_path_extractor.py +2 -0
credsweeper/file_handler/text_content_provider.py +7 -1
credsweeper/filters/__init__.py +1 -1
credsweeper/filters/group/token_pattern.py +2 -2
credsweeper/filters/group/weird_base36_token.py +2 -2
credsweeper/filters/group/weird_base64_token.py +2 -2
credsweeper/filters/value_file_path_check.py +5 -3
credsweeper/filters/value_github_check.py +3 -2
credsweeper/filters/value_morphemes_check.py +43 -0
credsweeper/filters/value_string_type_check.py +1 -0
credsweeper/ml_model/features/feature.py +1 -18
credsweeper/ml_model/features/file_extension.py +1 -1
credsweeper/ml_model/features/has_html_tag.py +10 -8
credsweeper/ml_model/features/is_secret_numeric.py +4 -3
credsweeper/ml_model/features/rule_name.py +1 -1
credsweeper/ml_model/features/word_in.py +9 -32
credsweeper/ml_model/features/word_in_path.py +2 -3
credsweeper/ml_model/features/word_in_postamble.py +1 -4
credsweeper/ml_model/features/word_in_preamble.py +1 -4
credsweeper/ml_model/features/word_in_transition.py +1 -4
credsweeper/ml_model/features/word_in_value.py +2 -3
credsweeper/ml_model/features/word_in_variable.py +2 -3
credsweeper/ml_model/ml_config.json +15 -8
credsweeper/ml_model/ml_model.onnx +0 -0
credsweeper/ml_model/ml_validator.py +1 -1
credsweeper/rules/config.yaml +129 -128
credsweeper/scanner/scanner.py +12 -7
credsweeper/secret/config.json +18 -5
credsweeper/utils/util.py +19 -16
{credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/METADATA +7 -7
{credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/RECORD +45 -43
credsweeper/filters/value_couple_keyword_check.py +0 -28
{credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/WHEEL +0 -0
{credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/entry_points.txt +0 -0
{credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/licenses/LICENSE +0 -0

credsweeper/ml_model/features/feature.py CHANGED Viewed

@@ -10,7 +10,7 @@ class Feature(ABC):
     """Base class for features."""
     def __init__(self):
-        self.words = []
+        pass
     def __call__(self, candidates: List[Candidate]) -> np.ndarray:
         """Call base class for features.
@@ -25,20 +25,3 @@ class Feature(ABC):
     def extract(self, candidate: Candidate) -> Any:
         """Abstract method of base class"""
         raise NotImplementedError
-    @property
-    def words(self) -> List[str]:
-        """getter"""
-        return self.__words
-    @words.setter
-    def words(self, words: List[str]) -> None:
-        """setter"""
-        self.__words = words
-    def any_word_in_(self, a_string: str) -> bool:
-        """Returns true if any words in a string"""
-        for i in self.words:
-            if i in a_string:
-                return True
-        return False

credsweeper/ml_model/features/file_extension.py CHANGED Viewed

@@ -19,7 +19,7 @@ class FileExtension(WordIn):
     def __call__(self, candidates: List[Candidate]) -> np.ndarray:
         extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
-        return self.word_in_set(extension_set)
+        return self.word_in_(extension_set)
     def extract(self, candidate: Candidate) -> Any:
         raise NotImplementedError

credsweeper/ml_model/features/has_html_tag.py CHANGED Viewed

@@ -1,17 +1,18 @@
 from credsweeper.common.constants import CHUNK_SIZE
 from credsweeper.credentials.candidate import Candidate
-from credsweeper.ml_model.features.feature import Feature
+from credsweeper.ml_model.features.word_in import WordIn
 from credsweeper.utils.util import Util
-class HasHtmlTag(Feature):
+class HasHtmlTag(WordIn):
     """Feature is true if line has HTML tags (HTML file)."""
+    HTML_WORDS = [
+        '< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
+    ]
     def __init__(self) -> None:
-        super().__init__()
-        self.words = [
-            '< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
-        ]
+        super().__init__(HasHtmlTag.HTML_WORDS)
     def extract(self, candidate: Candidate) -> bool:
         subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
@@ -19,8 +20,9 @@ class HasHtmlTag(Feature):
         if '<' not in candidate_line_data_list_0_line_lower:
             # early check
             return False
-        if self.any_word_in_(candidate_line_data_list_0_line_lower):
-            return True
+        for i in self.words:
+            if i in candidate_line_data_list_0_line_lower:
+                return True
         if "/>" in candidate_line_data_list_0_line_lower or "</" in candidate_line_data_list_0_line_lower:
             # possible closed tag
             return True

credsweeper/ml_model/features/is_secret_numeric.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import contextlib
 from credsweeper.credentials.candidate import Candidate
 from credsweeper.ml_model.features.feature import Feature
@@ -6,8 +8,7 @@ class IsSecretNumeric(Feature):
     """Feature is true if candidate value is a numerical value."""
     def extract(self, candidate: Candidate) -> bool:
-        try:
+        with contextlib.suppress(ValueError):
             float(candidate.line_data_list[0].value)
             return True
-        except ValueError:
-            return False
+        return False

credsweeper/ml_model/features/rule_name.py CHANGED Viewed

@@ -19,7 +19,7 @@ class RuleName(WordIn):
     def __call__(self, candidates: List[Candidate]) -> np.ndarray:
         candidate_rule_set = set(x.rule_name for x in candidates)
-        return self.word_in_set(candidate_rule_set)
+        return self.word_in_(candidate_rule_set)
     def extract(self, candidate: Candidate) -> Any:
         raise NotImplementedError

credsweeper/ml_model/features/word_in.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Any, Tuple, Set
+from typing import List, Any, Set, Union
 import numpy as np
@@ -18,42 +18,19 @@ class WordIn(Feature):
         if len(self.enumerated_words) != self.dimension:
             raise RuntimeError(f"Check duplicates:{words}")
-    @property
-    def enumerated_words(self) -> List[Tuple[int, str]]:
-        """getter for speedup"""
-        return self.__enumerated_words
-    @enumerated_words.setter
-    def enumerated_words(self, enumerated_words: List[Tuple[int, str]]) -> None:
-        """setter for speedup"""
-        self.__enumerated_words = enumerated_words
-    @property
-    def dimension(self) -> int:
-        """getter"""
-        return self.__dimension
-    @dimension.setter
-    def dimension(self, dimension: int) -> None:
-        """setter"""
-        self.__dimension = dimension
     @abstractmethod
     def extract(self, candidate: Candidate) -> Any:
         raise NotImplementedError
-    def word_in_str(self, a_string: str) -> np.ndarray:
-        """Returns array with words included in a string"""
-        result: np.ndarray = np.zeros(shape=[self.dimension], dtype=np.int8)
-        for i, word in self.enumerated_words:
-            if word in a_string:
-                result[i] = 1
-        return np.array([result])
+    @property
+    def zero(self) -> np.ndarray:
+        """Returns zero filled array for case of empty input"""
+        return np.zeros(shape=[self.dimension], dtype=np.int8)
-    def word_in_set(self, a_strings_set: Set[str]) -> np.ndarray:
-        """Returns array with words matches in a_strings_set"""
-        result: np.ndarray = np.zeros(shape=[self.dimension], dtype=np.int8)
+    def word_in_(self, iterable_data: Union[str, List[str], Set[str]]) -> np.ndarray:
+        """Returns array with words included in a string"""
+        result: np.ndarray = self.zero
         for i, word in self.enumerated_words:
-            if word in a_strings_set:
+            if word in iterable_data:
                 result[i] = 1
         return np.array([result])

credsweeper/ml_model/features/word_in_path.py CHANGED Viewed

@@ -19,9 +19,8 @@ class WordInPath(WordIn):
             posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
             # prevent extra confusion from the same word in extension
             path_without_extension, _ = os.path.splitext(posix_lower_path)
-            return self.word_in_str(path_without_extension)
-        else:
-            return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
+            return self.word_in_(path_without_extension)
+        return np.array([self.zero])
     def extract(self, candidate: Candidate) -> Any:
         raise NotImplementedError

credsweeper/ml_model/features/word_in_postamble.py CHANGED Viewed

@@ -15,7 +15,4 @@ class WordInPostamble(WordIn):
             else candidate.line_data_list[0].value_end + ML_HUNK
         postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
-        if postamble:
-            return self.word_in_str(postamble.lower())
-        else:
-            return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
+        return self.word_in_(postamble.lower()) if postamble else np.array([self.zero])

credsweeper/ml_model/features/word_in_preamble.py CHANGED Viewed

@@ -20,7 +20,4 @@ class WordInPreamble(WordIn):
                 else candidate.line_data_list[0].value_start - ML_HUNK
             preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
-        if preamble:
-            return self.word_in_str(preamble.lower())
-        else:
-            return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
+        return self.word_in_(preamble.lower()) if preamble else np.array([self.zero])

credsweeper/ml_model/features/word_in_transition.py CHANGED Viewed

@@ -15,7 +15,4 @@ class WordInTransition(WordIn):
         else:
             transition = ''
-        if transition:
-            return self.word_in_str(transition.lower())
-        else:
-            return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
+        return self.word_in_(transition.lower()) if transition else np.array([self.zero])

credsweeper/ml_model/features/word_in_value.py CHANGED Viewed

@@ -10,6 +10,5 @@ class WordInValue(WordIn):
     def extract(self, candidate: Candidate) -> np.ndarray:
         """Returns array of matching words for first line"""
         if value := candidate.line_data_list[0].value:
-            return self.word_in_str(value.lower())
-        else:
-            return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
+            return self.word_in_(value.lower())
+        return np.array([self.zero])

credsweeper/ml_model/features/word_in_variable.py CHANGED Viewed

@@ -10,6 +10,5 @@ class WordInVariable(WordIn):
     def extract(self, candidate: Candidate) -> np.ndarray:
         """Returns array of matching words for first line"""
         if variable := candidate.line_data_list[0].variable:
-            return self.word_in_str(variable.lower())
-        else:
-            return np.zeros(shape=[self.dimension], dtype=np.int8)
+            return self.word_in_(variable.lower())
+        return np.array([self.zero])

credsweeper/ml_model/ml_config.json CHANGED Viewed

@@ -10,6 +10,7 @@
     "features": [
         {
             "type": "RuleSeverity",
+            "comment": "INFO=0.0, LOW=0.25, MEDIUM=0.5, HIGH=0.75, CRITICAL=1.0",
             "kwargs": {}
         },
         {
@@ -62,7 +63,7 @@
             "type": "SearchInAttribute",
             "comment": "Repeated symbol",
             "kwargs": {
-                "pattern": ".*(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
+                "pattern": "(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
                 "attribute": "value"
             }
         },
@@ -70,7 +71,7 @@
             "type": "SearchInAttribute",
             "comment": "SHA marker",
             "kwargs": {
-                "pattern": ".*(?i:sha)[_-]?(224|256|384|512)",
+                "pattern": "(?i:sha)[_-]?(224|256|384|512)",
                 "attribute": "value"
             }
         },
@@ -126,7 +127,7 @@
             "type": "SearchInAttribute",
             "comment": "VariableNotAllowedNameCheck",
             "kwargs": {
-                "pattern": "(?i:(filters?|pub(lic)?)_?key)",
+                "pattern": "(?i:(sha[_-]?(224|256|384|512)|projects?|filters?|pub(lic)?)_?key)",
                 "attribute": "variable"
             }
         },
@@ -134,7 +135,7 @@
             "type": "SearchInAttribute",
             "comment": "VariableNotAllowedNameCheck",
             "kwargs": {
-                "pattern": "(?i:(id|size|name|type|manager|algorithm|view|error)$)",
+                "pattern": "(?i:(id|sum|size|name|type|manager|algorithm|pattern|view|error|date(time)?|time(stamp)?|tag|version|hash|rate)$)",
                 "attribute": "variable"
             }
         },
@@ -245,8 +246,10 @@
                     "crypt",
                     "crypted",
                     "decrypt",
+                    "edited",
                     "encrypt",
                     "example",
+                    "expire",
                     "fake",
                     "file",
                     "foo",
@@ -260,7 +263,8 @@
                     "pass",
                     "public",
                     "pwd",
-                    "rsa-",
+                    "redacted",
+                    "rsa",
                     "salt",
                     "secret",
                     "sha",
@@ -339,6 +343,7 @@
                     "get",
                     "e.g.",
                     "equal",
+                    "env",
                     "example",
                     "expect",
                     "line",
@@ -484,6 +489,7 @@
                     ".bat",
                     ".bats",
                     ".bazel",
+                    ".bin",
                     ".build",
                     ".bundle",
                     ".bzl",
@@ -504,7 +510,6 @@
                     ".csp",
                     ".csv",
                     ".dist",
-                    ".doc",
                     ".dockerfile",
                     ".edited",
                     ".eex",
@@ -527,6 +532,8 @@
                     ".gtpl",
                     ".h",
                     ".haml",
+                    ".har",
+                    ".hpp",
                     ".hs",
                     ".html",
                     ".idl",
@@ -657,8 +664,8 @@
                     "CMD Password",
                     "CMD Secret",
                     "CMD Token",
+                    "CURL User Password",
                     "Credential",
-                    "Github Old Token",
                     "Key",
                     "Nonce",
                     "Password",
@@ -671,4 +678,4 @@
             }
         }
     ]
-}
+}

credsweeper/ml_model/ml_model.onnx CHANGED Viewed

Binary file

credsweeper/ml_model/ml_validator.py CHANGED Viewed

@@ -272,7 +272,7 @@ class MlValidator:
         if head != tail:
             probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
                                                             features_list)
-        is_cred = probability > self.threshold
+        is_cred = self.threshold <= probability
         if logger.isEnabledFor(logging.DEBUG):
             for i, decision in enumerate(is_cred):
                 logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],

credsweeper 1.12.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

Potentially problematic release.

credsweeper 1.12.2py3-none-any.whl → 1.13.0py3-none-any.whl