credsweeper 1.10.7__tar.gz → 1.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- {credsweeper-1.10.7 → credsweeper-1.11.0}/PKG-INFO +4 -2
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/__init__.py +1 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_pattern.py +1 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/morpheme_checklist.txt +2 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/bzip2_scanner.py +1 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/deep_scanner.py +77 -37
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/gzip_scanner.py +1 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/__init__.py +3 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_path.py +4 -2
- credsweeper-1.11.0/credsweeper/ml_model/features/word_in_postamble.py +32 -0
- credsweeper-1.11.0/credsweeper/ml_model/features/word_in_preamble.py +37 -0
- credsweeper-1.10.7/credsweeper/ml_model/features/word_in_line.py → credsweeper-1.11.0/credsweeper/ml_model/features/word_in_transition.py +10 -7
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_config.json +214 -80
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_model.onnx +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/config.yaml +32 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scanner.py +6 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/secret/config.json +4 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/util.py +24 -8
- {credsweeper-1.10.7 → credsweeper-1.11.0}/pyproject.toml +3 -1
- {credsweeper-1.10.7 → credsweeper-1.11.0}/.gitignore +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/LICENSE +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/README.md +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/__main__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/app.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/constants.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_checklist.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_checklist.txt +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/config/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/config/config.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/augment_candidates.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate_group_generator.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate_key.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/credential_manager.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/line_data.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/abstract_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/byte_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/docx_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/eml_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/encoder_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/html_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/jks_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/lang_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/mxfile_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pdf_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pkcs12_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pptx_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/tar_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/xlsx_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/xml_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/zip_scanner.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/abstract_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/analysis_target.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/byte_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/data_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/descriptor.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/diff_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/file_path_extractor.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/files_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/patches_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/string_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/struct_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/text_content_provider.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/filter.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/general_keyword.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/general_pattern.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/group.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/password_keyword.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/token_pattern.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/url_credentials_group.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/weird_base36_token.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/weird_base64_token.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_git_binary_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_specific_key_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_uue_part_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_allowlist_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_array_dictionary_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_atlassian_token_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_azure_token_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base32_data_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_data_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_encoded_pem_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_key_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_part_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_blocklist_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_camel_case_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_couple_keyword_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_dictionary_keyword_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_dictionary_value_length_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_discord_bot_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base32_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base36_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base64_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_file_path_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_first_word_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_github_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_grafana_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_grafana_service_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_hex_number_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_jfrog_token_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_json_web_token_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_last_word_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_method_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_not_allowed_pattern_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_not_part_encoded_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_number_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_pattern_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_similarity_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_split_keyword_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_string_type_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base32_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base36_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base64_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_check.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/logger/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/logger/logger.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/entropy_evaluation.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/feature.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/file_extension.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/has_html_tag.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/is_secret_numeric.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/length_of_attribute.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/morpheme_dense.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/rule_name.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/search_in_attribute.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_value.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_variable.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_validator.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/py.typed +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/rule.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/multi_pattern.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/pem_key_pattern.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/scan_type.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/single_pattern.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/secret/log.yaml +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/__init__.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/entropy_validator.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/hop_stat.py +0 -0
- {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/pem_key_detector.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: credsweeper
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.11.0
|
|
4
4
|
Summary: Credential Sweeper
|
|
5
5
|
Project-URL: Homepage, https://github.com/Samsung/CredSweeper
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/Samsung/CredSweeper/issues
|
|
@@ -26,7 +26,8 @@ Requires-Dist: humanfriendly
|
|
|
26
26
|
Requires-Dist: lxml
|
|
27
27
|
Requires-Dist: numpy<2.0.0
|
|
28
28
|
Requires-Dist: odfpy
|
|
29
|
-
Requires-Dist: onnxruntime
|
|
29
|
+
Requires-Dist: onnxruntime; platform_system != 'Windows'
|
|
30
|
+
Requires-Dist: onnxruntime==1.19.2; platform_system == 'Windows'
|
|
30
31
|
Requires-Dist: openpyxl
|
|
31
32
|
Requires-Dist: pandas
|
|
32
33
|
Requires-Dist: pdfminer-six
|
|
@@ -37,6 +38,7 @@ Requires-Dist: python-docx
|
|
|
37
38
|
Requires-Dist: python-pptx
|
|
38
39
|
Requires-Dist: pyyaml
|
|
39
40
|
Requires-Dist: whatthepatch
|
|
41
|
+
Requires-Dist: xlrd
|
|
40
42
|
Description-Content-Type: text/markdown
|
|
41
43
|
|
|
42
44
|
# CredSweeper
|
|
@@ -26,7 +26,7 @@ class KeywordPattern:
|
|
|
26
26
|
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
|
|
27
27
|
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
|
|
28
28
|
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
|
|
29
|
-
auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey)\s)?"
|
|
29
|
+
auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm)\s)?"
|
|
30
30
|
value = r"(?P<value>" \
|
|
31
31
|
r"(?(value_leftquote)" \
|
|
32
32
|
r"(" \
|
|
@@ -29,7 +29,7 @@ class Bzip2Scanner(AbstractScanner, ABC):
|
|
|
29
29
|
bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
|
|
30
30
|
file_path=new_path,
|
|
31
31
|
file_type=Util.get_extension(new_path),
|
|
32
|
-
info=f"{data_provider.info}|BZIP2:{
|
|
32
|
+
info=f"{data_provider.info}|BZIP2:{file_path}")
|
|
33
33
|
new_limit = recursive_limit_size - len(bzip2_content_provider.data)
|
|
34
34
|
bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
|
|
35
35
|
return bzip2_candidates
|
|
@@ -76,17 +76,32 @@ class DeepScanner(
|
|
|
76
76
|
return self.__scanner
|
|
77
77
|
|
|
78
78
|
@staticmethod
|
|
79
|
-
def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
|
|
80
|
-
"""Returns possibly scan methods for the data depends on content"""
|
|
79
|
+
def get_deep_scanners(data: bytes, file_type: str, depth: int) -> Tuple[List[Any], List[Any]]:
|
|
80
|
+
"""Returns possibly scan methods for the data depends on content and fallback scanners"""
|
|
81
81
|
deep_scanners: List[Any] = []
|
|
82
|
+
fallback_scanners: List[Any] = []
|
|
82
83
|
if Util.is_zip(data):
|
|
83
84
|
if 0 < depth:
|
|
84
85
|
deep_scanners.append(ZipScanner)
|
|
85
|
-
# probably, there might be a docx,
|
|
86
|
+
# probably, there might be a docx, xlsx and so on.
|
|
86
87
|
# It might be scanned with text representation in third-party libraries.
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
if file_type in (".xlsx", ".ods"):
|
|
89
|
+
deep_scanners.append(XlsxScanner)
|
|
90
|
+
else:
|
|
91
|
+
fallback_scanners.append(XlsxScanner)
|
|
92
|
+
if ".docx" == file_type:
|
|
93
|
+
deep_scanners.append(DocxScanner)
|
|
94
|
+
else:
|
|
95
|
+
fallback_scanners.append(DocxScanner)
|
|
96
|
+
if ".pptx" == file_type:
|
|
97
|
+
deep_scanners.append(PptxScanner)
|
|
98
|
+
else:
|
|
99
|
+
fallback_scanners.append(PptxScanner)
|
|
100
|
+
elif Util.is_com(data):
|
|
101
|
+
if ".xls" == file_type:
|
|
102
|
+
deep_scanners.append(XlsxScanner)
|
|
103
|
+
else:
|
|
104
|
+
fallback_scanners.append(XlsxScanner)
|
|
90
105
|
elif Util.is_bzip2(data):
|
|
91
106
|
if 0 < depth:
|
|
92
107
|
deep_scanners.append(Bzip2Scanner)
|
|
@@ -102,25 +117,67 @@ class DeepScanner(
|
|
|
102
117
|
deep_scanners.append(JksScanner)
|
|
103
118
|
elif Util.is_asn1(data):
|
|
104
119
|
deep_scanners.append(Pkcs12Scanner)
|
|
105
|
-
elif file_type in [".eml", ".mht"]:
|
|
106
|
-
if Util.is_eml(data):
|
|
107
|
-
deep_scanners.append(EmlScanner)
|
|
108
|
-
elif Util.is_xml(data) and Util.is_html(data):
|
|
109
|
-
deep_scanners.append(HtmlScanner)
|
|
110
|
-
else:
|
|
111
|
-
deep_scanners.append(ByteScanner)
|
|
112
120
|
elif Util.is_xml(data):
|
|
113
121
|
if Util.is_html(data):
|
|
114
122
|
deep_scanners.append(HtmlScanner)
|
|
123
|
+
deep_scanners.append(XmlScanner)
|
|
124
|
+
fallback_scanners.append(ByteScanner)
|
|
115
125
|
elif Util.is_mxfile(data):
|
|
116
126
|
deep_scanners.append(MxfileScanner)
|
|
117
|
-
|
|
118
|
-
|
|
127
|
+
deep_scanners.append(XmlScanner)
|
|
128
|
+
fallback_scanners.append(ByteScanner)
|
|
129
|
+
else:
|
|
130
|
+
deep_scanners.append(XmlScanner)
|
|
131
|
+
fallback_scanners.append(ByteScanner)
|
|
132
|
+
elif Util.is_eml(data):
|
|
133
|
+
if ".eml" == file_type:
|
|
134
|
+
deep_scanners.append(EmlScanner)
|
|
135
|
+
else:
|
|
136
|
+
fallback_scanners.append(EmlScanner)
|
|
137
|
+
fallback_scanners.append(ByteScanner)
|
|
138
|
+
elif not Util.is_binary(data):
|
|
119
139
|
if 0 < depth:
|
|
120
140
|
deep_scanners.append(EncoderScanner)
|
|
121
141
|
deep_scanners.append(LangScanner)
|
|
122
142
|
deep_scanners.append(ByteScanner)
|
|
123
|
-
|
|
143
|
+
else:
|
|
144
|
+
logger.warning("Cannot apply a deep scanner for type %s", file_type)
|
|
145
|
+
return deep_scanners, fallback_scanners
|
|
146
|
+
|
|
147
|
+
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
148
|
+
|
|
149
|
+
def deep_scan_with_fallback(self, data_provider: DataContentProvider, depth: int,
|
|
150
|
+
recursive_limit_size: int) -> List[Candidate]:
|
|
151
|
+
"""Scans with deep scanners and fallback scanners if possible
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
data_provider: DataContentProvider with raw data
|
|
155
|
+
depth: maximal level of recursion
|
|
156
|
+
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
|
|
157
|
+
|
|
158
|
+
Returns: list with candidates
|
|
159
|
+
|
|
160
|
+
"""
|
|
161
|
+
candidates: List[Candidate] = []
|
|
162
|
+
deep_scanners, fallback_scanners = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
|
|
163
|
+
fallback = True
|
|
164
|
+
for scan_class in deep_scanners:
|
|
165
|
+
new_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
|
|
166
|
+
if new_candidates is None:
|
|
167
|
+
# scanner did not recognise the content type
|
|
168
|
+
continue
|
|
169
|
+
augment_candidates(candidates, new_candidates)
|
|
170
|
+
# this scan is successful, so fallback is not necessary
|
|
171
|
+
fallback = False
|
|
172
|
+
if fallback:
|
|
173
|
+
for scan_class in deep_scanners:
|
|
174
|
+
fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
|
|
175
|
+
if fallback_candidates is None:
|
|
176
|
+
continue
|
|
177
|
+
augment_candidates(candidates, fallback_candidates)
|
|
178
|
+
# use only first successful fallback scanner
|
|
179
|
+
break
|
|
180
|
+
return candidates
|
|
124
181
|
|
|
125
182
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
126
183
|
|
|
@@ -160,17 +217,8 @@ class DeepScanner(
|
|
|
160
217
|
file_path=content_provider.file_path,
|
|
161
218
|
file_type=content_provider.file_type,
|
|
162
219
|
info=content_provider.info or info)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
fallback = True
|
|
166
|
-
for scan_class in scanner_classes:
|
|
167
|
-
if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
|
|
168
|
-
augment_candidates(candidates, new_candidates)
|
|
169
|
-
fallback = False
|
|
170
|
-
if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
|
|
171
|
-
# wrong assumption case
|
|
172
|
-
fallback_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
|
|
173
|
-
augment_candidates(candidates, fallback_candidates)
|
|
220
|
+
new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size - len(data))
|
|
221
|
+
augment_candidates(candidates, new_candidates)
|
|
174
222
|
return candidates
|
|
175
223
|
|
|
176
224
|
def recursive_scan(
|
|
@@ -203,16 +251,8 @@ class DeepScanner(
|
|
|
203
251
|
FilePathExtractor.FIND_BY_EXT_RULE)
|
|
204
252
|
candidates.append(dummy_candidate)
|
|
205
253
|
else:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
|
|
209
|
-
for scanner_class in scanner_classes:
|
|
210
|
-
if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
|
|
211
|
-
augment_candidates(candidates, new_candidates)
|
|
212
|
-
fallback = False
|
|
213
|
-
if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data_provider.data):
|
|
214
|
-
bypass_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
|
|
215
|
-
augment_candidates(candidates, bypass_candidates)
|
|
254
|
+
new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size)
|
|
255
|
+
augment_candidates(candidates, new_candidates)
|
|
216
256
|
|
|
217
257
|
return candidates
|
|
218
258
|
|
|
@@ -31,7 +31,7 @@ class GzipScanner(AbstractScanner, ABC):
|
|
|
31
31
|
gzip_content_provider = DataContentProvider(data=f.read(),
|
|
32
32
|
file_path=new_path,
|
|
33
33
|
file_type=Util.get_extension(new_path),
|
|
34
|
-
info=f"{data_provider.info}|GZIP:{
|
|
34
|
+
info=f"{data_provider.info}|GZIP:{file_path}")
|
|
35
35
|
new_limit = recursive_limit_size - len(gzip_content_provider.data)
|
|
36
36
|
gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
|
|
37
37
|
return gzip_candidates
|
|
@@ -6,7 +6,9 @@ from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
|
|
|
6
6
|
from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
|
|
7
7
|
from credsweeper.ml_model.features.rule_name import RuleName
|
|
8
8
|
from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
|
|
9
|
-
from credsweeper.ml_model.features.word_in_line import WordInLine
|
|
10
9
|
from credsweeper.ml_model.features.word_in_path import WordInPath
|
|
10
|
+
from credsweeper.ml_model.features.word_in_postamble import WordInPostamble
|
|
11
|
+
from credsweeper.ml_model.features.word_in_preamble import WordInPreamble
|
|
12
|
+
from credsweeper.ml_model.features.word_in_transition import WordInTransition
|
|
11
13
|
from credsweeper.ml_model.features.word_in_value import WordInValue
|
|
12
14
|
from credsweeper.ml_model.features.word_in_variable import WordInVariable
|
|
@@ -21,8 +21,10 @@ class WordInPath(WordIn):
|
|
|
21
21
|
|
|
22
22
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
23
23
|
# actually there must be one path because the candidates are grouped before
|
|
24
|
-
if
|
|
25
|
-
|
|
24
|
+
if file_path := candidates[0].line_data_list[0].path:
|
|
25
|
+
path = Path(file_path)
|
|
26
|
+
# apply ./ for normalised path to detect "/src" for relative path
|
|
27
|
+
posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
|
|
26
28
|
return self.word_in_str(posix_lower_path)
|
|
27
29
|
else:
|
|
28
30
|
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from credsweeper.common.constants import ML_HUNK
|
|
6
|
+
from credsweeper.credentials import Candidate
|
|
7
|
+
from credsweeper.ml_model.features.word_in import WordIn
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WordInPostamble(WordIn):
|
|
11
|
+
"""Feature is true if line contains at least one word from predefined list."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, words: List[str]) -> None:
|
|
14
|
+
"""Feature returns array of matching words
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
words: list of predefined words - MUST BE IN LOWER CASE
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
super().__init__(words)
|
|
21
|
+
|
|
22
|
+
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
23
|
+
"""Returns true if any words in a part of line after value"""
|
|
24
|
+
postamble_end = len(candidate.line_data_list[0].line) \
|
|
25
|
+
if len(candidate.line_data_list[0].line) < candidate.line_data_list[0].value_end + ML_HUNK \
|
|
26
|
+
else candidate.line_data_list[0].value_end + ML_HUNK
|
|
27
|
+
postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
|
|
28
|
+
|
|
29
|
+
if postamble:
|
|
30
|
+
return self.word_in_str(postamble.lower())
|
|
31
|
+
else:
|
|
32
|
+
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from credsweeper.common.constants import ML_HUNK
|
|
6
|
+
from credsweeper.credentials import Candidate
|
|
7
|
+
from credsweeper.ml_model.features.word_in import WordIn
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WordInPreamble(WordIn):
|
|
11
|
+
"""Feature is true if line contains at least one word from predefined list."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, words: List[str]) -> None:
|
|
14
|
+
"""Feature returns array of matching words
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
words: list of predefined words - MUST BE IN LOWER CASE
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
super().__init__(words)
|
|
21
|
+
|
|
22
|
+
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
23
|
+
"""Returns true if any words in line before variable or value"""
|
|
24
|
+
if 0 <= candidate.line_data_list[0].variable_start:
|
|
25
|
+
preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].variable_start \
|
|
26
|
+
else candidate.line_data_list[0].variable_start - ML_HUNK
|
|
27
|
+
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].
|
|
28
|
+
variable_start].strip()
|
|
29
|
+
else:
|
|
30
|
+
preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].value_start \
|
|
31
|
+
else candidate.line_data_list[0].value_start - ML_HUNK
|
|
32
|
+
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
|
|
33
|
+
|
|
34
|
+
if preamble:
|
|
35
|
+
return self.word_in_str(preamble.lower())
|
|
36
|
+
else:
|
|
37
|
+
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
@@ -2,13 +2,11 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
|
-
from credsweeper.common.constants import CHUNK_SIZE
|
|
6
5
|
from credsweeper.credentials import Candidate
|
|
7
6
|
from credsweeper.ml_model.features.word_in import WordIn
|
|
8
|
-
from credsweeper.utils import Util
|
|
9
7
|
|
|
10
8
|
|
|
11
|
-
class
|
|
9
|
+
class WordInTransition(WordIn):
|
|
12
10
|
"""Feature is true if line contains at least one word from predefined list."""
|
|
13
11
|
|
|
14
12
|
def __init__(self, words: List[str]) -> None:
|
|
@@ -21,9 +19,14 @@ class WordInLine(WordIn):
|
|
|
21
19
|
super().__init__(words)
|
|
22
20
|
|
|
23
21
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
24
|
-
"""Returns true if any words
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
22
|
+
"""Returns true if any words between variable and value"""
|
|
23
|
+
if 0 <= candidate.line_data_list[0].variable_end < candidate.line_data_list[0].value_start:
|
|
24
|
+
transition = candidate.line_data_list[0].line[candidate.line_data_list[0].variable_end:candidate.
|
|
25
|
+
line_data_list[0].value_start].strip()
|
|
26
|
+
else:
|
|
27
|
+
transition = ''
|
|
28
|
+
|
|
29
|
+
if transition:
|
|
30
|
+
return self.word_in_str(transition.lower())
|
|
28
31
|
else:
|
|
29
32
|
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|