credsweeper 1.10.7__tar.gz → 1.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (152) hide show
  1. {credsweeper-1.10.7 → credsweeper-1.11.0}/PKG-INFO +4 -2
  2. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/__init__.py +1 -1
  3. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_pattern.py +1 -1
  4. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/morpheme_checklist.txt +2 -0
  5. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/bzip2_scanner.py +1 -1
  6. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/deep_scanner.py +77 -37
  7. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/gzip_scanner.py +1 -1
  8. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/__init__.py +3 -1
  9. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_path.py +4 -2
  10. credsweeper-1.11.0/credsweeper/ml_model/features/word_in_postamble.py +32 -0
  11. credsweeper-1.11.0/credsweeper/ml_model/features/word_in_preamble.py +37 -0
  12. credsweeper-1.10.7/credsweeper/ml_model/features/word_in_line.py → credsweeper-1.11.0/credsweeper/ml_model/features/word_in_transition.py +10 -7
  13. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_config.json +214 -80
  14. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_model.onnx +0 -0
  15. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/config.yaml +32 -1
  16. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scanner.py +6 -1
  17. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/secret/config.json +4 -1
  18. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/util.py +24 -8
  19. {credsweeper-1.10.7 → credsweeper-1.11.0}/pyproject.toml +3 -1
  20. {credsweeper-1.10.7 → credsweeper-1.11.0}/.gitignore +0 -0
  21. {credsweeper-1.10.7 → credsweeper-1.11.0}/LICENSE +0 -0
  22. {credsweeper-1.10.7 → credsweeper-1.11.0}/README.md +0 -0
  23. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/__main__.py +0 -0
  24. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/app.py +0 -0
  25. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/__init__.py +0 -0
  26. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/constants.py +0 -0
  27. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_checklist.py +0 -0
  28. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/common/keyword_checklist.txt +0 -0
  29. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/config/__init__.py +0 -0
  30. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/config/config.py +0 -0
  31. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/__init__.py +0 -0
  32. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/augment_candidates.py +0 -0
  33. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate.py +0 -0
  34. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate_group_generator.py +0 -0
  35. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/candidate_key.py +0 -0
  36. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/credential_manager.py +0 -0
  37. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/credentials/line_data.py +0 -0
  38. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/__init__.py +0 -0
  39. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/abstract_scanner.py +0 -0
  40. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/byte_scanner.py +0 -0
  41. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/docx_scanner.py +0 -0
  42. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/eml_scanner.py +0 -0
  43. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/encoder_scanner.py +0 -0
  44. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/html_scanner.py +0 -0
  45. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/jks_scanner.py +0 -0
  46. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/lang_scanner.py +0 -0
  47. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/mxfile_scanner.py +0 -0
  48. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pdf_scanner.py +0 -0
  49. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pkcs12_scanner.py +0 -0
  50. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/pptx_scanner.py +0 -0
  51. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/tar_scanner.py +0 -0
  52. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/xlsx_scanner.py +0 -0
  53. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/xml_scanner.py +0 -0
  54. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/deep_scanner/zip_scanner.py +0 -0
  55. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/__init__.py +0 -0
  56. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/abstract_provider.py +0 -0
  57. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/analysis_target.py +0 -0
  58. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/byte_content_provider.py +0 -0
  59. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/content_provider.py +0 -0
  60. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/data_content_provider.py +0 -0
  61. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/descriptor.py +0 -0
  62. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/diff_content_provider.py +0 -0
  63. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/file_path_extractor.py +0 -0
  64. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/files_provider.py +0 -0
  65. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/patches_provider.py +0 -0
  66. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/string_content_provider.py +0 -0
  67. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/struct_content_provider.py +0 -0
  68. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/file_handler/text_content_provider.py +0 -0
  69. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/__init__.py +0 -0
  70. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/filter.py +0 -0
  71. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/__init__.py +0 -0
  72. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/general_keyword.py +0 -0
  73. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/general_pattern.py +0 -0
  74. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/group.py +0 -0
  75. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/password_keyword.py +0 -0
  76. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/token_pattern.py +0 -0
  77. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/url_credentials_group.py +0 -0
  78. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/weird_base36_token.py +0 -0
  79. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/group/weird_base64_token.py +0 -0
  80. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_git_binary_check.py +0 -0
  81. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_specific_key_check.py +0 -0
  82. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/line_uue_part_check.py +0 -0
  83. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_allowlist_check.py +0 -0
  84. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_array_dictionary_check.py +0 -0
  85. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_atlassian_token_check.py +0 -0
  86. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_azure_token_check.py +0 -0
  87. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base32_data_check.py +0 -0
  88. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_data_check.py +0 -0
  89. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_encoded_pem_check.py +0 -0
  90. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_key_check.py +0 -0
  91. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_base64_part_check.py +0 -0
  92. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_blocklist_check.py +0 -0
  93. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_camel_case_check.py +0 -0
  94. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_couple_keyword_check.py +0 -0
  95. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_dictionary_keyword_check.py +0 -0
  96. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_dictionary_value_length_check.py +0 -0
  97. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_discord_bot_check.py +0 -0
  98. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base32_check.py +0 -0
  99. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base36_check.py +0 -0
  100. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_entropy_base64_check.py +0 -0
  101. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_file_path_check.py +0 -0
  102. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_first_word_check.py +0 -0
  103. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_github_check.py +0 -0
  104. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_grafana_check.py +0 -0
  105. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_grafana_service_check.py +0 -0
  106. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_hex_number_check.py +0 -0
  107. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_jfrog_token_check.py +0 -0
  108. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_json_web_token_check.py +0 -0
  109. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_last_word_check.py +0 -0
  110. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_method_check.py +0 -0
  111. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_not_allowed_pattern_check.py +0 -0
  112. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_not_part_encoded_check.py +0 -0
  113. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_number_check.py +0 -0
  114. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_pattern_check.py +0 -0
  115. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_similarity_check.py +0 -0
  116. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_split_keyword_check.py +0 -0
  117. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_string_type_check.py +0 -0
  118. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base32_check.py +0 -0
  119. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base36_check.py +0 -0
  120. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base64_check.py +0 -0
  121. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_base_check.py +0 -0
  122. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/filters/value_token_check.py +0 -0
  123. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/logger/__init__.py +0 -0
  124. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/logger/logger.py +0 -0
  125. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/__init__.py +0 -0
  126. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/entropy_evaluation.py +0 -0
  127. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/feature.py +0 -0
  128. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/file_extension.py +0 -0
  129. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/has_html_tag.py +0 -0
  130. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/is_secret_numeric.py +0 -0
  131. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/length_of_attribute.py +0 -0
  132. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/morpheme_dense.py +0 -0
  133. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/rule_name.py +0 -0
  134. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/search_in_attribute.py +0 -0
  135. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in.py +0 -0
  136. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_value.py +0 -0
  137. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/features/word_in_variable.py +0 -0
  138. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/ml_model/ml_validator.py +0 -0
  139. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/py.typed +0 -0
  140. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/__init__.py +0 -0
  141. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/rules/rule.py +0 -0
  142. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/__init__.py +0 -0
  143. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/__init__.py +0 -0
  144. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/multi_pattern.py +0 -0
  145. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/pem_key_pattern.py +0 -0
  146. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/scan_type.py +0 -0
  147. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/scanner/scan_type/single_pattern.py +0 -0
  148. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/secret/log.yaml +0 -0
  149. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/__init__.py +0 -0
  150. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/entropy_validator.py +0 -0
  151. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/hop_stat.py +0 -0
  152. {credsweeper-1.10.7 → credsweeper-1.11.0}/credsweeper/utils/pem_key_detector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: credsweeper
3
- Version: 1.10.7
3
+ Version: 1.11.0
4
4
  Summary: Credential Sweeper
5
5
  Project-URL: Homepage, https://github.com/Samsung/CredSweeper
6
6
  Project-URL: Bug Tracker, https://github.com/Samsung/CredSweeper/issues
@@ -26,7 +26,8 @@ Requires-Dist: humanfriendly
26
26
  Requires-Dist: lxml
27
27
  Requires-Dist: numpy<2.0.0
28
28
  Requires-Dist: odfpy
29
- Requires-Dist: onnxruntime
29
+ Requires-Dist: onnxruntime; platform_system != 'Windows'
30
+ Requires-Dist: onnxruntime==1.19.2; platform_system == 'Windows'
30
31
  Requires-Dist: openpyxl
31
32
  Requires-Dist: pandas
32
33
  Requires-Dist: pdfminer-six
@@ -37,6 +38,7 @@ Requires-Dist: python-docx
37
38
  Requires-Dist: python-pptx
38
39
  Requires-Dist: pyyaml
39
40
  Requires-Dist: whatthepatch
41
+ Requires-Dist: xlrd
40
42
  Description-Content-Type: text/markdown
41
43
 
42
44
  # CredSweeper
@@ -18,4 +18,4 @@ __all__ = [
18
18
  '__version__'
19
19
  ]
20
20
 
21
- __version__ = "1.10.7"
21
+ __version__ = "1.11.0"
@@ -26,7 +26,7 @@ class KeywordPattern:
26
26
  string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
27
27
  left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
28
28
  # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
29
- auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey)\s)?"
29
+ auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm)\s)?"
30
30
  value = r"(?P<value>" \
31
31
  r"(?(value_leftquote)" \
32
32
  r"(" \
@@ -960,6 +960,7 @@ nish
960
960
  nism
961
961
  node
962
962
  non
963
+ nope
963
964
  norm
964
965
  not
965
966
  nsive
@@ -1529,6 +1530,7 @@ warn
1529
1530
  watch
1530
1531
  wave
1531
1532
  way
1533
+ weak
1532
1534
  web
1533
1535
  week
1534
1536
  weight
@@ -29,7 +29,7 @@ class Bzip2Scanner(AbstractScanner, ABC):
29
29
  bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
30
30
  file_path=new_path,
31
31
  file_type=Util.get_extension(new_path),
32
- info=f"{data_provider.info}|BZIP2:{new_path}")
32
+ info=f"{data_provider.info}|BZIP2:{file_path}")
33
33
  new_limit = recursive_limit_size - len(bzip2_content_provider.data)
34
34
  bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
35
35
  return bzip2_candidates
@@ -76,17 +76,32 @@ class DeepScanner(
76
76
  return self.__scanner
77
77
 
78
78
  @staticmethod
79
- def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
80
- """Returns possibly scan methods for the data depends on content"""
79
+ def get_deep_scanners(data: bytes, file_type: str, depth: int) -> Tuple[List[Any], List[Any]]:
80
+ """Returns possibly scan methods for the data depends on content and fallback scanners"""
81
81
  deep_scanners: List[Any] = []
82
+ fallback_scanners: List[Any] = []
82
83
  if Util.is_zip(data):
83
84
  if 0 < depth:
84
85
  deep_scanners.append(ZipScanner)
85
- # probably, there might be a docx, xlxs and so on.
86
+ # probably, there might be a docx, xlsx and so on.
86
87
  # It might be scanned with text representation in third-party libraries.
87
- deep_scanners.append(XlsxScanner)
88
- deep_scanners.append(DocxScanner)
89
- deep_scanners.append(PptxScanner)
88
+ if file_type in (".xlsx", ".ods"):
89
+ deep_scanners.append(XlsxScanner)
90
+ else:
91
+ fallback_scanners.append(XlsxScanner)
92
+ if ".docx" == file_type:
93
+ deep_scanners.append(DocxScanner)
94
+ else:
95
+ fallback_scanners.append(DocxScanner)
96
+ if ".pptx" == file_type:
97
+ deep_scanners.append(PptxScanner)
98
+ else:
99
+ fallback_scanners.append(PptxScanner)
100
+ elif Util.is_com(data):
101
+ if ".xls" == file_type:
102
+ deep_scanners.append(XlsxScanner)
103
+ else:
104
+ fallback_scanners.append(XlsxScanner)
90
105
  elif Util.is_bzip2(data):
91
106
  if 0 < depth:
92
107
  deep_scanners.append(Bzip2Scanner)
@@ -102,25 +117,67 @@ class DeepScanner(
102
117
  deep_scanners.append(JksScanner)
103
118
  elif Util.is_asn1(data):
104
119
  deep_scanners.append(Pkcs12Scanner)
105
- elif file_type in [".eml", ".mht"]:
106
- if Util.is_eml(data):
107
- deep_scanners.append(EmlScanner)
108
- elif Util.is_xml(data) and Util.is_html(data):
109
- deep_scanners.append(HtmlScanner)
110
- else:
111
- deep_scanners.append(ByteScanner)
112
120
  elif Util.is_xml(data):
113
121
  if Util.is_html(data):
114
122
  deep_scanners.append(HtmlScanner)
123
+ deep_scanners.append(XmlScanner)
124
+ fallback_scanners.append(ByteScanner)
115
125
  elif Util.is_mxfile(data):
116
126
  deep_scanners.append(MxfileScanner)
117
- deep_scanners.append(XmlScanner)
118
- else:
127
+ deep_scanners.append(XmlScanner)
128
+ fallback_scanners.append(ByteScanner)
129
+ else:
130
+ deep_scanners.append(XmlScanner)
131
+ fallback_scanners.append(ByteScanner)
132
+ elif Util.is_eml(data):
133
+ if ".eml" == file_type:
134
+ deep_scanners.append(EmlScanner)
135
+ else:
136
+ fallback_scanners.append(EmlScanner)
137
+ fallback_scanners.append(ByteScanner)
138
+ elif not Util.is_binary(data):
119
139
  if 0 < depth:
120
140
  deep_scanners.append(EncoderScanner)
121
141
  deep_scanners.append(LangScanner)
122
142
  deep_scanners.append(ByteScanner)
123
- return deep_scanners
143
+ else:
144
+ logger.warning("Cannot apply a deep scanner for type %s", file_type)
145
+ return deep_scanners, fallback_scanners
146
+
147
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
148
+
149
+ def deep_scan_with_fallback(self, data_provider: DataContentProvider, depth: int,
150
+ recursive_limit_size: int) -> List[Candidate]:
151
+ """Scans with deep scanners and fallback scanners if possible
152
+
153
+ Args:
154
+ data_provider: DataContentProvider with raw data
155
+ depth: maximal level of recursion
156
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
157
+
158
+ Returns: list with candidates
159
+
160
+ """
161
+ candidates: List[Candidate] = []
162
+ deep_scanners, fallback_scanners = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
163
+ fallback = True
164
+ for scan_class in deep_scanners:
165
+ new_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
166
+ if new_candidates is None:
167
+ # scanner did not recognise the content type
168
+ continue
169
+ augment_candidates(candidates, new_candidates)
170
+ # this scan is successful, so fallback is not necessary
171
+ fallback = False
172
+ if fallback:
173
+ for scan_class in deep_scanners:
174
+ fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
175
+ if fallback_candidates is None:
176
+ continue
177
+ augment_candidates(candidates, fallback_candidates)
178
+ # use only first successful fallback scanner
179
+ break
180
+ return candidates
124
181
 
125
182
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
126
183
 
@@ -160,17 +217,8 @@ class DeepScanner(
160
217
  file_path=content_provider.file_path,
161
218
  file_type=content_provider.file_type,
162
219
  info=content_provider.info or info)
163
- # iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
164
- scanner_classes = self.get_deep_scanners(data, content_provider.file_type, depth)
165
- fallback = True
166
- for scan_class in scanner_classes:
167
- if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
168
- augment_candidates(candidates, new_candidates)
169
- fallback = False
170
- if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
171
- # wrong assumption case
172
- fallback_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
173
- augment_candidates(candidates, fallback_candidates)
220
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size - len(data))
221
+ augment_candidates(candidates, new_candidates)
174
222
  return candidates
175
223
 
176
224
  def recursive_scan(
@@ -203,16 +251,8 @@ class DeepScanner(
203
251
  FilePathExtractor.FIND_BY_EXT_RULE)
204
252
  candidates.append(dummy_candidate)
205
253
  else:
206
- fallback = True
207
- # iterate for all possibly scanner methods
208
- scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
209
- for scanner_class in scanner_classes:
210
- if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
211
- augment_candidates(candidates, new_candidates)
212
- fallback = False
213
- if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data_provider.data):
214
- bypass_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
215
- augment_candidates(candidates, bypass_candidates)
254
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size)
255
+ augment_candidates(candidates, new_candidates)
216
256
 
217
257
  return candidates
218
258
 
@@ -31,7 +31,7 @@ class GzipScanner(AbstractScanner, ABC):
31
31
  gzip_content_provider = DataContentProvider(data=f.read(),
32
32
  file_path=new_path,
33
33
  file_type=Util.get_extension(new_path),
34
- info=f"{data_provider.info}|GZIP:{new_path}")
34
+ info=f"{data_provider.info}|GZIP:{file_path}")
35
35
  new_limit = recursive_limit_size - len(gzip_content_provider.data)
36
36
  gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
37
37
  return gzip_candidates
@@ -6,7 +6,9 @@ from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
6
6
  from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
7
7
  from credsweeper.ml_model.features.rule_name import RuleName
8
8
  from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
9
- from credsweeper.ml_model.features.word_in_line import WordInLine
10
9
  from credsweeper.ml_model.features.word_in_path import WordInPath
10
+ from credsweeper.ml_model.features.word_in_postamble import WordInPostamble
11
+ from credsweeper.ml_model.features.word_in_preamble import WordInPreamble
12
+ from credsweeper.ml_model.features.word_in_transition import WordInTransition
11
13
  from credsweeper.ml_model.features.word_in_value import WordInValue
12
14
  from credsweeper.ml_model.features.word_in_variable import WordInVariable
@@ -21,8 +21,10 @@ class WordInPath(WordIn):
21
21
 
22
22
  def __call__(self, candidates: List[Candidate]) -> np.ndarray:
23
23
  # actually there must be one path because the candidates are grouped before
24
- if path := candidates[0].line_data_list[0].path:
25
- posix_lower_path = Path(path).as_posix().lower()
24
+ if file_path := candidates[0].line_data_list[0].path:
25
+ path = Path(file_path)
26
+ # apply ./ for normalised path to detect "/src" for relative path
27
+ posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
26
28
  return self.word_in_str(posix_lower_path)
27
29
  else:
28
30
  return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
@@ -0,0 +1,32 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+
5
+ from credsweeper.common.constants import ML_HUNK
6
+ from credsweeper.credentials import Candidate
7
+ from credsweeper.ml_model.features.word_in import WordIn
8
+
9
+
10
+ class WordInPostamble(WordIn):
11
+ """Feature is true if line contains at least one word from predefined list."""
12
+
13
+ def __init__(self, words: List[str]) -> None:
14
+ """Feature returns array of matching words
15
+
16
+ Args:
17
+ words: list of predefined words - MUST BE IN LOWER CASE
18
+
19
+ """
20
+ super().__init__(words)
21
+
22
+ def extract(self, candidate: Candidate) -> np.ndarray:
23
+ """Returns true if any words in a part of line after value"""
24
+ postamble_end = len(candidate.line_data_list[0].line) \
25
+ if len(candidate.line_data_list[0].line) < candidate.line_data_list[0].value_end + ML_HUNK \
26
+ else candidate.line_data_list[0].value_end + ML_HUNK
27
+ postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
28
+
29
+ if postamble:
30
+ return self.word_in_str(postamble.lower())
31
+ else:
32
+ return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
@@ -0,0 +1,37 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+
5
+ from credsweeper.common.constants import ML_HUNK
6
+ from credsweeper.credentials import Candidate
7
+ from credsweeper.ml_model.features.word_in import WordIn
8
+
9
+
10
+ class WordInPreamble(WordIn):
11
+ """Feature is true if line contains at least one word from predefined list."""
12
+
13
+ def __init__(self, words: List[str]) -> None:
14
+ """Feature returns array of matching words
15
+
16
+ Args:
17
+ words: list of predefined words - MUST BE IN LOWER CASE
18
+
19
+ """
20
+ super().__init__(words)
21
+
22
+ def extract(self, candidate: Candidate) -> np.ndarray:
23
+ """Returns true if any words in line before variable or value"""
24
+ if 0 <= candidate.line_data_list[0].variable_start:
25
+ preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].variable_start \
26
+ else candidate.line_data_list[0].variable_start - ML_HUNK
27
+ preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].
28
+ variable_start].strip()
29
+ else:
30
+ preamble_start = 0 if ML_HUNK >= candidate.line_data_list[0].value_start \
31
+ else candidate.line_data_list[0].value_start - ML_HUNK
32
+ preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
33
+
34
+ if preamble:
35
+ return self.word_in_str(preamble.lower())
36
+ else:
37
+ return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
@@ -2,13 +2,11 @@ from typing import List
2
2
 
3
3
  import numpy as np
4
4
 
5
- from credsweeper.common.constants import CHUNK_SIZE
6
5
  from credsweeper.credentials import Candidate
7
6
  from credsweeper.ml_model.features.word_in import WordIn
8
- from credsweeper.utils import Util
9
7
 
10
8
 
11
- class WordInLine(WordIn):
9
+ class WordInTransition(WordIn):
12
10
  """Feature is true if line contains at least one word from predefined list."""
13
11
 
14
12
  def __init__(self, words: List[str]) -> None:
@@ -21,9 +19,14 @@ class WordInLine(WordIn):
21
19
  super().__init__(words)
22
20
 
23
21
  def extract(self, candidate: Candidate) -> np.ndarray:
24
- """Returns true if any words in first line"""
25
- subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
26
- if subtext:
27
- return self.word_in_str(subtext.lower())
22
+ """Returns true if any words between variable and value"""
23
+ if 0 <= candidate.line_data_list[0].variable_end < candidate.line_data_list[0].value_start:
24
+ transition = candidate.line_data_list[0].line[candidate.line_data_list[0].variable_end:candidate.
25
+ line_data_list[0].value_start].strip()
26
+ else:
27
+ transition = ''
28
+
29
+ if transition:
30
+ return self.word_in_str(transition.lower())
28
31
  else:
29
32
  return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])