credsweeper 1.10.6__tar.gz → 1.10.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (150) hide show
  1. {credsweeper-1.10.6 → credsweeper-1.10.8}/PKG-INFO +5 -3
  2. {credsweeper-1.10.6 → credsweeper-1.10.8}/README.md +1 -1
  3. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/__init__.py +1 -1
  4. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/__main__.py +2 -2
  5. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/keyword_pattern.py +22 -15
  6. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/bzip2_scanner.py +1 -1
  7. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/deep_scanner.py +77 -37
  8. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/gzip_scanner.py +1 -1
  9. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_allowlist_check.py +24 -8
  10. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_blocklist_check.py +3 -0
  11. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/word_in_path.py +4 -2
  12. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/rules/config.yaml +48 -1
  13. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scanner.py +6 -1
  14. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/secret/config.json +4 -1
  15. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/utils/hop_stat.py +27 -1
  16. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/utils/util.py +24 -8
  17. {credsweeper-1.10.6 → credsweeper-1.10.8}/pyproject.toml +3 -1
  18. {credsweeper-1.10.6 → credsweeper-1.10.8}/.gitignore +0 -0
  19. {credsweeper-1.10.6 → credsweeper-1.10.8}/LICENSE +0 -0
  20. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/app.py +0 -0
  21. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/__init__.py +0 -0
  22. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/constants.py +0 -0
  23. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/keyword_checklist.py +0 -0
  24. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/keyword_checklist.txt +0 -0
  25. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/common/morpheme_checklist.txt +0 -0
  26. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/config/__init__.py +0 -0
  27. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/config/config.py +0 -0
  28. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/__init__.py +0 -0
  29. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/augment_candidates.py +0 -0
  30. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/candidate.py +0 -0
  31. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/candidate_group_generator.py +0 -0
  32. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/candidate_key.py +0 -0
  33. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/credential_manager.py +0 -0
  34. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/credentials/line_data.py +0 -0
  35. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/__init__.py +0 -0
  36. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/abstract_scanner.py +0 -0
  37. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/byte_scanner.py +0 -0
  38. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/docx_scanner.py +0 -0
  39. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/eml_scanner.py +0 -0
  40. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/encoder_scanner.py +0 -0
  41. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/html_scanner.py +0 -0
  42. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/jks_scanner.py +0 -0
  43. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/lang_scanner.py +0 -0
  44. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/mxfile_scanner.py +0 -0
  45. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/pdf_scanner.py +0 -0
  46. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/pkcs12_scanner.py +0 -0
  47. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/pptx_scanner.py +0 -0
  48. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/tar_scanner.py +0 -0
  49. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/xlsx_scanner.py +0 -0
  50. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/xml_scanner.py +0 -0
  51. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/deep_scanner/zip_scanner.py +0 -0
  52. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/__init__.py +0 -0
  53. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/abstract_provider.py +0 -0
  54. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/analysis_target.py +0 -0
  55. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/byte_content_provider.py +0 -0
  56. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/content_provider.py +0 -0
  57. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/data_content_provider.py +0 -0
  58. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/descriptor.py +0 -0
  59. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/diff_content_provider.py +0 -0
  60. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/file_path_extractor.py +0 -0
  61. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/files_provider.py +0 -0
  62. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/patches_provider.py +0 -0
  63. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/string_content_provider.py +0 -0
  64. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/struct_content_provider.py +0 -0
  65. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/file_handler/text_content_provider.py +0 -0
  66. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/__init__.py +0 -0
  67. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/filter.py +0 -0
  68. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/__init__.py +0 -0
  69. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/general_keyword.py +0 -0
  70. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/general_pattern.py +0 -0
  71. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/group.py +0 -0
  72. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/password_keyword.py +0 -0
  73. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/token_pattern.py +0 -0
  74. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/url_credentials_group.py +0 -0
  75. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/weird_base36_token.py +0 -0
  76. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/group/weird_base64_token.py +0 -0
  77. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/line_git_binary_check.py +0 -0
  78. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/line_specific_key_check.py +0 -0
  79. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/line_uue_part_check.py +0 -0
  80. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_array_dictionary_check.py +0 -0
  81. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_atlassian_token_check.py +0 -0
  82. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_azure_token_check.py +0 -0
  83. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_base32_data_check.py +0 -0
  84. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_base64_data_check.py +0 -0
  85. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_base64_encoded_pem_check.py +0 -0
  86. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_base64_key_check.py +0 -0
  87. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_base64_part_check.py +0 -0
  88. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_camel_case_check.py +0 -0
  89. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_couple_keyword_check.py +0 -0
  90. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_dictionary_keyword_check.py +0 -0
  91. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_dictionary_value_length_check.py +0 -0
  92. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_discord_bot_check.py +0 -0
  93. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_entropy_base32_check.py +0 -0
  94. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_entropy_base36_check.py +0 -0
  95. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_entropy_base64_check.py +0 -0
  96. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_file_path_check.py +0 -0
  97. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_first_word_check.py +0 -0
  98. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_github_check.py +0 -0
  99. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_grafana_check.py +0 -0
  100. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_grafana_service_check.py +0 -0
  101. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_hex_number_check.py +0 -0
  102. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_jfrog_token_check.py +0 -0
  103. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_json_web_token_check.py +0 -0
  104. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_last_word_check.py +0 -0
  105. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_method_check.py +0 -0
  106. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_not_allowed_pattern_check.py +0 -0
  107. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_not_part_encoded_check.py +0 -0
  108. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_number_check.py +0 -0
  109. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_pattern_check.py +0 -0
  110. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_similarity_check.py +0 -0
  111. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_split_keyword_check.py +0 -0
  112. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_string_type_check.py +0 -0
  113. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_token_base32_check.py +0 -0
  114. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_token_base36_check.py +0 -0
  115. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_token_base64_check.py +0 -0
  116. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_token_base_check.py +0 -0
  117. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/filters/value_token_check.py +0 -0
  118. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/logger/__init__.py +0 -0
  119. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/logger/logger.py +0 -0
  120. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/__init__.py +0 -0
  121. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/__init__.py +0 -0
  122. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/entropy_evaluation.py +0 -0
  123. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/feature.py +0 -0
  124. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/file_extension.py +0 -0
  125. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/has_html_tag.py +0 -0
  126. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/is_secret_numeric.py +0 -0
  127. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/length_of_attribute.py +0 -0
  128. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/morpheme_dense.py +0 -0
  129. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/rule_name.py +0 -0
  130. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/search_in_attribute.py +0 -0
  131. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/word_in.py +0 -0
  132. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/word_in_line.py +0 -0
  133. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/word_in_value.py +0 -0
  134. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/features/word_in_variable.py +0 -0
  135. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/ml_config.json +0 -0
  136. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/ml_model.onnx +0 -0
  137. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/ml_model/ml_validator.py +0 -0
  138. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/py.typed +0 -0
  139. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/rules/__init__.py +0 -0
  140. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/rules/rule.py +0 -0
  141. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/__init__.py +0 -0
  142. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scan_type/__init__.py +0 -0
  143. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scan_type/multi_pattern.py +0 -0
  144. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scan_type/pem_key_pattern.py +0 -0
  145. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scan_type/scan_type.py +0 -0
  146. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/scanner/scan_type/single_pattern.py +0 -0
  147. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/secret/log.yaml +0 -0
  148. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/utils/__init__.py +0 -0
  149. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/utils/entropy_validator.py +0 -0
  150. {credsweeper-1.10.6 → credsweeper-1.10.8}/credsweeper/utils/pem_key_detector.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: credsweeper
3
- Version: 1.10.6
3
+ Version: 1.10.8
4
4
  Summary: Credential Sweeper
5
5
  Project-URL: Homepage, https://github.com/Samsung/CredSweeper
6
6
  Project-URL: Bug Tracker, https://github.com/Samsung/CredSweeper/issues
@@ -26,7 +26,8 @@ Requires-Dist: humanfriendly
26
26
  Requires-Dist: lxml
27
27
  Requires-Dist: numpy<2.0.0
28
28
  Requires-Dist: odfpy
29
- Requires-Dist: onnxruntime
29
+ Requires-Dist: onnxruntime; platform_system != 'Windows'
30
+ Requires-Dist: onnxruntime==1.19.2; platform_system == 'Windows'
30
31
  Requires-Dist: openpyxl
31
32
  Requires-Dist: pandas
32
33
  Requires-Dist: pdfminer-six
@@ -37,6 +38,7 @@ Requires-Dist: python-docx
37
38
  Requires-Dist: python-pptx
38
39
  Requires-Dist: pyyaml
39
40
  Requires-Dist: whatthepatch
41
+ Requires-Dist: xlrd
40
42
  Description-Content-Type: text/markdown
41
43
 
42
44
  # CredSweeper
@@ -87,7 +89,7 @@ Full documentation can be found here: <https://credsweeper.readthedocs.io/>
87
89
 
88
90
  ### Main Requirements
89
91
 
90
- - Python 3.8, 3.9, 3.10, 3.11, 3.12
92
+ - Python 3.9, 3.10, 3.11, 3.12
91
93
 
92
94
  ### Installation
93
95
 
@@ -46,7 +46,7 @@ Full documentation can be found here: <https://credsweeper.readthedocs.io/>
46
46
 
47
47
  ### Main Requirements
48
48
 
49
- - Python 3.8, 3.9, 3.10, 3.11, 3.12
49
+ - Python 3.9, 3.10, 3.11, 3.12
50
50
 
51
51
  ### Installation
52
52
 
@@ -18,4 +18,4 @@ __all__ = [
18
18
  '__version__'
19
19
  ]
20
20
 
21
- __version__ = "1.10.6"
21
+ __version__ = "1.10.8"
@@ -205,8 +205,8 @@ def get_arguments() -> Namespace:
205
205
  metavar="POSITIVE_INT")
206
206
  parser.add_argument("--thrifty",
207
207
  help="clear objects after scan to reduce memory consumption",
208
- action="store_const",
209
- const=True)
208
+ action=BooleanOptionalAction,
209
+ default=True)
210
210
  parser.add_argument("--skip_ignored",
211
211
  help="parse .gitignore files and skip credentials from ignored objects",
212
212
  dest="skip_ignored",
@@ -3,27 +3,30 @@ import re
3
3
 
4
4
  class KeywordPattern:
5
5
  """Pattern set of keyword types"""
6
- key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
7
- r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
6
+ key_left = r"(\\[nrt]|%[0-9a-f]{2})?" \
7
+ r"(?P<variable>(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
8
8
  r"(?P<keyword>"
9
9
  # there will be inserted a keyword
10
10
  key_right = r")" \
11
- r"(&(quot|apos);|[^%:='\"`<>{?!&]*)[`'\"]*))" # <variable>
12
- separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
13
- r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
14
- r"(\s|\\+[tnr])*"
11
+ r"[^%:='\"`<>{?!&]*" \
12
+ r")" \
13
+ r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \
14
+ r")" # <variable>
15
+ separator = r"(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*" \
16
+ r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:|=(>|&gt;|\\u0026gt;)|!==|!=|===|==|=|%3d)" \
17
+ r"(\s|\\{1,8}[tnr])*"
15
18
  # might be curly, square or parenthesis with words before
16
19
  wrap = r"(?P<wrap>(" \
17
- r"(new(\s|\\+[tnr])+)?" \
20
+ r"(new(\s|\\{1,8}[tnr]){1,8})?" \
18
21
  r"([0-9a-z_.]|-(>|(&|\\\\*u0026)gt;))*" \
19
- r"[\[\(\{]"\
20
- r"(\s|\\+[tnr])*" \
22
+ r"[\[\(\{]" \
23
+ r"(\s|\\{1,8}[tnr])*" \
21
24
  r"([0-9a-z_]{1,32}=)?" \
22
- r")+)?"
25
+ r"){1,8})?"
23
26
  string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
24
27
  left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
25
28
  # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
26
- auth_keywords = r"( ?(oauth|bot|basic|bearer|apikey|accesskey) )?"
29
+ auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm)\s)?"
27
30
  value = r"(?P<value>" \
28
31
  r"(?(value_leftquote)" \
29
32
  r"(" \
@@ -31,11 +34,15 @@ class KeywordPattern:
31
34
  r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \
32
35
  r"|" \
33
36
  r"(?!&(quot|apos);)" \
34
- r"(\\+([ tnr]|[^\s`'\"])|[^\s`'\",;\\])" \
35
- r"){3,8000}" \
37
+ r"(\\{1,8}([ tnr]|[^\s`'\"])" \
38
+ r"|" \
39
+ r"(?P<url_esc>%[0-9a-f]{2})" \
40
+ r"|" \
41
+ r"(?(url_esc)[^\s`'\",;\\&]|[^\s`'\",;\\])" \
42
+ r")){3,8000}" \
36
43
  r"|(\{[^}]{3,8000}\})" \
37
44
  r"|(<[^>]{3,8000}>)" \
38
- r")"
45
+ r")" # <value>
39
46
  right_quote = r"(?(value_leftquote)" \
40
47
  r"(?P<value_rightquote>(?<!\\)(?P=value_leftquote)|\\$|(?<=[0-9a-z+_/-])$)" \
41
48
  r"|" \
@@ -44,7 +51,7 @@ class KeywordPattern:
44
51
  @classmethod
45
52
  def get_keyword_pattern(cls, keyword: str) -> re.Pattern:
46
53
  """Returns compiled regex pattern"""
47
- expression = "".join([ #
54
+ expression = ''.join([ #
48
55
  cls.key_left, #
49
56
  keyword, #
50
57
  cls.key_right, #
@@ -29,7 +29,7 @@ class Bzip2Scanner(AbstractScanner, ABC):
29
29
  bzip2_content_provider = DataContentProvider(data=bz2.decompress(data_provider.data),
30
30
  file_path=new_path,
31
31
  file_type=Util.get_extension(new_path),
32
- info=f"{data_provider.info}|BZIP2:{new_path}")
32
+ info=f"{data_provider.info}|BZIP2:{file_path}")
33
33
  new_limit = recursive_limit_size - len(bzip2_content_provider.data)
34
34
  bzip2_candidates = self.recursive_scan(bzip2_content_provider, depth, new_limit)
35
35
  return bzip2_candidates
@@ -76,17 +76,32 @@ class DeepScanner(
76
76
  return self.__scanner
77
77
 
78
78
  @staticmethod
79
- def get_deep_scanners(data: bytes, file_type: str, depth: int) -> List[Any]:
80
- """Returns possibly scan methods for the data depends on content"""
79
+ def get_deep_scanners(data: bytes, file_type: str, depth: int) -> Tuple[List[Any], List[Any]]:
80
+ """Returns possibly scan methods for the data depends on content and fallback scanners"""
81
81
  deep_scanners: List[Any] = []
82
+ fallback_scanners: List[Any] = []
82
83
  if Util.is_zip(data):
83
84
  if 0 < depth:
84
85
  deep_scanners.append(ZipScanner)
85
- # probably, there might be a docx, xlxs and so on.
86
+ # probably, there might be a docx, xlsx and so on.
86
87
  # It might be scanned with text representation in third-party libraries.
87
- deep_scanners.append(XlsxScanner)
88
- deep_scanners.append(DocxScanner)
89
- deep_scanners.append(PptxScanner)
88
+ if file_type in (".xlsx", ".ods"):
89
+ deep_scanners.append(XlsxScanner)
90
+ else:
91
+ fallback_scanners.append(XlsxScanner)
92
+ if ".docx" == file_type:
93
+ deep_scanners.append(DocxScanner)
94
+ else:
95
+ fallback_scanners.append(DocxScanner)
96
+ if ".pptx" == file_type:
97
+ deep_scanners.append(PptxScanner)
98
+ else:
99
+ fallback_scanners.append(PptxScanner)
100
+ elif Util.is_com(data):
101
+ if ".xls" == file_type:
102
+ deep_scanners.append(XlsxScanner)
103
+ else:
104
+ fallback_scanners.append(XlsxScanner)
90
105
  elif Util.is_bzip2(data):
91
106
  if 0 < depth:
92
107
  deep_scanners.append(Bzip2Scanner)
@@ -102,25 +117,67 @@ class DeepScanner(
102
117
  deep_scanners.append(JksScanner)
103
118
  elif Util.is_asn1(data):
104
119
  deep_scanners.append(Pkcs12Scanner)
105
- elif file_type in [".eml", ".mht"]:
106
- if Util.is_eml(data):
107
- deep_scanners.append(EmlScanner)
108
- elif Util.is_xml(data) and Util.is_html(data):
109
- deep_scanners.append(HtmlScanner)
110
- else:
111
- deep_scanners.append(ByteScanner)
112
120
  elif Util.is_xml(data):
113
121
  if Util.is_html(data):
114
122
  deep_scanners.append(HtmlScanner)
123
+ deep_scanners.append(XmlScanner)
124
+ fallback_scanners.append(ByteScanner)
115
125
  elif Util.is_mxfile(data):
116
126
  deep_scanners.append(MxfileScanner)
117
- deep_scanners.append(XmlScanner)
118
- else:
127
+ deep_scanners.append(XmlScanner)
128
+ fallback_scanners.append(ByteScanner)
129
+ else:
130
+ deep_scanners.append(XmlScanner)
131
+ fallback_scanners.append(ByteScanner)
132
+ elif Util.is_eml(data):
133
+ if ".eml" == file_type:
134
+ deep_scanners.append(EmlScanner)
135
+ else:
136
+ fallback_scanners.append(EmlScanner)
137
+ fallback_scanners.append(ByteScanner)
138
+ elif not Util.is_binary(data):
119
139
  if 0 < depth:
120
140
  deep_scanners.append(EncoderScanner)
121
141
  deep_scanners.append(LangScanner)
122
142
  deep_scanners.append(ByteScanner)
123
- return deep_scanners
143
+ else:
144
+ logger.warning("Cannot apply a deep scanner for type %s", file_type)
145
+ return deep_scanners, fallback_scanners
146
+
147
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
148
+
149
+ def deep_scan_with_fallback(self, data_provider: DataContentProvider, depth: int,
150
+ recursive_limit_size: int) -> List[Candidate]:
151
+ """Scans with deep scanners and fallback scanners if possible
152
+
153
+ Args:
154
+ data_provider: DataContentProvider with raw data
155
+ depth: maximal level of recursion
156
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
157
+
158
+ Returns: list with candidates
159
+
160
+ """
161
+ candidates: List[Candidate] = []
162
+ deep_scanners, fallback_scanners = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
163
+ fallback = True
164
+ for scan_class in deep_scanners:
165
+ new_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
166
+ if new_candidates is None:
167
+ # scanner did not recognise the content type
168
+ continue
169
+ augment_candidates(candidates, new_candidates)
170
+ # this scan is successful, so fallback is not necessary
171
+ fallback = False
172
+ if fallback:
173
+ for scan_class in deep_scanners:
174
+ fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
175
+ if fallback_candidates is None:
176
+ continue
177
+ augment_candidates(candidates, fallback_candidates)
178
+ # use only first successful fallback scanner
179
+ break
180
+ return candidates
124
181
 
125
182
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
126
183
 
@@ -160,17 +217,8 @@ class DeepScanner(
160
217
  file_path=content_provider.file_path,
161
218
  file_type=content_provider.file_type,
162
219
  info=content_provider.info or info)
163
- # iterate for all possibly scanner methods WITHOUT ByteContentProvider for TextContentProvider
164
- scanner_classes = self.get_deep_scanners(data, content_provider.file_type, depth)
165
- fallback = True
166
- for scan_class in scanner_classes:
167
- if new_candidates := scan_class.data_scan(self, data_provider, depth, recursive_limit_size - len(data)):
168
- augment_candidates(candidates, new_candidates)
169
- fallback = False
170
- if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data):
171
- # wrong assumption case
172
- fallback_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
173
- augment_candidates(candidates, fallback_candidates)
220
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size - len(data))
221
+ augment_candidates(candidates, new_candidates)
174
222
  return candidates
175
223
 
176
224
  def recursive_scan(
@@ -203,16 +251,8 @@ class DeepScanner(
203
251
  FilePathExtractor.FIND_BY_EXT_RULE)
204
252
  candidates.append(dummy_candidate)
205
253
  else:
206
- fallback = True
207
- # iterate for all possibly scanner methods
208
- scanner_classes = self.get_deep_scanners(data_provider.data, data_provider.file_type, depth)
209
- for scanner_class in scanner_classes:
210
- if new_candidates := scanner_class.data_scan(self, data_provider, depth, recursive_limit_size):
211
- augment_candidates(candidates, new_candidates)
212
- fallback = False
213
- if fallback and ByteScanner not in scanner_classes and not Util.is_binary(data_provider.data):
214
- bypass_candidates = ByteScanner.data_scan(self, data_provider, depth, recursive_limit_size)
215
- augment_candidates(candidates, bypass_candidates)
254
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size)
255
+ augment_candidates(candidates, new_candidates)
216
256
 
217
257
  return candidates
218
258
 
@@ -31,7 +31,7 @@ class GzipScanner(AbstractScanner, ABC):
31
31
  gzip_content_provider = DataContentProvider(data=f.read(),
32
32
  file_path=new_path,
33
33
  file_type=Util.get_extension(new_path),
34
- info=f"{data_provider.info}|GZIP:{new_path}")
34
+ info=f"{data_provider.info}|GZIP:{file_path}")
35
35
  new_limit = recursive_limit_size - len(gzip_content_provider.data)
36
36
  gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
37
37
  return gzip_candidates
@@ -8,22 +8,35 @@ from credsweeper.utils import Util
8
8
 
9
9
 
10
10
  class ValueAllowlistCheck(Filter):
11
- """Check that patterns from the list is not present in the candidate value."""
11
+ """Check that the patterns do not MATCH the candidate value."""
12
12
 
13
13
  ALLOWED = [
14
14
  r"ENC\(.*\)", #
15
15
  r"ENC\[.*\]", #
16
16
  r"\$\{(\*|[0-9]+|[a-z_].*)\}", #
17
- r"\$([0-9]+\b|[a-z_]+[0-9a-z_]*)", #
17
+ r"\$[0-9]+(\s|$)", #
18
18
  r"\$\$[a-z_]+(\^%[0-9a-z_]+)?", #
19
- r"#\{.*\}", #
19
+ r"#\{.+\}", # Ruby: String Interpolation
20
20
  r"\{\{.+\}\}", #
21
- r"\S{0,5}\*{5,}", #
22
21
  r".*@@@hl@@@(암호|비번|PW|PASS)@@@endhl@@@", #
23
22
  ]
24
23
 
25
24
  ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED), flags=re.IGNORECASE)
26
- ALLOWED_UNQUOTED_PATTERN = re.compile(r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", flags=re.IGNORECASE)
25
+
26
+ ALLOWED_QUOTED = [
27
+ r"\$[a-z_]+[0-9a-z_]*([$\s]|$)", #
28
+ r".*\*\*\*", #
29
+ ]
30
+
31
+ ALLOWED_QUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_QUOTED), flags=re.IGNORECASE)
32
+
33
+ ALLOWED_UNQUOTED = [
34
+ r"[~a-z0-9_]+((\.|->)[a-z0-9_]+)+\(.*$", #
35
+ r"\$[a-z_]+[0-9a-z_]*\b", #
36
+ r".*\*\*\*\*\*", #
37
+ ]
38
+
39
+ ALLOWED_UNQUOTED_PATTERN = re.compile(Util.get_regex_combine_or(ALLOWED_UNQUOTED), flags=re.IGNORECASE)
27
40
 
28
41
  def __init__(self, config: Config = None) -> None:
29
42
  pass
@@ -42,8 +55,11 @@ class ValueAllowlistCheck(Filter):
42
55
 
43
56
  if self.ALLOWED_PATTERN.match(line_data.value):
44
57
  return True
45
-
46
- if not line_data.is_well_quoted_value and self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
47
- return True
58
+ elif line_data.is_well_quoted_value:
59
+ if self.ALLOWED_QUOTED_PATTERN.match(line_data.value):
60
+ return True
61
+ else:
62
+ if self.ALLOWED_UNQUOTED_PATTERN.match(line_data.value):
63
+ return True
48
64
 
49
65
  return False
@@ -11,8 +11,11 @@ class ValueBlocklistCheck(Filter):
11
11
  "true",
12
12
  "false",
13
13
  "null",
14
+ "none",
14
15
  "bearer",
15
16
  "string",
17
+ "value",
18
+ "undefined",
16
19
  ]
17
20
 
18
21
  def __init__(self, config: Config = None) -> None:
@@ -21,8 +21,10 @@ class WordInPath(WordIn):
21
21
 
22
22
  def __call__(self, candidates: List[Candidate]) -> np.ndarray:
23
23
  # actually there must be one path because the candidates are grouped before
24
- if path := candidates[0].line_data_list[0].path:
25
- posix_lower_path = Path(path).as_posix().lower()
24
+ if file_path := candidates[0].line_data_list[0].path:
25
+ path = Path(file_path)
26
+ # apply ./ for normalised path to detect "/src" for relative path
27
+ posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
26
28
  return self.word_in_str(posix_lower_path)
27
29
  else:
28
30
  return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
@@ -576,7 +576,7 @@
576
576
  confidence: strong
577
577
  type: pattern
578
578
  values:
579
- - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>xox[aboprst]\-[0-9A-Za-z-]{10,250})(?![0-9A-Za-z_-])
579
+ - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>xox[a-z]\-[0-9A-Za-z-]{10,250})(?![0-9A-Za-z_-])
580
580
  filter_type: GeneralPattern
581
581
  required_substrings:
582
582
  - xox
@@ -1391,6 +1391,53 @@
1391
1391
  - code
1392
1392
  - doc
1393
1393
 
1394
+ - name: Tavily API Key
1395
+ severity: high
1396
+ confidence: strong
1397
+ type: pattern
1398
+ values:
1399
+ - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>tvly-[0-9A-Za-z_-]{32,40})(?![0-9A-Za-z_-])
1400
+ min_line_len: 37
1401
+ filter_type:
1402
+ - ValuePatternCheck(5)
1403
+ - ValueEntropyBase64Check
1404
+ required_substrings:
1405
+ - tvly-
1406
+ target:
1407
+ - code
1408
+ - doc
1409
+
1410
+ - name: Sentry Organization Auth Token
1411
+ severity: high
1412
+ confidence: strong
1413
+ type: pattern
1414
+ values:
1415
+ - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>sntrys_eyJ[0-9A-Za-z_-]{80,8000}=*([0-9A-Za-z_-]{32,256})?)(?![0-9A-Za-z_-])
1416
+ min_line_len: 37
1417
+ filter_type:
1418
+ - ValuePatternCheck(5)
1419
+ - ValueEntropyBase64Check
1420
+ required_substrings:
1421
+ - sntrys_eyJ
1422
+ target:
1423
+ - code
1424
+ - doc
1425
+
1426
+ - name: Sentry User Auth Token
1427
+ severity: high
1428
+ confidence: strong
1429
+ type: pattern
1430
+ values:
1431
+ - (?:(?<![0-9A-Za-z_-])|\\[0abfnrtv]|(%|\\x)[0-9A-Fa-f]{2}|\\[0-7]{3}|\\[Uu]([0-9A-Fa-f]{4}){1,2}|\x1B\[[0-9;]{0,80}m)(?P<value>sntryu_[0-9a-f]{64})(?![0-9A-Za-z_-])
1432
+ min_line_len: 37
1433
+ filter_type:
1434
+ - ValuePatternCheck(5)
1435
+ required_substrings:
1436
+ - sntryu_
1437
+ target:
1438
+ - code
1439
+ - doc
1440
+
1394
1441
  - name: Discord Bot Token
1395
1442
  severity: high
1396
1443
  confidence: strong
@@ -69,6 +69,7 @@ class Scanner:
69
69
  rule_path = APP_PATH / "rules" / "config.yaml"
70
70
  rule_templates = Util.yaml_load(rule_path)
71
71
  if rule_templates and isinstance(rule_templates, list):
72
+ rule_names = set()
72
73
  for rule_template in rule_templates:
73
74
  try:
74
75
  rule = Rule(self.config, rule_template)
@@ -77,6 +78,10 @@ class Scanner:
77
78
  raise exc
78
79
  if not self._is_available(rule):
79
80
  continue
81
+ if rule.rule_name in rule_names:
82
+ raise RuntimeError(f"Duplicated rule name {rule.rule_name}")
83
+ else:
84
+ rule_names.add(rule.rule_name)
80
85
  if 0 < rule.min_line_len:
81
86
  if rule.rule_type == RuleType.KEYWORD:
82
87
  self.min_keyword_len = min(self.min_keyword_len, rule.min_line_len)
@@ -141,7 +146,7 @@ class Scanner:
141
146
  # "cache" - YAPF and pycharm formatters ...
142
147
  matched_keyword = \
143
148
  target_line_stripped_len >= self.min_keyword_len and ( #
144
- '=' in target_line_stripped or ':' in target_line_stripped) #
149
+ '=' in target_line_stripped or ':' in target_line_stripped) #
145
150
  matched_pem_key = \
146
151
  target_line_stripped_len >= self.min_pem_key_len \
147
152
  and PEM_BEGIN_PATTERN in target_line_stripped and "PRIVATE" in target_line_stripped
@@ -12,6 +12,10 @@
12
12
  ".xlsx",
13
13
  ".docx",
14
14
  ".pptx",
15
+ ".xls",
16
+ ".odp",
17
+ ".ods",
18
+ ".odt",
15
19
  ".pdf"
16
20
  ],
17
21
  "extension": [
@@ -67,7 +71,6 @@
67
71
  ".webm",
68
72
  ".webp",
69
73
  ".woff",
70
- ".xls",
71
74
  ".yuv"
72
75
  ],
73
76
  "path": [
@@ -25,11 +25,37 @@ class HopStat:
25
25
  ')': '0',
26
26
  '_': '-',
27
27
  '+': '=',
28
+ 'Q': 'q',
29
+ 'W': 'w',
30
+ 'E': 'e',
31
+ 'R': 'r',
32
+ 'T': 't',
33
+ 'Y': 'y',
34
+ 'U': 'u',
35
+ 'I': 'i',
36
+ 'O': 'o',
37
+ 'P': 'p',
28
38
  '{': '[',
29
39
  '}': ']',
30
40
  '|': '\\',
41
+ 'A': 'a',
42
+ 'S': 's',
43
+ 'D': 'd',
44
+ 'F': 'f',
45
+ 'G': 'g',
46
+ 'H': 'h',
47
+ 'J': 'j',
48
+ 'K': 'k',
49
+ 'L': 'l',
31
50
  ':': ';',
32
51
  '"': "'",
52
+ 'Z': 'z',
53
+ 'X': 'x',
54
+ 'C': 'c',
55
+ 'V': 'v',
56
+ 'B': 'b',
57
+ 'N': 'n',
58
+ 'M': 'm',
33
59
  '<': ',',
34
60
  '>': '.',
35
61
  '?': '/',
@@ -75,7 +101,7 @@ class HopStat:
75
101
 
76
102
  """
77
103
  hops = []
78
- value = value.lower().translate(HopStat.TRANSLATION)
104
+ value = value.translate(HopStat.TRANSLATION)
79
105
  for a, b in zip(value[:-1], value[1:]):
80
106
  hop = self.__hop_dict.get((a, b))
81
107
  if hop is None:
@@ -153,19 +153,26 @@ class Util:
153
153
  return entropy < min_entropy
154
154
 
155
155
  @staticmethod
156
- def is_binary(data: bytes) -> bool:
156
+ def is_known(data: bytes) -> bool:
157
157
  """
158
158
  Returns true if any recognized binary format found
159
- or two zeroes sequence is found which never exists in text format (UTF-8, UTF-16)
160
- UTF-32 is not supported
161
159
  """
162
160
  if Util.is_zip(data) \
163
161
  or Util.is_gzip(data) \
164
162
  or Util.is_tar(data) \
165
163
  or Util.is_bzip2(data) \
164
+ or Util.is_com(data) \
166
165
  or Util.is_pdf(data) \
167
166
  or Util.is_elf(data):
168
167
  return True
168
+ return False
169
+
170
+ @staticmethod
171
+ def is_binary(data: bytes) -> bool:
172
+ """
173
+ Returns True when two zeroes sequence is found which never exists in text format (UTF-8, UTF-16)
174
+ UTF-32 is not supported
175
+ """
169
176
  if 0 <= data.find(b"\0\0", 0, MAX_LINE_LENGTH):
170
177
  return True
171
178
  non_ascii_cnt = 0
@@ -224,7 +231,7 @@ class Util:
224
231
  encodings = AVAILABLE_ENCODINGS
225
232
  for encoding in encodings:
226
233
  try:
227
- if binary_suggest and LATIN_1 == encoding and Util.is_binary(content):
234
+ if binary_suggest and LATIN_1 == encoding and (Util.is_known(content) or Util.is_binary(content)):
228
235
  # LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
229
236
  # so skip this encoding when checking binaries
230
237
  logger.warning("Binary file detected")
@@ -390,6 +397,15 @@ class Util:
390
397
  return False
391
398
  return False
392
399
 
400
+ @staticmethod
401
+ def is_com(data: bytes) -> bool:
402
+ """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
403
+ if isinstance(data, bytes) and 8 < len(data):
404
+ if data.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
405
+ # Compound File Binary Format: doc, xls, ppt, msi, msg
406
+ return True
407
+ return False
408
+
393
409
  @staticmethod
394
410
  def is_tar(data: bytes) -> bool:
395
411
  """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
@@ -520,10 +536,10 @@ class Util:
520
536
  def is_eml(data: Union[bytes, bytearray]) -> bool:
521
537
  """According to https://datatracker.ietf.org/doc/html/rfc822 lookup the fields: Date, From, To or Subject"""
522
538
  if isinstance(data, (bytes, bytearray)):
523
- if ((b"\nDate:" in data or data.startswith(b"Date:")) #
524
- and (b"\nFrom:" in data or data.startswith(b"From:")) #
525
- and (b"\nTo:" in data or data.startswith(b"To:") #
526
- or b"\nSubject:" in data or data.startswith(b"Subject:"))):
539
+ if (b"\nDate:" in data or data.startswith(b"Date:")) \
540
+ and (b"\nFrom:" in data or data.startswith(b"From:")) \
541
+ and (b"\nTo:" in data or data.startswith(b"To:")) \
542
+ and (b"\nSubject:" in data or data.startswith(b"Subject:")):
527
543
  return True
528
544
  return False
529
545
 
@@ -14,7 +14,8 @@ dependencies = [
14
14
  "lxml",
15
15
  "numpy<2.0.0",
16
16
  "odfpy",
17
- "onnxruntime",
17
+ "onnxruntime==1.19.2; platform_system == 'Windows'", # Python 3.9 limitation
18
+ "onnxruntime; platform_system != 'Windows'",
18
19
  "openpyxl",
19
20
  "pandas",
20
21
  "pdfminer.six",
@@ -25,6 +26,7 @@ dependencies = [
25
26
  "python-pptx",
26
27
  "PyYAML",
27
28
  "whatthepatch",
29
+ "xlrd",
28
30
  ]
29
31
  requires-python = ">=3.9"
30
32
  readme = "README.md"
File without changes
File without changes