assemblyline-v4-service 4.4.1.dev351__py3-none-any.whl → 4.4.1.dev355__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of assemblyline-v4-service might be problematic. Click here for more details.
- assemblyline_v4_service/VERSION +1 -1
- assemblyline_v4_service/common/ocr.py +79 -35
- {assemblyline_v4_service-4.4.1.dev351.dist-info → assemblyline_v4_service-4.4.1.dev355.dist-info}/METADATA +1 -1
- {assemblyline_v4_service-4.4.1.dev351.dist-info → assemblyline_v4_service-4.4.1.dev355.dist-info}/RECORD +7 -7
- {assemblyline_v4_service-4.4.1.dev351.dist-info → assemblyline_v4_service-4.4.1.dev355.dist-info}/LICENCE.md +0 -0
- {assemblyline_v4_service-4.4.1.dev351.dist-info → assemblyline_v4_service-4.4.1.dev355.dist-info}/WHEEL +0 -0
- {assemblyline_v4_service-4.4.1.dev351.dist-info → assemblyline_v4_service-4.4.1.dev355.dist-info}/top_level.txt +0 -0
assemblyline_v4_service/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
4.4.1.
|
|
1
|
+
4.4.1.dev355
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TextIO
|
|
3
|
+
from typing import Dict, List, TextIO
|
|
4
4
|
|
|
5
|
-
import regex
|
|
6
5
|
from assemblyline_v4_service.common.helper import get_service_manifest
|
|
7
6
|
from assemblyline_v4_service.common.utils import PASSWORD_WORDS
|
|
8
7
|
|
|
9
8
|
# TODO: Would prefer this mapping to be dynamic from trusted sources (ie. import from library), but will copy-paste for now
|
|
10
|
-
|
|
9
|
+
OCR_INDICATORS_TERMS: dict[str, list[str]] = {
|
|
11
10
|
'ransomware': [
|
|
12
11
|
# https://github.com/cuckoosandbox/community/blob/master/modules/signatures/windows/ransomware_message.py
|
|
13
12
|
"your files", "your data", "your documents", "restore files",
|
|
@@ -67,16 +66,45 @@ OCR_INDICATORS_MAPPING: dict[str, list[str]] = {
|
|
|
67
66
|
]
|
|
68
67
|
}
|
|
69
68
|
|
|
69
|
+
# The minimum number of indicator hits to avoid FP detections
|
|
70
|
+
OCR_INDICATORS_THRESHOLD: Dict[str, int] = {"ransomware": 2, "macros": 2, "banned": 1, "password": 1}
|
|
70
71
|
|
|
71
|
-
|
|
72
|
+
try:
|
|
73
|
+
# Retrieve service-configured OCR settings on module load
|
|
74
|
+
ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
|
|
75
|
+
indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
|
|
76
|
+
for i in indicators:
|
|
77
|
+
# Backwards compatibility: Check how the OCR configuration is formatted
|
|
78
|
+
indicator_config = ocr_config.get(i)
|
|
79
|
+
indicator_terms = []
|
|
80
|
+
indicator_threshold = 1
|
|
81
|
+
if not indicator_config:
|
|
82
|
+
# Empty block/no override provided by service
|
|
83
|
+
pass
|
|
84
|
+
elif isinstance(indicator_config, list):
|
|
85
|
+
# Legacy support (before configurable indicator thresholds)
|
|
86
|
+
indicator_terms = indicator_config
|
|
87
|
+
pass
|
|
88
|
+
elif isinstance(indicator_config, dict):
|
|
89
|
+
# Set indicator threshold before variable overwrite with terms list
|
|
90
|
+
indicator_terms = indicator_config.get('terms', [])
|
|
91
|
+
indicator_threshold = indicator_config.get('threshold', 1)
|
|
92
|
+
OCR_INDICATORS_TERMS[i] = indicator_terms or OCR_INDICATORS_TERMS.get(i, [])
|
|
93
|
+
OCR_INDICATORS_THRESHOLD[i] = indicator_threshold or OCR_INDICATORS_THRESHOLD.get(i, 1)
|
|
94
|
+
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def ocr_detections(image_path: str, ocr_io: TextIO = None) -> Dict[str, List[str]]:
|
|
72
100
|
try:
|
|
73
101
|
import pytesseract
|
|
74
102
|
from PIL import Image
|
|
75
103
|
except ImportError as exc:
|
|
76
104
|
raise ImportError(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
105
|
+
"In order to use this method to scan for OCR detections, "
|
|
106
|
+
"ensure you have the following installed in your service:\n"
|
|
107
|
+
"tesseract-ocr, pytesseract, and Pillow.\n"
|
|
80
108
|
'You can do this via "apt-get install -y tesseract-ocr" and "pip install Pillow pytesseract"'
|
|
81
109
|
) from exc
|
|
82
110
|
|
|
@@ -84,7 +112,9 @@ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> dict[str, list[str
|
|
|
84
112
|
ocr_output = ""
|
|
85
113
|
|
|
86
114
|
try:
|
|
87
|
-
ocr_output = pytesseract.image_to_string(
|
|
115
|
+
ocr_output = pytesseract.image_to_string(
|
|
116
|
+
Image.open(image_path), timeout=15
|
|
117
|
+
) # Stop OCR after 15 seconds
|
|
88
118
|
except (TypeError, RuntimeError):
|
|
89
119
|
# Image given isn't supported therefore no OCR output can be given with tesseract
|
|
90
120
|
return {}
|
|
@@ -97,37 +127,51 @@ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> dict[str, list[str
|
|
|
97
127
|
return detections(ocr_output)
|
|
98
128
|
|
|
99
129
|
|
|
100
|
-
def detections(ocr_output: str) ->
|
|
101
|
-
|
|
102
|
-
ocr_config
|
|
130
|
+
def detections(ocr_output: str) -> Dict[str, List[str]]:
|
|
131
|
+
indicators = list(OCR_INDICATORS_TERMS.keys())
|
|
132
|
+
ocr_config = {}
|
|
103
133
|
try:
|
|
104
|
-
#
|
|
105
|
-
ocr_config = get_service_manifest().get(
|
|
134
|
+
# Retrieve service-configured OCR settings on module load
|
|
135
|
+
ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
|
|
136
|
+
indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
|
|
106
137
|
except Exception:
|
|
107
138
|
pass
|
|
108
|
-
|
|
139
|
+
|
|
140
|
+
detection_output: Dict[str, List[str]] = {}
|
|
109
141
|
# Iterate over the different indicators and include lines of detection in response
|
|
110
142
|
for indicator in indicators:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
143
|
+
# Backwards compatibility: Check how the OCR configuration is formatted
|
|
144
|
+
indicator_config = ocr_config.get(indicator)
|
|
145
|
+
terms = OCR_INDICATORS_TERMS.get(indicator, [])
|
|
146
|
+
hit_threshold = OCR_INDICATORS_THRESHOLD.get(indicator, 1)
|
|
147
|
+
if not indicator_config:
|
|
148
|
+
# Empty block/no override provided by service
|
|
149
|
+
pass
|
|
150
|
+
elif isinstance(indicator_config, list):
|
|
151
|
+
# Legacy support (before configurable indicator thresholds)
|
|
152
|
+
terms = indicator_config
|
|
153
|
+
pass
|
|
154
|
+
elif isinstance(indicator_config, dict):
|
|
155
|
+
# Set indicator threshold before variable overwrite with terms list
|
|
156
|
+
terms = indicator_config.get('terms', [])
|
|
157
|
+
hit_threshold = indicator_config.get('threshold', 1)
|
|
158
|
+
|
|
159
|
+
# Perform a pre-check to see if the terms even exist in the OCR text
|
|
160
|
+
if not any([t.lower() in ocr_output.lower() for t in terms]):
|
|
114
161
|
continue
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
if indicator in ['banned', 'password']:
|
|
131
|
-
# Except if we're dealing with banned/password, one hit is more than enough
|
|
132
|
-
detection_output[indicator] = list_of_strings
|
|
162
|
+
|
|
163
|
+
# Keep a track of the hits and the lines corresponding with term hits
|
|
164
|
+
indicator_hits: int = 0
|
|
165
|
+
list_of_strings: List[str] = []
|
|
166
|
+
for line in ocr_output.split("\n"):
|
|
167
|
+
for t in terms:
|
|
168
|
+
term_count = line.lower().count(t.lower())
|
|
169
|
+
if term_count:
|
|
170
|
+
indicator_hits += term_count
|
|
171
|
+
if line not in list_of_strings:
|
|
172
|
+
list_of_strings.append(line)
|
|
173
|
+
|
|
174
|
+
if list_of_strings and indicator_hits >= hit_threshold:
|
|
175
|
+
# If we were to find hits and those hits are above the required threshold, then add them to output
|
|
176
|
+
detection_output[indicator] = list_of_strings
|
|
133
177
|
return detection_output
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
assemblyline_v4_service/VERSION,sha256=
|
|
1
|
+
assemblyline_v4_service/VERSION,sha256=DB7TR909uaAA41K89VQwr4iyPmWMyngEPPHlQ8Nk-E8,13
|
|
2
2
|
assemblyline_v4_service/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
assemblyline_v4_service/healthz.py,sha256=sS1cFkDLw8hUPMpj7tbHXFv8ZmHcazrwZ0l6oQDwwkQ,1575
|
|
4
4
|
assemblyline_v4_service/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -8,7 +8,7 @@ assemblyline_v4_service/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
8
8
|
assemblyline_v4_service/common/api.py,sha256=8s2GJI_GmY4ClnjPOn7ijYs6m50cVJfjVx1h3GBQVms,6694
|
|
9
9
|
assemblyline_v4_service/common/base.py,sha256=9xufnspN99J1EHTru1fdkflRwB6PGdfyCUDvYwUIBEk,13610
|
|
10
10
|
assemblyline_v4_service/common/helper.py,sha256=xs9quuf-M1JOdKieBqOmWaOece0CtzXFhhe85xQYmuY,3289
|
|
11
|
-
assemblyline_v4_service/common/ocr.py,sha256=
|
|
11
|
+
assemblyline_v4_service/common/ocr.py,sha256=NRK5FqTQZQdjGBj9kSOFyP1Z8gA3lNgk_MK7yeNnJsE,8466
|
|
12
12
|
assemblyline_v4_service/common/ontology_helper.py,sha256=QpwerYoS5hXjWzpx3Pmwv6j2330PQVYqxYGamjcpW3I,7890
|
|
13
13
|
assemblyline_v4_service/common/request.py,sha256=XXBafAQCV43_OBLXOSHxYoDHmqwERBkNul8fb_X6Ves,11774
|
|
14
14
|
assemblyline_v4_service/common/result.py,sha256=9AqM6qCYiia_Bpyn_fBFhzNQMcqJbtFSiGjp57fXW2E,32713
|
|
@@ -37,8 +37,8 @@ test/test_common/test_request.py,sha256=wxSwnOj-_YOv2SuZjOJsw09q8A7p8GJmJuK4vozq
|
|
|
37
37
|
test/test_common/test_result.py,sha256=Wm0Cs5kZRzlZr0jL-l8OTsYAvkoN2eaB3NkeXzvyssI,42208
|
|
38
38
|
test/test_common/test_task.py,sha256=jnfF68EgJIu30Pz_4jiJHkncfI-3XpGaut5r79KIXOA,18718
|
|
39
39
|
test/test_common/test_utils.py,sha256=TbnBxqpS_ZC5ptXR9XJX3xtbItD0mTbtiBxxdyP8J5k,5904
|
|
40
|
-
assemblyline_v4_service-4.4.1.
|
|
41
|
-
assemblyline_v4_service-4.4.1.
|
|
42
|
-
assemblyline_v4_service-4.4.1.
|
|
43
|
-
assemblyline_v4_service-4.4.1.
|
|
44
|
-
assemblyline_v4_service-4.4.1.
|
|
40
|
+
assemblyline_v4_service-4.4.1.dev355.dist-info/LICENCE.md,sha256=NSkYo9EH8h5oOkzg4VhjAHF4339MqPP2cQ8msTPgl-c,1396
|
|
41
|
+
assemblyline_v4_service-4.4.1.dev355.dist-info/METADATA,sha256=56hrcskcjpjwkV4ZvAzqILJAg6FYHzaCG-MzSI91ykk,9663
|
|
42
|
+
assemblyline_v4_service-4.4.1.dev355.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
43
|
+
assemblyline_v4_service-4.4.1.dev355.dist-info/top_level.txt,sha256=LpTOEaVCatkrvbVq3EZseMSIa2PQZU-2rhuO_FTpZgY,29
|
|
44
|
+
assemblyline_v4_service-4.4.1.dev355.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|