assemblyline-v4-service 4.4.1.dev351__py3-none-any.whl → 4.4.1.dev355__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-v4-service might be problematic. Click here for more details.

@@ -1 +1 @@
1
- 4.4.1.dev351
1
+ 4.4.1.dev355
@@ -1,13 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TextIO
3
+ from typing import Dict, List, TextIO
4
4
 
5
- import regex
6
5
  from assemblyline_v4_service.common.helper import get_service_manifest
7
6
  from assemblyline_v4_service.common.utils import PASSWORD_WORDS
8
7
 
9
8
  # TODO: Would prefer this mapping to be dynamic from trusted sources (ie. import from library), but will copy-paste for now
10
- OCR_INDICATORS_MAPPING: dict[str, list[str]] = {
9
+ OCR_INDICATORS_TERMS: dict[str, list[str]] = {
11
10
  'ransomware': [
12
11
  # https://github.com/cuckoosandbox/community/blob/master/modules/signatures/windows/ransomware_message.py
13
12
  "your files", "your data", "your documents", "restore files",
@@ -67,16 +66,45 @@ OCR_INDICATORS_MAPPING: dict[str, list[str]] = {
67
66
  ]
68
67
  }
69
68
 
69
+ # The minimum number of indicator hits to avoid FP detections
70
+ OCR_INDICATORS_THRESHOLD: Dict[str, int] = {"ransomware": 2, "macros": 2, "banned": 1, "password": 1}
70
71
 
71
- def ocr_detections(image_path: str, ocr_io: TextIO = None) -> dict[str, list[str]]:
72
+ try:
73
+ # Retrieve service-configured OCR settings on module load
74
+ ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
75
+ indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
76
+ for i in indicators:
77
+ # Backwards compatibility: Check how the OCR configuration is formatted
78
+ indicator_config = ocr_config.get(i)
79
+ indicator_terms = []
80
+ indicator_threshold = 1
81
+ if not indicator_config:
82
+ # Empty block/no override provided by service
83
+ pass
84
+ elif isinstance(indicator_config, list):
85
+ # Legacy support (before configurable indicator thresholds)
86
+ indicator_terms = indicator_config
87
+ pass
88
+ elif isinstance(indicator_config, dict):
89
+ # Set indicator threshold before variable overwrite with terms list
90
+ indicator_terms = indicator_config.get('terms', [])
91
+ indicator_threshold = indicator_config.get('threshold', 1)
92
+ OCR_INDICATORS_TERMS[i] = indicator_terms or OCR_INDICATORS_TERMS.get(i, [])
93
+ OCR_INDICATORS_THRESHOLD[i] = indicator_threshold or OCR_INDICATORS_THRESHOLD.get(i, 1)
94
+
95
+ except Exception:
96
+ pass
97
+
98
+
99
+ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> Dict[str, List[str]]:
72
100
  try:
73
101
  import pytesseract
74
102
  from PIL import Image
75
103
  except ImportError as exc:
76
104
  raise ImportError(
77
- 'In order to use this method to scan for OCR detections, '
78
- 'ensure you have the following installed in your service:\n'
79
- 'tesseract-ocr, pytesseract, and Pillow.\n'
105
+ "In order to use this method to scan for OCR detections, "
106
+ "ensure you have the following installed in your service:\n"
107
+ "tesseract-ocr, pytesseract, and Pillow.\n"
80
108
  'You can do this via "apt-get install -y tesseract-ocr" and "pip install Pillow pytesseract"'
81
109
  ) from exc
82
110
 
@@ -84,7 +112,9 @@ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> dict[str, list[str
84
112
  ocr_output = ""
85
113
 
86
114
  try:
87
- ocr_output = pytesseract.image_to_string(Image.open(image_path), timeout=15) # Stop OCR after 15 seconds
115
+ ocr_output = pytesseract.image_to_string(
116
+ Image.open(image_path), timeout=15
117
+ ) # Stop OCR after 15 seconds
88
118
  except (TypeError, RuntimeError):
89
119
  # Image given isn't supported therefore no OCR output can be given with tesseract
90
120
  return {}
@@ -97,37 +127,51 @@ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> dict[str, list[str
97
127
  return detections(ocr_output)
98
128
 
99
129
 
100
- def detections(ocr_output: str) -> dict[str, list[str]]:
101
- detection_output: dict[str, list[str]] = {}
102
- ocr_config: dict[str, list[str]] = {}
130
+ def detections(ocr_output: str) -> Dict[str, List[str]]:
131
+ indicators = list(OCR_INDICATORS_TERMS.keys())
132
+ ocr_config = {}
103
133
  try:
104
- # If running an AL service, grab OCR configuration from service manifest
105
- ocr_config = get_service_manifest().get('config', {}).get('ocr', {})
134
+ # Retrieve service-configured OCR settings on module load
135
+ ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
136
+ indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
106
137
  except Exception:
107
138
  pass
108
- indicators = set(list(OCR_INDICATORS_MAPPING.keys()) + list(ocr_config.keys()))
139
+
140
+ detection_output: Dict[str, List[str]] = {}
109
141
  # Iterate over the different indicators and include lines of detection in response
110
142
  for indicator in indicators:
111
- list_of_terms = ocr_config.get(indicator, []) or OCR_INDICATORS_MAPPING.get(indicator, [])
112
- if not list_of_terms:
113
- # If no terms specified, move onto next indicator
143
+ # Backwards compatibility: Check how the OCR configuration is formatted
144
+ indicator_config = ocr_config.get(indicator)
145
+ terms = OCR_INDICATORS_TERMS.get(indicator, [])
146
+ hit_threshold = OCR_INDICATORS_THRESHOLD.get(indicator, 1)
147
+ if not indicator_config:
148
+ # Empty block/no override provided by service
149
+ pass
150
+ elif isinstance(indicator_config, list):
151
+ # Legacy support (before configurable indicator thresholds)
152
+ terms = indicator_config
153
+ pass
154
+ elif isinstance(indicator_config, dict):
155
+ # Set indicator threshold before variable overwrite with terms list
156
+ terms = indicator_config.get('terms', [])
157
+ hit_threshold = indicator_config.get('threshold', 1)
158
+
159
+ # Perform a pre-check to see if the terms even exist in the OCR text
160
+ if not any([t.lower() in ocr_output.lower() for t in terms]):
114
161
  continue
115
- indicator_hits: set[str | None] = set()
116
- regex_exp = regex.compile(f"({')|('.join(list_of_terms).lower()})")
117
- list_of_strings: list[str] = []
118
- for line in ocr_output.split('\n'):
119
- search = regex_exp.search(line.lower())
120
- if search:
121
- indicator_hits = indicator_hits.union(set(search.groups()))
122
- list_of_strings.append(line)
123
- if None in indicator_hits:
124
- indicator_hits.remove(None)
125
-
126
- if list_of_strings:
127
- if len(indicator_hits) >= 2:
128
- # We consider the detection to be credible if there's more than a single indicator hit
129
- detection_output[indicator] = list_of_strings
130
- if indicator in ['banned', 'password']:
131
- # Except if we're dealing with banned/password, one hit is more than enough
132
- detection_output[indicator] = list_of_strings
162
+
163
+ # Keep a track of the hits and the lines corresponding with term hits
164
+ indicator_hits: int = 0
165
+ list_of_strings: List[str] = []
166
+ for line in ocr_output.split("\n"):
167
+ for t in terms:
168
+ term_count = line.lower().count(t.lower())
169
+ if term_count:
170
+ indicator_hits += term_count
171
+ if line not in list_of_strings:
172
+ list_of_strings.append(line)
173
+
174
+ if list_of_strings and indicator_hits >= hit_threshold:
175
+ # If we were to find hits and those hits are above the required threshold, then add them to output
176
+ detection_output[indicator] = list_of_strings
133
177
  return detection_output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-v4-service
3
- Version: 4.4.1.dev351
3
+ Version: 4.4.1.dev355
4
4
  Summary: Assemblyline 4 - Service base
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-v4-service/
6
6
  Author: CCCS Assemblyline development team
@@ -1,4 +1,4 @@
1
- assemblyline_v4_service/VERSION,sha256=S4YmcS3JWMqm8BjF0G1J4UYUF8kF9ACn4azroxWQmcc,13
1
+ assemblyline_v4_service/VERSION,sha256=DB7TR909uaAA41K89VQwr4iyPmWMyngEPPHlQ8Nk-E8,13
2
2
  assemblyline_v4_service/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  assemblyline_v4_service/healthz.py,sha256=sS1cFkDLw8hUPMpj7tbHXFv8ZmHcazrwZ0l6oQDwwkQ,1575
4
4
  assemblyline_v4_service/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -8,7 +8,7 @@ assemblyline_v4_service/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
8
8
  assemblyline_v4_service/common/api.py,sha256=8s2GJI_GmY4ClnjPOn7ijYs6m50cVJfjVx1h3GBQVms,6694
9
9
  assemblyline_v4_service/common/base.py,sha256=9xufnspN99J1EHTru1fdkflRwB6PGdfyCUDvYwUIBEk,13610
10
10
  assemblyline_v4_service/common/helper.py,sha256=xs9quuf-M1JOdKieBqOmWaOece0CtzXFhhe85xQYmuY,3289
11
- assemblyline_v4_service/common/ocr.py,sha256=ML9AwlGCeogFGoeLuFdhcu75E5t6fTE69U82fENGRkY,6528
11
+ assemblyline_v4_service/common/ocr.py,sha256=NRK5FqTQZQdjGBj9kSOFyP1Z8gA3lNgk_MK7yeNnJsE,8466
12
12
  assemblyline_v4_service/common/ontology_helper.py,sha256=QpwerYoS5hXjWzpx3Pmwv6j2330PQVYqxYGamjcpW3I,7890
13
13
  assemblyline_v4_service/common/request.py,sha256=XXBafAQCV43_OBLXOSHxYoDHmqwERBkNul8fb_X6Ves,11774
14
14
  assemblyline_v4_service/common/result.py,sha256=9AqM6qCYiia_Bpyn_fBFhzNQMcqJbtFSiGjp57fXW2E,32713
@@ -37,8 +37,8 @@ test/test_common/test_request.py,sha256=wxSwnOj-_YOv2SuZjOJsw09q8A7p8GJmJuK4vozq
37
37
  test/test_common/test_result.py,sha256=Wm0Cs5kZRzlZr0jL-l8OTsYAvkoN2eaB3NkeXzvyssI,42208
38
38
  test/test_common/test_task.py,sha256=jnfF68EgJIu30Pz_4jiJHkncfI-3XpGaut5r79KIXOA,18718
39
39
  test/test_common/test_utils.py,sha256=TbnBxqpS_ZC5ptXR9XJX3xtbItD0mTbtiBxxdyP8J5k,5904
40
- assemblyline_v4_service-4.4.1.dev351.dist-info/LICENCE.md,sha256=NSkYo9EH8h5oOkzg4VhjAHF4339MqPP2cQ8msTPgl-c,1396
41
- assemblyline_v4_service-4.4.1.dev351.dist-info/METADATA,sha256=u1l3L4q6HCN1mKbiy2vXVuRwxHQq0HL0AilqRtyh-KI,9663
42
- assemblyline_v4_service-4.4.1.dev351.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
43
- assemblyline_v4_service-4.4.1.dev351.dist-info/top_level.txt,sha256=LpTOEaVCatkrvbVq3EZseMSIa2PQZU-2rhuO_FTpZgY,29
44
- assemblyline_v4_service-4.4.1.dev351.dist-info/RECORD,,
40
+ assemblyline_v4_service-4.4.1.dev355.dist-info/LICENCE.md,sha256=NSkYo9EH8h5oOkzg4VhjAHF4339MqPP2cQ8msTPgl-c,1396
41
+ assemblyline_v4_service-4.4.1.dev355.dist-info/METADATA,sha256=56hrcskcjpjwkV4ZvAzqILJAg6FYHzaCG-MzSI91ykk,9663
42
+ assemblyline_v4_service-4.4.1.dev355.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
43
+ assemblyline_v4_service-4.4.1.dev355.dist-info/top_level.txt,sha256=LpTOEaVCatkrvbVq3EZseMSIa2PQZU-2rhuO_FTpZgY,29
44
+ assemblyline_v4_service-4.4.1.dev355.dist-info/RECORD,,