assemblyline-v4-service 4.5.1.dev71__py3-none-any.whl → 4.5.1.dev75__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-v4-service might be problematic. Click here for more details.

@@ -1 +1 @@
1
- 4.5.1.dev71
1
+ 4.5.1.dev75
@@ -5,7 +5,7 @@ from typing import Dict, List, TextIO
5
5
  from assemblyline_v4_service.common.helper import get_service_manifest
6
6
  from assemblyline_v4_service.common.utils import PASSWORD_WORDS
7
7
 
8
- # TODO: Would prefer this mapping to be dynamic from trusted sources (ie. import from library), but will copy-paste for now
8
+ # The terms related to each indicator category
9
9
  OCR_INDICATORS_TERMS: dict[str, list[str]] = {
10
10
  "ransomware": [
11
11
  # https://github.com/cuckoosandbox/community/blob/master/modules/signatures/windows/ransomware_message.py
@@ -154,6 +154,45 @@ OCR_INDICATORS_TERMS: dict[str, list[str]] = {
154
154
  # The minimum number of indicator hits to avoid FP detections
155
155
  OCR_INDICATORS_THRESHOLD: Dict[str, int] = {"ransomware": 2, "macros": 2, "banned": 1, "password": 1}
156
156
 
157
+ # Pre-compute the OCR_INDICATOR_* constants on module load so we only load the manifest once rather than per OCR request
158
+ try:
159
+ # Retrieve service-configured OCR settings on module load
160
+ ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
161
+ except Exception:
162
+ # Service manifest not found
163
+ ocr_config = {}
164
+
165
+ indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
166
+ # Iterate over the different indicators and include lines of detection in response
167
+ for indicator in indicators:
168
+ indicator_config = ocr_config.get(indicator)
169
+ terms = OCR_INDICATORS_TERMS.get(indicator, [])
170
+ hit_threshold = OCR_INDICATORS_THRESHOLD.get(indicator, 1)
171
+ # Backwards compatibility: Check how the OCR configuration is formatted
172
+ if not indicator_config:
173
+ # Empty block/no override provided by service
174
+ pass
175
+ elif isinstance(indicator_config, list):
176
+ # Legacy support (before configurable indicator thresholds)
177
+ terms = indicator_config
178
+ elif isinstance(indicator_config, dict):
179
+ # Either you're exclusively overwriting the terms list or you're selectively including/excluding terms
180
+ if indicator_config.get("terms"):
181
+ # Overwrite terms list with service configuration
182
+ terms = indicator_config["terms"]
183
+ else:
184
+ included_terms = set(indicator_config.get("include", []))
185
+ excluded_terms = set(indicator_config.get("exclude", []))
186
+ # Compute the new terms list for indicator type
187
+ terms = list(set(terms).union(included_terms) - excluded_terms)
188
+
189
+ # Set the indicator hit threshold
190
+ hit_threshold = indicator_config.get("threshold", 1)
191
+
192
+ # Overwrite key-value in respective constants
193
+ OCR_INDICATORS_TERMS[indicator] = terms
194
+ OCR_INDICATORS_THRESHOLD[indicator] = hit_threshold
195
+
157
196
 
158
197
  def ocr_detections(image_path: str, ocr_io: TextIO = None) -> Dict[str, List[str]]:
159
198
  try:
@@ -185,57 +224,25 @@ def ocr_detections(image_path: str, ocr_io: TextIO = None) -> Dict[str, List[str
185
224
 
186
225
 
187
226
  def detections(ocr_output: str) -> Dict[str, List[str]]:
188
- try:
189
- # Retrieve service-configured OCR settings on module load
190
- ocr_config: Dict = get_service_manifest().get("config", {}).get("ocr", {})
191
- except Exception:
192
- # Service manifest not found
193
- ocr_config = {}
194
-
195
- indicators = set(list(OCR_INDICATORS_TERMS.keys()) + list(ocr_config.keys()))
196
227
  detection_output: Dict[str, List[str]] = {}
197
- # Iterate over the different indicators and include lines of detection in response
198
- for indicator in indicators:
199
- indicator_config = ocr_config.get(indicator)
200
- terms = OCR_INDICATORS_TERMS.get(indicator, [])
201
- hit_threshold = OCR_INDICATORS_THRESHOLD.get(indicator, 1)
202
- # Backwards compatibility: Check how the OCR configuration is formatted
203
- if not indicator_config:
204
- # Empty block/no override provided by service
205
- pass
206
- elif isinstance(indicator_config, list):
207
- # Legacy support (before configurable indicator thresholds)
208
- terms = indicator_config
209
- elif isinstance(indicator_config, dict):
210
- # Either you're exclusively overwriting the terms list or you're selectively including/excluding terms
211
- if indicator_config.get("terms"):
212
- # Overwrite terms list with service configuration
213
- terms = indicator_config["terms"]
214
- else:
215
- included_terms = set(indicator_config.get("include", []))
216
- excluded_terms = set(indicator_config.get("exclude", []))
217
- # Compute the new terms list for indicator type
218
- terms = list(set(terms).union(included_terms) - excluded_terms)
219
-
220
- # Set the indicator hit threshold
221
- hit_threshold = indicator_config.get("threshold", 1)
222
-
228
+ for indicator, terms in OCR_INDICATORS_TERMS.items():
229
+ hit_threshold = OCR_INDICATORS_THRESHOLD[indicator]
223
230
  # Perform a pre-check to see if the terms even exist in the OCR text
224
231
  if not any([t.lower() in ocr_output.lower() for t in terms]):
225
232
  continue
226
233
 
227
234
  # Keep a track of the hits and the lines corresponding with term hits
228
- indicator_hits: int = 0
235
+ indicator_hits: set = set()
229
236
  list_of_strings: List[str] = []
230
237
  for line in ocr_output.split("\n"):
231
238
  for t in terms:
232
239
  term_count = line.lower().count(t.lower())
233
240
  if term_count:
234
- indicator_hits += term_count
241
+ indicator_hits.add(t)
235
242
  if line not in list_of_strings:
236
243
  list_of_strings.append(line)
237
244
 
238
- if list_of_strings and indicator_hits >= hit_threshold:
245
+ if list_of_strings and len(indicator_hits) >= hit_threshold:
239
246
  # If we were to find hits and those hits are above the required threshold, then add them to output
240
247
  detection_output[indicator] = list_of_strings
241
248
  return detection_output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: assemblyline-v4-service
3
- Version: 4.5.1.dev71
3
+ Version: 4.5.1.dev75
4
4
  Summary: Assemblyline 4 - Service base
5
5
  Home-page: https://github.com/CybercentreCanada/assemblyline-v4-service/
6
6
  Author: CCCS Assemblyline development team
@@ -1,4 +1,4 @@
1
- assemblyline_v4_service/VERSION,sha256=sgnbYqhBwfHaBVEZ9RudMJZ-VblAxWq7JSyPx5vGabE,12
1
+ assemblyline_v4_service/VERSION,sha256=JZHoM4NWVTRtlPRRggVpGFHenhb-c_AXFCT7sYtWPQQ,12
2
2
  assemblyline_v4_service/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  assemblyline_v4_service/healthz.py,sha256=sS1cFkDLw8hUPMpj7tbHXFv8ZmHcazrwZ0l6oQDwwkQ,1575
4
4
  assemblyline_v4_service/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -8,7 +8,7 @@ assemblyline_v4_service/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
8
8
  assemblyline_v4_service/common/api.py,sha256=Xzp8j4HCCfjPvNSGKiZl5ttH2_Itg47cjlH0NXNtth0,6849
9
9
  assemblyline_v4_service/common/base.py,sha256=mKkkzbxVL_wVMy_VieU9mlHYLqZXndga_4dWWbnEnx8,14045
10
10
  assemblyline_v4_service/common/helper.py,sha256=xs9quuf-M1JOdKieBqOmWaOece0CtzXFhhe85xQYmuY,3289
11
- assemblyline_v4_service/common/ocr.py,sha256=A8OnjpEor-S3OUC_jZzJ-Er3KKAsMdQLEXTtMS81Xbk,8397
11
+ assemblyline_v4_service/common/ocr.py,sha256=dzx5n9fzOHCVX3NnGKONpGAF3Cq16hw_y7M7g_nsJ7A,8588
12
12
  assemblyline_v4_service/common/ontology_helper.py,sha256=QpwerYoS5hXjWzpx3Pmwv6j2330PQVYqxYGamjcpW3I,7890
13
13
  assemblyline_v4_service/common/request.py,sha256=XXBafAQCV43_OBLXOSHxYoDHmqwERBkNul8fb_X6Ves,11774
14
14
  assemblyline_v4_service/common/result.py,sha256=9AqM6qCYiia_Bpyn_fBFhzNQMcqJbtFSiGjp57fXW2E,32713
@@ -28,18 +28,18 @@ test/conftest.py,sha256=W3SieQpZsZpGEmtLqY4aIlxREDSsHceyCrFcFsWUM0U,1851
28
28
  test/test_healthz.py,sha256=DkeLUlrb7rGx3nZ04aADU9HXXu5mZTf_DBwT0xhzIv4,7
29
29
  test/test_run_privileged_service.py,sha256=DkeLUlrb7rGx3nZ04aADU9HXXu5mZTf_DBwT0xhzIv4,7
30
30
  test/test_run_service.py,sha256=DkeLUlrb7rGx3nZ04aADU9HXXu5mZTf_DBwT0xhzIv4,7
31
- test/test_common/__init__.py,sha256=v3__rzWUBW0Smc2rFwfFR9WehQHM_uBTIFCdC_We3tA,1319
31
+ test/test_common/__init__.py,sha256=RkOm3vnVp5L947mD1jTo4bdOgLTZJ24_NX-kqfMn5a8,1259
32
32
  test/test_common/test_api.py,sha256=7wlo7wgB12T23zMLbwjJ3GIomLHqE_Qvs3xkibSsR1U,4902
33
- test/test_common/test_base.py,sha256=d61an3lRaes_cx0AOPMNFMVWxTOjilrcGpuVP0dqTvM,13198
33
+ test/test_common/test_base.py,sha256=fuJSSlPxIDHq6HU1xbvaMFitw2z1spOZNHD2SJ4UUic,13346
34
34
  test/test_common/test_helper.py,sha256=sO6YAiBhKTqaxlpLhFYDuy2ZdbuF2cg07Ylzo83ZzQs,2575
35
- test/test_common/test_ocr.py,sha256=s5kL0vKjLmbyXuCg-9v6V6wQwwFRu0w2hYpjG0_BXy4,1874
36
- test/test_common/test_ontology_helper.py,sha256=TuvTeP9BTRqklOlsLu_yMdN9wdPWlVAENx2JqUe9a-A,7856
37
- test/test_common/test_request.py,sha256=wxSwnOj-_YOv2SuZjOJsw09q8A7p8GJmJuK4vozqCNg,11749
38
- test/test_common/test_result.py,sha256=Wm0Cs5kZRzlZr0jL-l8OTsYAvkoN2eaB3NkeXzvyssI,42208
35
+ test/test_common/test_ocr.py,sha256=nel1GCkieDRW2F_6kYCbkIwB9Kwj_d2rJBgb8VZWXS8,1685
36
+ test/test_common/test_ontology_helper.py,sha256=KhHEBg_ecJyQbDw79NMT4FzUyA4C1Aak3HEQCwBfM2s,7914
37
+ test/test_common/test_request.py,sha256=PPhHfrwpwMdNZ33P1Z_0h1Zaz9ao9VFiDr_MJrBS3Lg,11492
38
+ test/test_common/test_result.py,sha256=b96bCfyW0ukdTcCsl01jS_l5YhfzXFVYs_VPOwz7IEU,41982
39
39
  test/test_common/test_task.py,sha256=jnfF68EgJIu30Pz_4jiJHkncfI-3XpGaut5r79KIXOA,18718
40
40
  test/test_common/test_utils.py,sha256=TbnBxqpS_ZC5ptXR9XJX3xtbItD0mTbtiBxxdyP8J5k,5904
41
- assemblyline_v4_service-4.5.1.dev71.dist-info/LICENCE.md,sha256=NSkYo9EH8h5oOkzg4VhjAHF4339MqPP2cQ8msTPgl-c,1396
42
- assemblyline_v4_service-4.5.1.dev71.dist-info/METADATA,sha256=TCHvOZ28H5YkRA2nGTlA3jv9y8r8Ty2w6HPQ6o5vGSc,9498
43
- assemblyline_v4_service-4.5.1.dev71.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
44
- assemblyline_v4_service-4.5.1.dev71.dist-info/top_level.txt,sha256=LpTOEaVCatkrvbVq3EZseMSIa2PQZU-2rhuO_FTpZgY,29
45
- assemblyline_v4_service-4.5.1.dev71.dist-info/RECORD,,
41
+ assemblyline_v4_service-4.5.1.dev75.dist-info/LICENCE.md,sha256=NSkYo9EH8h5oOkzg4VhjAHF4339MqPP2cQ8msTPgl-c,1396
42
+ assemblyline_v4_service-4.5.1.dev75.dist-info/METADATA,sha256=G0uwI6VSb0U0defzTlpY1uKuw0qji1s4OGUTxvPuZig,9498
43
+ assemblyline_v4_service-4.5.1.dev75.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
44
+ assemblyline_v4_service-4.5.1.dev75.dist-info/top_level.txt,sha256=LpTOEaVCatkrvbVq3EZseMSIa2PQZU-2rhuO_FTpZgY,29
45
+ assemblyline_v4_service-4.5.1.dev75.dist-info/RECORD,,
@@ -3,8 +3,7 @@ import subprocess
3
3
 
4
4
  from assemblyline.common.version import FRAMEWORK_VERSION, SYSTEM_VERSION
5
5
 
6
- SERVICE_CONFIG_NAME = "service_manifest.yml"
7
- TEMP_SERVICE_CONFIG_PATH = os.path.join("/tmp", SERVICE_CONFIG_NAME)
6
+ TEMP_SERVICE_CONFIG_PATH ="/tmp/service_manifest.yml"
8
7
 
9
8
  ret = subprocess.run("dpkg -l | grep ^ii | awk '{print $2}' | grep -i 'tesseract'", capture_output=True, shell=True)
10
9
  TESSERACT_LIST = list(filter(None, ret.stdout.decode().split('\n')))
@@ -3,6 +3,12 @@ import os
3
3
  import time
4
4
  from logging import Logger
5
5
 
6
+
7
+ from test.test_common import setup_module
8
+
9
+ # Ensure service manifest is instantiated before loading assemblyline_v4_service module
10
+ setup_module()
11
+
6
12
  import pytest
7
13
  import requests_mock
8
14
  from assemblyline_v4_service.common.base import *
@@ -1,16 +1,13 @@
1
1
  import os
2
- from test.test_common import TESSERACT_LIST, setup_module
2
+ from test.test_common import TESSERACT_LIST
3
3
 
4
4
  import pytest
5
- from assemblyline_v4_service.common.ocr import *
6
5
 
6
+ from assemblyline_v4_service.common.ocr import ocr_detections, detections
7
7
 
8
8
  @pytest.mark.skipif(len(TESSERACT_LIST) < 1, reason="Requires tesseract-ocr apt package")
9
9
  def test_ocr_detections():
10
- if os.getcwd().endswith("/test"):
11
- file_path = os.path.join(os.getcwd(), "test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
12
- else:
13
- file_path = os.path.join(os.getcwd(), "test/test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
10
+ file_path = os.path.join(os.path.dirname(__file__), "b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
14
11
  assert ocr_detections(file_path) == {
15
12
  'ransomware': [
16
13
  "YOUR FILES HAVE BEEN ENCRYPTED AND YOU WON'T BE ABLE TO "
@@ -30,7 +27,6 @@ def test_ocr_detections():
30
27
 
31
28
 
32
29
  def test_detections():
33
- setup_module()
34
30
  # No detection
35
31
  assert detections("blah") == {}
36
32
 
@@ -3,6 +3,9 @@ import logging
3
3
  import os
4
4
  import tempfile
5
5
 
6
+ from test.test_common import setup_module
7
+ setup_module()
8
+
6
9
  import pytest
7
10
  from assemblyline_v4_service.common.ontology_helper import *
8
11
  from assemblyline_v4_service.common.result import ResultSection
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import tempfile
3
3
  from logging import Logger
4
- from test.test_common import TESSERACT_LIST
4
+ from test.test_common import TESSERACT_LIST, setup_module
5
5
 
6
6
  import pytest
7
7
  from assemblyline_v4_service.common.request import ServiceRequest
@@ -10,6 +10,8 @@ from assemblyline_v4_service.common.task import MaxExtractedExceeded, Task
10
10
 
11
11
  from assemblyline.odm.messages.task import Task as ServiceTask
12
12
 
13
+ # Ensure service manifest is instantiated before importing from OCR submodule
14
+ setup_module()
13
15
 
14
16
  @pytest.fixture
15
17
  def service_request():
@@ -105,10 +107,7 @@ def test_add_extracted(service_request):
105
107
 
106
108
  @pytest.mark.skipif(len(TESSERACT_LIST) < 1, reason="Requires tesseract-ocr apt package")
107
109
  def test_add_image(service_request):
108
- if os.getcwd().endswith("/test"):
109
- image_path = os.path.join(os.getcwd(), "test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
110
- else:
111
- image_path = os.path.join(os.getcwd(), "test/test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
110
+ image_path = os.path.join(os.path.dirname(__file__), "b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
112
111
 
113
112
  # Basic
114
113
  assert service_request.add_image(image_path, "image_name", "description of image") == {
@@ -196,10 +195,7 @@ def test_add_image(service_request):
196
195
  service_request.task.supplementary.clear()
197
196
 
198
197
  # Classification, OCR heuristic, OCR_IO, image with password
199
- if os.getcwd().endswith("/test"):
200
- image_path = os.path.join(os.getcwd(), "test_common/4031ed8786439eee24b87f84901e38038a76b8c55e9d87dd5a7d88df2806c1cf")
201
- else:
202
- image_path = os.path.join(os.getcwd(), "test/test_common/4031ed8786439eee24b87f84901e38038a76b8c55e9d87dd5a7d88df2806c1cf")
198
+ image_path = os.path.join(os.path.dirname(__file__), "4031ed8786439eee24b87f84901e38038a76b8c55e9d87dd5a7d88df2806c1cf")
203
199
  _, path = tempfile.mkstemp()
204
200
  ocr_io = open(path, "w")
205
201
  data = service_request.add_image(image_path, "image_name", "description of image", "TLP:A", ocr_heuristic_id, ocr_io)
@@ -1,6 +1,10 @@
1
1
  import os
2
2
  import tempfile
3
- from test.test_common import TESSERACT_LIST
3
+ from test.test_common import TESSERACT_LIST, setup_module
4
+
5
+ # Ensure service manifest is instantiated before importing from OCR submodule
6
+ setup_module()
7
+
4
8
 
5
9
  import pytest
6
10
  from assemblyline_v4_service.common.request import ServiceRequest
@@ -9,7 +13,6 @@ from assemblyline_v4_service.common.task import Task
9
13
 
10
14
  from assemblyline.odm.messages.task import Task as ServiceTask
11
15
 
12
-
13
16
  @pytest.fixture
14
17
  def heuristic():
15
18
  return Heuristic(1)
@@ -592,10 +595,7 @@ def test_imagesectionbody_init(service_request):
592
595
  @pytest.mark.skipif(len(TESSERACT_LIST) < 1, reason="Requires tesseract-ocr apt package")
593
596
  def test_imagesectionbody_add_image(service_request):
594
597
  isb = ImageSectionBody(service_request)
595
- if os.getcwd().endswith("/test"):
596
- image_path = os.path.join(os.getcwd(), "test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
597
- else:
598
- image_path = os.path.join(os.getcwd(), "test/test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
598
+ image_path = os.path.join(os.path.dirname(__file__), "b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
599
599
 
600
600
  # Basic
601
601
  assert isb.add_image(image_path, "image_name", "description of image") is None
@@ -1230,10 +1230,8 @@ def test_resultimagesection_init(service_request):
1230
1230
  def test_resultimagesection_add_image(service_request):
1231
1231
  ris = ResultImageSection(service_request, "title_text_as_str")
1232
1232
 
1233
- if os.getcwd().endswith("/test"):
1234
- image_path = os.path.join(os.getcwd(), "test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
1235
- else:
1236
- image_path = os.path.join(os.getcwd(), "test/test_common/b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
1233
+ image_path = os.path.join(os.path.dirname(__file__),
1234
+ "b32969aa664e3905c20f865cdd7b921f922678f5c3850c78e4c803fbc1757a8e")
1237
1235
 
1238
1236
  # Basic
1239
1237
  assert ris.add_image(image_path, "image_name", "description of image") is None