assemblyline-v4-service 4.4.0.24__py3-none-any.whl → 4.4.0.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of assemblyline-v4-service might be problematic. Click here for more details.

Files changed (42) hide show
  1. assemblyline_v4_service/VERSION +1 -1
  2. assemblyline_v4_service/common/api.py +3 -2
  3. assemblyline_v4_service/common/base.py +3 -4
  4. assemblyline_v4_service/common/helper.py +1 -2
  5. assemblyline_v4_service/common/{extractor/ocr.py → ocr.py} +0 -1
  6. assemblyline_v4_service/common/ontology_helper.py +7 -8
  7. assemblyline_v4_service/common/request.py +4 -5
  8. assemblyline_v4_service/common/result.py +3 -3
  9. assemblyline_v4_service/common/task.py +3 -3
  10. assemblyline_v4_service/common/utils.py +2 -2
  11. assemblyline_v4_service/updater/helper.py +4 -0
  12. {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/METADATA +1 -1
  13. assemblyline_v4_service-4.4.0.26.dist-info/RECORD +28 -0
  14. assemblyline_v4_service/common/balbuzard/__init__.py +0 -0
  15. assemblyline_v4_service/common/balbuzard/balbuzard.py +0 -656
  16. assemblyline_v4_service/common/balbuzard/bbcrack.py +0 -830
  17. assemblyline_v4_service/common/balbuzard/patterns.py +0 -650
  18. assemblyline_v4_service/common/dynamic_service_helper.py +0 -3631
  19. assemblyline_v4_service/common/extractor/__init__.py +0 -1
  20. assemblyline_v4_service/common/extractor/base64.py +0 -86
  21. assemblyline_v4_service/common/extractor/pe_file.py +0 -51
  22. assemblyline_v4_service/common/icap.py +0 -149
  23. assemblyline_v4_service/common/keytool_parse.py +0 -66
  24. assemblyline_v4_service/common/pestudio/__init__.py +0 -0
  25. assemblyline_v4_service/common/pestudio/xml/__init__.py +0 -0
  26. assemblyline_v4_service/common/pestudio/xml/features.xml +0 -5607
  27. assemblyline_v4_service/common/pestudio/xml/functions.xml +0 -5824
  28. assemblyline_v4_service/common/pestudio/xml/languages.xml +0 -375
  29. assemblyline_v4_service/common/pestudio/xml/resources.xml +0 -511
  30. assemblyline_v4_service/common/pestudio/xml/signatures.xml +0 -29105
  31. assemblyline_v4_service/common/pestudio/xml/strings.xml +0 -2379
  32. assemblyline_v4_service/common/safelist_helper.py +0 -73
  33. assemblyline_v4_service/common/section_reducer.py +0 -43
  34. assemblyline_v4_service/common/tag_helper.py +0 -117
  35. assemblyline_v4_service/common/tag_reducer.py +0 -242
  36. assemblyline_v4_service/testing/__init__.py +0 -0
  37. assemblyline_v4_service/testing/helper.py +0 -463
  38. assemblyline_v4_service/testing/regenerate_results.py +0 -37
  39. assemblyline_v4_service-4.4.0.24.dist-info/RECORD +0 -53
  40. {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/LICENCE.md +0 -0
  41. {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/WHEEL +0 -0
  42. {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/top_level.txt +0 -0
@@ -1,650 +0,0 @@
1
- """
2
- Modified version of patterns.py found here:
3
- https://github.com/decalage2/balbuzard
4
-
5
- Info:
6
- balbuzard patterns - v0.07 2014-02-13 Philippe Lagadec
7
- For more info and updates: http://www.decalage.info/balbuzard
8
- """
9
-
10
- # LICENSE:
11
- #
12
- # balbuzard is copyright (c) 2007-2014, Philippe Lagadec (http://www.decalage.info)
13
- # All rights reserved.
14
- #
15
- # Redistribution and use in source and binary forms, with or without modification,
16
- # are permitted provided that the following conditions are met:
17
- #
18
- # * Redistributions of source code must retain the above copyright notice, this
19
- # list of conditions and the following disclaimer.
20
- # * Redistributions in binary form must reproduce the above copyright notice,
21
- # this list of conditions and the following disclaimer in the documentation
22
- # and/or other materials provided with the distribution.
23
- #
24
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
25
- # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
- # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
29
- # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
- # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31
- # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32
- # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
-
36
- import regex as re
37
- from os import path
38
- from xml.etree import ElementTree
39
-
40
- from fuzzywuzzy import process
41
-
42
- from assemblyline_v4_service.common.balbuzard.balbuzard import Pattern, Pattern_re
43
-
44
-
45
- def get_xml_strings():
46
-
47
- pest_minlen = 6
48
-
49
- api = {}
50
- blacklist = {}
51
- powershell = {}
52
-
53
- with open(path.join(path.dirname(__file__), "../pestudio/xml/strings.xml"), 'rt') as f:
54
- tree = ElementTree.parse(f)
55
-
56
- for st in tree.findall('.//agent'):
57
- if len(st.text) > pest_minlen:
58
- blacklist.setdefault('agent', set()).add(st.text)
59
- for st in tree.findall('.//av'):
60
- if len(st.text) > pest_minlen:
61
- blacklist.setdefault('av', set()).add(st.text)
62
- for st in tree.findall('.//event'):
63
- if len(st.text) > pest_minlen:
64
- blacklist.setdefault('event', set()).add(st.text)
65
- for st in tree.findall('.//guid'):
66
- if len(st.text) > pest_minlen:
67
- blacklist.setdefault('guid', set()).add(st.text)
68
- for st in tree.findall('.//insult'):
69
- if len(st.text) > pest_minlen:
70
- blacklist.setdefault('insult', set()).add(st.text)
71
- for st in tree.findall('.//key'):
72
- if len(st.text) > pest_minlen:
73
- blacklist.setdefault('key', set()).add(st.text)
74
- for st in tree.findall('.//oid'):
75
- if len(st.text) > pest_minlen:
76
- blacklist.setdefault('oid', set()).add(st.text)
77
- for st in tree.findall('.//os'):
78
- if len(st.text) > pest_minlen:
79
- blacklist.setdefault('os', set()).add(st.text)
80
- for st in tree.findall('.//priv'):
81
- if len(st.text) > pest_minlen:
82
- blacklist.setdefault('priv', set()).add(st.text)
83
- for st in tree.findall('.//product'):
84
- if len(st.text) > pest_minlen:
85
- blacklist.setdefault('product', set()).add(st.text)
86
- for st in tree.findall('.//protocol'):
87
- blacklist.setdefault('protocol', set()).add(st.text)
88
- for st in tree.findall('.//reg'):
89
- if len(st.text) > pest_minlen:
90
- blacklist.setdefault('reg', set()).add(st.text)
91
- for st in tree.findall('.//sid'):
92
- if len(st.text) > pest_minlen:
93
- blacklist.setdefault('sid', set()).add(st.text)
94
- for st in tree.findall('.//string'):
95
- if len(st.text) > pest_minlen:
96
- blacklist.setdefault('string', set()).add(st.text)
97
- # Powershell indicator strings
98
- for st in tree.findall('.//powershell'):
99
- if len(st.text) > pest_minlen:
100
- powershell.setdefault('powershell', set()).add(st.text)
101
-
102
- # Adding Popular API
103
- with open(path.join(path.dirname(__file__), '../pestudio/xml/functions.xml'), 'rt') as f:
104
- tree = ElementTree.parse(f)
105
-
106
- for st in tree.findall(".//fct"):
107
- if st.text is not None:
108
- if len(st.text) > pest_minlen and st.text is not None:
109
- api.setdefault('fct', set()).add(st.text.split('::', 1)[0])
110
- for st in tree.findall(".//lib"):
111
- if st.attrib['name'] is not None:
112
- if len(st.attrib['name']) > pest_minlen:
113
- api.setdefault('lib', set()).add(st.attrib['name'])
114
- for st in tree.findall('.//topapi'):
115
- if st.text is not None:
116
- if len(st.text) > pest_minlen:
117
- api.setdefault('topapi', set()).add(st.text)
118
-
119
- return api, blacklist, powershell
120
-
121
-
122
- class PatternMatch(object):
123
-
124
- # Curated list to avoid false positives.
125
- TDLS = {b'ac', b'aco', b'ad', b'adac', b'ads', b'ae', b'aeg', b'aero', b'af', b'afl', b'ag', b'agakhan', b'ai',
126
- b'aig', b'akdn', b'al', b'am', b'amica', b'anz', b'ao', b'apple', b'aq', b'ar', b'army', b'arpa', b'at',
127
- b'au', b'aw', b'aws', b'ax', b'axa', b'az', b'ba', b'baidu', b'bbc', b'bbva', b'bcg', b'bcn', b'bd', b'be',
128
- b'bf', b'bg', b'bh', b'bharti', b'bi', b'bing', b'biz', b'bj', b'blog', b'bm', b'bms', b'bn', b'bo',
129
- b'bom', b'bot', b'br', b'bs', b'bt', b'bv', b'bw', b'by', b'bz', b'bzh', b'ca', b'cba', b'cbn', b'cbre',
130
- b'cf', b'cfa', b'cfd', b'cg', b'ch', b'ci', b'ck', b'cl', b'cm', b'cn', b'co', b'com', b'cr',
131
- b'crs', b'csc', b'cu', b'cv', b'cw', b'cx', b'cy', b'cz', b'dclk', b'dds', b'de', b'dev', b'dhl', b'dj',
132
- b'dk', b'dm', b'dnp', b'do', b'docs', b'domains', b'download', b'drive', b'dtv', b'dubai', b'dvag',
133
- b'dz', b'ec', b'edu', b'er', b'erni', b'es', b'esq', b'et', b'eu', b'eurovision', b'eus', b'fi', b'fj',
134
- b'fk', b'flickr', b'flir', b'fly', b'fm', b'fo', b'foo', b'fr', b'frl', b'ftr', b'ga', b'gb',
135
- b'gbiz', b'gd', b'gdn', b'ge', b'gea', b'gl', b'gle', b'gm', b'gmail', b'gmbh', b'gmo', b'gmx', b'gn',
136
- b'goog', b'google', b'gop', b'got', b'gov', b'gp', b'gq', b'gr', b'gs', b'gt', b'gu', b'guru', b'gw', b'gy',
137
- b'hk', b'hkt', b'hm', b'hn', b'host', b'hotmail', b'hr', b'ht', b'hu', b'icu', b'id', b'ie',
138
- b'ifm', b'ikano', b'il', b'im', b'imamat', b'imdb', b'immo', b'immobilien', b'in', b'info',
139
- b'ing', b'ink', b'int', b'io', b'ipiranga', b'iq', b'ir', b'is', b'ist', b'istanbul', b'it', b'itau',
140
- b'itv', b'jaguar', b'jcb', b'je', b'jll', b'jm', b'jmp', b'jnj', b'jo', b'jot',
141
- b'jp', b'ke', b'kfh', b'kg', b'kh', b'ki', b'kia', b'kindle', b'km', b'kn', b'kp', b'kpmg', b'kpn', b'kr',
142
- b'krd', b'kw', b'ky', b'kyoto', b'kz', b'la', b'lat', b'lb', b'lc', b'lds', b'li', b'link', b'lk', b'lol',
143
- b'lr', b'ls', b'lt', b'ltd', b'ltda', b'lu', b'lv', b'ly', b'ma', b'madrid', b'mba', b'mc', b'md', b'me',
144
- b'med', b'meme', b'mg', b'mh', b'microsoft', b'mil', b'mk', b'ml', b'mlb', b'mls', b'mma',
145
- b'mn', b'mo', b'mobi', b'mov', b'mp', b'mq', b'mr', b'ms', b'mt', b'mtn', b'mtr',
146
- b'mu', b'mv', b'mw', b'mx', b'my', b'mz', b'na', b'navy', b'nc', b'ne', b'nec', b'net', b'netbank',
147
- b'neustar', b'nexus', b'nf', b'ng', b'ngo', b'nhk', b'ni', b'nico', b'nl', b'nowruz', b'nowtv', b'np',
148
- b'nr', b'nra', b'nrw', b'ntt', b'nu', b'nyc', b'nz', b'obi', b'ollo', b'om', b'ong', b'onl', b'org', b'ott',
149
- b'ovh', b'pa', b'pccw', b'pe', b'pet', b'pf', b'pg', b'ph', b'pid', b'pin', b'ping', b'pk', b'pl', b'pm',
150
- b'pn', b'pnc', b'pohl', b'porn', b'post', b'pr', b'pro', b'prod', b'ps', b'pt', b'pub', b'pw', b'pwc',
151
- b'py', b'qa', b'qpon', b'quebec', b're', b'ren', b'rio', b'ro', b'rocher', b'rs', b'rsvp', b'ru', b'ruhr',
152
- b'rw', b'rwe', b'ryukyu', b'sa', b'sap', b'sarl', b'sas', b'saxo', b'sb', b'sbi', b'sbs',
153
- b'sc', b'sca', b'scb', b'sd', b'se', b'sew', b'sex', b'sfr', b'sg', b'sh', b'si', b'sina', b'site',
154
- b'sj', b'sk', b'skype', b'sl', b'sm', b'sn', b'sncf', b'so', b'sr', b'srl', b'st', b'stc', b'stcgroup',
155
- b'su', b'sv', b'sx', b'sy', b'sydney', b'systems', b'sz', b'tab',
156
- b'taipei', b'taobao', b'tc', b'tci', b'td', b'tdk', b'tel', b'teva', b'tf', b'tg', b'th', b'thd', b'tj',
157
- b'tk', b'tl', b'tm', b'tmall', b'tn', b'to', b'tokyo', b'tr', b'trv', b'tt', b'tube', b'tui', b'tunes',
158
- b'tushu', b'tv', b'tw', b'tz', b'ua', b'ubs', b'ug', b'uk', b'uno', b'uol', b'ups', b'us', b'uy', b'uz',
159
- b'va', b'vc', b've', b'vet', b'vg', b'vi', b'vig', b'vin', b'vip', b'vn',
160
- b'vu', b'wed', b'weibo', b'weir', b'wf', b'whoswho', b'wien', b'wiki', b'win', b'windows', b'wme', b'ws',
161
- b'wtc', b'wtf', b'xbox', b'xerox', b'xihuan', b'xin', b'xn--11b4c3d', b'xn--1ck2e1b',
162
- b'xn--1qqw23a', b'xn--30rr7y', b'xn--3bst00m', b'xn--3ds443g', b'xn--3e0b707e', b'xn--3pxu8k',
163
- b'xn--42c2d9a', b'xn--45brj9c', b'xn--45q11c', b'xn--4gbrim', b'xn--55qw42g', b'xn--55qx5d',
164
- b'xn--5su34j936bgsg', b'xn--5tzm5g', b'xn--6frz82g', b'xn--6qq986b3xl', b'xn--80adxhks',
165
- b'xn--80ao21a', b'xn--80asehdb', b'xn--80aswg', b'xn--8y0a063a', b'xn--90a3ac', b'xn--90ae',
166
- b'xn--90ais', b'xn--9dbq2a', b'xn--9et52u', b'xn--9krt00a', b'xn--b4w605ferd', b'xn--bck1b9a5dre4c',
167
- b'xn--c1avg', b'xn--c2br7g', b'xn--cck2b3b', b'xn--cg4bki', b'xn--clchc0ea0b2g2a9gcd',
168
- b'xn--czr694b', b'xn--czrs0t', b'xn--czru2d', b'xn--d1acj3b', b'xn--d1alf', b'xn--e1a4c',
169
- b'xn--eckvdtc9d', b'xn--efvy88h', b'xn--fct429k', b'xn--fhbei', b'xn--fiq228c5hs',
170
- b'xn--fiq64b', b'xn--fiqs8s', b'xn--fiqz9s', b'xn--fjq720a', b'xn--flw351e', b'xn--fpcrj9c3d',
171
- b'xn--fzc2c9e2c', b'xn--fzys8d69uvgm', b'xn--g2xx48c', b'xn--gckr3f0f', b'xn--gecrj9c',
172
- b'xn--h2brj9c', b'xn--hxt814e', b'xn--i1b6b1a6a2e', b'xn--imr513n', b'xn--io0a7i', b'xn--j1aef',
173
- b'xn--j1amh', b'xn--j6w193g', b'xn--jlq61u9w7b', b'xn--jvr189m', b'xn--kcrx77d1x4a', b'xn--kprw13d',
174
- b'xn--kpry57d', b'xn--kput3i', b'xn--l1acc', b'xn--lgbbat1ad8j', b'xn--mgb9awbf',
175
- b'xn--mgba3a3ejt', b'xn--mgba3a4f16a', b'xn--mgba7c0bbn0a', b'xn--mgbaam7a8h', b'xn--mgbab2bd',
176
- b'xn--mgbayh7gpa', b'xn--mgbbh1a71e', b'xn--mgbc0a9azcg', b'xn--mgbca7dzdo',
177
- b'xn--mgberp4a5d4ar', b'xn--mgbpl2fh', b'xn--mgbt3dhd', b'xn--mgbtx2b', b'xn--mgbx4cd0ab',
178
- b'xn--mix891f', b'xn--mk1bu44c', b'xn--mxtq1m', b'xn--ngbc5azd', b'xn--ngbe9e0a', b'xn--node',
179
- b'xn--nqv7f', b'xn--nqv7fs00ema', b'xn--nyqy26a', b'xn--o3cw4h', b'xn--ogbpf8fl', b'xn--p1acf',
180
- b'xn--p1ai', b'xn--pgbs0dh', b'xn--pssy2u', b'xn--q9jyb4c', b'xn--qcka1pmc',
181
- b'xn--qxam', b'xn--rhqv96g', b'xn--rovu88b', b'xn--s9brj9c', b'xn--ses554g', b'xn--t60b56a',
182
- b'xn--tckwe', b'xn--unup4y', b'xn--vermgensberater-ctb', b'xn--vermgensberatung-pwb', b'xn--vhquv',
183
- b'xn--vuq861b', b'xn--w4r85el8fhu5dnra', b'xn--w4rs40l', b'xn--wgbh1c', b'xn--wgbl6a',
184
- b'xn--xhq521b', b'xn--xkc2al3hye2a', b'xn--xkc2dl3a5ee0h', b'xn--y9a3aq', b'xn--yfro4i67o',
185
- b'xn--ygbi2ammx', b'xn--zfr164b', b'xyz', b'yahoo', b'yamaxun',
186
- b'yandex', b'ye', b'yokohama', b'you', b'youtube', b'yt', b'yun', b'za', b'zappos',
187
- b'zara', b'zero', b'zm', b'zone', b'zuerich', b'zw'}
188
-
189
- # --- PEStudio Patterns ------------------------------------------------------------------------------------------------
190
-
191
- PEST_API, PEST_BLACKLIST, PEST_POWERSHELL = get_xml_strings()
192
-
193
- # --- Regex Patterns ---------------------------------------------------------------------------------------------------
194
-
195
- PAT_DOMAIN = rb'(?i)\b(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})\b'
196
- PAT_FILECOM = rb'(?i)(?:\b[a-z]?[:]?[- _A-Z0-9.\\~]{0,75}[%]?' \
197
- rb'(?:ALLUSERPROFILE|APPDATA|commonappdata|CommonProgramFiles|HOMEPATH|LOCALAPPDATA|' \
198
- rb'ProgramData|ProgramFiles|PUBLIC|SystemDrive|SystemRoot|\\TEMP|USERPROFILE|' \
199
- rb'windir|system32|syswow64|\\user)[%]?\\[-_A-Z0-9\.\\]{1,200}\b|' \
200
- rb'/home/[-_A-Z0-9\./]{0,50}|/usr/local[-_A-Z0-9\./]{0,50}|/usr/bin[-_A-Z0-9\./]{0,50}|' \
201
- rb'/var/log[-_A-Z0-9\./]{0,50}|/etc/(?:shadow|group|passwd))'
202
- PAT_FILEEXT = rb'(?i)\b[a-z]?[:]?[- _A-Z0-9.\\~]{0,200}\w\.' \
203
- rb'(?:7Z|APK|APP|BAT|BIN|CLASS|CMD|DAT|DOC|DOCX|DLL|EML|EXE|JAR|JPEG|JPG|JS|JSE|LNK|LOG|MSI|' \
204
- rb'OSX|PAF|PDF|PNG|PPT|PPTX|PS1|RAR|RTF|SCR|SWF|SYS|[T]?BZ[2]?|TXT|TMP|VBE|VBS|WSF|WSH|XLS' \
205
- rb'|XLSX|ZIP)\b'
206
- PAT_FILEPDB = rb'(?i)\b[-_A-Z0-9.\\]{0,200}\w\.PDB\b'
207
- PAT_EMAIL = rb'(?i)\b[A-Z0-9._%+-]{3,200}@(?:[A-Z0-9-]+\.)+(?:XN--[A-Z0-9]{4,18}|[A-Z]{2,12})\b'
208
- PAT_IP = rb'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
209
- PAT_REGIS = rb'(?i)\b[- _A-Z0-9.\\]{0,25}' \
210
- rb'(?:controlset001|controlset002|currentcontrolset|currentversion|HKCC|HKCR|HKCU|HKDD|' \
211
- rb'hkey_classes_root|hkey_current_config|hkey_current_user|hkey_dyn_data|hkey_local_machine|' \
212
- rb'HKLM|hkey_performance_data|hkey_users|HKPD|internet settings|\\sam|\\software|\\system|' \
213
- rb'\\userinit)' \
214
- rb'\\[-_A-Z0-9.\\ ]{1,200}\b'
215
- PAT_URL = rb'(?i)(?:ftp|http|https)://' \
216
- rb'[A-Z0-9.-]{1,}\.(?:XN--[A-Z0-9]{4,18}|[a-z]{2,12}|[0-9]{1,3})' \
217
- rb'(?::[0-9]{1,5})?' \
218
- rb'(?:/[A-Z0-9/\-\.&%\$#=~\?_+]{3,200}){0,1}'
219
- PAT_ANYHTTP = rb'(?i)http://' \
220
- rb'[A-Z0-9.-]{6,}\.' \
221
- rb'(?:XN--[A-Z0-9]{4,18}|[a-z]{2,12}|[0-9]{1,3})' \
222
- rb'(?::[0-9]{1,5})?' \
223
- rb'/[A-Z0-9/\-\.&%\$#=~\?_+]{5,}[\r\n]*'
224
- PAT_ANYHTTPS = rb'(?i)https://' \
225
- rb'[A-Z0-9.-]{6,}\.' \
226
- rb'(?:XN--[A-Z0-9]{4,18}|[a-z]{2,12}|[0-9]{1,3})' \
227
- rb'(?::[0-9]{1,5})?' \
228
- rb'/[A-Z0-9/\-\.&%\$#=~\?_+]{5,}[\r\n]*'
229
- PAT_ANYFTP = rb'(?i)ftp://' \
230
- rb'[A-Z0-9.-]{6,}\.' \
231
- rb'(?:XN--[A-Z0-9]{4,18}|[a-z]{2,12}|[0-9]{1,3})' \
232
- rb'(?::[0-9]{1,5})?' \
233
- rb'/[A-Z0-9/\-\.&%\$#=~\?_+]{5,}[\r\n]*'
234
- PAT_URI_NO_PROTOCOL = rb'(?:(?:(?:[A-Za-z]*:)?//)' \
235
- rb'(?:[^"\']\S+(?::\S*)?@)?' \
236
- rb'(?:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}' \
237
- rb'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)' \
238
- rb'|(?:(?:[A-Za-z0-9\\u00a1-\\uffff][A-Za-z0-9\\u00a1-\\uffff_-]{0,62})?' \
239
- rb'[A-Za-z0-9\\u00a1-\\uffff]\.)+(?:xn--)?' \
240
- rb'(?:[A-Za-z0-9\\u00a1-\\uffff]{2,}\.?))' \
241
- rb'(?::\d{2,5})?)' \
242
- rb'(?:[/?#=&%][a-zA-Z0-9.\-_~\$\.\+\!\*\'\(\)\,]+)*'
243
-
244
- PAT_EXEDOS = rb'This program cannot be run in DOS mode'
245
- PAT_EXEHEADER = rb'(?s)MZ.{32,1024}PE\000\000'
246
-
247
- # --- Find Match for IOC Regex, Return Dictionary: {[AL Tag Type:(Match Values)]} --------------------------------------
248
-
249
- def ioc_match(self, value, bogon_ip=None, just_network=None):
250
- # NOTES:
251
- # '(?i)' makes a regex case-insensitive
252
- # \b matches a word boundary, it can help speeding up regex search and avoiding some false positives.
253
- # See http://www.regular-expressions.info/wordboundaries.html
254
- value_extract = {}
255
- # ------------------------------------------------------------------------------
256
- # IP ADDRESSES
257
- # Pattern_re("IP addresses", r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", weight=10),
258
- # Here I use \b to make sure there is no other digit around and to speedup search
259
- # print("ips")
260
- find_ip = re.findall(self.PAT_IP, value)
261
- if len(find_ip) > 0:
262
- longeststring = max(find_ip, key=len)
263
- if len(longeststring) == len(value):
264
- not_filtered = self.ipv4_filter(value, bogon=bogon_ip)
265
- if not_filtered:
266
- value_extract.setdefault('network.static.ip', set()).add(value)
267
- # If the complete value matches the IP regex, not interested in other regex values
268
- return value_extract
269
- if len(find_ip) == 1:
270
- for val in find_ip:
271
- not_filtered = self.ipv4_filter(val, bogon=bogon_ip)
272
- if not_filtered:
273
- value_extract.setdefault('network.static.ip', set()).add(val)
274
- else:
275
- like_ls = process.extract(str(longeststring), find_ip, limit=50)
276
- final_values = list(filter(lambda ls: ls[1] < 99, like_ls))
277
- final_values.append((longeststring, 100))
278
- for val in final_values:
279
- not_filtered = self.ipv4_filter(val[0], bogon=bogon_ip)
280
- if not_filtered:
281
- value_extract.setdefault('network.static.ip', set()).add(val[0])
282
- # ------------------------------------------------------------------------------
283
- # URLs
284
- # print("urls")
285
- find_url = re.findall(self.PAT_URL, value)
286
- if len(find_url) > 0:
287
- ret = False
288
- longeststring = max(find_url, key=len)
289
- if len(longeststring) == len(value):
290
- ret = True
291
- final_values = [(value, 100)]
292
- elif len(find_url) == 1:
293
- final_values = [(find_url[0], 100)]
294
- else:
295
- like_ls = process.extract(str(longeststring), find_url, limit=50)
296
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
297
- final_values.append((longeststring, 100))
298
-
299
- for val in final_values:
300
- value_extract.setdefault('network.static.uri', set()).add(val[0])
301
-
302
- # Extract domain from URL
303
- find_domain = re.findall(self.PAT_DOMAIN, val[0])
304
- if len(find_domain) != 0:
305
- longeststring = max(find_domain, key=len)
306
- not_filtered = self.domain_filter(longeststring)
307
- if not_filtered:
308
- value_extract.setdefault('network.static.domain', set()).add(longeststring)
309
- if ret:
310
- return value_extract
311
- # ------------------------------------------------------------------------------
312
- # E-MAIL ADDRESSES
313
- # r'(?i)\b[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+(?:[A-Z]{2}|com|org|net|edu|gov|mil|int|biz|info|mobi|name|aero|asia|jobs|museum)\b',
314
- # changed to catch all current TLDs registered at IANA (in combination with filter function):
315
- # TLD = either only chars from 2 to 12, or 'XN--' followed by up to 18 chars and digits
316
- # print("emails")
317
- find_email = re.findall(self.PAT_EMAIL, value)
318
- if len(find_email) > 0:
319
- longeststring = max(find_email, key=len)
320
- if len(longeststring) == len(value):
321
- not_filtered = self.email_filter(value)
322
- if not_filtered:
323
- value_extract.setdefault('network.email.address', set()).add(value)
324
- return value_extract
325
- if len(find_email) == 1:
326
- for val in find_email:
327
- not_filtered = self.email_filter(val)
328
- if not_filtered:
329
- value_extract.setdefault('network.email.address', set()).add(val)
330
- else:
331
- like_ls = process.extract(str(longeststring), find_email, limit=50)
332
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
333
- final_values.append((longeststring, 100))
334
- for val in final_values:
335
- not_filtered = self.email_filter(val[0])
336
- if not_filtered:
337
- value_extract.setdefault('network.email.address', set()).add(val[0])
338
- # ------------------------------------------------------------------------------
339
- # DOMAIN NAMES
340
- # Old: r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(?<!-)\.?)+(?:[a-zA-Z]{2,})$)'
341
- # Below is taken from email regex above
342
- # print("domains")
343
- find_domain = re.findall(self.PAT_DOMAIN, value)
344
- if len(find_domain) > 0 and len(max(find_domain, key=len)) > 11:
345
- longeststring = max(find_domain, key=len)
346
- if len(longeststring) == len(value):
347
- not_filtered = self.domain_filter(value)
348
- if not_filtered:
349
- value_extract.setdefault('network.static.domain', set()).add(value)
350
- return value_extract
351
- if len(find_domain) == 1:
352
- for val in find_domain:
353
- not_filtered = self.domain_filter(val)
354
- if not_filtered:
355
- value_extract.setdefault('network.static.domain', set()).add(val)
356
- else:
357
- like_ls = process.extract(str(longeststring), find_domain, limit=50)
358
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
359
- final_values.append((longeststring, 100))
360
- for val in final_values:
361
- not_filtered = self.domain_filter(val[0])
362
- if not_filtered:
363
- value_extract.setdefault('network.static.domain', set()).add(val[0])
364
-
365
- if just_network:
366
- return value_extract
367
-
368
- # ------------------------------------------------------------------------------
369
- # FILENAMES
370
- # Check length
371
- # Ends with extension of interest or contains strings of interest
372
- # print("files")
373
- filefind_pdb = re.findall(self.PAT_FILEPDB, value)
374
- if len(filefind_pdb) > 0:
375
- if len(max(filefind_pdb, key=len)) > 6:
376
- longeststring = max(filefind_pdb, key=len)
377
- if len(longeststring) == len(value):
378
- value_extract.setdefault('file.pe.pdb_filename', set()).add(value)
379
- return value_extract
380
- if len(filefind_pdb) == 1:
381
- for val in filefind_pdb:
382
- value_extract.setdefault('file.pe.pdb_filename', set()).add(val)
383
- else:
384
- like_ls = process.extract(str(longeststring), filefind_pdb, limit=50)
385
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
386
- final_values.append((longeststring, 100))
387
- for val in final_values:
388
- value_extract.setdefault('file.pe.pdb_filename', set()).add(val[0])
389
- filefind_ext = re.findall(self.PAT_FILEEXT, value)
390
- if len(filefind_ext) > 0:
391
- if len(max(filefind_ext, key=len)) > 6:
392
- longeststring = max(filefind_ext, key=len)
393
- if len(longeststring) == len(value):
394
- value_extract.setdefault('file.name.extracted', set()).add(value)
395
- return value_extract
396
- if len(filefind_ext) == 1:
397
- for val in filefind_ext:
398
- value_extract.setdefault('file.name.extracted', set()).add(val)
399
- else:
400
- like_ls = process.extract(str(longeststring), filefind_ext, limit=50)
401
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
402
- final_values.append((longeststring, 100))
403
- for val in final_values:
404
- value_extract.setdefault('file.name.extracted', set()).add(val[0])
405
- filefind_com = re.findall(self.PAT_FILECOM, value)
406
- if len(filefind_com) > 0 and len(max(filefind_com, key=len)) > 6:
407
- longeststring = max(filefind_com, key=len)
408
- if len(longeststring) == len(value):
409
- value_extract.setdefault('file.name.extracted', set()).add(value)
410
- return value_extract
411
- if len(filefind_com) == 1:
412
- for val in filefind_com:
413
- value_extract.setdefault('file.name.extracted', set()).add(val)
414
- else:
415
- like_ls = process.extract(str(longeststring), filefind_com, limit=50)
416
- final_values = list(filter(lambda ls: ls[1] < 95, like_ls))
417
- final_values.append((longeststring, 100))
418
- for val in final_values:
419
- value_extract.setdefault('file.name.extracted', set()).add(val[0])
420
- # ------------------------------------------------------------------------------
421
- # REGISTRYKEYS
422
- # Looks for alpha numeric characters seperated by at least two sets of '\'s
423
- # print("reg")
424
- regfind = re.findall(self.PAT_REGIS, value)
425
- if len(regfind) > 0 and len(max(regfind, key=len)) > 15:
426
- longeststring = max(regfind, key=len)
427
- if len(longeststring) == len(value):
428
- value_extract.setdefault('dynamic.registry_key', set()).add(value)
429
- return value_extract
430
- if len(regfind) == 1:
431
- for val in regfind:
432
- value_extract.setdefault('dynamic.registry_key', set()).add(val)
433
- else:
434
- like_ls = process.extract(str(longeststring), regfind, limit=50)
435
- final_values = list(filter(lambda ls: ls[1] < 90, like_ls))
436
- final_values.append((longeststring, 100))
437
- for val in final_values:
438
- value_extract.setdefault('dynamic.registry_key', set()).add(val[0])
439
- # ------------------------------------------------------------------------------
440
- # PEStudio Blacklist
441
- # Flags strings from PEStudio's Blacklist
442
- final_values = []
443
- for k, i in self.PEST_BLACKLIST.items():
444
- for e in i:
445
- val = bytes(e, 'utf8')
446
- if val in value:
447
- final_values.append(val)
448
- if len(final_values) > 0:
449
- value_extract['file.string.blacklisted'] = set()
450
- for val in final_values:
451
- value_extract['file.string.blacklisted'].add(val)
452
- # -----------------------------------------------------------------------------
453
- # Function/Library Strings
454
- # Win API strings from PEStudio's Blacklist
455
- final_values = []
456
- for k, i in self.PEST_API.items():
457
- for e in i:
458
- val = bytes(e, 'utf8')
459
- if val in value:
460
- final_values.append(val)
461
- if len(final_values) > 0:
462
- value_extract['file.string.api'] = set()
463
- for val in final_values:
464
- value_extract['file.string.api'].add(val)
465
- # -----------------------------------------------------------------------------
466
- # Powershell Strings
467
- # Powershell Cmdlets added to PEStudio's strings.xml list
468
- final_values = []
469
- for k, i in self.PEST_POWERSHELL.items():
470
- for e in i:
471
- val = bytes(e, 'utf8')
472
- if val in value:
473
- final_values.append(val)
474
- if len(final_values) > 0:
475
- value_extract['file.powershell.cmdlet'] = set()
476
- for val in final_values:
477
- value_extract['file.powershell.cmdlet'].add(val)
478
-
479
- return value_extract
480
-
481
- # --- Filters ----------------------------------------------------------------------------------------------------------
482
-
483
- @staticmethod
484
- def ipv4_filter(value, bogon=None, **_):
485
- """
486
- IPv4 address filter:
487
- - check if string length is >7 (e.g. not just 4 digits and 3 dots)
488
- - check if not in list of bogon IP addresses
489
- return True if OK, False otherwise.
490
- """
491
- ip = value
492
-
493
- # 0.0.0.0 255.0.0.0e
494
- # > 255
495
- if ip.startswith(b'0'):
496
- return False
497
- for x in ip.split(b'.'):
498
- if int(x) > 255:
499
- return False
500
-
501
- # also reject IPs ending with .0 or .255
502
- if ip.endswith(b'.0') or ip.endswith(b'.255'):
503
- return False
504
-
505
- # BOGON IP ADDRESS RANGES:
506
- # source: http://www.team-cymru.org/Services/Bogons/bogon-dd.html
507
-
508
- if bogon is not None:
509
- # extract 1st and 2nd decimal number from IP as int:
510
- ip_bytes = ip.split(b'.')
511
- byte1 = int(ip_bytes[0])
512
- byte2 = int(ip_bytes[1])
513
- # print 'ip=%s byte1=%d byte2=%d' % (ip, byte1, byte2)
514
-
515
- # actually we might want to see the following bogon IPs if malware uses them
516
- # => this should be an option
517
- # 10.0.0.0 255.0.0.0
518
- if ip.startswith(b'10.'):
519
- return False
520
- # 100.64.0.0 255.192.0.0
521
- if ip.startswith(b'100.') and (byte2 & 192 == 64):
522
- return False
523
- # 127.0.0.0 255.0.0.0
524
- if ip.startswith(b'127.'):
525
- return False
526
- # 169.254.0.0 255.255.0.0
527
- if ip.startswith(b'169.254.'):
528
- return False
529
- # 172.16.0.0 255.240.0.0
530
- if ip.startswith(b'172.') and (byte2 & 240 == 16):
531
- return False
532
- # 192.0.0.0 255.255.255.0
533
- if ip.startswith(b'192.0.0.'):
534
- return False
535
- # 192.0.2.0 255.255.255.0
536
- if ip.startswith(b'192.0.2.'):
537
- return False
538
- # 192.168.0.0 255.255.0.0
539
- if ip.startswith(b'192.168.'):
540
- return False
541
- # 198.18.0.0 255.254.0.0
542
- if ip.startswith(b'198.') and (byte2 & 254 == 18):
543
- return False
544
- # 198.51.100.0 255.255.255.0
545
- if ip.startswith(b'198.51.100.'):
546
- return False
547
- # 203.0.113.0 255.255.255.0
548
- if ip.startswith(b'203.0.113.'):
549
- return False
550
- # 224.0.0.0 240.0.0.0
551
- if byte1 & 240 == 224:
552
- return False
553
- # 240.0.0.0 240.0.0.0
554
- if byte1 & 240 == 240:
555
- return False
556
-
557
- # otherwise it's a valid IP adress
558
- return True
559
-
560
- def email_filter(self, value, **_):
561
- # check length, e.g. longer than xy@hp.fr
562
- # check case? e.g. either lower, upper, or capital (but CamelCase covers
563
- # almost everything... the only rejected case would be starting with lower
564
- # and containing upper?)
565
- # or reject mixed case in last part of domain name? (might filter 50% of
566
- # false positives)
567
- # optionally, DNS MX query with caching?
568
-
569
- user, domain = value.split(b'@', 1)
570
- if len(user) < 3:
571
- return False
572
- if len(domain) < 5:
573
- return False
574
- tld = domain.rsplit(b'.', 1)[1].lower()
575
- if tld not in self.TDLS:
576
- return False
577
-
578
- return True
579
-
580
- def domain_filter(self, value, **_):
581
- # check length
582
- # check match again tlds set
583
- if len(value) < 10:
584
- return False
585
- # No more than 3 domain names
586
- if value.count(b'.') > 3:
587
- return False
588
- uniq_char = ''.join(set(str(value)))
589
- if len(uniq_char) < 6:
590
- return False
591
- fld = value.split(b'.')
592
- tld = value.rsplit(b'.', 1)[1].lower()
593
- # If only two domain levels and either second level < 6 char or tld <= 2 char, or top-level not in list
594
- if (len(fld) <= 2 and len(fld[0]) < 6) or tld not in self.TDLS:
595
- return False
596
- return True
597
-
598
- @staticmethod
599
- def str_filter(value, **_):
600
- """
601
- String filter: avoid false positives with random case. A typical string
602
- should be either:
603
- - all UPPERCASE
604
- - all lowercase
605
- - or Capitalized
606
- return True if OK, False otherwise.
607
- Usage: This filter is meant to be used with string patterns that catch words
608
- with the option nocase=True, but where random case is not likely.
609
- Note 1: It is assumed the string only contains alphabetical characters (a-z)
610
- Note 2: this filter does not cover CamelCase strings.
611
- """
612
- # case 1: all UPPERCASE
613
- # case 2: all lowercase except 1st character which can be uppercase (Capitalized)
614
- if value.isupper() or value[1:].islower():
615
- return True
616
- # Note: we could also use istitle() if strings are not only alphabetical.
617
-
618
- @staticmethod
619
- def len_filter(value, **_):
620
- if len(value) < 10:
621
- return False
622
- return True
623
-
624
- # --- BBCrack Patterns -------------------------------------------------------------------------------------------------
625
-
626
- def bbcr(self, level=1):
627
-
628
- if level == 'small_string':
629
- bbcrack_patterns = [
630
- Pattern_re("FTP://_NET_FULL_URI", self.PAT_ANYFTP, weight=100),
631
- Pattern_re("HTTP://_NET_FULL_URI", self.PAT_ANYHTTP, weight=100),
632
- Pattern_re("HTTPS://_NET_FULL_URI", self.PAT_ANYHTTPS, weight=100),
633
- ]
634
- return bbcrack_patterns
635
-
636
- bbcrack_patterns = [
637
- Pattern_re("EXE_HEAD", self.PAT_EXEHEADER, weight=100),
638
- Pattern_re("EXE_DOS", self.PAT_EXEDOS, weight=100),
639
- Pattern_re("NET_FULL_URI", self.PAT_URL, weight=100),
640
- ]
641
-
642
- if level == 2:
643
- # Add PEStudio's API String list, weight will default to 1:
644
- for k, i in self.PEST_API.items():
645
- if k == "topapi" or k == "lib":
646
- for e in i:
647
- if len(e) > 6:
648
- bbcrack_patterns.append(Pattern('file.string.api', e, nocase=True, weight=1000))
649
-
650
- return bbcrack_patterns