xbase-util 0.0.7__tar.gz → 0.0.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {xbase_util-0.0.7 → xbase_util-0.0.9}/PKG-INFO +2 -1
- {xbase_util-0.0.7 → xbase_util-0.0.9}/setup.py +2 -1
- xbase_util-0.0.9/xbase_util/esreq.py +26 -0
- xbase_util-0.0.9/xbase_util/handle_features_util.py +161 -0
- xbase_util-0.0.9/xbase_util/xbase_util.py +86 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util.egg-info/PKG-INFO +2 -1
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util.egg-info/SOURCES.txt +2 -0
- xbase_util-0.0.7/xbase_util/xbase_util.py +0 -21
- {xbase_util-0.0.7 → xbase_util-0.0.9}/README.md +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/setup.cfg +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util/__init__.py +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util.egg-info/dependency_links.txt +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util.egg-info/not-zip-safe +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util.egg-info/top_level.txt +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util_assets/GeoLite2-City.mmdb +0 -0
- {xbase_util-0.0.7 → xbase_util-0.0.9}/xbase_util_assets/arkimeparse.js +0 -0
@@ -3,13 +3,14 @@ from distutils.core import setup
|
|
3
3
|
from setuptools import find_packages
|
4
4
|
|
5
5
|
setup(name="xbase_util",
|
6
|
-
version="0.0.
|
6
|
+
version="0.0.9",
|
7
7
|
description="网络安全基础工具",
|
8
8
|
long_description="包含提取,预测,训练的基础工具",
|
9
9
|
author="xyt",
|
10
10
|
author_email="2506564278@qq.com",
|
11
11
|
license="<MIT License>",
|
12
12
|
packages=find_packages(),
|
13
|
+
url="https://gitee.com/jimonik/xbase_util.git",
|
13
14
|
install_requires=[
|
14
15
|
],
|
15
16
|
zip_safe=False,
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import requests
|
2
|
+
|
3
|
+
|
4
|
+
class EsReq:
|
5
|
+
def __init__(self, url,timeout=120):
|
6
|
+
self.es_url = url
|
7
|
+
self.timeout = timeout
|
8
|
+
print("初始化自定义es请求类")
|
9
|
+
|
10
|
+
def clear_all_scroll(self):
|
11
|
+
return requests.delete(self.es_url + "/_search/scroll", timeout=self.timeout, json={'scroll_id': '_all'})
|
12
|
+
|
13
|
+
def search(self, body, scroll):
|
14
|
+
requests.post(self.es_url + "/_search/scroll", data=body, timeout=self.timeout, json={'scroll_id': scroll})
|
15
|
+
|
16
|
+
def start_scroll(self, exp, scroll):
|
17
|
+
return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
|
18
|
+
json=exp)
|
19
|
+
|
20
|
+
def scroll_by_id(self, scroll_id, scroll):
|
21
|
+
return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
|
22
|
+
json={'scroll_id': scroll_id, 'scroll': scroll})
|
23
|
+
|
24
|
+
def search_file(self, id):
|
25
|
+
return requests.get(f"{self.es_url}/arkime_files_v30/_search", timeout=self.timeout,
|
26
|
+
json={"query": {"term": {"_id": id}}})
|
@@ -0,0 +1,161 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
import traceback
|
4
|
+
from urllib.parse import unquote
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def handle_uri(data):
|
10
|
+
print(f"处理URI:{len(data)}")
|
11
|
+
# 定义正则表达式,确保精确匹配各种攻击特征
|
12
|
+
regex_patterns = {
|
13
|
+
"sql": re.compile(
|
14
|
+
r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
|
15
|
+
re.IGNORECASE),
|
16
|
+
"xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
|
17
|
+
"cmd": re.compile(
|
18
|
+
r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
|
19
|
+
re.IGNORECASE),
|
20
|
+
"path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
|
21
|
+
"redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
|
22
|
+
"danger": re.compile(
|
23
|
+
r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
|
24
|
+
re.IGNORECASE),
|
25
|
+
"suspicious_ext": re.compile(
|
26
|
+
r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
|
27
|
+
re.IGNORECASE)
|
28
|
+
}
|
29
|
+
|
30
|
+
# 定义多层解码函数,确保完全解码 URI
|
31
|
+
def fully_decode_uri(uri):
|
32
|
+
try:
|
33
|
+
decoded_uri = str(uri)
|
34
|
+
for _ in range(3): # 尝试多次解码嵌套的编码
|
35
|
+
decoded_uri = unquote(decoded_uri)
|
36
|
+
return decoded_uri
|
37
|
+
except Exception as e:
|
38
|
+
return uri
|
39
|
+
|
40
|
+
def process_row(row):
|
41
|
+
uris = row['http.uri']
|
42
|
+
if not isinstance(uris, list):
|
43
|
+
try:
|
44
|
+
uris = json.loads(uris)
|
45
|
+
if not isinstance(uris, list):
|
46
|
+
uris = [str(uris)]
|
47
|
+
except Exception:
|
48
|
+
uris = [str(uris)]
|
49
|
+
try:
|
50
|
+
decoded_uris = [fully_decode_uri(uri) for uri in uris]
|
51
|
+
except Exception as e:
|
52
|
+
traceback.print_exc()
|
53
|
+
exit(0)
|
54
|
+
|
55
|
+
|
56
|
+
# 初始化统计变量
|
57
|
+
param_count = 0
|
58
|
+
path_depth = 0
|
59
|
+
param_lengths = []
|
60
|
+
feature_flags = {key: False for key in regex_patterns.keys()}
|
61
|
+
|
62
|
+
# 遍历解码后的 URI
|
63
|
+
for uri in decoded_uris:
|
64
|
+
param_count += uri.count('&') + 1
|
65
|
+
path_depth += uri.count('/')
|
66
|
+
|
67
|
+
# 提取参数长度
|
68
|
+
if '?' in uri:
|
69
|
+
params = uri.split('?', 1)[-1].split('&')
|
70
|
+
for param in params:
|
71
|
+
if '=' in param:
|
72
|
+
_, value = param.split('=', 1)
|
73
|
+
param_lengths.append(len(value))
|
74
|
+
|
75
|
+
# 检查正则匹配特征
|
76
|
+
for key, pattern in regex_patterns.items():
|
77
|
+
if pattern.search(uri):
|
78
|
+
feature_flags[key] = True
|
79
|
+
|
80
|
+
# 计算参数长度的统计值
|
81
|
+
avg_length = sum(param_lengths) / len(param_lengths) if param_lengths else 0
|
82
|
+
max_length = max(param_lengths) if param_lengths else 0
|
83
|
+
|
84
|
+
# 创建返回结果字典
|
85
|
+
result = {
|
86
|
+
"URI_FEATURES_EXTRA_param_count": param_count,
|
87
|
+
"URI_FEATURES_EXTRA_path_depth": path_depth,
|
88
|
+
"URI_FEATURES_EXTRA_param_length_avg": avg_length,
|
89
|
+
"URI_FEATURES_EXTRA_param_length_max": max_length,
|
90
|
+
}
|
91
|
+
|
92
|
+
# 添加特征标志到结果
|
93
|
+
for key, value in feature_flags.items():
|
94
|
+
result[f"URI_FEATURES_EXTRA_contains_{key}"] = value
|
95
|
+
|
96
|
+
return result
|
97
|
+
feature_data = data.progress_apply(process_row, axis=1, result_type="expand")
|
98
|
+
data = pd.concat([data, feature_data], axis=1)
|
99
|
+
return data
|
100
|
+
|
101
|
+
|
102
|
+
def handle_ua(data):
|
103
|
+
print("处理UA")
|
104
|
+
data['http.useragent'] = data['http.useragent'].fillna('').astype(str)
|
105
|
+
# 处理换行符及多余空格
|
106
|
+
data['http.useragent'] = data['http.useragent'].str.replace(r'\s+', ' ', regex=True)
|
107
|
+
# 常见攻击的 User-Agent 字符串匹配模式,忽略大小写
|
108
|
+
attack_patterns = '|'.join([
|
109
|
+
r"\bselect\b", r"\bunion\b", r"\binsert\b", r"\bupdate\b", r"\bdelete\b", r"\bdrop\b", r"--", r"#", r" or ",
|
110
|
+
r"' or '",
|
111
|
+
r"information_schema", r"database\(\)", r"version\(\)", # SQL注入相关
|
112
|
+
r"<script>", r"javascript:", r"onload=", r"onclick=", r"<iframe>", r"src=", # XSS相关
|
113
|
+
r"/etc/passwd", r"/etc/shadow", r"\&\&", r"\|", r"\$\(\)", r"exec", r"system", # 命令执行相关
|
114
|
+
r"\.\./", r"\.\.%2f", r"\.\.%5c", r"%c0%af", r"%252e%252e%252f", # 路径遍历
|
115
|
+
r"\.php", r"\.asp", r"\.jsp", r"\.exe", r"\.sh", r"\.py", r"\.pl", # 文件扩展名
|
116
|
+
r"redirect=", r"url=", r"next=", # 重定向
|
117
|
+
r"%3C", r"%3E", r"%27", r"%22", r"%00", r"%2F", r"%5C", r"%3B", r"%7C", r"%2E", r"%28", r"%29", # 编码
|
118
|
+
r'Googlebot', r'Bingbot', r'Slurp', r'curl', r'wget', r'Nmap',
|
119
|
+
r'SQLMap', r'Nikto', r'Dirbuster', r'python-requests', r'Apache-HttpClient',
|
120
|
+
r'Postman', r'Burp Suite', r'Fuzzing', r'nessus'
|
121
|
+
])
|
122
|
+
# 企业客户端 User-Agent 模式
|
123
|
+
enterprise_patterns = '|'.join([
|
124
|
+
r'MicroMessenger', r'wxwork', r'QQ/', r'QQBrowser', r'Alipay', r'UCWEB'
|
125
|
+
])
|
126
|
+
# 批量检查是否为攻击的 User-Agent,忽略大小写
|
127
|
+
data['UserAgent_is_attack'] = data['http.useragent'].str.contains(attack_patterns, case=False, regex=True)
|
128
|
+
# 批量检查是否为企业客户端,忽略大小写
|
129
|
+
data['UserAgent_is_enterprise'] = data['http.useragent'].str.contains(enterprise_patterns, case=False)
|
130
|
+
# 提取浏览器和版本
|
131
|
+
data['UserAgent_browser'] = data['http.useragent'].str.extract(r'(Chrome|Firefox|Safari|MSIE|Edge|Opera|Trident)',
|
132
|
+
expand=False, flags=re.IGNORECASE).fillna("Unknown")
|
133
|
+
data['UserAgent_browser_version'] = data['http.useragent'].str.extract(
|
134
|
+
r'Chrome/([\d\.]+)|Firefox/([\d\.]+)|Version/([\d\.]+).*Safari|MSIE ([\d\.]+)|Edge/([\d\.]+)|Opera/([\d\.]+)|Trident/([\d\.]+)',
|
135
|
+
expand=False, flags=re.IGNORECASE).bfill(axis=1).fillna("Unknown").iloc[:, 0]
|
136
|
+
# 提取操作系统和版本
|
137
|
+
os_info = data['http.useragent'].str.extract(
|
138
|
+
r'(Windows NT [\d\.]+|Mac OS X [\d_\.]+|Linux|Android [\d\.]+|iOS [\d_\.]+|Ubuntu|Debian|CentOS|Red Hat)',
|
139
|
+
expand=False, flags=re.IGNORECASE)
|
140
|
+
data['UserAgent_os'] = os_info.str.extract(r'(Windows|Mac OS X|Linux|Android|iOS|Ubuntu|Debian|CentOS|Red Hat)',
|
141
|
+
expand=False, flags=re.IGNORECASE).fillna("Unknown")
|
142
|
+
data['UserAgent_os_version'] = os_info.str.extract(r'([\d\._]+)', expand=False).fillna("Unknown")
|
143
|
+
# 提取设备类型,忽略大小写
|
144
|
+
data['UserAgent_device_type'] = data['http.useragent'].str.contains('mobile|android|iphone', case=False).map(
|
145
|
+
{True: 'Mobile', False: 'Desktop'})
|
146
|
+
# 提取硬件平台,增加对 x64 的匹配
|
147
|
+
data['UserAgent_platform'] = data['http.useragent'].str.extract(r'(x86|x86_64|arm|arm64|x64)', expand=False,
|
148
|
+
flags=re.IGNORECASE).fillna('Unknown')
|
149
|
+
# 判断是否为爬虫,忽略大小写
|
150
|
+
data['UserAgent_is_bot'] = data['http.useragent'].str.contains('bot|crawler|spider|slurp|curl|wget|httpclient',
|
151
|
+
case=False)
|
152
|
+
# 提取语言偏好(如果存在),忽略大小写
|
153
|
+
data['UserAgent_language'] = data['http.useragent'].str.extract(r'\b([a-z]{2}-[A-Z]{2})\b', expand=False,
|
154
|
+
flags=re.IGNORECASE).fillna("Unknown")
|
155
|
+
# 统计 User-Agent 中的特殊字符个数
|
156
|
+
data['UserAgent_special_char_count'] = data['http.useragent'].progress_apply(
|
157
|
+
lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
|
158
|
+
# 更新 UserAgent_is_unknown 的计算逻辑
|
159
|
+
data['UserAgent_is_unknown'] = data[['UserAgent_browser', 'UserAgent_os', 'UserAgent_platform']].isna().any(
|
160
|
+
axis=1).fillna("Unknown")
|
161
|
+
return data
|
@@ -0,0 +1,86 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
|
4
|
+
import execjs
|
5
|
+
import geoip2.database
|
6
|
+
|
7
|
+
current_dir = os.path.dirname(__file__)
|
8
|
+
parse_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'arkimeparse.js')
|
9
|
+
geo_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'GeoLite2-City.mmdb')
|
10
|
+
|
11
|
+
|
12
|
+
def parse_expression(expression):
|
13
|
+
if expression:
|
14
|
+
with open(parse_path, "r") as f:
|
15
|
+
ctx = execjs.compile(f.read())
|
16
|
+
return ctx.call("parse_exp", expression)
|
17
|
+
else:
|
18
|
+
return None
|
19
|
+
|
20
|
+
|
21
|
+
def geo_reader():
|
22
|
+
return geoip2.database.Reader(geo_path)
|
23
|
+
|
24
|
+
|
25
|
+
def split_samples(sample, per_subsection):
|
26
|
+
num_subsections = len(sample) // per_subsection
|
27
|
+
remainder = len(sample) % per_subsection
|
28
|
+
subsection_sizes = [per_subsection] * num_subsections
|
29
|
+
if remainder > 0:
|
30
|
+
subsection_sizes.append(remainder)
|
31
|
+
num_subsections += 1
|
32
|
+
return num_subsections, subsection_sizes
|
33
|
+
|
34
|
+
|
35
|
+
def split_process(subsection, process_count):
|
36
|
+
subsection_per_process = len(subsection) // process_count
|
37
|
+
remainder = len(subsection) % process_count
|
38
|
+
lengths = []
|
39
|
+
start = 0
|
40
|
+
for i in range(process_count):
|
41
|
+
end = start + subsection_per_process + (1 if i < remainder else 0)
|
42
|
+
lengths.append(end - start)
|
43
|
+
start = end
|
44
|
+
return lengths
|
45
|
+
|
46
|
+
|
47
|
+
def build_es_expression(size, start_time, end_time, arkime_expression):
|
48
|
+
expression = {"query": {"bool": {"filter": []}}}
|
49
|
+
try:
|
50
|
+
if size:
|
51
|
+
expression['size'] = size
|
52
|
+
if start_time:
|
53
|
+
expression['query']['bool']['filter'].append(
|
54
|
+
{"range": {"firstPacket": {"gte": round(start_time.timestamp() * 1000)}}})
|
55
|
+
if end_time:
|
56
|
+
expression['query']['bool']['filter'].append(
|
57
|
+
{"range": {"lastPacket": {"lte": round(end_time.timestamp() * 1000)}}})
|
58
|
+
arkime_2_es = parse_expression(arkime_expression)
|
59
|
+
if arkime_2_es:
|
60
|
+
expression['query']['bool']['filter'].append(arkime_2_es)
|
61
|
+
return expression
|
62
|
+
except Exception as e:
|
63
|
+
print(f"请安装nodejs{e}")
|
64
|
+
print(arkime_expression)
|
65
|
+
exit(1)
|
66
|
+
|
67
|
+
|
68
|
+
def get_uri_depth(url):
|
69
|
+
match = re.match(r'^[^?]*', url)
|
70
|
+
if match:
|
71
|
+
path = match.group(0)
|
72
|
+
# 去除协议和域名部分
|
73
|
+
path = re.sub(r'^https?://[^/]+', '', path)
|
74
|
+
segments = [segment for segment in path.split('/') if segment]
|
75
|
+
return len(segments)
|
76
|
+
return 0
|
77
|
+
|
78
|
+
|
79
|
+
def firstOrZero(param):
|
80
|
+
if type(param).__name__ == 'list':
|
81
|
+
if (len(param)) != 0:
|
82
|
+
return param[0]
|
83
|
+
else:
|
84
|
+
return 0
|
85
|
+
else:
|
86
|
+
return 0
|
@@ -1,21 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
import execjs
|
4
|
-
import geoip2.database
|
5
|
-
|
6
|
-
current_dir = os.path.dirname(__file__)
|
7
|
-
parse_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'arkimeparse.js')
|
8
|
-
geo_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'GeoLite2-City.mmdb')
|
9
|
-
|
10
|
-
|
11
|
-
def parse_expression(expression):
|
12
|
-
if expression:
|
13
|
-
with open(parse_path, "r") as f:
|
14
|
-
ctx = execjs.compile(f.read())
|
15
|
-
return ctx.call("parse_exp", expression)
|
16
|
-
else:
|
17
|
-
return None
|
18
|
-
|
19
|
-
|
20
|
-
def geo_reader():
|
21
|
-
return geoip2.database.Reader(geo_path)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|