xbase-util 0.0.8__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xbase_util-0.0.8 → xbase_util-0.0.9}/PKG-INFO +2 -1
- {xbase_util-0.0.8 → xbase_util-0.0.9}/setup.py +2 -1
- xbase_util-0.0.9/xbase_util/esreq.py +26 -0
- xbase_util-0.0.9/xbase_util/handle_features_util.py +161 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util.egg-info/PKG-INFO +2 -1
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util.egg-info/SOURCES.txt +2 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/README.md +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/setup.cfg +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util/__init__.py +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util/xbase_util.py +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util.egg-info/dependency_links.txt +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util.egg-info/not-zip-safe +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util.egg-info/top_level.txt +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util_assets/GeoLite2-City.mmdb +0 -0
- {xbase_util-0.0.8 → xbase_util-0.0.9}/xbase_util_assets/arkimeparse.js +0 -0
@@ -3,13 +3,14 @@ from distutils.core import setup
|
|
3
3
|
from setuptools import find_packages
|
4
4
|
|
5
5
|
setup(name="xbase_util",
|
6
|
-
version="0.0.
|
6
|
+
version="0.0.9",
|
7
7
|
description="网络安全基础工具",
|
8
8
|
long_description="包含提取,预测,训练的基础工具",
|
9
9
|
author="xyt",
|
10
10
|
author_email="2506564278@qq.com",
|
11
11
|
license="<MIT License>",
|
12
12
|
packages=find_packages(),
|
13
|
+
url="https://gitee.com/jimonik/xbase_util.git",
|
13
14
|
install_requires=[
|
14
15
|
],
|
15
16
|
zip_safe=False,
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import requests
|
2
|
+
|
3
|
+
|
4
|
+
class EsReq:
|
5
|
+
def __init__(self, url,timeout=120):
|
6
|
+
self.es_url = url
|
7
|
+
self.timeout = timeout
|
8
|
+
print("初始化自定义es请求类")
|
9
|
+
|
10
|
+
def clear_all_scroll(self):
|
11
|
+
return requests.delete(self.es_url + "/_search/scroll", timeout=self.timeout, json={'scroll_id': '_all'})
|
12
|
+
|
13
|
+
def search(self, body, scroll):
|
14
|
+
requests.post(self.es_url + "/_search/scroll", data=body, timeout=self.timeout, json={'scroll_id': scroll})
|
15
|
+
|
16
|
+
def start_scroll(self, exp, scroll):
|
17
|
+
return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
|
18
|
+
json=exp)
|
19
|
+
|
20
|
+
def scroll_by_id(self, scroll_id, scroll):
|
21
|
+
return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
|
22
|
+
json={'scroll_id': scroll_id, 'scroll': scroll})
|
23
|
+
|
24
|
+
def search_file(self, id):
|
25
|
+
return requests.get(f"{self.es_url}/arkime_files_v30/_search", timeout=self.timeout,
|
26
|
+
json={"query": {"term": {"_id": id}}})
|
@@ -0,0 +1,161 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
import traceback
|
4
|
+
from urllib.parse import unquote
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def handle_uri(data):
|
10
|
+
print(f"处理URI:{len(data)}")
|
11
|
+
# 定义正则表达式,确保精确匹配各种攻击特征
|
12
|
+
regex_patterns = {
|
13
|
+
"sql": re.compile(
|
14
|
+
r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
|
15
|
+
re.IGNORECASE),
|
16
|
+
"xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
|
17
|
+
"cmd": re.compile(
|
18
|
+
r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
|
19
|
+
re.IGNORECASE),
|
20
|
+
"path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
|
21
|
+
"redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
|
22
|
+
"danger": re.compile(
|
23
|
+
r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
|
24
|
+
re.IGNORECASE),
|
25
|
+
"suspicious_ext": re.compile(
|
26
|
+
r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
|
27
|
+
re.IGNORECASE)
|
28
|
+
}
|
29
|
+
|
30
|
+
# 定义多层解码函数,确保完全解码 URI
|
31
|
+
def fully_decode_uri(uri):
|
32
|
+
try:
|
33
|
+
decoded_uri = str(uri)
|
34
|
+
for _ in range(3): # 尝试多次解码嵌套的编码
|
35
|
+
decoded_uri = unquote(decoded_uri)
|
36
|
+
return decoded_uri
|
37
|
+
except Exception as e:
|
38
|
+
return uri
|
39
|
+
|
40
|
+
def process_row(row):
|
41
|
+
uris = row['http.uri']
|
42
|
+
if not isinstance(uris, list):
|
43
|
+
try:
|
44
|
+
uris = json.loads(uris)
|
45
|
+
if not isinstance(uris, list):
|
46
|
+
uris = [str(uris)]
|
47
|
+
except Exception:
|
48
|
+
uris = [str(uris)]
|
49
|
+
try:
|
50
|
+
decoded_uris = [fully_decode_uri(uri) for uri in uris]
|
51
|
+
except Exception as e:
|
52
|
+
traceback.print_exc()
|
53
|
+
exit(0)
|
54
|
+
|
55
|
+
|
56
|
+
# 初始化统计变量
|
57
|
+
param_count = 0
|
58
|
+
path_depth = 0
|
59
|
+
param_lengths = []
|
60
|
+
feature_flags = {key: False for key in regex_patterns.keys()}
|
61
|
+
|
62
|
+
# 遍历解码后的 URI
|
63
|
+
for uri in decoded_uris:
|
64
|
+
param_count += uri.count('&') + 1
|
65
|
+
path_depth += uri.count('/')
|
66
|
+
|
67
|
+
# 提取参数长度
|
68
|
+
if '?' in uri:
|
69
|
+
params = uri.split('?', 1)[-1].split('&')
|
70
|
+
for param in params:
|
71
|
+
if '=' in param:
|
72
|
+
_, value = param.split('=', 1)
|
73
|
+
param_lengths.append(len(value))
|
74
|
+
|
75
|
+
# 检查正则匹配特征
|
76
|
+
for key, pattern in regex_patterns.items():
|
77
|
+
if pattern.search(uri):
|
78
|
+
feature_flags[key] = True
|
79
|
+
|
80
|
+
# 计算参数长度的统计值
|
81
|
+
avg_length = sum(param_lengths) / len(param_lengths) if param_lengths else 0
|
82
|
+
max_length = max(param_lengths) if param_lengths else 0
|
83
|
+
|
84
|
+
# 创建返回结果字典
|
85
|
+
result = {
|
86
|
+
"URI_FEATURES_EXTRA_param_count": param_count,
|
87
|
+
"URI_FEATURES_EXTRA_path_depth": path_depth,
|
88
|
+
"URI_FEATURES_EXTRA_param_length_avg": avg_length,
|
89
|
+
"URI_FEATURES_EXTRA_param_length_max": max_length,
|
90
|
+
}
|
91
|
+
|
92
|
+
# 添加特征标志到结果
|
93
|
+
for key, value in feature_flags.items():
|
94
|
+
result[f"URI_FEATURES_EXTRA_contains_{key}"] = value
|
95
|
+
|
96
|
+
return result
|
97
|
+
feature_data = data.progress_apply(process_row, axis=1, result_type="expand")
|
98
|
+
data = pd.concat([data, feature_data], axis=1)
|
99
|
+
return data
|
100
|
+
|
101
|
+
|
102
|
+
def handle_ua(data):
|
103
|
+
print("处理UA")
|
104
|
+
data['http.useragent'] = data['http.useragent'].fillna('').astype(str)
|
105
|
+
# 处理换行符及多余空格
|
106
|
+
data['http.useragent'] = data['http.useragent'].str.replace(r'\s+', ' ', regex=True)
|
107
|
+
# 常见攻击的 User-Agent 字符串匹配模式,忽略大小写
|
108
|
+
attack_patterns = '|'.join([
|
109
|
+
r"\bselect\b", r"\bunion\b", r"\binsert\b", r"\bupdate\b", r"\bdelete\b", r"\bdrop\b", r"--", r"#", r" or ",
|
110
|
+
r"' or '",
|
111
|
+
r"information_schema", r"database\(\)", r"version\(\)", # SQL注入相关
|
112
|
+
r"<script>", r"javascript:", r"onload=", r"onclick=", r"<iframe>", r"src=", # XSS相关
|
113
|
+
r"/etc/passwd", r"/etc/shadow", r"\&\&", r"\|", r"\$\(\)", r"exec", r"system", # 命令执行相关
|
114
|
+
r"\.\./", r"\.\.%2f", r"\.\.%5c", r"%c0%af", r"%252e%252e%252f", # 路径遍历
|
115
|
+
r"\.php", r"\.asp", r"\.jsp", r"\.exe", r"\.sh", r"\.py", r"\.pl", # 文件扩展名
|
116
|
+
r"redirect=", r"url=", r"next=", # 重定向
|
117
|
+
r"%3C", r"%3E", r"%27", r"%22", r"%00", r"%2F", r"%5C", r"%3B", r"%7C", r"%2E", r"%28", r"%29", # 编码
|
118
|
+
r'Googlebot', r'Bingbot', r'Slurp', r'curl', r'wget', r'Nmap',
|
119
|
+
r'SQLMap', r'Nikto', r'Dirbuster', r'python-requests', r'Apache-HttpClient',
|
120
|
+
r'Postman', r'Burp Suite', r'Fuzzing', r'nessus'
|
121
|
+
])
|
122
|
+
# 企业客户端 User-Agent 模式
|
123
|
+
enterprise_patterns = '|'.join([
|
124
|
+
r'MicroMessenger', r'wxwork', r'QQ/', r'QQBrowser', r'Alipay', r'UCWEB'
|
125
|
+
])
|
126
|
+
# 批量检查是否为攻击的 User-Agent,忽略大小写
|
127
|
+
data['UserAgent_is_attack'] = data['http.useragent'].str.contains(attack_patterns, case=False, regex=True)
|
128
|
+
# 批量检查是否为企业客户端,忽略大小写
|
129
|
+
data['UserAgent_is_enterprise'] = data['http.useragent'].str.contains(enterprise_patterns, case=False)
|
130
|
+
# 提取浏览器和版本
|
131
|
+
data['UserAgent_browser'] = data['http.useragent'].str.extract(r'(Chrome|Firefox|Safari|MSIE|Edge|Opera|Trident)',
|
132
|
+
expand=False, flags=re.IGNORECASE).fillna("Unknown")
|
133
|
+
data['UserAgent_browser_version'] = data['http.useragent'].str.extract(
|
134
|
+
r'Chrome/([\d\.]+)|Firefox/([\d\.]+)|Version/([\d\.]+).*Safari|MSIE ([\d\.]+)|Edge/([\d\.]+)|Opera/([\d\.]+)|Trident/([\d\.]+)',
|
135
|
+
expand=False, flags=re.IGNORECASE).bfill(axis=1).fillna("Unknown").iloc[:, 0]
|
136
|
+
# 提取操作系统和版本
|
137
|
+
os_info = data['http.useragent'].str.extract(
|
138
|
+
r'(Windows NT [\d\.]+|Mac OS X [\d_\.]+|Linux|Android [\d\.]+|iOS [\d_\.]+|Ubuntu|Debian|CentOS|Red Hat)',
|
139
|
+
expand=False, flags=re.IGNORECASE)
|
140
|
+
data['UserAgent_os'] = os_info.str.extract(r'(Windows|Mac OS X|Linux|Android|iOS|Ubuntu|Debian|CentOS|Red Hat)',
|
141
|
+
expand=False, flags=re.IGNORECASE).fillna("Unknown")
|
142
|
+
data['UserAgent_os_version'] = os_info.str.extract(r'([\d\._]+)', expand=False).fillna("Unknown")
|
143
|
+
# 提取设备类型,忽略大小写
|
144
|
+
data['UserAgent_device_type'] = data['http.useragent'].str.contains('mobile|android|iphone', case=False).map(
|
145
|
+
{True: 'Mobile', False: 'Desktop'})
|
146
|
+
# 提取硬件平台,增加对 x64 的匹配
|
147
|
+
data['UserAgent_platform'] = data['http.useragent'].str.extract(r'(x86|x86_64|arm|arm64|x64)', expand=False,
|
148
|
+
flags=re.IGNORECASE).fillna('Unknown')
|
149
|
+
# 判断是否为爬虫,忽略大小写
|
150
|
+
data['UserAgent_is_bot'] = data['http.useragent'].str.contains('bot|crawler|spider|slurp|curl|wget|httpclient',
|
151
|
+
case=False)
|
152
|
+
# 提取语言偏好(如果存在),忽略大小写
|
153
|
+
data['UserAgent_language'] = data['http.useragent'].str.extract(r'\b([a-z]{2}-[A-Z]{2})\b', expand=False,
|
154
|
+
flags=re.IGNORECASE).fillna("Unknown")
|
155
|
+
# 统计 User-Agent 中的特殊字符个数
|
156
|
+
data['UserAgent_special_char_count'] = data['http.useragent'].progress_apply(
|
157
|
+
lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
|
158
|
+
# 更新 UserAgent_is_unknown 的计算逻辑
|
159
|
+
data['UserAgent_is_unknown'] = data[['UserAgent_browser', 'UserAgent_os', 'UserAgent_platform']].isna().any(
|
160
|
+
axis=1).fillna("Unknown")
|
161
|
+
return data
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|