xbase-util 0.0.8__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase_util
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: 网络安全基础工具
5
+ Home-page: https://gitee.com/jimonik/xbase_util.git
5
6
  Author: xyt
6
7
  Author-email: 2506564278@qq.com
7
8
  License: <MIT License>
@@ -3,13 +3,14 @@ from distutils.core import setup
3
3
  from setuptools import find_packages
4
4
 
5
5
  setup(name="xbase_util",
6
- version="0.0.8",
6
+ version="0.0.9",
7
7
  description="网络安全基础工具",
8
8
  long_description="包含提取,预测,训练的基础工具",
9
9
  author="xyt",
10
10
  author_email="2506564278@qq.com",
11
11
  license="<MIT License>",
12
12
  packages=find_packages(),
13
+ url="https://gitee.com/jimonik/xbase_util.git",
13
14
  install_requires=[
14
15
  ],
15
16
  zip_safe=False,
@@ -0,0 +1,26 @@
1
+ import requests
2
+
3
+
4
+ class EsReq:
5
+ def __init__(self, url,timeout=120):
6
+ self.es_url = url
7
+ self.timeout = timeout
8
+ print("初始化自定义es请求类")
9
+
10
+ def clear_all_scroll(self):
11
+ return requests.delete(self.es_url + "/_search/scroll", timeout=self.timeout, json={'scroll_id': '_all'})
12
+
13
+ def search(self, body, scroll):
14
+ requests.post(self.es_url + "/_search/scroll", data=body, timeout=self.timeout, json={'scroll_id': scroll})
15
+
16
+ def start_scroll(self, exp, scroll):
17
+ return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
18
+ json=exp)
19
+
20
+ def scroll_by_id(self, scroll_id, scroll):
21
+ return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
22
+ json={'scroll_id': scroll_id, 'scroll': scroll})
23
+
24
+ def search_file(self, id):
25
+ return requests.get(f"{self.es_url}/arkime_files_v30/_search", timeout=self.timeout,
26
+ json={"query": {"term": {"_id": id}}})
@@ -0,0 +1,161 @@
1
+ import json
2
+ import re
3
+ import traceback
4
+ from urllib.parse import unquote
5
+
6
+ import pandas as pd
7
+
8
+
9
+ def handle_uri(data):
10
+ print(f"处理URI:{len(data)}")
11
+ # 定义正则表达式,确保精确匹配各种攻击特征
12
+ regex_patterns = {
13
+ "sql": re.compile(
14
+ r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
15
+ re.IGNORECASE),
16
+ "xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
17
+ "cmd": re.compile(
18
+ r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
19
+ re.IGNORECASE),
20
+ "path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
21
+ "redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
22
+ "danger": re.compile(
23
+ r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
24
+ re.IGNORECASE),
25
+ "suspicious_ext": re.compile(
26
+ r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
27
+ re.IGNORECASE)
28
+ }
29
+
30
+ # 定义多层解码函数,确保完全解码 URI
31
+ def fully_decode_uri(uri):
32
+ try:
33
+ decoded_uri = str(uri)
34
+ for _ in range(3): # 尝试多次解码嵌套的编码
35
+ decoded_uri = unquote(decoded_uri)
36
+ return decoded_uri
37
+ except Exception as e:
38
+ return uri
39
+
40
+ def process_row(row):
41
+ uris = row['http.uri']
42
+ if not isinstance(uris, list):
43
+ try:
44
+ uris = json.loads(uris)
45
+ if not isinstance(uris, list):
46
+ uris = [str(uris)]
47
+ except Exception:
48
+ uris = [str(uris)]
49
+ try:
50
+ decoded_uris = [fully_decode_uri(uri) for uri in uris]
51
+ except Exception as e:
52
+ traceback.print_exc()
53
+ exit(0)
54
+
55
+
56
+ # 初始化统计变量
57
+ param_count = 0
58
+ path_depth = 0
59
+ param_lengths = []
60
+ feature_flags = {key: False for key in regex_patterns.keys()}
61
+
62
+ # 遍历解码后的 URI
63
+ for uri in decoded_uris:
64
+ param_count += uri.count('&') + 1
65
+ path_depth += uri.count('/')
66
+
67
+ # 提取参数长度
68
+ if '?' in uri:
69
+ params = uri.split('?', 1)[-1].split('&')
70
+ for param in params:
71
+ if '=' in param:
72
+ _, value = param.split('=', 1)
73
+ param_lengths.append(len(value))
74
+
75
+ # 检查正则匹配特征
76
+ for key, pattern in regex_patterns.items():
77
+ if pattern.search(uri):
78
+ feature_flags[key] = True
79
+
80
+ # 计算参数长度的统计值
81
+ avg_length = sum(param_lengths) / len(param_lengths) if param_lengths else 0
82
+ max_length = max(param_lengths) if param_lengths else 0
83
+
84
+ # 创建返回结果字典
85
+ result = {
86
+ "URI_FEATURES_EXTRA_param_count": param_count,
87
+ "URI_FEATURES_EXTRA_path_depth": path_depth,
88
+ "URI_FEATURES_EXTRA_param_length_avg": avg_length,
89
+ "URI_FEATURES_EXTRA_param_length_max": max_length,
90
+ }
91
+
92
+ # 添加特征标志到结果
93
+ for key, value in feature_flags.items():
94
+ result[f"URI_FEATURES_EXTRA_contains_{key}"] = value
95
+
96
+ return result
97
+ feature_data = data.progress_apply(process_row, axis=1, result_type="expand")
98
+ data = pd.concat([data, feature_data], axis=1)
99
+ return data
100
+
101
+
102
+ def handle_ua(data):
103
+ print("处理UA")
104
+ data['http.useragent'] = data['http.useragent'].fillna('').astype(str)
105
+ # 处理换行符及多余空格
106
+ data['http.useragent'] = data['http.useragent'].str.replace(r'\s+', ' ', regex=True)
107
+ # 常见攻击的 User-Agent 字符串匹配模式,忽略大小写
108
+ attack_patterns = '|'.join([
109
+ r"\bselect\b", r"\bunion\b", r"\binsert\b", r"\bupdate\b", r"\bdelete\b", r"\bdrop\b", r"--", r"#", r" or ",
110
+ r"' or '",
111
+ r"information_schema", r"database\(\)", r"version\(\)", # SQL注入相关
112
+ r"<script>", r"javascript:", r"onload=", r"onclick=", r"<iframe>", r"src=", # XSS相关
113
+ r"/etc/passwd", r"/etc/shadow", r"\&\&", r"\|", r"\$\(\)", r"exec", r"system", # 命令执行相关
114
+ r"\.\./", r"\.\.%2f", r"\.\.%5c", r"%c0%af", r"%252e%252e%252f", # 路径遍历
115
+ r"\.php", r"\.asp", r"\.jsp", r"\.exe", r"\.sh", r"\.py", r"\.pl", # 文件扩展名
116
+ r"redirect=", r"url=", r"next=", # 重定向
117
+ r"%3C", r"%3E", r"%27", r"%22", r"%00", r"%2F", r"%5C", r"%3B", r"%7C", r"%2E", r"%28", r"%29", # 编码
118
+ r'Googlebot', r'Bingbot', r'Slurp', r'curl', r'wget', r'Nmap',
119
+ r'SQLMap', r'Nikto', r'Dirbuster', r'python-requests', r'Apache-HttpClient',
120
+ r'Postman', r'Burp Suite', r'Fuzzing', r'nessus'
121
+ ])
122
+ # 企业客户端 User-Agent 模式
123
+ enterprise_patterns = '|'.join([
124
+ r'MicroMessenger', r'wxwork', r'QQ/', r'QQBrowser', r'Alipay', r'UCWEB'
125
+ ])
126
+ # 批量检查是否为攻击的 User-Agent,忽略大小写
127
+ data['UserAgent_is_attack'] = data['http.useragent'].str.contains(attack_patterns, case=False, regex=True)
128
+ # 批量检查是否为企业客户端,忽略大小写
129
+ data['UserAgent_is_enterprise'] = data['http.useragent'].str.contains(enterprise_patterns, case=False)
130
+ # 提取浏览器和版本
131
+ data['UserAgent_browser'] = data['http.useragent'].str.extract(r'(Chrome|Firefox|Safari|MSIE|Edge|Opera|Trident)',
132
+ expand=False, flags=re.IGNORECASE).fillna("Unknown")
133
+ data['UserAgent_browser_version'] = data['http.useragent'].str.extract(
134
+ r'Chrome/([\d\.]+)|Firefox/([\d\.]+)|Version/([\d\.]+).*Safari|MSIE ([\d\.]+)|Edge/([\d\.]+)|Opera/([\d\.]+)|Trident/([\d\.]+)',
135
+ expand=False, flags=re.IGNORECASE).bfill(axis=1).fillna("Unknown").iloc[:, 0]
136
+ # 提取操作系统和版本
137
+ os_info = data['http.useragent'].str.extract(
138
+ r'(Windows NT [\d\.]+|Mac OS X [\d_\.]+|Linux|Android [\d\.]+|iOS [\d_\.]+|Ubuntu|Debian|CentOS|Red Hat)',
139
+ expand=False, flags=re.IGNORECASE)
140
+ data['UserAgent_os'] = os_info.str.extract(r'(Windows|Mac OS X|Linux|Android|iOS|Ubuntu|Debian|CentOS|Red Hat)',
141
+ expand=False, flags=re.IGNORECASE).fillna("Unknown")
142
+ data['UserAgent_os_version'] = os_info.str.extract(r'([\d\._]+)', expand=False).fillna("Unknown")
143
+ # 提取设备类型,忽略大小写
144
+ data['UserAgent_device_type'] = data['http.useragent'].str.contains('mobile|android|iphone', case=False).map(
145
+ {True: 'Mobile', False: 'Desktop'})
146
+ # 提取硬件平台,增加对 x64 的匹配
147
+ data['UserAgent_platform'] = data['http.useragent'].str.extract(r'(x86|x86_64|arm|arm64|x64)', expand=False,
148
+ flags=re.IGNORECASE).fillna('Unknown')
149
+ # 判断是否为爬虫,忽略大小写
150
+ data['UserAgent_is_bot'] = data['http.useragent'].str.contains('bot|crawler|spider|slurp|curl|wget|httpclient',
151
+ case=False)
152
+ # 提取语言偏好(如果存在),忽略大小写
153
+ data['UserAgent_language'] = data['http.useragent'].str.extract(r'\b([a-z]{2}-[A-Z]{2})\b', expand=False,
154
+ flags=re.IGNORECASE).fillna("Unknown")
155
+ # 统计 User-Agent 中的特殊字符个数
156
+ data['UserAgent_special_char_count'] = data['http.useragent'].progress_apply(
157
+ lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
158
+ # 更新 UserAgent_is_unknown 的计算逻辑
159
+ data['UserAgent_is_unknown'] = data[['UserAgent_browser', 'UserAgent_os', 'UserAgent_platform']].isna().any(
160
+ axis=1).fillna("Unknown")
161
+ return data
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase-util
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: 网络安全基础工具
5
+ Home-page: https://gitee.com/jimonik/xbase_util.git
5
6
  Author: xyt
6
7
  Author-email: 2506564278@qq.com
7
8
  License: <MIT License>
@@ -1,6 +1,8 @@
1
1
  README.md
2
2
  setup.py
3
3
  xbase_util/__init__.py
4
+ xbase_util/esreq.py
5
+ xbase_util/handle_features_util.py
4
6
  xbase_util/xbase_util.py
5
7
  xbase_util.egg-info/PKG-INFO
6
8
  xbase_util.egg-info/SOURCES.txt
File without changes
File without changes