xbase-util 0.0.7__tar.gz → 0.0.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase_util
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: 网络安全基础工具
5
+ Home-page: https://gitee.com/jimonik/xbase_util.git
5
6
  Author: xyt
6
7
  Author-email: 2506564278@qq.com
7
8
  License: <MIT License>
@@ -3,13 +3,14 @@ from distutils.core import setup
3
3
  from setuptools import find_packages
4
4
 
5
5
  setup(name="xbase_util",
6
- version="0.0.7",
6
+ version="0.0.9",
7
7
  description="网络安全基础工具",
8
8
  long_description="包含提取,预测,训练的基础工具",
9
9
  author="xyt",
10
10
  author_email="2506564278@qq.com",
11
11
  license="<MIT License>",
12
12
  packages=find_packages(),
13
+ url="https://gitee.com/jimonik/xbase_util.git",
13
14
  install_requires=[
14
15
  ],
15
16
  zip_safe=False,
@@ -0,0 +1,26 @@
1
+ import requests
2
+
3
+
4
+ class EsReq:
5
+ def __init__(self, url,timeout=120):
6
+ self.es_url = url
7
+ self.timeout = timeout
8
+ print("初始化自定义es请求类")
9
+
10
+ def clear_all_scroll(self):
11
+ return requests.delete(self.es_url + "/_search/scroll", timeout=self.timeout, json={'scroll_id': '_all'})
12
+
13
+ def search(self, body, scroll):
14
+ requests.post(self.es_url + "/_search/scroll", data=body, timeout=self.timeout, json={'scroll_id': scroll})
15
+
16
+ def start_scroll(self, exp, scroll):
17
+ return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
18
+ json=exp)
19
+
20
+ def scroll_by_id(self, scroll_id, scroll):
21
+ return requests.post(self.es_url + "/_search/scroll", timeout=self.timeout,
22
+ json={'scroll_id': scroll_id, 'scroll': scroll})
23
+
24
+ def search_file(self, id):
25
+ return requests.get(f"{self.es_url}/arkime_files_v30/_search", timeout=self.timeout,
26
+ json={"query": {"term": {"_id": id}}})
@@ -0,0 +1,161 @@
1
+ import json
2
+ import re
3
+ import traceback
4
+ from urllib.parse import unquote
5
+
6
+ import pandas as pd
7
+
8
+
9
+ def handle_uri(data):
10
+ print(f"处理URI:{len(data)}")
11
+ # 定义正则表达式,确保精确匹配各种攻击特征
12
+ regex_patterns = {
13
+ "sql": re.compile(
14
+ r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
15
+ re.IGNORECASE),
16
+ "xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
17
+ "cmd": re.compile(
18
+ r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
19
+ re.IGNORECASE),
20
+ "path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
21
+ "redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
22
+ "danger": re.compile(
23
+ r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
24
+ re.IGNORECASE),
25
+ "suspicious_ext": re.compile(
26
+ r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
27
+ re.IGNORECASE)
28
+ }
29
+
30
+ # 定义多层解码函数,确保完全解码 URI
31
+ def fully_decode_uri(uri):
32
+ try:
33
+ decoded_uri = str(uri)
34
+ for _ in range(3): # 尝试多次解码嵌套的编码
35
+ decoded_uri = unquote(decoded_uri)
36
+ return decoded_uri
37
+ except Exception as e:
38
+ return uri
39
+
40
+ def process_row(row):
41
+ uris = row['http.uri']
42
+ if not isinstance(uris, list):
43
+ try:
44
+ uris = json.loads(uris)
45
+ if not isinstance(uris, list):
46
+ uris = [str(uris)]
47
+ except Exception:
48
+ uris = [str(uris)]
49
+ try:
50
+ decoded_uris = [fully_decode_uri(uri) for uri in uris]
51
+ except Exception as e:
52
+ traceback.print_exc()
53
+ exit(0)
54
+
55
+
56
+ # 初始化统计变量
57
+ param_count = 0
58
+ path_depth = 0
59
+ param_lengths = []
60
+ feature_flags = {key: False for key in regex_patterns.keys()}
61
+
62
+ # 遍历解码后的 URI
63
+ for uri in decoded_uris:
64
+ param_count += uri.count('&') + 1
65
+ path_depth += uri.count('/')
66
+
67
+ # 提取参数长度
68
+ if '?' in uri:
69
+ params = uri.split('?', 1)[-1].split('&')
70
+ for param in params:
71
+ if '=' in param:
72
+ _, value = param.split('=', 1)
73
+ param_lengths.append(len(value))
74
+
75
+ # 检查正则匹配特征
76
+ for key, pattern in regex_patterns.items():
77
+ if pattern.search(uri):
78
+ feature_flags[key] = True
79
+
80
+ # 计算参数长度的统计值
81
+ avg_length = sum(param_lengths) / len(param_lengths) if param_lengths else 0
82
+ max_length = max(param_lengths) if param_lengths else 0
83
+
84
+ # 创建返回结果字典
85
+ result = {
86
+ "URI_FEATURES_EXTRA_param_count": param_count,
87
+ "URI_FEATURES_EXTRA_path_depth": path_depth,
88
+ "URI_FEATURES_EXTRA_param_length_avg": avg_length,
89
+ "URI_FEATURES_EXTRA_param_length_max": max_length,
90
+ }
91
+
92
+ # 添加特征标志到结果
93
+ for key, value in feature_flags.items():
94
+ result[f"URI_FEATURES_EXTRA_contains_{key}"] = value
95
+
96
+ return result
97
+ feature_data = data.progress_apply(process_row, axis=1, result_type="expand")
98
+ data = pd.concat([data, feature_data], axis=1)
99
+ return data
100
+
101
+
102
+ def handle_ua(data):
103
+ print("处理UA")
104
+ data['http.useragent'] = data['http.useragent'].fillna('').astype(str)
105
+ # 处理换行符及多余空格
106
+ data['http.useragent'] = data['http.useragent'].str.replace(r'\s+', ' ', regex=True)
107
+ # 常见攻击的 User-Agent 字符串匹配模式,忽略大小写
108
+ attack_patterns = '|'.join([
109
+ r"\bselect\b", r"\bunion\b", r"\binsert\b", r"\bupdate\b", r"\bdelete\b", r"\bdrop\b", r"--", r"#", r" or ",
110
+ r"' or '",
111
+ r"information_schema", r"database\(\)", r"version\(\)", # SQL注入相关
112
+ r"<script>", r"javascript:", r"onload=", r"onclick=", r"<iframe>", r"src=", # XSS相关
113
+ r"/etc/passwd", r"/etc/shadow", r"\&\&", r"\|", r"\$\(\)", r"exec", r"system", # 命令执行相关
114
+ r"\.\./", r"\.\.%2f", r"\.\.%5c", r"%c0%af", r"%252e%252e%252f", # 路径遍历
115
+ r"\.php", r"\.asp", r"\.jsp", r"\.exe", r"\.sh", r"\.py", r"\.pl", # 文件扩展名
116
+ r"redirect=", r"url=", r"next=", # 重定向
117
+ r"%3C", r"%3E", r"%27", r"%22", r"%00", r"%2F", r"%5C", r"%3B", r"%7C", r"%2E", r"%28", r"%29", # 编码
118
+ r'Googlebot', r'Bingbot', r'Slurp', r'curl', r'wget', r'Nmap',
119
+ r'SQLMap', r'Nikto', r'Dirbuster', r'python-requests', r'Apache-HttpClient',
120
+ r'Postman', r'Burp Suite', r'Fuzzing', r'nessus'
121
+ ])
122
+ # 企业客户端 User-Agent 模式
123
+ enterprise_patterns = '|'.join([
124
+ r'MicroMessenger', r'wxwork', r'QQ/', r'QQBrowser', r'Alipay', r'UCWEB'
125
+ ])
126
+ # 批量检查是否为攻击的 User-Agent,忽略大小写
127
+ data['UserAgent_is_attack'] = data['http.useragent'].str.contains(attack_patterns, case=False, regex=True)
128
+ # 批量检查是否为企业客户端,忽略大小写
129
+ data['UserAgent_is_enterprise'] = data['http.useragent'].str.contains(enterprise_patterns, case=False)
130
+ # 提取浏览器和版本
131
+ data['UserAgent_browser'] = data['http.useragent'].str.extract(r'(Chrome|Firefox|Safari|MSIE|Edge|Opera|Trident)',
132
+ expand=False, flags=re.IGNORECASE).fillna("Unknown")
133
+ data['UserAgent_browser_version'] = data['http.useragent'].str.extract(
134
+ r'Chrome/([\d\.]+)|Firefox/([\d\.]+)|Version/([\d\.]+).*Safari|MSIE ([\d\.]+)|Edge/([\d\.]+)|Opera/([\d\.]+)|Trident/([\d\.]+)',
135
+ expand=False, flags=re.IGNORECASE).bfill(axis=1).fillna("Unknown").iloc[:, 0]
136
+ # 提取操作系统和版本
137
+ os_info = data['http.useragent'].str.extract(
138
+ r'(Windows NT [\d\.]+|Mac OS X [\d_\.]+|Linux|Android [\d\.]+|iOS [\d_\.]+|Ubuntu|Debian|CentOS|Red Hat)',
139
+ expand=False, flags=re.IGNORECASE)
140
+ data['UserAgent_os'] = os_info.str.extract(r'(Windows|Mac OS X|Linux|Android|iOS|Ubuntu|Debian|CentOS|Red Hat)',
141
+ expand=False, flags=re.IGNORECASE).fillna("Unknown")
142
+ data['UserAgent_os_version'] = os_info.str.extract(r'([\d\._]+)', expand=False).fillna("Unknown")
143
+ # 提取设备类型,忽略大小写
144
+ data['UserAgent_device_type'] = data['http.useragent'].str.contains('mobile|android|iphone', case=False).map(
145
+ {True: 'Mobile', False: 'Desktop'})
146
+ # 提取硬件平台,增加对 x64 的匹配
147
+ data['UserAgent_platform'] = data['http.useragent'].str.extract(r'(x86|x86_64|arm|arm64|x64)', expand=False,
148
+ flags=re.IGNORECASE).fillna('Unknown')
149
+ # 判断是否为爬虫,忽略大小写
150
+ data['UserAgent_is_bot'] = data['http.useragent'].str.contains('bot|crawler|spider|slurp|curl|wget|httpclient',
151
+ case=False)
152
+ # 提取语言偏好(如果存在),忽略大小写
153
+ data['UserAgent_language'] = data['http.useragent'].str.extract(r'\b([a-z]{2}-[A-Z]{2})\b', expand=False,
154
+ flags=re.IGNORECASE).fillna("Unknown")
155
+ # 统计 User-Agent 中的特殊字符个数
156
+ data['UserAgent_special_char_count'] = data['http.useragent'].progress_apply(
157
+ lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
158
+ # 更新 UserAgent_is_unknown 的计算逻辑
159
+ data['UserAgent_is_unknown'] = data[['UserAgent_browser', 'UserAgent_os', 'UserAgent_platform']].isna().any(
160
+ axis=1).fillna("Unknown")
161
+ return data
@@ -0,0 +1,86 @@
1
+ import os
2
+ import re
3
+
4
+ import execjs
5
+ import geoip2.database
6
+
7
+ current_dir = os.path.dirname(__file__)
8
+ parse_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'arkimeparse.js')
9
+ geo_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'GeoLite2-City.mmdb')
10
+
11
+
12
+ def parse_expression(expression):
13
+ if expression:
14
+ with open(parse_path, "r") as f:
15
+ ctx = execjs.compile(f.read())
16
+ return ctx.call("parse_exp", expression)
17
+ else:
18
+ return None
19
+
20
+
21
+ def geo_reader():
22
+ return geoip2.database.Reader(geo_path)
23
+
24
+
25
+ def split_samples(sample, per_subsection):
26
+ num_subsections = len(sample) // per_subsection
27
+ remainder = len(sample) % per_subsection
28
+ subsection_sizes = [per_subsection] * num_subsections
29
+ if remainder > 0:
30
+ subsection_sizes.append(remainder)
31
+ num_subsections += 1
32
+ return num_subsections, subsection_sizes
33
+
34
+
35
+ def split_process(subsection, process_count):
36
+ subsection_per_process = len(subsection) // process_count
37
+ remainder = len(subsection) % process_count
38
+ lengths = []
39
+ start = 0
40
+ for i in range(process_count):
41
+ end = start + subsection_per_process + (1 if i < remainder else 0)
42
+ lengths.append(end - start)
43
+ start = end
44
+ return lengths
45
+
46
+
47
+ def build_es_expression(size, start_time, end_time, arkime_expression):
48
+ expression = {"query": {"bool": {"filter": []}}}
49
+ try:
50
+ if size:
51
+ expression['size'] = size
52
+ if start_time:
53
+ expression['query']['bool']['filter'].append(
54
+ {"range": {"firstPacket": {"gte": round(start_time.timestamp() * 1000)}}})
55
+ if end_time:
56
+ expression['query']['bool']['filter'].append(
57
+ {"range": {"lastPacket": {"lte": round(end_time.timestamp() * 1000)}}})
58
+ arkime_2_es = parse_expression(arkime_expression)
59
+ if arkime_2_es:
60
+ expression['query']['bool']['filter'].append(arkime_2_es)
61
+ return expression
62
+ except Exception as e:
63
+ print(f"请安装nodejs{e}")
64
+ print(arkime_expression)
65
+ exit(1)
66
+
67
+
68
+ def get_uri_depth(url):
69
+ match = re.match(r'^[^?]*', url)
70
+ if match:
71
+ path = match.group(0)
72
+ # 去除协议和域名部分
73
+ path = re.sub(r'^https?://[^/]+', '', path)
74
+ segments = [segment for segment in path.split('/') if segment]
75
+ return len(segments)
76
+ return 0
77
+
78
+
79
+ def firstOrZero(param):
80
+ if type(param).__name__ == 'list':
81
+ if (len(param)) != 0:
82
+ return param[0]
83
+ else:
84
+ return 0
85
+ else:
86
+ return 0
@@ -1,7 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase-util
3
- Version: 0.0.7
3
+ Version: 0.0.9
4
4
  Summary: 网络安全基础工具
5
+ Home-page: https://gitee.com/jimonik/xbase_util.git
5
6
  Author: xyt
6
7
  Author-email: 2506564278@qq.com
7
8
  License: <MIT License>
@@ -1,6 +1,8 @@
1
1
  README.md
2
2
  setup.py
3
3
  xbase_util/__init__.py
4
+ xbase_util/esreq.py
5
+ xbase_util/handle_features_util.py
4
6
  xbase_util/xbase_util.py
5
7
  xbase_util.egg-info/PKG-INFO
6
8
  xbase_util.egg-info/SOURCES.txt
@@ -1,21 +0,0 @@
1
- import os
2
-
3
- import execjs
4
- import geoip2.database
5
-
6
- current_dir = os.path.dirname(__file__)
7
- parse_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'arkimeparse.js')
8
- geo_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'GeoLite2-City.mmdb')
9
-
10
-
11
- def parse_expression(expression):
12
- if expression:
13
- with open(parse_path, "r") as f:
14
- ctx = execjs.compile(f.read())
15
- return ctx.call("parse_exp", expression)
16
- else:
17
- return None
18
-
19
-
20
- def geo_reader():
21
- return geoip2.database.Reader(geo_path)
File without changes
File without changes