xbase-util 0.3.9__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xbase_util-0.3.9 → xbase_util-0.4.1}/PKG-INFO +1 -1
- {xbase_util-0.3.9 → xbase_util-0.4.1}/setup.py +1 -1
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/add_column_util.py +33 -17
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/handle_features_util.py +18 -32
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/xbase_constant.py +19 -1
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util.egg-info/PKG-INFO +1 -1
- {xbase_util-0.3.9 → xbase_util-0.4.1}/README.md +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/setup.cfg +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/__init__.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/__init__.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/bean/ConfigBean.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/bean/CurrentConfigBean.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/bean/FlowBean.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/bean/TaskTemplateBean.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/bean/__init__.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/dao/ConfigDao.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/dao/CurrentConfigDao.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/dao/FlowDao.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/dao/TaskTemplateDao.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/dao/__init__.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/db/initsqlite3.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/es_db_util.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/esreq.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/geo_util.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/pcap_util.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util/xbase_util.py +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util.egg-info/SOURCES.txt +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util.egg-info/dependency_links.txt +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util.egg-info/not-zip-safe +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util.egg-info/top_level.txt +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util_assets/GeoLite2-City.mmdb +0 -0
- {xbase_util-0.3.9 → xbase_util-0.4.1}/xbase_util_assets/arkimeparse.js +0 -0
@@ -128,24 +128,40 @@ def parse_list(x):
|
|
128
128
|
return x
|
129
129
|
|
130
130
|
|
131
|
-
def handle_dns(origin_list, isDataFrame=False):
|
132
|
-
print("handle_dnslist")
|
131
|
+
def handle_dns(origin_list, isDataFrame=False,use_tqdm=False):
|
133
132
|
if not isDataFrame:
|
134
133
|
origin_list = pd.DataFrame(origin_list)
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
134
|
+
if use_tqdm:
|
135
|
+
origin_list["dnslist"] = origin_list['dns.host'].progress_apply(parse_list)
|
136
|
+
origin_list['dns_host_is_long_domain'] = origin_list['dnslist'].progress_apply(
|
137
|
+
lambda x: any(is_long_domain(domain) for domain in x))
|
138
|
+
origin_list['dns_host_is_random_characters'] = origin_list['dnslist'].progress_apply(
|
139
|
+
lambda x: any(has_random_characters(domain) for domain in x))
|
140
|
+
origin_list['dns_host_is_special_characters'] = origin_list['dnslist'].progress_apply(
|
141
|
+
lambda x: any(has_special_characters(domain) for domain in x))
|
142
|
+
origin_list['dns_host_is_large_subdomains'] = origin_list['dnslist'].progress_apply(
|
143
|
+
lambda x: any(has_large_number_of_subdomains(domain) for domain in x))
|
144
|
+
origin_list['dns_host_is_danger_domain'] = origin_list['dnslist'].progress_apply(
|
145
|
+
lambda x: any(is_danger_domain(domain) for domain in x))
|
146
|
+
origin_list['dns_host_is_danger_subdomain'] = origin_list['dnslist'].progress_apply(
|
147
|
+
lambda x: any(is_danger_subdomain(domain) for domain in x))
|
148
|
+
origin_list['dns_host_is_uncommon_tld'] = origin_list['dnslist'].progress_apply(
|
149
|
+
lambda x: any(has_uncommon_tld(domain) for domain in x))
|
150
|
+
else:
|
151
|
+
origin_list["dnslist"] = origin_list['dns.host'].apply(parse_list)
|
152
|
+
origin_list['dns_host_is_long_domain'] = origin_list['dnslist'].apply(
|
153
|
+
lambda x: any(is_long_domain(domain) for domain in x))
|
154
|
+
origin_list['dns_host_is_random_characters'] = origin_list['dnslist'].apply(
|
155
|
+
lambda x: any(has_random_characters(domain) for domain in x))
|
156
|
+
origin_list['dns_host_is_special_characters'] = origin_list['dnslist'].apply(
|
157
|
+
lambda x: any(has_special_characters(domain) for domain in x))
|
158
|
+
origin_list['dns_host_is_large_subdomains'] = origin_list['dnslist'].apply(
|
159
|
+
lambda x: any(has_large_number_of_subdomains(domain) for domain in x))
|
160
|
+
origin_list['dns_host_is_danger_domain'] = origin_list['dnslist'].apply(
|
161
|
+
lambda x: any(is_danger_domain(domain) for domain in x))
|
162
|
+
origin_list['dns_host_is_danger_subdomain'] = origin_list['dnslist'].apply(
|
163
|
+
lambda x: any(is_danger_subdomain(domain) for domain in x))
|
164
|
+
origin_list['dns_host_is_uncommon_tld'] = origin_list['dnslist'].apply(
|
165
|
+
lambda x: any(has_uncommon_tld(domain) for domain in x))
|
150
166
|
origin_list.drop(columns=['dnslist'], inplace=True)
|
151
167
|
return origin_list
|
@@ -4,32 +4,11 @@ import traceback
|
|
4
4
|
from urllib.parse import unquote
|
5
5
|
|
6
6
|
import pandas as pd
|
7
|
-
from tqdm import tqdm
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def handle_uri(data):
|
12
|
-
tqdm.pandas()
|
13
|
-
print(f"处理URI:{len(data)}")
|
14
|
-
# 定义正则表达式,确保精确匹配各种攻击特征
|
15
|
-
regex_patterns = {
|
16
|
-
"sql": re.compile(
|
17
|
-
r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
|
18
|
-
re.IGNORECASE),
|
19
|
-
"xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
|
20
|
-
"cmd": re.compile(
|
21
|
-
r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
|
22
|
-
re.IGNORECASE),
|
23
|
-
"path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
|
24
|
-
"redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
|
25
|
-
"danger": re.compile(
|
26
|
-
r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
|
27
|
-
re.IGNORECASE),
|
28
|
-
"suspicious_ext": re.compile(
|
29
|
-
r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
|
30
|
-
re.IGNORECASE)
|
31
|
-
}
|
32
7
|
|
8
|
+
from xbase_util.xbase_constant import regex_patterns
|
9
|
+
|
10
|
+
|
11
|
+
def handle_uri(data, use_tqdm=True):
|
33
12
|
# 定义多层解码函数,确保完全解码 URI
|
34
13
|
def fully_decode_uri(uri):
|
35
14
|
try:
|
@@ -55,7 +34,6 @@ def handle_uri(data):
|
|
55
34
|
traceback.print_exc()
|
56
35
|
exit(0)
|
57
36
|
|
58
|
-
|
59
37
|
# 初始化统计变量
|
60
38
|
param_count = 0
|
61
39
|
path_depth = 0
|
@@ -97,14 +75,16 @@ def handle_uri(data):
|
|
97
75
|
result[f"URI_FEATURES_EXTRA_contains_{key}"] = value
|
98
76
|
|
99
77
|
return result
|
100
|
-
|
78
|
+
|
79
|
+
if use_tqdm:
|
80
|
+
feature_data = data.progress_apply(process_row, axis=1, result_type="expand")
|
81
|
+
else:
|
82
|
+
feature_data = data.apply(process_row, axis=1, result_type="expand")
|
101
83
|
data = pd.concat([data, feature_data], axis=1)
|
102
84
|
return data
|
103
85
|
|
104
86
|
|
105
|
-
def handle_ua(data):
|
106
|
-
tqdm.pandas()
|
107
|
-
print("处理UA")
|
87
|
+
def handle_ua(data, use_tqdm=True):
|
108
88
|
data['http.useragent'] = data['http.useragent'].fillna('').astype(str)
|
109
89
|
# 处理换行符及多余空格
|
110
90
|
data['http.useragent'] = data['http.useragent'].str.replace(r'\s+', ' ', regex=True)
|
@@ -157,8 +137,14 @@ def handle_ua(data):
|
|
157
137
|
data['UserAgent_language'] = data['http.useragent'].str.extract(r'\b([a-z]{2}-[A-Z]{2})\b', expand=False,
|
158
138
|
flags=re.IGNORECASE).fillna("Unknown")
|
159
139
|
# 统计 User-Agent 中的特殊字符个数
|
160
|
-
|
161
|
-
|
140
|
+
|
141
|
+
if use_tqdm:
|
142
|
+
data['UserAgent_special_char_count'] = data['http.useragent'].progress_apply(
|
143
|
+
lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
|
144
|
+
else:
|
145
|
+
data['UserAgent_special_char_count'] = data['http.useragent'].apply(
|
146
|
+
lambda x: len(re.findall(r'[!@#$%^&*\'=:|{}]', x, flags=re.IGNORECASE)))
|
147
|
+
|
162
148
|
# 更新 UserAgent_is_unknown 的计算逻辑
|
163
149
|
data['UserAgent_is_unknown'] = data[['UserAgent_browser', 'UserAgent_os', 'UserAgent_platform']].isna().any(
|
164
150
|
axis=1).fillna("Unknown")
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
2
3
|
|
3
4
|
current_dir = os.path.dirname(__file__)
|
4
5
|
parse_path = os.path.join(current_dir, '..', 'xbase_util_assets', 'arkimeparse.js')
|
@@ -203,4 +204,21 @@ features_key = [
|
|
203
204
|
'URI_FEATURES_EXTRA_param_length_max', 'UserAgent_is_attack', 'UserAgent_is_enterprise', 'UserAgent_browser',
|
204
205
|
'UserAgent_browser_version', 'UserAgent_os', 'UserAgent_os_version', 'UserAgent_device_type',
|
205
206
|
'UserAgent_platform', 'UserAgent_is_bot', 'UserAgent_language', 'UserAgent_special_char_count',
|
206
|
-
'UserAgent_is_unknown']
|
207
|
+
'UserAgent_is_unknown']
|
208
|
+
regex_patterns = {
|
209
|
+
"sql": re.compile(
|
210
|
+
r"\b(select|union|insert|update|delete|drop|--|#| or |' or '|information_schema|database\(\)|version\(\))\b",
|
211
|
+
re.IGNORECASE),
|
212
|
+
"xss": re.compile(r"(<script\b|javascript:|onload=|onclick=|<iframe\b|src=)", re.IGNORECASE),
|
213
|
+
"cmd": re.compile(
|
214
|
+
r"(/etc/passwd\b|/etc/shadow\b|;|&&|\||\$\(.+\)|\bcurl\b|\bwget\b|\bexec\b|\bsystem\b|cmd=|proc/self/environ)",
|
215
|
+
re.IGNORECASE),
|
216
|
+
"path": re.compile(r"(\.\./|\.\.%2f|\.\.%5c|\.\.\\|\.\.;|%2f%2e%2e%2f)", re.IGNORECASE),
|
217
|
+
"redirect": re.compile(r"(redirect=|url=|next=|redirect_uri=|redirect:|RedirectTo=)", re.IGNORECASE),
|
218
|
+
"danger": re.compile(
|
219
|
+
r"(%3C|%3E|%27|%22|%00|%2F|%5C|%3B|%7C|%28|%29|%20|%3D|%3A|%3F|%26|%23|%2B|%25|file://|<foo|xmlns:|/etc/passwd|windows/win\.ini)",
|
220
|
+
re.IGNORECASE),
|
221
|
+
"suspicious_ext": re.compile(
|
222
|
+
r"\.(exe|sh|py|pl|bak|php5|jspx|bat|cmd|pif|js|vbs|vbe|sct|ini|inf|tmp|swp|jar|java|class|ps1)\b",
|
223
|
+
re.IGNORECASE)
|
224
|
+
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|