xbase-util 0.3.3__tar.gz → 0.3.5__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {xbase_util-0.3.3 → xbase_util-0.3.5}/PKG-INFO +1 -1
- {xbase_util-0.3.3 → xbase_util-0.3.5}/setup.py +2 -2
- xbase_util-0.3.5/xbase_util/add_column_util.py +151 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/xbase_util.py +1 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/PKG-INFO +1 -1
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/SOURCES.txt +1 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/README.md +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/setup.cfg +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/__init__.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/__init__.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/ConfigBean.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/CurrentConfigBean.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/FlowBean.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/TaskTemplateBean.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/__init__.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/ConfigDao.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/CurrentConfigDao.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/FlowDao.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/TaskTemplateDao.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/__init__.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/initsqlite3.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/es_db_util.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/esreq.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/geo_util.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/handle_features_util.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/pcap_util.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/xbase_constant.py +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/dependency_links.txt +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/not-zip-safe +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/top_level.txt +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util_assets/GeoLite2-City.mmdb +0 -0
- {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util_assets/arkimeparse.js +0 -0
@@ -3,7 +3,7 @@ from distutils.core import setup
|
|
3
3
|
from setuptools import find_packages
|
4
4
|
|
5
5
|
setup(name="xbase_util",
|
6
|
-
version="0.3.
|
6
|
+
version="0.3.5",
|
7
7
|
description="网络安全基础工具",
|
8
8
|
long_description="包含提取,预测,训练的基础工具",
|
9
9
|
author="xyt",
|
@@ -15,6 +15,6 @@ setup(name="xbase_util",
|
|
15
15
|
],
|
16
16
|
zip_safe=False,
|
17
17
|
package_data={
|
18
|
-
'xbase_util': ['../xbase_util_assets/*']
|
18
|
+
'xbase_util': ['../xbase_util_assets/*']
|
19
19
|
},
|
20
20
|
include_package_data=True)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
import re
|
5
|
+
|
6
|
+
from nltk.corpus import words
|
7
|
+
from nltk.stem import WordNetLemmatizer
|
8
|
+
from tldextract import tldextract
|
9
|
+
|
10
|
+
# 初始化 lemmatizer
|
11
|
+
lemmatizer = WordNetLemmatizer()
|
12
|
+
os.environ["TLDEXTRACT_DISABLE_UPDATE"] = "1" # 禁用更新
|
13
|
+
|
14
|
+
# 下载词库(仅需执行一次)
|
15
|
+
# nltk.download('wordnet')
|
16
|
+
# 构建词库集合
|
17
|
+
common_tlds = {
|
18
|
+
# 常见的 gTLD
|
19
|
+
".com", ".org", ".net", ".info", ".biz", ".edu", ".gov", ".mil",
|
20
|
+
# 新兴的 gTLD
|
21
|
+
".app", ".blog", ".shop", ".tech", ".xyz", ".online", ".me", ".co", ".tv",
|
22
|
+
# 其他 gTLD
|
23
|
+
".name", ".pro", ".mobi", ".aero", ".coop", ".museum",
|
24
|
+
# 中国的 ccTLD
|
25
|
+
".cn", ".us", ".uk", ".de", ".jp", ".fr", ".ca", ".in", ".br", ".ru", ".au", ".kr", ".it", ".es",
|
26
|
+
# 其他常见的 ccTLD
|
27
|
+
".ar", ".mx", ".ch", ".nl", ".se", ".pl", ".no", ".fi", ".be", ".dk", ".at",
|
28
|
+
# 国际化域名 (IDN)
|
29
|
+
".中国", ".한국", ".рф", ".印度"
|
30
|
+
}
|
31
|
+
word_set = set(words.words()) # 将词库转换为集合,提高查找速度
|
32
|
+
word_set.update(
|
33
|
+
["baidu", "qq", "ali", "souhu", "douyin", "jd", "tencent", "taobao", "tianmao", "dewu", "sougou", "anmeng", "weibo",
|
34
|
+
"douyu", "huya", "bilibili", "csnd", "zhihu", "huawei", "xiaomi", "vivo", "oppo", "qihu", "yahu", "fanke",
|
35
|
+
"xunfei"])
|
36
|
+
|
37
|
+
|
38
|
+
def is_meaningful_word(word):
|
39
|
+
"""判断单词是否在词库中"""
|
40
|
+
return int(lemmatizer.lemmatize(word.lower(), pos='n') in word_set)
|
41
|
+
|
42
|
+
|
43
|
+
def is_meaningful_phrase(phrase):
|
44
|
+
"""判断是否是有意义的短语(分词后每个词都必须有意义)"""
|
45
|
+
words_in_phrase = phrase.split('.')
|
46
|
+
return all(is_meaningful_word(word) for word in words_in_phrase)
|
47
|
+
|
48
|
+
|
49
|
+
def is_danger_subdomain(uri):
|
50
|
+
"""提取并处理子域名"""
|
51
|
+
ext = tldextract.extract(uri)
|
52
|
+
|
53
|
+
subdomain = ext.subdomain.replace("www.", "")
|
54
|
+
if subdomain:
|
55
|
+
subdomain_parts = subdomain.split('.')
|
56
|
+
# filtered_parts = [part for part in subdomain_parts if part not in common_prefixes]
|
57
|
+
# print(filtered_parts)
|
58
|
+
meaningful_parts = [part for part in subdomain_parts if is_meaningful_word(part)]
|
59
|
+
# print(meaningful_parts)
|
60
|
+
if meaningful_parts:
|
61
|
+
return 0
|
62
|
+
else:
|
63
|
+
return 1
|
64
|
+
return 0
|
65
|
+
|
66
|
+
|
67
|
+
def is_danger_domain(uri):
|
68
|
+
"""提取主域名并判断是否有意义"""
|
69
|
+
ext = tldextract.extract(uri)
|
70
|
+
domain = ext.domain
|
71
|
+
if is_meaningful_word(domain):
|
72
|
+
return 0
|
73
|
+
return 1
|
74
|
+
|
75
|
+
|
76
|
+
# 判断域名是否过长
|
77
|
+
def is_long_domain(uri):
|
78
|
+
ext = tldextract.extract(uri)
|
79
|
+
domain = ext.domain
|
80
|
+
subdomain = ext.subdomain
|
81
|
+
if subdomain:
|
82
|
+
subdomain_parts = subdomain.split(".")
|
83
|
+
target = 1 if any(len(part) > 10 for part in subdomain_parts) else 0
|
84
|
+
else:
|
85
|
+
target = 0
|
86
|
+
return int(len(domain) > 10 or target)
|
87
|
+
|
88
|
+
|
89
|
+
def has_uncommon_tld(domain):
|
90
|
+
"""判断域名是否使用了非常规TLD"""
|
91
|
+
ext = tldextract.extract(domain)
|
92
|
+
return int(ext.suffix not in common_tlds)
|
93
|
+
|
94
|
+
|
95
|
+
# 判断域名是否包含随机字符(简单示例:检查是否包含非字母数字字符)
|
96
|
+
def has_random_characters(domain):
|
97
|
+
# 正常域名通常只包含字母、数字、和连字符
|
98
|
+
return int(bool(re.search(r'[^a-zA-Z0-9-_.]', domain)))
|
99
|
+
|
100
|
+
|
101
|
+
# 判断域名是否包含特殊字符(例如汉字或表情符号)
|
102
|
+
def has_special_characters(domain):
|
103
|
+
# 汉字或特殊字符的 Unicode 范围
|
104
|
+
return int(bool(re.search(r'[\u4e00-\u9fff\U0001F600-\U0001F64F]', domain)))
|
105
|
+
|
106
|
+
|
107
|
+
# 判断域名是否包含大量子域名(假设 10 个以上子域名为异常)
|
108
|
+
def has_large_number_of_subdomains(uri):
|
109
|
+
if tldextract.extract(uri).subdomain:
|
110
|
+
subdomains_list = uri.split('.')
|
111
|
+
# 如果子域名的数量超过 10,则认为它可能是异常的
|
112
|
+
return int(len(subdomains_list) > 3)
|
113
|
+
else:
|
114
|
+
return 0
|
115
|
+
|
116
|
+
|
117
|
+
def parse_list(x):
|
118
|
+
if isinstance(x, str):
|
119
|
+
if x == "[]":
|
120
|
+
x = []
|
121
|
+
else:
|
122
|
+
x = f"{x}".replace("\"", "").replace("[", "").replace("]", "").split(",")
|
123
|
+
elif isinstance(x, list):
|
124
|
+
x = [f"{item}" for item in x]
|
125
|
+
else:
|
126
|
+
print(f"unknown:{x} {type(x)}")
|
127
|
+
x = []
|
128
|
+
return x
|
129
|
+
|
130
|
+
|
131
|
+
def handle_dns(origin_list, isDataFrame=False):
|
132
|
+
print("handle_dnslist")
|
133
|
+
if not isDataFrame:
|
134
|
+
origin_list = pd.DataFrame(origin_list)
|
135
|
+
origin_list["dnslist"] = origin_list['dns.host'].apply(parse_list)
|
136
|
+
origin_list['dns_host_is_long_domain'] = origin_list['dnslist'].apply(
|
137
|
+
lambda x: any(is_long_domain(domain) for domain in x))
|
138
|
+
origin_list['dns_host_is_random_characters'] = origin_list['dnslist'].apply(
|
139
|
+
lambda x: any(has_random_characters(domain) for domain in x))
|
140
|
+
origin_list['dns_host_is_special_characters'] = origin_list['dnslist'].apply(
|
141
|
+
lambda x: any(has_special_characters(domain) for domain in x))
|
142
|
+
origin_list['dns_host_is_large_subdomains'] = origin_list['dnslist'].apply(
|
143
|
+
lambda x: any(has_large_number_of_subdomains(domain) for domain in x))
|
144
|
+
origin_list['dns_host_is_danger_domain'] = origin_list['dnslist'].apply(
|
145
|
+
lambda x: any(is_danger_domain(domain) for domain in x))
|
146
|
+
origin_list['dns_host_is_danger_subdomain'] = origin_list['dnslist'].apply(
|
147
|
+
lambda x: any(is_danger_subdomain(domain) for domain in x))
|
148
|
+
origin_list['dns_host_is_uncommon_tld'] = origin_list['dnslist'].apply(
|
149
|
+
lambda x: any(has_uncommon_tld(domain) for domain in x))
|
150
|
+
origin_list.drop(columns=['dnslist'], inplace=True)
|
151
|
+
return origin_list
|
@@ -324,6 +324,7 @@ def extract_session_fields(origin_list, geoUtil):
|
|
324
324
|
"http.request-refererCnt": http.get("requestRefererCnt", 0),
|
325
325
|
"http.path": http.get("path", []),
|
326
326
|
"http.hostCnt": http.get("hostCnt", 0),
|
327
|
+
"http.host": http.get("host", []),
|
327
328
|
"http.response-server": http.get("response-server", []),
|
328
329
|
"http.pathCnt": http.get("pathCnt", 0),
|
329
330
|
"http.useragentTokens": http.get("useragentTokens", ""),
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|