xbase-util 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {xbase_util-0.3.3 → xbase_util-0.3.5}/PKG-INFO +1 -1
  2. {xbase_util-0.3.3 → xbase_util-0.3.5}/setup.py +2 -2
  3. xbase_util-0.3.5/xbase_util/add_column_util.py +151 -0
  4. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/xbase_util.py +1 -0
  5. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/PKG-INFO +1 -1
  6. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/SOURCES.txt +1 -0
  7. {xbase_util-0.3.3 → xbase_util-0.3.5}/README.md +0 -0
  8. {xbase_util-0.3.3 → xbase_util-0.3.5}/setup.cfg +0 -0
  9. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/__init__.py +0 -0
  10. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/__init__.py +0 -0
  11. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/ConfigBean.py +0 -0
  12. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/CurrentConfigBean.py +0 -0
  13. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/FlowBean.py +0 -0
  14. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/TaskTemplateBean.py +0 -0
  15. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/bean/__init__.py +0 -0
  16. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/ConfigDao.py +0 -0
  17. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/CurrentConfigDao.py +0 -0
  18. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/FlowDao.py +0 -0
  19. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/TaskTemplateDao.py +0 -0
  20. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/dao/__init__.py +0 -0
  21. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/db/initsqlite3.py +0 -0
  22. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/es_db_util.py +0 -0
  23. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/esreq.py +0 -0
  24. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/geo_util.py +0 -0
  25. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/handle_features_util.py +0 -0
  26. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/pcap_util.py +0 -0
  27. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util/xbase_constant.py +0 -0
  28. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/dependency_links.txt +0 -0
  29. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/not-zip-safe +0 -0
  30. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util.egg-info/top_level.txt +0 -0
  31. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util_assets/GeoLite2-City.mmdb +0 -0
  32. {xbase_util-0.3.3 → xbase_util-0.3.5}/xbase_util_assets/arkimeparse.js +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase_util
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: 网络安全基础工具
5
5
  Home-page: https://gitee.com/jimonik/xbase_util.git
6
6
  Author: xyt
@@ -3,7 +3,7 @@ from distutils.core import setup
3
3
  from setuptools import find_packages
4
4
 
5
5
  setup(name="xbase_util",
6
- version="0.3.3",
6
+ version="0.3.5",
7
7
  description="网络安全基础工具",
8
8
  long_description="包含提取,预测,训练的基础工具",
9
9
  author="xyt",
@@ -15,6 +15,6 @@ setup(name="xbase_util",
15
15
  ],
16
16
  zip_safe=False,
17
17
  package_data={
18
- 'xbase_util': ['../xbase_util_assets/*'],
18
+ 'xbase_util': ['../xbase_util_assets/*']
19
19
  },
20
20
  include_package_data=True)
@@ -0,0 +1,151 @@
1
+ import os
2
+
3
+ import pandas as pd
4
+ import re
5
+
6
+ from nltk.corpus import words
7
+ from nltk.stem import WordNetLemmatizer
8
+ from tldextract import tldextract
9
+
10
+ # 初始化 lemmatizer
11
+ lemmatizer = WordNetLemmatizer()
12
+ os.environ["TLDEXTRACT_DISABLE_UPDATE"] = "1" # 禁用更新
13
+
14
+ # 下载词库(仅需执行一次)
15
+ # nltk.download('wordnet')
16
+ # 构建词库集合
17
+ common_tlds = {
18
+ # 常见的 gTLD
19
+ ".com", ".org", ".net", ".info", ".biz", ".edu", ".gov", ".mil",
20
+ # 新兴的 gTLD
21
+ ".app", ".blog", ".shop", ".tech", ".xyz", ".online", ".me", ".co", ".tv",
22
+ # 其他 gTLD
23
+ ".name", ".pro", ".mobi", ".aero", ".coop", ".museum",
24
+ # 中国的 ccTLD
25
+ ".cn", ".us", ".uk", ".de", ".jp", ".fr", ".ca", ".in", ".br", ".ru", ".au", ".kr", ".it", ".es",
26
+ # 其他常见的 ccTLD
27
+ ".ar", ".mx", ".ch", ".nl", ".se", ".pl", ".no", ".fi", ".be", ".dk", ".at",
28
+ # 国际化域名 (IDN)
29
+ ".中国", ".한국", ".рф", ".印度"
30
+ }
31
+ word_set = set(words.words()) # 将词库转换为集合,提高查找速度
32
+ word_set.update(
33
+ ["baidu", "qq", "ali", "souhu", "douyin", "jd", "tencent", "taobao", "tianmao", "dewu", "sougou", "anmeng", "weibo",
34
+ "douyu", "huya", "bilibili", "csnd", "zhihu", "huawei", "xiaomi", "vivo", "oppo", "qihu", "yahu", "fanke",
35
+ "xunfei"])
36
+
37
+
38
+ def is_meaningful_word(word):
39
+ """判断单词是否在词库中"""
40
+ return int(lemmatizer.lemmatize(word.lower(), pos='n') in word_set)
41
+
42
+
43
+ def is_meaningful_phrase(phrase):
44
+ """判断是否是有意义的短语(分词后每个词都必须有意义)"""
45
+ words_in_phrase = phrase.split('.')
46
+ return all(is_meaningful_word(word) for word in words_in_phrase)
47
+
48
+
49
+ def is_danger_subdomain(uri):
50
+ """提取并处理子域名"""
51
+ ext = tldextract.extract(uri)
52
+
53
+ subdomain = ext.subdomain.replace("www.", "")
54
+ if subdomain:
55
+ subdomain_parts = subdomain.split('.')
56
+ # filtered_parts = [part for part in subdomain_parts if part not in common_prefixes]
57
+ # print(filtered_parts)
58
+ meaningful_parts = [part for part in subdomain_parts if is_meaningful_word(part)]
59
+ # print(meaningful_parts)
60
+ if meaningful_parts:
61
+ return 0
62
+ else:
63
+ return 1
64
+ return 0
65
+
66
+
67
+ def is_danger_domain(uri):
68
+ """提取主域名并判断是否有意义"""
69
+ ext = tldextract.extract(uri)
70
+ domain = ext.domain
71
+ if is_meaningful_word(domain):
72
+ return 0
73
+ return 1
74
+
75
+
76
+ # 判断域名是否过长
77
+ def is_long_domain(uri):
78
+ ext = tldextract.extract(uri)
79
+ domain = ext.domain
80
+ subdomain = ext.subdomain
81
+ if subdomain:
82
+ subdomain_parts = subdomain.split(".")
83
+ target = 1 if any(len(part) > 10 for part in subdomain_parts) else 0
84
+ else:
85
+ target = 0
86
+ return int(len(domain) > 10 or target)
87
+
88
+
89
+ def has_uncommon_tld(domain):
90
+ """判断域名是否使用了非常规TLD"""
91
+ ext = tldextract.extract(domain)
92
+ return int(ext.suffix not in common_tlds)
93
+
94
+
95
+ # 判断域名是否包含随机字符(简单示例:检查是否包含非字母数字字符)
96
+ def has_random_characters(domain):
97
+ # 正常域名通常只包含字母、数字、和连字符
98
+ return int(bool(re.search(r'[^a-zA-Z0-9-_.]', domain)))
99
+
100
+
101
+ # 判断域名是否包含特殊字符(例如汉字或表情符号)
102
+ def has_special_characters(domain):
103
+ # 汉字或特殊字符的 Unicode 范围
104
+ return int(bool(re.search(r'[\u4e00-\u9fff\U0001F600-\U0001F64F]', domain)))
105
+
106
+
107
+ # 判断域名是否包含大量子域名(假设 10 个以上子域名为异常)
108
+ def has_large_number_of_subdomains(uri):
109
+ if tldextract.extract(uri).subdomain:
110
+ subdomains_list = uri.split('.')
111
+ # 如果子域名的数量超过 10,则认为它可能是异常的
112
+ return int(len(subdomains_list) > 3)
113
+ else:
114
+ return 0
115
+
116
+
117
+ def parse_list(x):
118
+ if isinstance(x, str):
119
+ if x == "[]":
120
+ x = []
121
+ else:
122
+ x = f"{x}".replace("\"", "").replace("[", "").replace("]", "").split(",")
123
+ elif isinstance(x, list):
124
+ x = [f"{item}" for item in x]
125
+ else:
126
+ print(f"unknown:{x} {type(x)}")
127
+ x = []
128
+ return x
129
+
130
+
131
+ def handle_dns(origin_list, isDataFrame=False):
132
+ print("handle_dnslist")
133
+ if not isDataFrame:
134
+ origin_list = pd.DataFrame(origin_list)
135
+ origin_list["dnslist"] = origin_list['dns.host'].apply(parse_list)
136
+ origin_list['dns_host_is_long_domain'] = origin_list['dnslist'].apply(
137
+ lambda x: any(is_long_domain(domain) for domain in x))
138
+ origin_list['dns_host_is_random_characters'] = origin_list['dnslist'].apply(
139
+ lambda x: any(has_random_characters(domain) for domain in x))
140
+ origin_list['dns_host_is_special_characters'] = origin_list['dnslist'].apply(
141
+ lambda x: any(has_special_characters(domain) for domain in x))
142
+ origin_list['dns_host_is_large_subdomains'] = origin_list['dnslist'].apply(
143
+ lambda x: any(has_large_number_of_subdomains(domain) for domain in x))
144
+ origin_list['dns_host_is_danger_domain'] = origin_list['dnslist'].apply(
145
+ lambda x: any(is_danger_domain(domain) for domain in x))
146
+ origin_list['dns_host_is_danger_subdomain'] = origin_list['dnslist'].apply(
147
+ lambda x: any(is_danger_subdomain(domain) for domain in x))
148
+ origin_list['dns_host_is_uncommon_tld'] = origin_list['dnslist'].apply(
149
+ lambda x: any(has_uncommon_tld(domain) for domain in x))
150
+ origin_list.drop(columns=['dnslist'], inplace=True)
151
+ return origin_list
@@ -324,6 +324,7 @@ def extract_session_fields(origin_list, geoUtil):
324
324
  "http.request-refererCnt": http.get("requestRefererCnt", 0),
325
325
  "http.path": http.get("path", []),
326
326
  "http.hostCnt": http.get("hostCnt", 0),
327
+ "http.host": http.get("host", []),
327
328
  "http.response-server": http.get("response-server", []),
328
329
  "http.pathCnt": http.get("pathCnt", 0),
329
330
  "http.useragentTokens": http.get("useragentTokens", ""),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xbase-util
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: 网络安全基础工具
5
5
  Home-page: https://gitee.com/jimonik/xbase_util.git
6
6
  Author: xyt
@@ -1,6 +1,7 @@
1
1
  README.md
2
2
  setup.py
3
3
  xbase_util/__init__.py
4
+ xbase_util/add_column_util.py
4
5
  xbase_util/es_db_util.py
5
6
  xbase_util/esreq.py
6
7
  xbase_util/geo_util.py
File without changes
File without changes