oneforall-kjl 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OneForAll/__init__.py +15 -0
- OneForAll/brute.py +503 -0
- OneForAll/common/check.py +41 -0
- OneForAll/common/crawl.py +10 -0
- OneForAll/common/database.py +277 -0
- OneForAll/common/domain.py +63 -0
- OneForAll/common/ipasn.py +42 -0
- OneForAll/common/ipreg.py +139 -0
- OneForAll/common/lookup.py +28 -0
- OneForAll/common/module.py +369 -0
- OneForAll/common/query.py +9 -0
- OneForAll/common/records.py +363 -0
- OneForAll/common/request.py +264 -0
- OneForAll/common/resolve.py +173 -0
- OneForAll/common/search.py +78 -0
- OneForAll/common/similarity.py +138 -0
- OneForAll/common/tablib/__init__.py +0 -0
- OneForAll/common/tablib/format.py +89 -0
- OneForAll/common/tablib/tablib.py +360 -0
- OneForAll/common/tldextract.py +240 -0
- OneForAll/common/utils.py +789 -0
- OneForAll/config/__init__.py +17 -0
- OneForAll/config/api.py +94 -0
- OneForAll/config/default.py +255 -0
- OneForAll/config/log.py +38 -0
- OneForAll/config/setting.py +108 -0
- OneForAll/export.py +72 -0
- OneForAll/modules/altdns.py +216 -0
- OneForAll/modules/autotake/github.py +105 -0
- OneForAll/modules/certificates/censys_api.py +73 -0
- OneForAll/modules/certificates/certspotter.py +48 -0
- OneForAll/modules/certificates/crtsh.py +84 -0
- OneForAll/modules/certificates/google.py +48 -0
- OneForAll/modules/certificates/myssl.py +46 -0
- OneForAll/modules/certificates/racent.py +49 -0
- OneForAll/modules/check/axfr.py +97 -0
- OneForAll/modules/check/cdx.py +44 -0
- OneForAll/modules/check/cert.py +58 -0
- OneForAll/modules/check/csp.py +94 -0
- OneForAll/modules/check/nsec.py +58 -0
- OneForAll/modules/check/robots.py +44 -0
- OneForAll/modules/check/sitemap.py +44 -0
- OneForAll/modules/collect.py +70 -0
- OneForAll/modules/crawl/archivecrawl.py +59 -0
- OneForAll/modules/crawl/commoncrawl.py +59 -0
- OneForAll/modules/datasets/anubis.py +45 -0
- OneForAll/modules/datasets/bevigil.py +50 -0
- OneForAll/modules/datasets/binaryedge_api.py +50 -0
- OneForAll/modules/datasets/cebaidu.py +45 -0
- OneForAll/modules/datasets/chinaz.py +45 -0
- OneForAll/modules/datasets/chinaz_api.py +49 -0
- OneForAll/modules/datasets/circl_api.py +49 -0
- OneForAll/modules/datasets/cloudflare_api.py +130 -0
- OneForAll/modules/datasets/dnsdb_api.py +51 -0
- OneForAll/modules/datasets/dnsdumpster.py +52 -0
- OneForAll/modules/datasets/dnsgrep.py +44 -0
- OneForAll/modules/datasets/fullhunt.py +48 -0
- OneForAll/modules/datasets/hackertarget.py +45 -0
- OneForAll/modules/datasets/ip138.py +45 -0
- OneForAll/modules/datasets/ipv4info_api.py +73 -0
- OneForAll/modules/datasets/netcraft.py +66 -0
- OneForAll/modules/datasets/passivedns_api.py +51 -0
- OneForAll/modules/datasets/qianxun.py +61 -0
- OneForAll/modules/datasets/rapiddns.py +45 -0
- OneForAll/modules/datasets/riddler.py +45 -0
- OneForAll/modules/datasets/robtex.py +58 -0
- OneForAll/modules/datasets/securitytrails_api.py +56 -0
- OneForAll/modules/datasets/sitedossier.py +57 -0
- OneForAll/modules/datasets/spyse_api.py +62 -0
- OneForAll/modules/datasets/sublist3r.py +45 -0
- OneForAll/modules/datasets/urlscan.py +45 -0
- OneForAll/modules/datasets/windvane.py +92 -0
- OneForAll/modules/dnsquery/mx.py +35 -0
- OneForAll/modules/dnsquery/ns.py +35 -0
- OneForAll/modules/dnsquery/soa.py +35 -0
- OneForAll/modules/dnsquery/spf.py +35 -0
- OneForAll/modules/dnsquery/txt.py +35 -0
- OneForAll/modules/enrich.py +72 -0
- OneForAll/modules/finder.py +206 -0
- OneForAll/modules/intelligence/alienvault.py +50 -0
- OneForAll/modules/intelligence/riskiq_api.py +58 -0
- OneForAll/modules/intelligence/threatbook_api.py +50 -0
- OneForAll/modules/intelligence/threatminer.py +45 -0
- OneForAll/modules/intelligence/virustotal.py +60 -0
- OneForAll/modules/intelligence/virustotal_api.py +59 -0
- OneForAll/modules/iscdn.py +86 -0
- OneForAll/modules/search/ask.py +69 -0
- OneForAll/modules/search/baidu.py +96 -0
- OneForAll/modules/search/bing.py +79 -0
- OneForAll/modules/search/bing_api.py +78 -0
- OneForAll/modules/search/fofa_api.py +74 -0
- OneForAll/modules/search/gitee.py +71 -0
- OneForAll/modules/search/github_api.py +86 -0
- OneForAll/modules/search/google.py +83 -0
- OneForAll/modules/search/google_api.py +77 -0
- OneForAll/modules/search/hunter_api.py +72 -0
- OneForAll/modules/search/quake_api.py +72 -0
- OneForAll/modules/search/shodan_api.py +53 -0
- OneForAll/modules/search/so.py +75 -0
- OneForAll/modules/search/sogou.py +72 -0
- OneForAll/modules/search/wzsearch.py +68 -0
- OneForAll/modules/search/yahoo.py +81 -0
- OneForAll/modules/search/yandex.py +80 -0
- OneForAll/modules/search/zoomeye_api.py +73 -0
- OneForAll/modules/srv.py +75 -0
- OneForAll/modules/wildcard.py +319 -0
- OneForAll/oneforall.py +275 -0
- OneForAll/takeover.py +168 -0
- OneForAll/test.py +23 -0
- oneforall_kjl-0.1.1.dist-info/METADATA +18 -0
- oneforall_kjl-0.1.1.dist-info/RECORD +114 -0
- oneforall_kjl-0.1.1.dist-info/WHEEL +5 -0
- oneforall_kjl-0.1.1.dist-info/entry_points.txt +2 -0
- oneforall_kjl-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,173 @@
|
|
1
|
+
import gc
|
2
|
+
import json
|
3
|
+
|
4
|
+
from config.log import logger
|
5
|
+
from config import settings
|
6
|
+
from common import utils
|
7
|
+
|
8
|
+
|
9
|
+
def filter_subdomain(data):
|
10
|
+
"""
|
11
|
+
过滤出无解析内容的子域到新的子域列表
|
12
|
+
|
13
|
+
:param list data: 待过滤的数据列表
|
14
|
+
:return: 符合条件的子域列表
|
15
|
+
"""
|
16
|
+
logger.log('DEBUG', f'Filtering subdomains to be resolved')
|
17
|
+
subdomains = []
|
18
|
+
for infos in data:
|
19
|
+
if not infos.get('ip'):
|
20
|
+
subdomain = infos.get('subdomain')
|
21
|
+
if subdomain:
|
22
|
+
subdomains.append(subdomain)
|
23
|
+
return subdomains
|
24
|
+
|
25
|
+
|
26
|
+
def update_data(data, infos):
|
27
|
+
"""
|
28
|
+
更新解析结果
|
29
|
+
|
30
|
+
:param list data: 待更新的数据列表
|
31
|
+
:param dict infos: 子域有关结果信息
|
32
|
+
:return: 更新后的数据列表
|
33
|
+
"""
|
34
|
+
logger.log('DEBUG', f'Updating resolved results')
|
35
|
+
if not infos:
|
36
|
+
logger.log('ALERT', f'No valid resolved result')
|
37
|
+
return data
|
38
|
+
new_data = list()
|
39
|
+
for index, items in enumerate(data):
|
40
|
+
if items.get('ip'):
|
41
|
+
new_data.append(items)
|
42
|
+
continue
|
43
|
+
subdomain = items.get('subdomain')
|
44
|
+
record = infos.get(subdomain)
|
45
|
+
if record:
|
46
|
+
items.update(record)
|
47
|
+
new_data.append(items)
|
48
|
+
else:
|
49
|
+
subdomain = items.get('subdomain')
|
50
|
+
logger.log('DEBUG', f'{subdomain} resolution has no result')
|
51
|
+
return new_data
|
52
|
+
|
53
|
+
|
54
|
+
def save_db(name, data):
|
55
|
+
"""
|
56
|
+
Save resolved results to database
|
57
|
+
|
58
|
+
:param str name: table name
|
59
|
+
:param list data: data to be saved
|
60
|
+
"""
|
61
|
+
logger.log('INFOR', f'Saving resolved results')
|
62
|
+
utils.save_to_db(name, data, 'resolve')
|
63
|
+
|
64
|
+
|
65
|
+
def save_subdomains(save_path, subdomain_list):
|
66
|
+
logger.log('DEBUG', f'Saving resolved subdomain')
|
67
|
+
subdomain_data = '\n'.join(subdomain_list)
|
68
|
+
if not utils.save_to_file(save_path, subdomain_data):
|
69
|
+
logger.log('FATAL', 'Save resolved subdomain error')
|
70
|
+
exit(1)
|
71
|
+
|
72
|
+
|
73
|
+
def gen_infos(data, qname, info, infos):
|
74
|
+
flag = False
|
75
|
+
cnames = list()
|
76
|
+
ips = list()
|
77
|
+
ttl = list()
|
78
|
+
answers = data.get('answers')
|
79
|
+
for answer in answers:
|
80
|
+
if answer.get('type') == 'A':
|
81
|
+
flag = True
|
82
|
+
name = answer.get('name')
|
83
|
+
cname = name[:-1].lower() # 去除最右边的`.`点号
|
84
|
+
cnames.append(cname)
|
85
|
+
ip = answer.get('data')
|
86
|
+
ips.append(ip)
|
87
|
+
ttl.append(str(answer.get('ttl')))
|
88
|
+
info['resolve'] = 1
|
89
|
+
info['reason'] = 'OK'
|
90
|
+
info['cname'] = ','.join(cnames)
|
91
|
+
info['ip'] = ','.join(ips)
|
92
|
+
info['ttl'] = ','.join(ttl)
|
93
|
+
infos[qname] = info
|
94
|
+
if not flag:
|
95
|
+
logger.log('DEBUG', f'Resolving {qname} have not a record')
|
96
|
+
info['alive'] = 0
|
97
|
+
info['resolve'] = 0
|
98
|
+
info['reason'] = 'NoARecord'
|
99
|
+
infos[qname] = info
|
100
|
+
return infos
|
101
|
+
|
102
|
+
|
103
|
+
def deal_output(output_path):
|
104
|
+
logger.log('INFOR', f'Processing resolved results')
|
105
|
+
infos = dict() # 用来记录所有域名有关信息
|
106
|
+
with open(output_path) as fd:
|
107
|
+
for line in fd:
|
108
|
+
line = line.strip()
|
109
|
+
try:
|
110
|
+
items = json.loads(line)
|
111
|
+
except Exception as e:
|
112
|
+
logger.log('ERROR', e.args)
|
113
|
+
logger.log('ERROR', f'Error resolve line {line}, skip this line')
|
114
|
+
continue
|
115
|
+
info = dict()
|
116
|
+
info['resolver'] = items.get('resolver')
|
117
|
+
qname = items.get('name')[:-1] # 去除最右边的`.`点号
|
118
|
+
status = items.get('status')
|
119
|
+
if status != 'NOERROR':
|
120
|
+
logger.log('DEBUG', f'Resolving {qname}: {status}')
|
121
|
+
continue
|
122
|
+
data = items.get('data')
|
123
|
+
if 'answers' not in data:
|
124
|
+
logger.log('DEBUG', f'Resolving {qname} have not any answers')
|
125
|
+
info['alive'] = 0
|
126
|
+
info['resolve'] = 0
|
127
|
+
info['reason'] = 'NoAnswer'
|
128
|
+
infos[qname] = info
|
129
|
+
continue
|
130
|
+
infos = gen_infos(data, qname, info, infos)
|
131
|
+
return infos
|
132
|
+
|
133
|
+
|
134
|
+
def run_resolve(domain, data):
|
135
|
+
"""
|
136
|
+
调用子域解析入口函数
|
137
|
+
|
138
|
+
:param str domain: 待解析的主域
|
139
|
+
:param list data: 待解析的子域数据列表
|
140
|
+
:return: 解析得到的结果列表
|
141
|
+
:rtype: list
|
142
|
+
"""
|
143
|
+
logger.log('INFOR', f'Start resolving subdomains of {domain}')
|
144
|
+
subdomains = filter_subdomain(data)
|
145
|
+
if not subdomains:
|
146
|
+
return data
|
147
|
+
|
148
|
+
massdns_dir = settings.third_party_dir.joinpath('massdns')
|
149
|
+
result_dir = settings.result_save_dir
|
150
|
+
temp_dir = result_dir.joinpath('temp')
|
151
|
+
utils.check_dir(temp_dir)
|
152
|
+
massdns_path = utils.get_massdns_path(massdns_dir)
|
153
|
+
timestring = utils.get_timestring()
|
154
|
+
|
155
|
+
save_name = f'collected_subdomains_{domain}_{timestring}.txt'
|
156
|
+
save_path = temp_dir.joinpath(save_name)
|
157
|
+
save_subdomains(save_path, subdomains)
|
158
|
+
del subdomains
|
159
|
+
gc.collect()
|
160
|
+
|
161
|
+
output_name = f'resolved_result_{domain}_{timestring}.json'
|
162
|
+
output_path = temp_dir.joinpath(output_name)
|
163
|
+
log_path = result_dir.joinpath('massdns.log')
|
164
|
+
ns_path = utils.get_ns_path()
|
165
|
+
|
166
|
+
logger.log('INFOR', f'Running massdns to resolve subdomains')
|
167
|
+
utils.call_massdns(massdns_path, save_path, ns_path,
|
168
|
+
output_path, log_path, quiet_mode=True)
|
169
|
+
|
170
|
+
infos = deal_output(output_path)
|
171
|
+
data = update_data(data, infos)
|
172
|
+
logger.log('INFOR', f'Finished resolve subdomains of {domain}')
|
173
|
+
return data
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from config import settings
|
2
|
+
from common.module import Module
|
3
|
+
|
4
|
+
|
5
|
+
class Search(Module):
|
6
|
+
"""
|
7
|
+
Search base class
|
8
|
+
"""
|
9
|
+
def __init__(self):
|
10
|
+
Module.__init__(self)
|
11
|
+
self.page_num = 0 # 要显示搜索起始条数
|
12
|
+
self.per_page_num = 50 # 每页显示搜索条数
|
13
|
+
self.recursive_search = settings.enable_recursive_search
|
14
|
+
self.recursive_times = settings.search_recursive_times
|
15
|
+
self.full_search = settings.enable_full_search
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def filter(domain, subdomain):
|
19
|
+
"""
|
20
|
+
生成搜索过滤语句
|
21
|
+
使用搜索引擎支持的-site:语法过滤掉搜索页面较多的子域以发现新域
|
22
|
+
|
23
|
+
:param str domain: 域名
|
24
|
+
:param set subdomain: 子域名集合
|
25
|
+
:return: 过滤语句
|
26
|
+
:rtype: str
|
27
|
+
"""
|
28
|
+
statements_list = []
|
29
|
+
subdomains_temp = set(map(lambda x: x + '.' + domain, settings.common_subnames))
|
30
|
+
subdomains_temp = list(subdomain.intersection(subdomains_temp))
|
31
|
+
for i in range(0, len(subdomains_temp), 2): # 同时排除2个子域
|
32
|
+
statements_list.append(''.join(set(map(lambda s: ' -site:' + s,
|
33
|
+
subdomains_temp[i:i + 2]))))
|
34
|
+
return statements_list
|
35
|
+
|
36
|
+
def match_location(self, url):
|
37
|
+
"""
|
38
|
+
匹配跳转之后的url
|
39
|
+
针对部分搜索引擎(如百度搜索)搜索展示url时有显示不全的情况
|
40
|
+
此函数会向每条结果的链接发送head请求获取响应头的location值并做子域匹配
|
41
|
+
|
42
|
+
:param str url: 展示结果的url链接
|
43
|
+
:return: 匹配的子域
|
44
|
+
:rtype set
|
45
|
+
"""
|
46
|
+
resp = self.head(url, check=False, allow_redirects=False)
|
47
|
+
if not resp:
|
48
|
+
return set()
|
49
|
+
location = resp.headers.get('location')
|
50
|
+
if not location:
|
51
|
+
return set()
|
52
|
+
return set(self.match_subdomains(location))
|
53
|
+
|
54
|
+
def check_subdomains(self, subdomains):
|
55
|
+
"""
|
56
|
+
检查搜索出的子域结果是否满足条件
|
57
|
+
|
58
|
+
:param subdomains: 子域结果
|
59
|
+
:return:
|
60
|
+
"""
|
61
|
+
if not subdomains:
|
62
|
+
# 搜索没有发现子域名则停止搜索
|
63
|
+
return False
|
64
|
+
if not self.full_search and subdomains.issubset(self.subdomains):
|
65
|
+
# 在全搜索过程中发现搜索出的结果有完全重复的结果就停止搜索
|
66
|
+
return False
|
67
|
+
return True
|
68
|
+
|
69
|
+
def recursive_subdomain(self):
|
70
|
+
# 递归搜索下一层的子域
|
71
|
+
# 从1开始是之前已经做过1层子域搜索了,当前实际递归层数是layer+1
|
72
|
+
subdomains = self.subdomains.copy()
|
73
|
+
for layer_num in range(1, self.recursive_times):
|
74
|
+
for subdomain in subdomains:
|
75
|
+
# 进行下一层子域搜索的限制条件
|
76
|
+
count = subdomain.count('.') - self.domain.count('.')
|
77
|
+
if count == layer_num:
|
78
|
+
yield subdomain
|
@@ -0,0 +1,138 @@
|
|
1
|
+
"""
|
2
|
+
根据网页结构判断页面相似性(Determine page similarity based on HTML page structure)
|
3
|
+
判断方法:根据网页的DOM树确定网页的模板特征向量,对模板特征向量计算网页结构相似性。
|
4
|
+
来源地址:https://github.com/SPuerBRead/HTMLSimilarity
|
5
|
+
计算参考: https://patents.google.com/patent/CN101694668B/zh
|
6
|
+
"""
|
7
|
+
from treelib import Tree
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
import bs4
|
10
|
+
|
11
|
+
|
12
|
+
class DOMTree(object):
|
13
|
+
def __init__(self, label, attrs):
|
14
|
+
self.label = label
|
15
|
+
self.attrs = attrs
|
16
|
+
|
17
|
+
|
18
|
+
class HTMLParser(object):
|
19
|
+
def __init__(self, html):
|
20
|
+
self.dom_id = 1
|
21
|
+
self.dom_tree = Tree()
|
22
|
+
self.bs_html = BeautifulSoup(html, 'html.parser')
|
23
|
+
|
24
|
+
def get_dom_structure_tree(self):
|
25
|
+
for content in self.bs_html.contents:
|
26
|
+
if isinstance(content, bs4.element.Tag):
|
27
|
+
self.bs_html = content
|
28
|
+
self.recursive_descendants(self.bs_html, 1)
|
29
|
+
return self.dom_tree
|
30
|
+
|
31
|
+
def recursive_descendants(self, descendants, parent_id):
|
32
|
+
if self.dom_id == 1:
|
33
|
+
self.dom_tree.create_node(descendants.name, self.dom_id,
|
34
|
+
data=DOMTree(descendants.name, descendants.attrs))
|
35
|
+
self.dom_id = self.dom_id + 1
|
36
|
+
for child in descendants.contents:
|
37
|
+
if isinstance(child, bs4.element.Tag):
|
38
|
+
self.dom_tree.create_node(child.name, self.dom_id, parent_id,
|
39
|
+
data=DOMTree(child.name, child.attrs))
|
40
|
+
self.dom_id = self.dom_id + 1
|
41
|
+
self.recursive_descendants(child, self.dom_id - 1)
|
42
|
+
|
43
|
+
|
44
|
+
class Converter(object):
|
45
|
+
def __init__(self, dom_tree, dimension):
|
46
|
+
self.dom_tree = dom_tree
|
47
|
+
self.node_info_list = []
|
48
|
+
self.dimension = dimension
|
49
|
+
self.initial_weight = 1
|
50
|
+
self.attenuation_ratio = 0.6
|
51
|
+
self.dom_eigenvector = {}.fromkeys(range(0, dimension), 0)
|
52
|
+
|
53
|
+
def get_eigenvector(self):
|
54
|
+
for node_id in range(1, self.dom_tree.size() + 1):
|
55
|
+
node = self.dom_tree.get_node(node_id)
|
56
|
+
node_feature = self.create_feature(node)
|
57
|
+
feature_hash = self.feature_hash(node_feature)
|
58
|
+
node_weight = self.calculate_weight(node, node_id, feature_hash)
|
59
|
+
self.construct_eigenvector(feature_hash, node_weight)
|
60
|
+
return self.dom_eigenvector
|
61
|
+
|
62
|
+
@staticmethod
|
63
|
+
def create_feature(node):
|
64
|
+
node_attr_list = []
|
65
|
+
node_feature = node.data.label + '|'
|
66
|
+
for attr in node.data.attrs.keys():
|
67
|
+
node_attr_list.append(attr + ':' + str(node.data.attrs[attr]))
|
68
|
+
node_feature += '|'.join(node_attr_list)
|
69
|
+
return node_feature
|
70
|
+
|
71
|
+
@staticmethod
|
72
|
+
def feature_hash(node_feature):
|
73
|
+
return abs(hash(node_feature)) % (10 ** 8)
|
74
|
+
|
75
|
+
def calculate_weight(self, node, node_id, feature_hash):
|
76
|
+
brother_node_count = 0
|
77
|
+
depth = self.dom_tree.depth(node)
|
78
|
+
for brother_node in self.dom_tree.siblings(node_id):
|
79
|
+
brother_node_feature = self.create_feature(brother_node)
|
80
|
+
brother_node_feature_hash = self.feature_hash(brother_node_feature)
|
81
|
+
if brother_node_feature_hash == feature_hash:
|
82
|
+
brother_node_count = brother_node_count + 1
|
83
|
+
if brother_node_count:
|
84
|
+
node_weight = self.initial_weight * self.attenuation_ratio ** depth \
|
85
|
+
* self.attenuation_ratio ** brother_node_count
|
86
|
+
else:
|
87
|
+
node_weight = self.initial_weight * self.attenuation_ratio ** depth
|
88
|
+
return node_weight
|
89
|
+
|
90
|
+
def construct_eigenvector(self, feature_hash, node_weight):
|
91
|
+
feature_hash = feature_hash % self.dimension
|
92
|
+
self.dom_eigenvector[feature_hash] += node_weight
|
93
|
+
|
94
|
+
|
95
|
+
def calc_pseudodistance(dom1_eigenvector, dom2_eigenvector, dimension):
|
96
|
+
a, b = 0, 0
|
97
|
+
for i in range(dimension):
|
98
|
+
a += dom1_eigenvector[i]-dom2_eigenvector[i]
|
99
|
+
if dom1_eigenvector[i] and dom2_eigenvector[i]:
|
100
|
+
b += dom1_eigenvector[i] + dom2_eigenvector[i]
|
101
|
+
pseudodistance = abs(a)/b
|
102
|
+
return pseudodistance
|
103
|
+
|
104
|
+
|
105
|
+
def get_pseudodistance(html_doc1, html_doc2, dimension=5000):
|
106
|
+
"""
|
107
|
+
获取html文档结构相似度
|
108
|
+
|
109
|
+
:param str html_doc1: html文档
|
110
|
+
:param str html_doc2: html文档
|
111
|
+
:param int dimension: 降维后的维数
|
112
|
+
:return 伪距离
|
113
|
+
"""
|
114
|
+
hp1 = HTMLParser(html_doc1)
|
115
|
+
html_doc1_dom_tree = hp1.get_dom_structure_tree()
|
116
|
+
hp2 = HTMLParser(html_doc2)
|
117
|
+
html_doc2_dom_tree = hp2.get_dom_structure_tree()
|
118
|
+
converter = Converter(html_doc1_dom_tree, dimension)
|
119
|
+
dom1_eigenvector = converter.get_eigenvector()
|
120
|
+
converter = Converter(html_doc2_dom_tree, dimension)
|
121
|
+
dom2_eigenvector = converter.get_eigenvector()
|
122
|
+
return calc_pseudodistance(dom1_eigenvector, dom2_eigenvector, dimension)
|
123
|
+
|
124
|
+
|
125
|
+
def is_similar(html_doc1, html_doc2, dimension=5000):
|
126
|
+
"""
|
127
|
+
根据计算出的伪距离来判断是否html页面结构相似
|
128
|
+
|
129
|
+
:param str html_doc1: html文档
|
130
|
+
:param str html_doc2: html文档
|
131
|
+
:param int dimension: 降维后的维数
|
132
|
+
:return 是否相似(伪距离value<0.2时相似,value>0.2时不相似)
|
133
|
+
"""
|
134
|
+
value = get_pseudodistance(html_doc1, html_doc2, dimension)
|
135
|
+
if value > 0.2:
|
136
|
+
return False
|
137
|
+
else:
|
138
|
+
return True
|
File without changes
|
@@ -0,0 +1,89 @@
|
|
1
|
+
import decimal
|
2
|
+
import json
|
3
|
+
import csv
|
4
|
+
from io import StringIO
|
5
|
+
from uuid import UUID
|
6
|
+
|
7
|
+
""" Tablib - formats
|
8
|
+
"""
|
9
|
+
from collections import OrderedDict
|
10
|
+
|
11
|
+
|
12
|
+
class Registry:
|
13
|
+
_formats = OrderedDict()
|
14
|
+
|
15
|
+
def register(self, key, format_or_path):
|
16
|
+
# Create Databook.<format> read or read/write properties
|
17
|
+
|
18
|
+
# Create Dataset.<format> read or read/write properties,
|
19
|
+
# and Dataset.get_<format>/set_<format> methods.
|
20
|
+
self._formats[key] = format_or_path
|
21
|
+
|
22
|
+
def register_builtins(self):
|
23
|
+
# Registration ordering matters for autodetection.
|
24
|
+
self.register('csv', CSVFormat())
|
25
|
+
self.register('json', JSONFormat())
|
26
|
+
|
27
|
+
def get_format(self, key):
|
28
|
+
if key not in self._formats:
|
29
|
+
raise Exception("OneForAll has no format '%s'." % key)
|
30
|
+
return self._formats[key]
|
31
|
+
|
32
|
+
|
33
|
+
registry = Registry()
|
34
|
+
|
35
|
+
|
36
|
+
def serialize_objects_handler(obj):
|
37
|
+
if isinstance(obj, (decimal.Decimal, UUID)):
|
38
|
+
return str(obj)
|
39
|
+
elif hasattr(obj, 'isoformat'):
|
40
|
+
return obj.isoformat()
|
41
|
+
else:
|
42
|
+
return obj
|
43
|
+
|
44
|
+
|
45
|
+
"""
|
46
|
+
Tablib - JSON Support
|
47
|
+
"""
|
48
|
+
|
49
|
+
|
50
|
+
class JSONFormat:
|
51
|
+
title = 'json'
|
52
|
+
extensions = ('json',)
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def export_set(cls, dataset):
|
56
|
+
"""Returns JSON representation of Dataset."""
|
57
|
+
return json.dumps(dataset.dict, default=serialize_objects_handler)
|
58
|
+
|
59
|
+
|
60
|
+
""" Tablib - CSV Support.
|
61
|
+
"""
|
62
|
+
|
63
|
+
|
64
|
+
class CSVFormat:
|
65
|
+
title = 'csv'
|
66
|
+
extensions = ('csv',)
|
67
|
+
|
68
|
+
DEFAULT_DELIMITER = ','
|
69
|
+
|
70
|
+
@classmethod
|
71
|
+
def export_stream_set(cls, dataset, **kwargs):
|
72
|
+
"""Returns CSV representation of Dataset as file-like."""
|
73
|
+
stream = StringIO()
|
74
|
+
|
75
|
+
kwargs.setdefault('delimiter', cls.DEFAULT_DELIMITER)
|
76
|
+
|
77
|
+
_csv = csv.writer(stream, **kwargs)
|
78
|
+
|
79
|
+
for row in dataset._package(dicts=False):
|
80
|
+
_csv.writerow(row)
|
81
|
+
|
82
|
+
stream.seek(0)
|
83
|
+
return stream
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def export_set(cls, dataset, **kwargs):
|
87
|
+
"""Returns CSV representation of Dataset."""
|
88
|
+
stream = cls.export_stream_set(dataset, **kwargs)
|
89
|
+
return stream.getvalue()
|