oneforall-kjl 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- OneForAll/__init__.py +15 -0
- OneForAll/brute.py +503 -0
- OneForAll/common/check.py +41 -0
- OneForAll/common/crawl.py +10 -0
- OneForAll/common/database.py +277 -0
- OneForAll/common/domain.py +63 -0
- OneForAll/common/ipasn.py +42 -0
- OneForAll/common/ipreg.py +139 -0
- OneForAll/common/lookup.py +28 -0
- OneForAll/common/module.py +369 -0
- OneForAll/common/query.py +9 -0
- OneForAll/common/records.py +363 -0
- OneForAll/common/request.py +264 -0
- OneForAll/common/resolve.py +173 -0
- OneForAll/common/search.py +78 -0
- OneForAll/common/similarity.py +138 -0
- OneForAll/common/tablib/__init__.py +0 -0
- OneForAll/common/tablib/format.py +89 -0
- OneForAll/common/tablib/tablib.py +360 -0
- OneForAll/common/tldextract.py +240 -0
- OneForAll/common/utils.py +789 -0
- OneForAll/config/__init__.py +17 -0
- OneForAll/config/api.py +94 -0
- OneForAll/config/default.py +255 -0
- OneForAll/config/log.py +38 -0
- OneForAll/config/setting.py +108 -0
- OneForAll/export.py +72 -0
- OneForAll/modules/altdns.py +216 -0
- OneForAll/modules/autotake/github.py +105 -0
- OneForAll/modules/certificates/censys_api.py +73 -0
- OneForAll/modules/certificates/certspotter.py +48 -0
- OneForAll/modules/certificates/crtsh.py +84 -0
- OneForAll/modules/certificates/google.py +48 -0
- OneForAll/modules/certificates/myssl.py +46 -0
- OneForAll/modules/certificates/racent.py +49 -0
- OneForAll/modules/check/axfr.py +97 -0
- OneForAll/modules/check/cdx.py +44 -0
- OneForAll/modules/check/cert.py +58 -0
- OneForAll/modules/check/csp.py +94 -0
- OneForAll/modules/check/nsec.py +58 -0
- OneForAll/modules/check/robots.py +44 -0
- OneForAll/modules/check/sitemap.py +44 -0
- OneForAll/modules/collect.py +70 -0
- OneForAll/modules/crawl/archivecrawl.py +59 -0
- OneForAll/modules/crawl/commoncrawl.py +59 -0
- OneForAll/modules/datasets/anubis.py +45 -0
- OneForAll/modules/datasets/bevigil.py +50 -0
- OneForAll/modules/datasets/binaryedge_api.py +50 -0
- OneForAll/modules/datasets/cebaidu.py +45 -0
- OneForAll/modules/datasets/chinaz.py +45 -0
- OneForAll/modules/datasets/chinaz_api.py +49 -0
- OneForAll/modules/datasets/circl_api.py +49 -0
- OneForAll/modules/datasets/cloudflare_api.py +130 -0
- OneForAll/modules/datasets/dnsdb_api.py +51 -0
- OneForAll/modules/datasets/dnsdumpster.py +52 -0
- OneForAll/modules/datasets/dnsgrep.py +44 -0
- OneForAll/modules/datasets/fullhunt.py +48 -0
- OneForAll/modules/datasets/hackertarget.py +45 -0
- OneForAll/modules/datasets/ip138.py +45 -0
- OneForAll/modules/datasets/ipv4info_api.py +73 -0
- OneForAll/modules/datasets/netcraft.py +66 -0
- OneForAll/modules/datasets/passivedns_api.py +51 -0
- OneForAll/modules/datasets/qianxun.py +61 -0
- OneForAll/modules/datasets/rapiddns.py +45 -0
- OneForAll/modules/datasets/riddler.py +45 -0
- OneForAll/modules/datasets/robtex.py +58 -0
- OneForAll/modules/datasets/securitytrails_api.py +56 -0
- OneForAll/modules/datasets/sitedossier.py +57 -0
- OneForAll/modules/datasets/spyse_api.py +62 -0
- OneForAll/modules/datasets/sublist3r.py +45 -0
- OneForAll/modules/datasets/urlscan.py +45 -0
- OneForAll/modules/datasets/windvane.py +92 -0
- OneForAll/modules/dnsquery/mx.py +35 -0
- OneForAll/modules/dnsquery/ns.py +35 -0
- OneForAll/modules/dnsquery/soa.py +35 -0
- OneForAll/modules/dnsquery/spf.py +35 -0
- OneForAll/modules/dnsquery/txt.py +35 -0
- OneForAll/modules/enrich.py +72 -0
- OneForAll/modules/finder.py +206 -0
- OneForAll/modules/intelligence/alienvault.py +50 -0
- OneForAll/modules/intelligence/riskiq_api.py +58 -0
- OneForAll/modules/intelligence/threatbook_api.py +50 -0
- OneForAll/modules/intelligence/threatminer.py +45 -0
- OneForAll/modules/intelligence/virustotal.py +60 -0
- OneForAll/modules/intelligence/virustotal_api.py +59 -0
- OneForAll/modules/iscdn.py +86 -0
- OneForAll/modules/search/ask.py +69 -0
- OneForAll/modules/search/baidu.py +96 -0
- OneForAll/modules/search/bing.py +79 -0
- OneForAll/modules/search/bing_api.py +78 -0
- OneForAll/modules/search/fofa_api.py +74 -0
- OneForAll/modules/search/gitee.py +71 -0
- OneForAll/modules/search/github_api.py +86 -0
- OneForAll/modules/search/google.py +83 -0
- OneForAll/modules/search/google_api.py +77 -0
- OneForAll/modules/search/hunter_api.py +72 -0
- OneForAll/modules/search/quake_api.py +72 -0
- OneForAll/modules/search/shodan_api.py +53 -0
- OneForAll/modules/search/so.py +75 -0
- OneForAll/modules/search/sogou.py +72 -0
- OneForAll/modules/search/wzsearch.py +68 -0
- OneForAll/modules/search/yahoo.py +81 -0
- OneForAll/modules/search/yandex.py +80 -0
- OneForAll/modules/search/zoomeye_api.py +73 -0
- OneForAll/modules/srv.py +75 -0
- OneForAll/modules/wildcard.py +319 -0
- OneForAll/oneforall.py +275 -0
- OneForAll/takeover.py +168 -0
- OneForAll/test.py +23 -0
- oneforall_kjl-0.1.1.dist-info/METADATA +18 -0
- oneforall_kjl-0.1.1.dist-info/RECORD +114 -0
- oneforall_kjl-0.1.1.dist-info/WHEEL +5 -0
- oneforall_kjl-0.1.1.dist-info/entry_points.txt +2 -0
- oneforall_kjl-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,360 @@
|
|
1
|
+
from collections import OrderedDict
|
2
|
+
from .format import registry
|
3
|
+
|
4
|
+
|
5
|
+
class Row:
|
6
|
+
"""Internal Row object. Mainly used for filtering."""
|
7
|
+
|
8
|
+
__slots__ = ['_row', 'tags']
|
9
|
+
|
10
|
+
def __init__(self, row=None, tags=None):
|
11
|
+
if tags is None:
|
12
|
+
tags = list()
|
13
|
+
if row is None:
|
14
|
+
row = list()
|
15
|
+
self._row = list(row)
|
16
|
+
self.tags = list(tags)
|
17
|
+
|
18
|
+
def __iter__(self):
|
19
|
+
return (col for col in self._row)
|
20
|
+
|
21
|
+
def __len__(self):
|
22
|
+
return len(self._row)
|
23
|
+
|
24
|
+
def __repr__(self):
|
25
|
+
return repr(self._row)
|
26
|
+
|
27
|
+
def __getitem__(self, i):
|
28
|
+
return self._row[i]
|
29
|
+
|
30
|
+
def __setitem__(self, i, value):
|
31
|
+
self._row[i] = value
|
32
|
+
|
33
|
+
def __delitem__(self, i):
|
34
|
+
del self._row[i]
|
35
|
+
|
36
|
+
def __getstate__(self):
|
37
|
+
|
38
|
+
slots = dict()
|
39
|
+
|
40
|
+
for slot in self.__slots__:
|
41
|
+
attribute = getattr(self, slot)
|
42
|
+
slots[slot] = attribute
|
43
|
+
|
44
|
+
return slots
|
45
|
+
|
46
|
+
def __setstate__(self, state):
|
47
|
+
for (k, v) in list(state.items()):
|
48
|
+
setattr(self, k, v)
|
49
|
+
|
50
|
+
def rpush(self, value):
|
51
|
+
self.insert(len(self._row), value)
|
52
|
+
|
53
|
+
def append(self, value):
|
54
|
+
self.rpush(value)
|
55
|
+
|
56
|
+
def insert(self, index, value):
|
57
|
+
self._row.insert(index, value)
|
58
|
+
|
59
|
+
def __contains__(self, item):
|
60
|
+
return (item in self._row)
|
61
|
+
|
62
|
+
@property
|
63
|
+
def tuple(self):
|
64
|
+
"""Tuple representation of :class:`Row`."""
|
65
|
+
return tuple(self._row)
|
66
|
+
|
67
|
+
|
68
|
+
class Dataset:
|
69
|
+
"""The :class:`Dataset` object is the heart of Tablib. It provides all core
|
70
|
+
functionality.
|
71
|
+
|
72
|
+
Usually you create a :class:`Dataset` instance in your main module, and append
|
73
|
+
rows as you collect data. ::
|
74
|
+
|
75
|
+
data = tablib.Dataset()
|
76
|
+
data.headers = ('name', 'age')
|
77
|
+
|
78
|
+
for (name, age) in some_collector():
|
79
|
+
data.append((name, age))
|
80
|
+
|
81
|
+
|
82
|
+
Setting columns is similar. The column data length must equal the
|
83
|
+
current height of the data and headers must be set. ::
|
84
|
+
|
85
|
+
data = tablib.Dataset()
|
86
|
+
data.headers = ('first_name', 'last_name')
|
87
|
+
|
88
|
+
data.append(('John', 'Adams'))
|
89
|
+
data.append(('George', 'Washington'))
|
90
|
+
|
91
|
+
data.append_col((90, 67), header='age')
|
92
|
+
|
93
|
+
|
94
|
+
You can also set rows and headers upon instantiation. This is useful if
|
95
|
+
dealing with dozens or hundreds of :class:`Dataset` objects. ::
|
96
|
+
|
97
|
+
headers = ('first_name', 'last_name')
|
98
|
+
data = [('John', 'Adams'), ('George', 'Washington')]
|
99
|
+
|
100
|
+
data = tablib.Dataset(*data, headers=headers)
|
101
|
+
|
102
|
+
:param \\*args: (optional) list of rows to populate Dataset
|
103
|
+
:param headers: (optional) list strings for Dataset header row
|
104
|
+
:param title: (optional) string to use as title of the Dataset
|
105
|
+
|
106
|
+
|
107
|
+
.. admonition:: Format Attributes Definition
|
108
|
+
|
109
|
+
If you look at the code, the various output/import formats are not
|
110
|
+
defined within the :class:`Dataset` object. To add support for a new format, see
|
111
|
+
:ref:`Adding New Formats <newformats>`.
|
112
|
+
|
113
|
+
"""
|
114
|
+
|
115
|
+
def __init__(self, *args, **kwargs):
|
116
|
+
self._data = list(Row(arg) for arg in args)
|
117
|
+
self.__headers = None
|
118
|
+
|
119
|
+
# ('title', index) tuples
|
120
|
+
self._separators = []
|
121
|
+
|
122
|
+
# (column, callback) tuples
|
123
|
+
self._formatters = []
|
124
|
+
|
125
|
+
self.headers = kwargs.get('headers')
|
126
|
+
|
127
|
+
self.title = kwargs.get('title')
|
128
|
+
|
129
|
+
def __len__(self):
|
130
|
+
return self.height
|
131
|
+
|
132
|
+
def _validate(self, row=None, col=None, safety=False):
|
133
|
+
"""Assures size of every row in dataset is of proper proportions."""
|
134
|
+
if row:
|
135
|
+
is_valid = (len(row) == self.width) if self.width else True
|
136
|
+
elif col:
|
137
|
+
if len(col) < 1:
|
138
|
+
is_valid = True
|
139
|
+
else:
|
140
|
+
is_valid = (len(col) == self.height) if self.height else True
|
141
|
+
else:
|
142
|
+
is_valid = all(len(x) == self.width for x in self._data)
|
143
|
+
|
144
|
+
if is_valid:
|
145
|
+
return True
|
146
|
+
if not safety:
|
147
|
+
raise InvalidDimensions
|
148
|
+
return False
|
149
|
+
|
150
|
+
def _package(self, dicts=True, ordered=True):
|
151
|
+
"""Packages Dataset into lists of dictionaries for transmission."""
|
152
|
+
# TODO: Dicts default to false?
|
153
|
+
|
154
|
+
_data = list(self._data)
|
155
|
+
|
156
|
+
if ordered:
|
157
|
+
dict_pack = OrderedDict
|
158
|
+
else:
|
159
|
+
dict_pack = dict
|
160
|
+
|
161
|
+
# Execute formatters
|
162
|
+
if self._formatters:
|
163
|
+
for row_i, row in enumerate(_data):
|
164
|
+
for col, callback in self._formatters:
|
165
|
+
try:
|
166
|
+
if col is None:
|
167
|
+
for j, c in enumerate(row):
|
168
|
+
_data[row_i][j] = callback(c)
|
169
|
+
else:
|
170
|
+
_data[row_i][col] = callback(row[col])
|
171
|
+
except IndexError:
|
172
|
+
raise InvalidDatasetIndex
|
173
|
+
|
174
|
+
if self.headers:
|
175
|
+
if dicts:
|
176
|
+
data = [dict_pack(list(zip(self.headers, data_row)))
|
177
|
+
for data_row in _data]
|
178
|
+
else:
|
179
|
+
data = [list(self.headers)] + list(_data)
|
180
|
+
else:
|
181
|
+
data = [list(row) for row in _data]
|
182
|
+
|
183
|
+
return data
|
184
|
+
|
185
|
+
def _get_headers(self):
|
186
|
+
"""An *optional* list of strings to be used for header rows and attribute names.
|
187
|
+
|
188
|
+
This must be set manually. The given list length must equal :class:`Dataset.width`.
|
189
|
+
|
190
|
+
"""
|
191
|
+
return self.__headers
|
192
|
+
|
193
|
+
def _set_headers(self, collection):
|
194
|
+
"""Validating headers setter."""
|
195
|
+
self._validate(collection)
|
196
|
+
if collection:
|
197
|
+
try:
|
198
|
+
self.__headers = list(collection)
|
199
|
+
except TypeError:
|
200
|
+
raise TypeError
|
201
|
+
else:
|
202
|
+
self.__headers = None
|
203
|
+
|
204
|
+
headers = property(_get_headers, _set_headers)
|
205
|
+
|
206
|
+
def _get_dict(self):
|
207
|
+
"""A native Python representation of the :class:`Dataset` object. If headers have
|
208
|
+
been set, a list of Python dictionaries will be returned. If no headers have been
|
209
|
+
set, a list of tuples (rows) will be returned instead.
|
210
|
+
|
211
|
+
A dataset object can also be imported by setting the `Dataset.dict` attribute: ::
|
212
|
+
|
213
|
+
data = tablib.Dataset()
|
214
|
+
data.dict = [{'age': 90, 'first_name': 'Kenneth', 'last_name': 'Reitz'}]
|
215
|
+
|
216
|
+
"""
|
217
|
+
return self._package()
|
218
|
+
|
219
|
+
def _set_dict(self, pickle):
|
220
|
+
"""A native Python representation of the Dataset object. If headers have been
|
221
|
+
set, a list of Python dictionaries will be returned. If no headers have been
|
222
|
+
set, a list of tuples (rows) will be returned instead.
|
223
|
+
|
224
|
+
A dataset object can also be imported by setting the :class:`Dataset.dict` attribute. ::
|
225
|
+
|
226
|
+
data = tablib.Dataset()
|
227
|
+
data.dict = [{'age': 90, 'first_name': 'Kenneth', 'last_name': 'Reitz'}]
|
228
|
+
|
229
|
+
"""
|
230
|
+
|
231
|
+
if not len(pickle):
|
232
|
+
return
|
233
|
+
|
234
|
+
# if list of rows
|
235
|
+
if isinstance(pickle[0], list):
|
236
|
+
self.wipe()
|
237
|
+
for row in pickle:
|
238
|
+
self.append(Row(row))
|
239
|
+
|
240
|
+
# if list of objects
|
241
|
+
elif isinstance(pickle[0], dict):
|
242
|
+
self.wipe()
|
243
|
+
self.headers = list(pickle[0].keys())
|
244
|
+
for row in pickle:
|
245
|
+
self.append(Row(list(row.values())))
|
246
|
+
else:
|
247
|
+
raise UnsupportedFormat
|
248
|
+
|
249
|
+
dict = property(_get_dict, _set_dict)
|
250
|
+
|
251
|
+
@property
|
252
|
+
def height(self):
|
253
|
+
"""The number of rows currently in the :class:`Dataset`.
|
254
|
+
Cannot be directly modified.
|
255
|
+
"""
|
256
|
+
return len(self._data)
|
257
|
+
|
258
|
+
@property
|
259
|
+
def width(self):
|
260
|
+
"""The number of columns currently in the :class:`Dataset`.
|
261
|
+
Cannot be directly modified.
|
262
|
+
"""
|
263
|
+
|
264
|
+
try:
|
265
|
+
return len(self._data[0])
|
266
|
+
except IndexError:
|
267
|
+
try:
|
268
|
+
return len(self.headers)
|
269
|
+
except TypeError:
|
270
|
+
return 0
|
271
|
+
|
272
|
+
def export(self, format, **kwargs):
|
273
|
+
"""
|
274
|
+
Export :class:`Dataset` object to `format`.
|
275
|
+
|
276
|
+
:param format: export format
|
277
|
+
:param kwargs: (optional) custom configuration to the format `export_set`.
|
278
|
+
"""
|
279
|
+
fmt = registry.get_format(format)
|
280
|
+
if not hasattr(fmt, 'export_set'):
|
281
|
+
raise Exception('Format {} cannot be exported.'.format(format))
|
282
|
+
|
283
|
+
return fmt.export_set(self, **kwargs)
|
284
|
+
|
285
|
+
# ----
|
286
|
+
# Rows
|
287
|
+
# ----
|
288
|
+
|
289
|
+
def insert(self, index, row, tags=None):
|
290
|
+
"""Inserts a row to the :class:`Dataset` at the given index.
|
291
|
+
|
292
|
+
Rows inserted must be the correct size (height or width).
|
293
|
+
|
294
|
+
The default behaviour is to insert the given row to the :class:`Dataset`
|
295
|
+
object at the given index.
|
296
|
+
"""
|
297
|
+
|
298
|
+
if tags is None:
|
299
|
+
tags = list()
|
300
|
+
self._validate(row)
|
301
|
+
self._data.insert(index, Row(row, tags=tags))
|
302
|
+
|
303
|
+
def rpush(self, row, tags=None):
|
304
|
+
"""Adds a row to the end of the :class:`Dataset`.
|
305
|
+
See :class:`Dataset.insert` for additional documentation.
|
306
|
+
"""
|
307
|
+
|
308
|
+
if tags is None:
|
309
|
+
tags = list()
|
310
|
+
self.insert(self.height, row=row, tags=tags)
|
311
|
+
|
312
|
+
def append(self, row, tags=None):
|
313
|
+
"""Adds a row to the :class:`Dataset`.
|
314
|
+
See :class:`Dataset.insert` for additional documentation.
|
315
|
+
"""
|
316
|
+
|
317
|
+
if tags is None:
|
318
|
+
tags = list()
|
319
|
+
self.rpush(row, tags)
|
320
|
+
|
321
|
+
def extend(self, rows, tags=None):
|
322
|
+
"""Adds a list of rows to the :class:`Dataset` using
|
323
|
+
:class:`Dataset.append`
|
324
|
+
"""
|
325
|
+
|
326
|
+
if tags is None:
|
327
|
+
tags = list()
|
328
|
+
for row in rows:
|
329
|
+
self.append(row, tags)
|
330
|
+
|
331
|
+
# ----
|
332
|
+
# Misc
|
333
|
+
# ----
|
334
|
+
|
335
|
+
def remove_duplicates(self):
|
336
|
+
"""Removes all duplicate rows from the :class:`Dataset` object
|
337
|
+
while maintaining the original order."""
|
338
|
+
seen = set()
|
339
|
+
self._data[:] = [row for row in self._data if
|
340
|
+
not (tuple(row) in seen or seen.add(tuple(row)))]
|
341
|
+
|
342
|
+
def wipe(self):
|
343
|
+
"""Removes all content and headers from the :class:`Dataset` object."""
|
344
|
+
self._data = list()
|
345
|
+
self.__headers = None
|
346
|
+
|
347
|
+
|
348
|
+
registry.register_builtins()
|
349
|
+
|
350
|
+
|
351
|
+
class InvalidDimensions(Exception):
|
352
|
+
"""Invalid size"""
|
353
|
+
|
354
|
+
|
355
|
+
class InvalidDatasetIndex(Exception):
|
356
|
+
"""Outside of Dataset size"""
|
357
|
+
|
358
|
+
|
359
|
+
class UnsupportedFormat(NotImplementedError):
|
360
|
+
"""Format is not supported"""
|
@@ -0,0 +1,240 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
"""`tldextract` accurately separates the gTLD or ccTLD (generic or country code
|
3
|
+
top-level domain) from the registered domain and subdomains of a URL.
|
4
|
+
|
5
|
+
>>> import tldextract
|
6
|
+
|
7
|
+
>>> tldextract.extract('http://forums.news.cnn.com/')
|
8
|
+
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
|
9
|
+
|
10
|
+
>>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
|
11
|
+
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
|
12
|
+
|
13
|
+
>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
|
14
|
+
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
|
15
|
+
|
16
|
+
`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
|
17
|
+
|
18
|
+
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
19
|
+
>>> (ext.subdomain, ext.domain, ext.suffix)
|
20
|
+
('forums', 'bbc', 'co.uk')
|
21
|
+
>>> # rejoin subdomain and domain
|
22
|
+
>>> '.'.join(ext[:2])
|
23
|
+
'forums.bbc'
|
24
|
+
>>> # a common alias
|
25
|
+
>>> ext.registered_domain
|
26
|
+
'bbc.co.uk'
|
27
|
+
|
28
|
+
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
|
29
|
+
subdomain or a valid suffix.
|
30
|
+
|
31
|
+
>>> tldextract.extract('google.com')
|
32
|
+
ExtractResult(subdomain='', domain='google', suffix='com')
|
33
|
+
|
34
|
+
>>> tldextract.extract('google.notavalidsuffix')
|
35
|
+
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
|
36
|
+
|
37
|
+
>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
|
38
|
+
ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
|
39
|
+
|
40
|
+
If you want to rejoin the whole namedtuple, regardless of whether a subdomain
|
41
|
+
or suffix were found:
|
42
|
+
|
43
|
+
>>> ext = tldextract.extract('http://127.0.0.1:8080/deployed/')
|
44
|
+
>>> # this has unwanted dots
|
45
|
+
>>> '.'.join(ext)
|
46
|
+
'.127.0.0.1.'
|
47
|
+
"""
|
48
|
+
|
49
|
+
|
50
|
+
import os
|
51
|
+
import re
|
52
|
+
import json
|
53
|
+
import collections
|
54
|
+
from urllib.parse import scheme_chars
|
55
|
+
from functools import wraps
|
56
|
+
|
57
|
+
import idna
|
58
|
+
|
59
|
+
from common import utils
|
60
|
+
|
61
|
+
IP_RE = re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$') # pylint: disable=line-too-long
|
62
|
+
|
63
|
+
SCHEME_RE = re.compile(r'^([' + scheme_chars + ']+:)?//')
|
64
|
+
|
65
|
+
|
66
|
+
class ExtractResult(collections.namedtuple('ExtractResult', 'subdomain domain suffix')):
|
67
|
+
"""namedtuple of a URL's subdomain, domain, and suffix."""
|
68
|
+
|
69
|
+
# Necessary for __dict__ member to get populated in Python 3+
|
70
|
+
__slots__ = ()
|
71
|
+
|
72
|
+
@property
|
73
|
+
def registered_domain(self):
|
74
|
+
"""
|
75
|
+
Joins the domain and suffix fields with a dot, if they're both set.
|
76
|
+
|
77
|
+
>>> extract('http://forums.bbc.co.uk').registered_domain
|
78
|
+
'bbc.co.uk'
|
79
|
+
>>> extract('http://localhost:8080').registered_domain
|
80
|
+
''
|
81
|
+
"""
|
82
|
+
if self.domain and self.suffix:
|
83
|
+
return self.domain + '.' + self.suffix
|
84
|
+
return ''
|
85
|
+
|
86
|
+
@property
|
87
|
+
def fqdn(self):
|
88
|
+
"""
|
89
|
+
Returns a Fully Qualified Domain Name, if there is a proper domain/suffix.
|
90
|
+
|
91
|
+
>>> extract('http://forums.bbc.co.uk/path/to/file').fqdn
|
92
|
+
'forums.bbc.co.uk'
|
93
|
+
>>> extract('http://localhost:8080').fqdn
|
94
|
+
''
|
95
|
+
"""
|
96
|
+
if self.domain and self.suffix:
|
97
|
+
# self is the namedtuple (subdomain domain suffix)
|
98
|
+
return '.'.join(i for i in self if i)
|
99
|
+
return ''
|
100
|
+
|
101
|
+
@property
|
102
|
+
def ipv4(self):
|
103
|
+
"""
|
104
|
+
Returns the ipv4 if that is what the presented domain/url is
|
105
|
+
|
106
|
+
>>> extract('http://127.0.0.1/path/to/file').ipv4
|
107
|
+
'127.0.0.1'
|
108
|
+
>>> extract('http://127.0.0.1.1/path/to/file').ipv4
|
109
|
+
''
|
110
|
+
>>> extract('http://256.1.1.1').ipv4
|
111
|
+
''
|
112
|
+
"""
|
113
|
+
if not (self.suffix or self.subdomain) and IP_RE.match(self.domain):
|
114
|
+
return self.domain
|
115
|
+
return ''
|
116
|
+
|
117
|
+
|
118
|
+
class TLDExtract(object):
|
119
|
+
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
|
120
|
+
|
121
|
+
def __init__(self, cache_file=None):
|
122
|
+
"""
|
123
|
+
Constructs a callable for extracting subdomain, domain, and suffix
|
124
|
+
components from a URL.
|
125
|
+
"""
|
126
|
+
|
127
|
+
self.cache_file = os.path.expanduser(cache_file or '')
|
128
|
+
self._extractor = None
|
129
|
+
|
130
|
+
def __call__(self, url):
|
131
|
+
"""
|
132
|
+
Takes a string URL and splits it into its subdomain, domain, and
|
133
|
+
suffix (effective TLD, gTLD, ccTLD, etc.) component.
|
134
|
+
|
135
|
+
>>> ext = TLDExtract()
|
136
|
+
>>> ext('http://forums.news.cnn.com/')
|
137
|
+
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
|
138
|
+
>>> ext('http://forums.bbc.co.uk/')
|
139
|
+
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
|
140
|
+
"""
|
141
|
+
netloc = SCHEME_RE.sub("", url) \
|
142
|
+
.partition("/")[0] \
|
143
|
+
.partition("?")[0] \
|
144
|
+
.partition("#")[0] \
|
145
|
+
.split("@")[-1] \
|
146
|
+
.partition(":")[0] \
|
147
|
+
.strip() \
|
148
|
+
.rstrip(".")
|
149
|
+
|
150
|
+
labels = netloc.split(".")
|
151
|
+
|
152
|
+
translations = [_decode_punycode(label) for label in labels]
|
153
|
+
suffix_index = self._get_tld_extractor().suffix_index(translations)
|
154
|
+
|
155
|
+
suffix = ".".join(labels[suffix_index:])
|
156
|
+
if not suffix and netloc and utils.looks_like_ip(netloc):
|
157
|
+
return ExtractResult('', netloc, '')
|
158
|
+
|
159
|
+
subdomain = ".".join(labels[:suffix_index - 1]) if suffix_index else ""
|
160
|
+
domain = labels[suffix_index - 1] if suffix_index else ""
|
161
|
+
return ExtractResult(subdomain, domain, suffix)
|
162
|
+
|
163
|
+
@property
|
164
|
+
def tlds(self):
|
165
|
+
return self._get_tld_extractor().tlds
|
166
|
+
|
167
|
+
def _get_tld_extractor(self):
|
168
|
+
"""Get or compute this object's TLDExtractor. Looks up the TLDExtractor
|
169
|
+
in roughly the following order, based on the settings passed to
|
170
|
+
__init__:
|
171
|
+
|
172
|
+
1. Memoized on `self`
|
173
|
+
2. Local system cache file"""
|
174
|
+
# pylint: disable=no-else-return
|
175
|
+
|
176
|
+
if self._extractor:
|
177
|
+
return self._extractor
|
178
|
+
tlds = self._get_cached_tlds()
|
179
|
+
if tlds:
|
180
|
+
self._extractor = _PublicSuffixListTLDExtractor(tlds)
|
181
|
+
return self._extractor
|
182
|
+
else:
|
183
|
+
raise Exception("tlds is empty, cannot proceed without tlds.")
|
184
|
+
|
185
|
+
def _get_cached_tlds(self):
|
186
|
+
"""Read the local TLD cache file. Returns None on IOError or other
|
187
|
+
error, or if this object is not set to use the cache
|
188
|
+
file."""
|
189
|
+
if not self.cache_file:
|
190
|
+
return None
|
191
|
+
|
192
|
+
with open(self.cache_file) as cache_file:
|
193
|
+
return json.loads(cache_file.read())
|
194
|
+
|
195
|
+
|
196
|
+
TLD_EXTRACTOR = TLDExtract()
|
197
|
+
|
198
|
+
|
199
|
+
@wraps(TLD_EXTRACTOR.__call__)
|
200
|
+
def extract(url):
|
201
|
+
return TLD_EXTRACTOR(url)
|
202
|
+
|
203
|
+
|
204
|
+
class _PublicSuffixListTLDExtractor(object):
|
205
|
+
"""Wrapper around this project's main algo for PSL
|
206
|
+
lookups.
|
207
|
+
"""
|
208
|
+
def __init__(self, tlds):
|
209
|
+
self.tlds = frozenset(tlds)
|
210
|
+
|
211
|
+
def suffix_index(self, lower_spl):
|
212
|
+
"""Returns the index of the first suffix label.
|
213
|
+
Returns len(spl) if no suffix is found
|
214
|
+
"""
|
215
|
+
length = len(lower_spl)
|
216
|
+
for i in range(length):
|
217
|
+
maybe_tld = '.'.join(lower_spl[i:])
|
218
|
+
exception_tld = '!' + maybe_tld
|
219
|
+
if exception_tld in self.tlds:
|
220
|
+
return i + 1
|
221
|
+
|
222
|
+
if maybe_tld in self.tlds:
|
223
|
+
return i
|
224
|
+
|
225
|
+
wildcard_tld = '*.' + '.'.join(lower_spl[i + 1:])
|
226
|
+
if wildcard_tld in self.tlds:
|
227
|
+
return i
|
228
|
+
|
229
|
+
return length
|
230
|
+
|
231
|
+
|
232
|
+
def _decode_punycode(label):
|
233
|
+
lowered = label.lower()
|
234
|
+
looks_like_puny = lowered.startswith('xn--')
|
235
|
+
if looks_like_puny:
|
236
|
+
try:
|
237
|
+
return idna.decode(label.encode('ascii')).lower()
|
238
|
+
except (UnicodeError, IndexError):
|
239
|
+
pass
|
240
|
+
return lowered
|