webtoolkit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webtoolkit/__init__.py +15 -0
- webtoolkit/contentmoderation.py +226 -0
- webtoolkit/ipc.py +131 -0
- webtoolkit/pages.py +2026 -0
- webtoolkit/remoteserver.py +219 -0
- webtoolkit/urllocation.py +684 -0
- webtoolkit/webconfig.py +131 -0
- webtoolkit/webtools.py +1074 -0
- webtoolkit-0.0.1.dist-info/LICENSE +674 -0
- webtoolkit-0.0.1.dist-info/METADATA +30 -0
- webtoolkit-0.0.1.dist-info/RECORD +12 -0
- webtoolkit-0.0.1.dist-info/WHEEL +4 -0
webtoolkit/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Similar project: https://pypi.org/project/abstract-webtools/
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .webtools import *
|
|
6
|
+
from .pages import *
|
|
7
|
+
from .webconfig import WebConfig
|
|
8
|
+
from .urllocation import UrlLocation
|
|
9
|
+
|
|
10
|
+
from .remoteserver import RemoteServer
|
|
11
|
+
from .contentmoderation import (
|
|
12
|
+
UrlPropertyValidator,
|
|
13
|
+
UrlPropertyValidator,
|
|
14
|
+
UrlAgeModerator,
|
|
15
|
+
)
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
from .webtools import WebLogger
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class UrlContentsModerator(object):
|
|
5
|
+
def __init__(self, page_object=None, properties=None, blocked_keywords=None):
|
|
6
|
+
self.properties = []
|
|
7
|
+
|
|
8
|
+
def get_title(self):
|
|
9
|
+
if "title" in self.properties:
|
|
10
|
+
if self.properties["title"] is None:
|
|
11
|
+
return ""
|
|
12
|
+
return self.properties["title"]
|
|
13
|
+
else:
|
|
14
|
+
return ""
|
|
15
|
+
|
|
16
|
+
def get_description(self):
|
|
17
|
+
if "description" in self.properties:
|
|
18
|
+
if self.properties["description"] is None:
|
|
19
|
+
return ""
|
|
20
|
+
return self.properties["description"]
|
|
21
|
+
else:
|
|
22
|
+
return ""
|
|
23
|
+
|
|
24
|
+
def get_descriptive_pulp(self):
|
|
25
|
+
title = self.get_title()
|
|
26
|
+
title = title.lower()
|
|
27
|
+
|
|
28
|
+
description = self.get_description()
|
|
29
|
+
description = description.lower()
|
|
30
|
+
|
|
31
|
+
return title + "\n" + description
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class UrlPropertyValidator(UrlContentsModerator):
|
|
35
|
+
def __init__(self, page_object=None, properties=None, blocked_keywords=None):
|
|
36
|
+
self.properties = []
|
|
37
|
+
if page_object:
|
|
38
|
+
self.properties = page_object.get_properties()
|
|
39
|
+
if properties:
|
|
40
|
+
self.properties = properties
|
|
41
|
+
|
|
42
|
+
if blocked_keywords and len(blocked_keywords) > 0:
|
|
43
|
+
self.blocked_keywords = blocked_keywords
|
|
44
|
+
else:
|
|
45
|
+
self.blocked_keywords = [
|
|
46
|
+
"masturbat",
|
|
47
|
+
"porn",
|
|
48
|
+
"xxx",
|
|
49
|
+
"sex",
|
|
50
|
+
"slutt",
|
|
51
|
+
"nude",
|
|
52
|
+
"chaturbat",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
def is_valid(self):
|
|
56
|
+
if self.is_site_not_found():
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
if self.is_porn_blocked():
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
if self.is_casino_blocked():
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
if self.is_blocked_keywords():
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
def is_blocked_keywords(self):
|
|
71
|
+
"""
|
|
72
|
+
TODO This should be configurable - move to configuration
|
|
73
|
+
"""
|
|
74
|
+
title = self.get_title()
|
|
75
|
+
title = title.lower()
|
|
76
|
+
|
|
77
|
+
for keyword in self.blocked_keywords:
|
|
78
|
+
if title.find(keyword) >= 0:
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
def is_site_not_found(self):
|
|
84
|
+
title = self.get_title()
|
|
85
|
+
if title:
|
|
86
|
+
title = title.lower()
|
|
87
|
+
else:
|
|
88
|
+
title = ""
|
|
89
|
+
|
|
90
|
+
is_title_invalid = (
|
|
91
|
+
title.find("forbidden") >= 0
|
|
92
|
+
or title.find("access denied") >= 0
|
|
93
|
+
or title.find("site not found") >= 0
|
|
94
|
+
or title.find("page not found") >= 0
|
|
95
|
+
or title.find("this page could not found") >= 0
|
|
96
|
+
or title.find("404 not found") >= 0
|
|
97
|
+
or title.find("404: not found") >= 0
|
|
98
|
+
or title.find("404 not_found") >= 0
|
|
99
|
+
or title.find("404 - not found") >= 0
|
|
100
|
+
or title.find("404 error") >= 0
|
|
101
|
+
or title.find("404 page") >= 0
|
|
102
|
+
or title.find("404 file not found") >= 0
|
|
103
|
+
or title.find("error 404") >= 0
|
|
104
|
+
or title.find("purged account") >= 0
|
|
105
|
+
or title.find("410 gone") >= 0
|
|
106
|
+
or title.find("squarespace - website expired") >= 0
|
|
107
|
+
or title.find("domain name for sale") >= 0
|
|
108
|
+
or title.find("account suspended") >= 0
|
|
109
|
+
or title.find("the request could not be satisfied") >= 0
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if is_title_invalid:
|
|
113
|
+
WebLogger.debug("Title is invalid {}".format(title))
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def is_porn_blocked(self):
|
|
117
|
+
"""
|
|
118
|
+
TODO This should be configurable - move to configuration
|
|
119
|
+
"""
|
|
120
|
+
title = self.get_title()
|
|
121
|
+
title = title.lower()
|
|
122
|
+
|
|
123
|
+
porn_keywords = [
|
|
124
|
+
"masturbat",
|
|
125
|
+
"porn",
|
|
126
|
+
"xxx",
|
|
127
|
+
"sex",
|
|
128
|
+
"slutt",
|
|
129
|
+
"nude",
|
|
130
|
+
"chaturbat",
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
for keyword in porn_keywords:
|
|
134
|
+
if title.find(keyword) >= 0:
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
keywords = [
|
|
138
|
+
"live",
|
|
139
|
+
"nast",
|
|
140
|
+
"slut",
|
|
141
|
+
"webcam",
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
points = 0
|
|
145
|
+
for keyword in keywords:
|
|
146
|
+
if title.find(keyword) >= 0:
|
|
147
|
+
points += 1
|
|
148
|
+
|
|
149
|
+
return points > 3
|
|
150
|
+
|
|
151
|
+
def is_casino_blocked(self):
|
|
152
|
+
"""
|
|
153
|
+
TODO This should be configurable - move to configuration
|
|
154
|
+
"""
|
|
155
|
+
title = self.get_title()
|
|
156
|
+
title = title.lower()
|
|
157
|
+
|
|
158
|
+
if title.find("slot server") >= 0:
|
|
159
|
+
return True
|
|
160
|
+
|
|
161
|
+
description = self.get_description()
|
|
162
|
+
description = description.lower()
|
|
163
|
+
|
|
164
|
+
text = title + "\n" + description
|
|
165
|
+
|
|
166
|
+
keywords = [
|
|
167
|
+
"casino",
|
|
168
|
+
"lotter",
|
|
169
|
+
"bingo",
|
|
170
|
+
"slot",
|
|
171
|
+
"poker",
|
|
172
|
+
"jackpot",
|
|
173
|
+
"gacor",
|
|
174
|
+
"bandar judi",
|
|
175
|
+
"pagcor",
|
|
176
|
+
"slotlara kadar",
|
|
177
|
+
"canli bahis",
|
|
178
|
+
"terpopuler",
|
|
179
|
+
"deposit",
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
sum = 0
|
|
183
|
+
for keyword in keywords:
|
|
184
|
+
sum += text.count(keyword)
|
|
185
|
+
|
|
186
|
+
return sum > 3
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class UrlAgeModerator(UrlContentsModerator):
|
|
190
|
+
def __init__(self, page_object=None, properties=None, blocked_keywords=None):
|
|
191
|
+
self.properties = []
|
|
192
|
+
if page_object:
|
|
193
|
+
self.properties = page_object.get_properties()
|
|
194
|
+
if properties:
|
|
195
|
+
self.properties = properties
|
|
196
|
+
|
|
197
|
+
def get_age(self):
|
|
198
|
+
"""
|
|
199
|
+
implement more types of checks?
|
|
200
|
+
|
|
201
|
+
@return age requirement, or None
|
|
202
|
+
"""
|
|
203
|
+
age0 = self.get_age__sexual()
|
|
204
|
+
|
|
205
|
+
return age0
|
|
206
|
+
|
|
207
|
+
def get_age__sexual(self):
|
|
208
|
+
text = self.get_descriptive_pulp()
|
|
209
|
+
|
|
210
|
+
keywords = [
|
|
211
|
+
"sexua",
|
|
212
|
+
"lesbian",
|
|
213
|
+
"bisexual",
|
|
214
|
+
"queer ",
|
|
215
|
+
"drag quee",
|
|
216
|
+
"fuck",
|
|
217
|
+
"porn",
|
|
218
|
+
"nsfw",
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
sum = 0
|
|
222
|
+
for keyword in keywords:
|
|
223
|
+
sum += text.count(keyword)
|
|
224
|
+
|
|
225
|
+
if sum > 1:
|
|
226
|
+
return 15
|
webtoolkit/ipc.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inter process communication. Communication between scraping/crawl server
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import pickle
|
|
7
|
+
import socket
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_PORT = 5007
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def object_to_command(command_string, input_object):
|
|
15
|
+
"""
|
|
16
|
+
TODO: All three functions are not currently used
|
|
17
|
+
Pickle uses 0 byte for something.
|
|
18
|
+
I use it signal end of command.
|
|
19
|
+
|
|
20
|
+
We could rewrite protocol to support pickle, but that would be a pickle!
|
|
21
|
+
"""
|
|
22
|
+
data = object_to_bytes(input_object)
|
|
23
|
+
return bytes_to_command(command_string, data)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def bytes_to_command(command_string, bytes):
|
|
27
|
+
command_string = command_string + ":"
|
|
28
|
+
|
|
29
|
+
total = bytearray(command_string.encode())
|
|
30
|
+
total.extend(bytearray(bytes))
|
|
31
|
+
total.extend(bytearray((0).to_bytes(1, byteorder="big")))
|
|
32
|
+
return total
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def string_to_command(command_string, string):
|
|
36
|
+
return bytes_to_command(command_string, string.encode())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def commands_from_bytes(read_message):
|
|
40
|
+
"""
|
|
41
|
+
returns vector of [command, data]
|
|
42
|
+
"""
|
|
43
|
+
result = []
|
|
44
|
+
|
|
45
|
+
index = 0
|
|
46
|
+
while True:
|
|
47
|
+
command, data, read_message = get_command_and_data(read_message)
|
|
48
|
+
if not command:
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
result.append([command, data])
|
|
52
|
+
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_command_and_data(read_message):
|
|
57
|
+
command, remaining = get_command_bytes(read_message)
|
|
58
|
+
|
|
59
|
+
if not command:
|
|
60
|
+
return [None, None, None]
|
|
61
|
+
|
|
62
|
+
wh = command.find(b"\x3A")
|
|
63
|
+
if not wh:
|
|
64
|
+
print("Cannot find ':' in response")
|
|
65
|
+
return [None, None, None]
|
|
66
|
+
|
|
67
|
+
else:
|
|
68
|
+
command_string = command[:wh].decode()
|
|
69
|
+
data = command[wh + 1 :]
|
|
70
|
+
return [command_string, data, remaining]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_command_bytes(read_message):
|
|
74
|
+
wh = read_message.find(b"\x00")
|
|
75
|
+
|
|
76
|
+
if wh >= 0:
|
|
77
|
+
command = read_message[:wh]
|
|
78
|
+
read_message = read_message[wh + 1 :]
|
|
79
|
+
|
|
80
|
+
return [command, read_message]
|
|
81
|
+
|
|
82
|
+
return [None, None]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SocketConnection(object):
|
|
86
|
+
def __init__(self, conn=None):
|
|
87
|
+
self.conn = conn
|
|
88
|
+
self.read_message = bytearray()
|
|
89
|
+
self.closed = False
|
|
90
|
+
|
|
91
|
+
def __del__(self):
|
|
92
|
+
self.close()
|
|
93
|
+
|
|
94
|
+
def gethostname():
|
|
95
|
+
return socket.gethostname()
|
|
96
|
+
|
|
97
|
+
def connect(self, host, port):
|
|
98
|
+
if host:
|
|
99
|
+
self.host = host
|
|
100
|
+
else:
|
|
101
|
+
self.host = SocketConnection.gethostname()
|
|
102
|
+
|
|
103
|
+
if port:
|
|
104
|
+
self.port = port
|
|
105
|
+
else:
|
|
106
|
+
self.port = DEFAULT_PORT
|
|
107
|
+
|
|
108
|
+
self.conn = socket.socket()
|
|
109
|
+
self.conn.settimeout(1.0) # to be able to make ctrl-c
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
self.conn.connect((self.host, self.port))
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
except Exception as E:
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def close(self):
|
|
119
|
+
if not self.is_closed():
|
|
120
|
+
try:
|
|
121
|
+
self.conn.close()
|
|
122
|
+
except Exception as E:
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
self.closed = True
|
|
126
|
+
|
|
127
|
+
def is_closed(self):
|
|
128
|
+
return self.closed
|
|
129
|
+
|
|
130
|
+
def __str__(self):
|
|
131
|
+
return "{} {}".format(self.conn, self.closed)
|