webtoolkit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webtoolkit/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Similar project: https://pypi.org/project/abstract-webtools/
3
+ """
4
+
5
+ from .webtools import *
6
+ from .pages import *
7
+ from .webconfig import WebConfig
8
+ from .urllocation import UrlLocation
9
+
10
+ from .remoteserver import RemoteServer
11
+ from .contentmoderation import (
12
+ UrlPropertyValidator,
13
+ UrlPropertyValidator,
14
+ UrlAgeModerator,
15
+ )
@@ -0,0 +1,226 @@
1
+ from .webtools import WebLogger
2
+
3
+
4
+ class UrlContentsModerator(object):
5
+ def __init__(self, page_object=None, properties=None, blocked_keywords=None):
6
+ self.properties = []
7
+
8
+ def get_title(self):
9
+ if "title" in self.properties:
10
+ if self.properties["title"] is None:
11
+ return ""
12
+ return self.properties["title"]
13
+ else:
14
+ return ""
15
+
16
+ def get_description(self):
17
+ if "description" in self.properties:
18
+ if self.properties["description"] is None:
19
+ return ""
20
+ return self.properties["description"]
21
+ else:
22
+ return ""
23
+
24
+ def get_descriptive_pulp(self):
25
+ title = self.get_title()
26
+ title = title.lower()
27
+
28
+ description = self.get_description()
29
+ description = description.lower()
30
+
31
+ return title + "\n" + description
32
+
33
+
34
+ class UrlPropertyValidator(UrlContentsModerator):
35
+ def __init__(self, page_object=None, properties=None, blocked_keywords=None):
36
+ self.properties = []
37
+ if page_object:
38
+ self.properties = page_object.get_properties()
39
+ if properties:
40
+ self.properties = properties
41
+
42
+ if blocked_keywords and len(blocked_keywords) > 0:
43
+ self.blocked_keywords = blocked_keywords
44
+ else:
45
+ self.blocked_keywords = [
46
+ "masturbat",
47
+ "porn",
48
+ "xxx",
49
+ "sex",
50
+ "slutt",
51
+ "nude",
52
+ "chaturbat",
53
+ ]
54
+
55
+ def is_valid(self):
56
+ if self.is_site_not_found():
57
+ return False
58
+
59
+ if self.is_porn_blocked():
60
+ return False
61
+
62
+ if self.is_casino_blocked():
63
+ return False
64
+
65
+ if self.is_blocked_keywords():
66
+ return False
67
+
68
+ return True
69
+
70
+ def is_blocked_keywords(self):
71
+ """
72
+ TODO This should be configurable - move to configuration
73
+ """
74
+ title = self.get_title()
75
+ title = title.lower()
76
+
77
+ for keyword in self.blocked_keywords:
78
+ if title.find(keyword) >= 0:
79
+ return True
80
+
81
+ return False
82
+
83
+ def is_site_not_found(self):
84
+ title = self.get_title()
85
+ if title:
86
+ title = title.lower()
87
+ else:
88
+ title = ""
89
+
90
+ is_title_invalid = (
91
+ title.find("forbidden") >= 0
92
+ or title.find("access denied") >= 0
93
+ or title.find("site not found") >= 0
94
+ or title.find("page not found") >= 0
95
+ or title.find("this page could not found") >= 0
96
+ or title.find("404 not found") >= 0
97
+ or title.find("404: not found") >= 0
98
+ or title.find("404 not_found") >= 0
99
+ or title.find("404 - not found") >= 0
100
+ or title.find("404 error") >= 0
101
+ or title.find("404 page") >= 0
102
+ or title.find("404 file not found") >= 0
103
+ or title.find("error 404") >= 0
104
+ or title.find("purged account") >= 0
105
+ or title.find("410 gone") >= 0
106
+ or title.find("squarespace - website expired") >= 0
107
+ or title.find("domain name for sale") >= 0
108
+ or title.find("account suspended") >= 0
109
+ or title.find("the request could not be satisfied") >= 0
110
+ )
111
+
112
+ if is_title_invalid:
113
+ WebLogger.debug("Title is invalid {}".format(title))
114
+ return True
115
+
116
+ def is_porn_blocked(self):
117
+ """
118
+ TODO This should be configurable - move to configuration
119
+ """
120
+ title = self.get_title()
121
+ title = title.lower()
122
+
123
+ porn_keywords = [
124
+ "masturbat",
125
+ "porn",
126
+ "xxx",
127
+ "sex",
128
+ "slutt",
129
+ "nude",
130
+ "chaturbat",
131
+ ]
132
+
133
+ for keyword in porn_keywords:
134
+ if title.find(keyword) >= 0:
135
+ return True
136
+
137
+ keywords = [
138
+ "live",
139
+ "nast",
140
+ "slut",
141
+ "webcam",
142
+ ]
143
+
144
+ points = 0
145
+ for keyword in keywords:
146
+ if title.find(keyword) >= 0:
147
+ points += 1
148
+
149
+ return points > 3
150
+
151
+ def is_casino_blocked(self):
152
+ """
153
+ TODO This should be configurable - move to configuration
154
+ """
155
+ title = self.get_title()
156
+ title = title.lower()
157
+
158
+ if title.find("slot server") >= 0:
159
+ return True
160
+
161
+ description = self.get_description()
162
+ description = description.lower()
163
+
164
+ text = title + "\n" + description
165
+
166
+ keywords = [
167
+ "casino",
168
+ "lotter",
169
+ "bingo",
170
+ "slot",
171
+ "poker",
172
+ "jackpot",
173
+ "gacor",
174
+ "bandar judi",
175
+ "pagcor",
176
+ "slotlara kadar",
177
+ "canli bahis",
178
+ "terpopuler",
179
+ "deposit",
180
+ ]
181
+
182
+ sum = 0
183
+ for keyword in keywords:
184
+ sum += text.count(keyword)
185
+
186
+ return sum > 3
187
+
188
+
189
+ class UrlAgeModerator(UrlContentsModerator):
190
+ def __init__(self, page_object=None, properties=None, blocked_keywords=None):
191
+ self.properties = []
192
+ if page_object:
193
+ self.properties = page_object.get_properties()
194
+ if properties:
195
+ self.properties = properties
196
+
197
+ def get_age(self):
198
+ """
199
+ implement more types of checks?
200
+
201
+ @return age requirement, or None
202
+ """
203
+ age0 = self.get_age__sexual()
204
+
205
+ return age0
206
+
207
+ def get_age__sexual(self):
208
+ text = self.get_descriptive_pulp()
209
+
210
+ keywords = [
211
+ "sexua",
212
+ "lesbian",
213
+ "bisexual",
214
+ "queer ",
215
+ "drag quee",
216
+ "fuck",
217
+ "porn",
218
+ "nsfw",
219
+ ]
220
+
221
+ sum = 0
222
+ for keyword in keywords:
223
+ sum += text.count(keyword)
224
+
225
+ if sum > 1:
226
+ return 15
webtoolkit/ipc.py ADDED
@@ -0,0 +1,131 @@
1
+ """
2
+ Inter process communication. Communication between scraping/crawl server
3
+ """
4
+
5
+ import json
6
+ import pickle
7
+ import socket
8
+ import time
9
+
10
+
11
+ DEFAULT_PORT = 5007
12
+
13
+
14
+ def object_to_command(command_string, input_object):
15
+ """
16
+ TODO: All three functions are not currently used
17
+ Pickle uses 0 byte for something.
18
+ I use it signal end of command.
19
+
20
+ We could rewrite protocol to support pickle, but that would be a pickle!
21
+ """
22
+ data = object_to_bytes(input_object)
23
+ return bytes_to_command(command_string, data)
24
+
25
+
26
+ def bytes_to_command(command_string, bytes):
27
+ command_string = command_string + ":"
28
+
29
+ total = bytearray(command_string.encode())
30
+ total.extend(bytearray(bytes))
31
+ total.extend(bytearray((0).to_bytes(1, byteorder="big")))
32
+ return total
33
+
34
+
35
+ def string_to_command(command_string, string):
36
+ return bytes_to_command(command_string, string.encode())
37
+
38
+
39
+ def commands_from_bytes(read_message):
40
+ """
41
+ returns vector of [command, data]
42
+ """
43
+ result = []
44
+
45
+ index = 0
46
+ while True:
47
+ command, data, read_message = get_command_and_data(read_message)
48
+ if not command:
49
+ break
50
+
51
+ result.append([command, data])
52
+
53
+ return result
54
+
55
+
56
+ def get_command_and_data(read_message):
57
+ command, remaining = get_command_bytes(read_message)
58
+
59
+ if not command:
60
+ return [None, None, None]
61
+
62
+ wh = command.find(b"\x3A")
63
+ if not wh:
64
+ print("Cannot find ':' in response")
65
+ return [None, None, None]
66
+
67
+ else:
68
+ command_string = command[:wh].decode()
69
+ data = command[wh + 1 :]
70
+ return [command_string, data, remaining]
71
+
72
+
73
+ def get_command_bytes(read_message):
74
+ wh = read_message.find(b"\x00")
75
+
76
+ if wh >= 0:
77
+ command = read_message[:wh]
78
+ read_message = read_message[wh + 1 :]
79
+
80
+ return [command, read_message]
81
+
82
+ return [None, None]
83
+
84
+
85
+ class SocketConnection(object):
86
+ def __init__(self, conn=None):
87
+ self.conn = conn
88
+ self.read_message = bytearray()
89
+ self.closed = False
90
+
91
+ def __del__(self):
92
+ self.close()
93
+
94
+ def gethostname():
95
+ return socket.gethostname()
96
+
97
+ def connect(self, host, port):
98
+ if host:
99
+ self.host = host
100
+ else:
101
+ self.host = SocketConnection.gethostname()
102
+
103
+ if port:
104
+ self.port = port
105
+ else:
106
+ self.port = DEFAULT_PORT
107
+
108
+ self.conn = socket.socket()
109
+ self.conn.settimeout(1.0) # to be able to make ctrl-c
110
+
111
+ try:
112
+ self.conn.connect((self.host, self.port))
113
+ return True
114
+
115
+ except Exception as E:
116
+ return False
117
+
118
+ def close(self):
119
+ if not self.is_closed():
120
+ try:
121
+ self.conn.close()
122
+ except Exception as E:
123
+ pass
124
+
125
+ self.closed = True
126
+
127
+ def is_closed(self):
128
+ return self.closed
129
+
130
+ def __str__(self):
131
+ return "{} {}".format(self.conn, self.closed)