webtoolkit 0.0.122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webtoolkit/__init__.py +38 -0
- webtoolkit/baseurl.py +617 -0
- webtoolkit/contentinterface.py +414 -0
- webtoolkit/contentlinkparser.py +270 -0
- webtoolkit/contentmoderation.py +226 -0
- webtoolkit/contenttext.py +89 -0
- webtoolkit/crawlers/__init__.py +7 -0
- webtoolkit/crawlers/crawlerinterface.py +196 -0
- webtoolkit/crawlers/crawlers.py +305 -0
- webtoolkit/domaincache.py +246 -0
- webtoolkit/handlers/__init__.py +9 -0
- webtoolkit/handlers/defaulturlhandler.py +215 -0
- webtoolkit/handlers/handlerchannelodysee.py +103 -0
- webtoolkit/handlers/handlerchannelyoutube.py +185 -0
- webtoolkit/handlers/handlerhttppage.py +363 -0
- webtoolkit/handlers/handlerinterface.py +213 -0
- webtoolkit/handlers/handlers.py +499 -0
- webtoolkit/handlers/handlervideoodysee.py +93 -0
- webtoolkit/handlers/handlervideoyoutube.py +164 -0
- webtoolkit/pages.py +1613 -0
- webtoolkit/remoteserver.py +281 -0
- webtoolkit/remoteurl.py +152 -0
- webtoolkit/request.py +178 -0
- webtoolkit/response.py +594 -0
- webtoolkit/statuses.py +158 -0
- webtoolkit/tests/fake/__init__.py +3 -0
- webtoolkit/tests/fake/codeproject.py +282 -0
- webtoolkit/tests/fake/firebog.py +49 -0
- webtoolkit/tests/fake/geekwirecom.py +149 -0
- webtoolkit/tests/fake/githubcom.py +128 -0
- webtoolkit/tests/fake/hackernews.py +431 -0
- webtoolkit/tests/fake/instance.py +239 -0
- webtoolkit/tests/fake/opmlfile.py +24 -0
- webtoolkit/tests/fake/reddit.py +498 -0
- webtoolkit/tests/fake/returndislike.py +3 -0
- webtoolkit/tests/fake/robotstxtcom.py +4 -0
- webtoolkit/tests/fake/thehill.py +1952 -0
- webtoolkit/tests/fake/warhammercommunity.py +245 -0
- webtoolkit/tests/fake/youtube.py +2320 -0
- webtoolkit/tests/fakeinternet.py +272 -0
- webtoolkit/tests/fakeinternetcontents.py +183 -0
- webtoolkit/tests/fakeresponse.py +947 -0
- webtoolkit/tests/mocks.py +103 -0
- webtoolkit/urllocation.py +734 -0
- webtoolkit/utils/dateutils.py +152 -0
- webtoolkit/utils/logger.py +169 -0
- webtoolkit/webconfig.py +130 -0
- webtoolkit/webtools.py +230 -0
- webtoolkit-0.0.122.dist-info/LICENSE +674 -0
- webtoolkit-0.0.122.dist-info/METADATA +267 -0
- webtoolkit-0.0.122.dist-info/RECORD +52 -0
- webtoolkit-0.0.122.dist-info/WHEEL +4 -0
webtoolkit/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Similar project: https://pypi.org/project/abstract-webtools/
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .webtools import *
|
|
6
|
+
from .statuses import *
|
|
7
|
+
from .webconfig import WebConfig
|
|
8
|
+
from .response import *
|
|
9
|
+
from .request import *
|
|
10
|
+
|
|
11
|
+
from .contentinterface import ContentInterface
|
|
12
|
+
from .contentlinkparser import ContentLinkParser
|
|
13
|
+
from .contenttext import ContentText
|
|
14
|
+
from .urllocation import UrlLocation
|
|
15
|
+
from .remoteserver import RemoteServer
|
|
16
|
+
from .remoteurl import RemoteUrl
|
|
17
|
+
|
|
18
|
+
from .pages import (
|
|
19
|
+
DefaultContentPage,
|
|
20
|
+
HtmlPage,
|
|
21
|
+
RssPage,
|
|
22
|
+
RssContentReader,
|
|
23
|
+
OpmlPage,
|
|
24
|
+
JsonPage,
|
|
25
|
+
PageFactory,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
from .contentmoderation import (
|
|
29
|
+
UrlPropertyValidator,
|
|
30
|
+
UrlPropertyValidator,
|
|
31
|
+
UrlAgeModerator,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
from .crawlers import *
|
|
35
|
+
from .handlers import *
|
|
36
|
+
|
|
37
|
+
from .baseurl import BaseUrl
|
|
38
|
+
from .domaincache import *
|
webtoolkit/baseurl.py
ADDED
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main Url handling class
|
|
3
|
+
|
|
4
|
+
@example
|
|
5
|
+
url = Url(link = "https://google.com")
|
|
6
|
+
response = url.get_response()
|
|
7
|
+
|
|
8
|
+
options.request.url
|
|
9
|
+
options.mode_mapping
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import base64
|
|
14
|
+
|
|
15
|
+
from .utils.dateutils import DateUtils
|
|
16
|
+
|
|
17
|
+
from .pages import (
|
|
18
|
+
ContentInterface,
|
|
19
|
+
DefaultContentPage,
|
|
20
|
+
RssPage,
|
|
21
|
+
HtmlPage,
|
|
22
|
+
)
|
|
23
|
+
from .webtools import (
|
|
24
|
+
calculate_hash,
|
|
25
|
+
WebLogger,
|
|
26
|
+
)
|
|
27
|
+
from .urllocation import (
|
|
28
|
+
UrlLocation,
|
|
29
|
+
URL_TYPE_RSS,
|
|
30
|
+
URL_TYPE_CSS,
|
|
31
|
+
URL_TYPE_JAVASCRIPT,
|
|
32
|
+
URL_TYPE_HTML,
|
|
33
|
+
URL_TYPE_FONT,
|
|
34
|
+
URL_TYPE_UNKNOWN,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .statuses import status_code_to_text
|
|
38
|
+
from .response import response_to_json
|
|
39
|
+
from .request import request_to_json, PageRequestObject
|
|
40
|
+
from .handlers import (
|
|
41
|
+
HandlerInterface,
|
|
42
|
+
HttpPageHandler,
|
|
43
|
+
OdyseeVideoHandler,
|
|
44
|
+
OdyseeChannelHandler,
|
|
45
|
+
RedditUrlHandler,
|
|
46
|
+
ReturnDislike,
|
|
47
|
+
GitHubUrlHandler,
|
|
48
|
+
HackerNewsHandler,
|
|
49
|
+
InternetArchive,
|
|
50
|
+
FourChanChannelHandler,
|
|
51
|
+
TwitterUrlHandler,
|
|
52
|
+
YouTubeVideoHandler,
|
|
53
|
+
YouTubeChannelHandler,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
from .crawlers import (
|
|
57
|
+
RequestsCrawler,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class BaseUrl(ContentInterface):
|
|
62
|
+
"""
|
|
63
|
+
Encapsulates data page, and builder to make request
|
|
64
|
+
"""
|
|
65
|
+
def __init__(self, url=None, request=None, url_builder=None):
|
|
66
|
+
"""
|
|
67
|
+
@param handler_class Allows to enforce desired handler to be used to process link
|
|
68
|
+
|
|
69
|
+
There are various ways url can be specified. For simplicity we cleanup it.
|
|
70
|
+
I do not like trailing slashes, no google stupid links, etc.
|
|
71
|
+
"""
|
|
72
|
+
if not request and url:
|
|
73
|
+
self.request_url = url
|
|
74
|
+
self.request = self.get_request_for_url(url)
|
|
75
|
+
else:
|
|
76
|
+
self.request_url = request.url
|
|
77
|
+
self.request = request
|
|
78
|
+
|
|
79
|
+
if not self.request.crawler_type:
|
|
80
|
+
self.request = self.get_request_for_request(self.request)
|
|
81
|
+
|
|
82
|
+
self.url = self.request.url
|
|
83
|
+
self.handler = None
|
|
84
|
+
self.response = None
|
|
85
|
+
self.url_builder = url_builder
|
|
86
|
+
|
|
87
|
+
if self.request.url:
|
|
88
|
+
self.request.url = self.get_cleaned_link()
|
|
89
|
+
else:
|
|
90
|
+
WebLogger.error("Url needs to be specified")
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
if not self.url_builder:
|
|
94
|
+
self.url_builder = BaseUrl
|
|
95
|
+
|
|
96
|
+
def get_request_for_url(self, url):
|
|
97
|
+
request = PageRequestObject(url)
|
|
98
|
+
request.crawler_name = "RequestsCrawler"
|
|
99
|
+
request.crawler_type = RequestsCrawler(url)
|
|
100
|
+
|
|
101
|
+
return request
|
|
102
|
+
|
|
103
|
+
def get_request_for_request(self, request):
|
|
104
|
+
request.crawler_name = "RequestsCrawler"
|
|
105
|
+
request.crawler_type = RequestsCrawler(request.url)
|
|
106
|
+
|
|
107
|
+
return request
|
|
108
|
+
|
|
109
|
+
def get_handlers(self):
|
|
110
|
+
#fmt off
|
|
111
|
+
|
|
112
|
+
return [
|
|
113
|
+
YouTubeVideoHandler,
|
|
114
|
+
OdyseeVideoHandler,
|
|
115
|
+
OdyseeChannelHandler,
|
|
116
|
+
RedditUrlHandler,
|
|
117
|
+
ReturnDislike,
|
|
118
|
+
GitHubUrlHandler,
|
|
119
|
+
HackerNewsHandler,
|
|
120
|
+
InternetArchive,
|
|
121
|
+
FourChanChannelHandler,
|
|
122
|
+
TwitterUrlHandler,
|
|
123
|
+
YouTubeChannelHandler, # present here, if somebody wants to call it by name
|
|
124
|
+
HttpPageHandler, # default
|
|
125
|
+
]
|
|
126
|
+
#fmt on
|
|
127
|
+
|
|
128
|
+
def get_handler_by_name(self, handler_name):
|
|
129
|
+
handlers = self.get_handlers()
|
|
130
|
+
for handler in handlers:
|
|
131
|
+
if handler.__name__ == handler_name:
|
|
132
|
+
return handler
|
|
133
|
+
|
|
134
|
+
def get_handler(self):
|
|
135
|
+
"""
|
|
136
|
+
This function does not fetch anything by itself
|
|
137
|
+
"""
|
|
138
|
+
if self.handler:
|
|
139
|
+
return self.handler
|
|
140
|
+
|
|
141
|
+
self.handler = self.get_handler_implementation()
|
|
142
|
+
return self.handler
|
|
143
|
+
|
|
144
|
+
def get_type(self):
|
|
145
|
+
"""
|
|
146
|
+
Based on link structure identify type.
|
|
147
|
+
Should provide a faster means of obtaining handler, without the need
|
|
148
|
+
to obtain the page
|
|
149
|
+
|
|
150
|
+
TODO maybe we should 'ping page' to see status
|
|
151
|
+
"""
|
|
152
|
+
# based on link 'appearance'
|
|
153
|
+
|
|
154
|
+
url = self.request.url
|
|
155
|
+
|
|
156
|
+
if not url:
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
p = UrlLocation(url)
|
|
160
|
+
short_url = p.get_protocolless()
|
|
161
|
+
if not short_url:
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
handlers = self.get_handlers()
|
|
165
|
+
for handler in handlers:
|
|
166
|
+
if handler(url=url).is_handled_by():
|
|
167
|
+
if handler == HttpPageHandler:
|
|
168
|
+
page_type = UrlLocation(url).get_type()
|
|
169
|
+
|
|
170
|
+
# TODO this should return HttpPageHandler?
|
|
171
|
+
|
|
172
|
+
if page_type == URL_TYPE_HTML:
|
|
173
|
+
return HtmlPage(url, "")
|
|
174
|
+
|
|
175
|
+
if page_type == URL_TYPE_RSS:
|
|
176
|
+
return RssPage(url, "")
|
|
177
|
+
|
|
178
|
+
if url.find("rss") >= 0:
|
|
179
|
+
return RssPage(url, "")
|
|
180
|
+
if url.find("feed") >= 0:
|
|
181
|
+
return RssPage(url, "")
|
|
182
|
+
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
return handler(url)
|
|
186
|
+
|
|
187
|
+
def get_contents(self):
|
|
188
|
+
"""
|
|
189
|
+
Returns text
|
|
190
|
+
"""
|
|
191
|
+
if self.get_response():
|
|
192
|
+
return self.get_response().get_text()
|
|
193
|
+
|
|
194
|
+
def get_binary(self):
|
|
195
|
+
"""
|
|
196
|
+
Returns binary
|
|
197
|
+
"""
|
|
198
|
+
response = self.get_response()
|
|
199
|
+
if response:
|
|
200
|
+
return self.response.get_binary()
|
|
201
|
+
|
|
202
|
+
def get_response(self):
|
|
203
|
+
"""
|
|
204
|
+
Returns full response, with page handling object
|
|
205
|
+
"""
|
|
206
|
+
if self.response:
|
|
207
|
+
return self.response
|
|
208
|
+
|
|
209
|
+
if not self.handler:
|
|
210
|
+
self.handler = self.get_handler_implementation()
|
|
211
|
+
|
|
212
|
+
if self.handler:
|
|
213
|
+
if self.request.respect_robots:
|
|
214
|
+
if not self.is_allowed():
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
self.response = self.handler.get_response()
|
|
218
|
+
if self.response:
|
|
219
|
+
if not self.response.is_valid():
|
|
220
|
+
WebLogger.error(
|
|
221
|
+
"Url:{} Response is invalid:{}".format(self.request.url, self.response),
|
|
222
|
+
detail_text = str(response_to_json(self.response))
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return self.response
|
|
226
|
+
|
|
227
|
+
def get_streams(self):
|
|
228
|
+
streams = []
|
|
229
|
+
streams_data = []
|
|
230
|
+
|
|
231
|
+
handler = self.get_handler()
|
|
232
|
+
|
|
233
|
+
if handler:
|
|
234
|
+
if self.request.respect_robots:
|
|
235
|
+
if not self.is_allowed():
|
|
236
|
+
return []
|
|
237
|
+
|
|
238
|
+
streams = self.handler.get_streams()
|
|
239
|
+
|
|
240
|
+
if streams:
|
|
241
|
+
for response in streams.values():
|
|
242
|
+
response_json = response_to_json(response)
|
|
243
|
+
streams_data.append(response_json)
|
|
244
|
+
|
|
245
|
+
return streams_data
|
|
246
|
+
|
|
247
|
+
def get_headers(self):
|
|
248
|
+
# TODO implement
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
def ping(self, timeout_s=20, user_agent=None):
|
|
252
|
+
# TODO if that fails we would have to find suitable agent, and then ping
|
|
253
|
+
return RequestsCrawler(self.request.url).ping()
|
|
254
|
+
|
|
255
|
+
def get_handler_implementation(self):
|
|
256
|
+
url = self.request.url
|
|
257
|
+
if not url:
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
p = UrlLocation(url)
|
|
261
|
+
short_url = p.get_protocolless()
|
|
262
|
+
|
|
263
|
+
if not short_url:
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
handlers = self.get_handlers()
|
|
267
|
+
for handler in handlers:
|
|
268
|
+
if self.request.handler_name and self.request.handler_name != "" and self.request.handler_name != handler.__name__:
|
|
269
|
+
continue
|
|
270
|
+
if self.request.handler_type and self.request.handler_type != handler:
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
h = handler(
|
|
274
|
+
url=self.request.url, request=self.request, url_builder=self.url_builder
|
|
275
|
+
)
|
|
276
|
+
if h.is_handled_by():
|
|
277
|
+
self.request.url = h.url
|
|
278
|
+
return h
|
|
279
|
+
|
|
280
|
+
if url.startswith("https") or url.startswith("http"):
|
|
281
|
+
return HttpPageHandler(
|
|
282
|
+
url=url, request=self.request, url_builder=self.url_builder
|
|
283
|
+
)
|
|
284
|
+
elif url.startswith("smb") or url.startswith("ftp"):
|
|
285
|
+
raise NotImplementedError("Protocol has not been implemented")
|
|
286
|
+
|
|
287
|
+
def get_cleaned_link(self):
|
|
288
|
+
url = self.request.url
|
|
289
|
+
|
|
290
|
+
url = url.strip()
|
|
291
|
+
|
|
292
|
+
if url.endswith("/"):
|
|
293
|
+
url = url[:-1]
|
|
294
|
+
if url.endswith("."):
|
|
295
|
+
url = url[:-1]
|
|
296
|
+
|
|
297
|
+
# domain is lowercase
|
|
298
|
+
return UrlLocation.get_cleaned_link(url)
|
|
299
|
+
|
|
300
|
+
def get_url(self):
|
|
301
|
+
self.get_handler()
|
|
302
|
+
if self.handler:
|
|
303
|
+
return self.handler.get_url()
|
|
304
|
+
else:
|
|
305
|
+
return self.request.url
|
|
306
|
+
|
|
307
|
+
def get_canonical_url(self):
|
|
308
|
+
if self.handler:
|
|
309
|
+
return self.handler.get_canonical_url()
|
|
310
|
+
|
|
311
|
+
handlers = self.get_handlers()
|
|
312
|
+
for handler_class in handlers:
|
|
313
|
+
handler = handler_class(url=self.request.url)
|
|
314
|
+
if handler.is_handled_by():
|
|
315
|
+
return handler.get_canonical_url()
|
|
316
|
+
|
|
317
|
+
def get_urls(self):
|
|
318
|
+
properties = {}
|
|
319
|
+
properties["link"] = self.request.url
|
|
320
|
+
properties["link_request"] = self.request_url
|
|
321
|
+
canonical = self.get_canonical_url()
|
|
322
|
+
if canonical:
|
|
323
|
+
properties["link_canonical"] = canonical
|
|
324
|
+
return properties
|
|
325
|
+
|
|
326
|
+
def get_urls_archive(self):
|
|
327
|
+
p = UrlLocation(self.request.url)
|
|
328
|
+
short_url = p.get_protocolless()
|
|
329
|
+
|
|
330
|
+
properties = []
|
|
331
|
+
|
|
332
|
+
archive = InternetArchive(self.request.url)
|
|
333
|
+
properties.append(archive.get_archive_url())
|
|
334
|
+
|
|
335
|
+
properties.append("https://archive.ph/" + short_url)
|
|
336
|
+
|
|
337
|
+
return properties
|
|
338
|
+
|
|
339
|
+
def __str__(self):
|
|
340
|
+
return "{}".format(self.request)
|
|
341
|
+
|
|
342
|
+
def is_valid(self):
|
|
343
|
+
if not self.handler:
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
if self.response is None:
|
|
347
|
+
return False
|
|
348
|
+
|
|
349
|
+
if self.response and not self.response.is_valid():
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
if not self.handler.is_valid():
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
def get_title(self):
|
|
358
|
+
if self.handler:
|
|
359
|
+
return self.handler.get_title()
|
|
360
|
+
|
|
361
|
+
def get_description(self):
|
|
362
|
+
if self.handler:
|
|
363
|
+
return self.handler.get_description()
|
|
364
|
+
|
|
365
|
+
def get_language(self):
|
|
366
|
+
if self.handler:
|
|
367
|
+
return self.handler.get_language()
|
|
368
|
+
|
|
369
|
+
def get_thumbnail(self):
|
|
370
|
+
if self.handler:
|
|
371
|
+
return self.handler.get_thumbnail()
|
|
372
|
+
|
|
373
|
+
def get_author(self):
|
|
374
|
+
if self.handler:
|
|
375
|
+
return self.handler.get_author()
|
|
376
|
+
|
|
377
|
+
def get_album(self):
|
|
378
|
+
if self.handler:
|
|
379
|
+
return self.handler.get_album()
|
|
380
|
+
|
|
381
|
+
def get_tags(self):
|
|
382
|
+
if self.handler:
|
|
383
|
+
return self.handler.get_tags()
|
|
384
|
+
|
|
385
|
+
def get_date_published(self):
|
|
386
|
+
if self.handler:
|
|
387
|
+
return self.handler.get_date_published()
|
|
388
|
+
|
|
389
|
+
def get_status_code(self):
|
|
390
|
+
if self.response:
|
|
391
|
+
return self.response.get_status_code()
|
|
392
|
+
|
|
393
|
+
return 0
|
|
394
|
+
|
|
395
|
+
def get_entries(self):
|
|
396
|
+
handler = self.get_handler()
|
|
397
|
+
if handler:
|
|
398
|
+
return handler.get_entries()
|
|
399
|
+
else:
|
|
400
|
+
return []
|
|
401
|
+
|
|
402
|
+
def find_rss_url(self):
|
|
403
|
+
"""
|
|
404
|
+
TODO remove
|
|
405
|
+
"""
|
|
406
|
+
url = self.url
|
|
407
|
+
|
|
408
|
+
if not url:
|
|
409
|
+
return
|
|
410
|
+
|
|
411
|
+
handler = self.get_handler()
|
|
412
|
+
|
|
413
|
+
if handler:
|
|
414
|
+
if type(handler) is HttpPageHandler:
|
|
415
|
+
if type(handler.p) is RssPage:
|
|
416
|
+
return self
|
|
417
|
+
|
|
418
|
+
# maybe our handler is able to produce feed without asking for response
|
|
419
|
+
|
|
420
|
+
feeds = self.get_feeds()
|
|
421
|
+
if url in feeds:
|
|
422
|
+
return self
|
|
423
|
+
|
|
424
|
+
if feeds and len(feeds) > 0:
|
|
425
|
+
u = self.url_builder(url=feeds[0])
|
|
426
|
+
return u
|
|
427
|
+
|
|
428
|
+
def get_feeds(self):
|
|
429
|
+
result = []
|
|
430
|
+
|
|
431
|
+
handler = self.get_handler()
|
|
432
|
+
if handler:
|
|
433
|
+
return handler.get_feeds()
|
|
434
|
+
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
def get_contents_hash(self):
|
|
438
|
+
handler = self.get_handler()
|
|
439
|
+
if handler:
|
|
440
|
+
return handler.get_contents_hash()
|
|
441
|
+
|
|
442
|
+
def get_contents_body_hash(self):
|
|
443
|
+
handler = self.get_handler()
|
|
444
|
+
if handler:
|
|
445
|
+
return handler.get_contents_body_hash()
|
|
446
|
+
|
|
447
|
+
def get_properties(self, full=False, include_social=False, check_robots=False):
|
|
448
|
+
response = self.get_response()
|
|
449
|
+
|
|
450
|
+
properties_data = self.get_properties_data()
|
|
451
|
+
if not full:
|
|
452
|
+
return properties_data
|
|
453
|
+
|
|
454
|
+
all_properties = []
|
|
455
|
+
|
|
456
|
+
all_properties.append({"name": "Properties", "data": properties_data})
|
|
457
|
+
|
|
458
|
+
properties_hash = self.property_encode(calculate_hash(str(properties_data)))
|
|
459
|
+
all_properties.append({"name": "PropertiesHash", "data": properties_hash})
|
|
460
|
+
|
|
461
|
+
if response:
|
|
462
|
+
if response.get_text():
|
|
463
|
+
all_properties.append(
|
|
464
|
+
{"name": "Text", "data": {"Contents": response.get_text()}}
|
|
465
|
+
)
|
|
466
|
+
elif response.get_binary():
|
|
467
|
+
all_properties.append(
|
|
468
|
+
{
|
|
469
|
+
"name": "Binary",
|
|
470
|
+
"data": {
|
|
471
|
+
"Contents": self.property_encode(response.get_binary())
|
|
472
|
+
},
|
|
473
|
+
}
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
streams = self.get_streams()
|
|
477
|
+
all_properties.append({"name": "Streams", "data": streams})
|
|
478
|
+
|
|
479
|
+
# TODO request is part of response now. Should we include it?
|
|
480
|
+
request_data = request_to_json(self.request)
|
|
481
|
+
request_data["crawler_type"] = type(request_data["crawler_type"]).__name__
|
|
482
|
+
all_properties.append({"name": "Request", "data": request_data})
|
|
483
|
+
|
|
484
|
+
response_data = self.get_response_data()
|
|
485
|
+
all_properties.append({"name": "Response", "data": response_data})
|
|
486
|
+
if response:
|
|
487
|
+
raw_headers_data = response.get_headers()
|
|
488
|
+
all_properties.append({"name": "Headers", "data": raw_headers_data})
|
|
489
|
+
else:
|
|
490
|
+
all_properties.append({"name": "Headers", "data": None})
|
|
491
|
+
|
|
492
|
+
if include_social:
|
|
493
|
+
social_data = self.get_social_properties(self.request.url)
|
|
494
|
+
if social_data:
|
|
495
|
+
all_properties.append({"name": "Social", "data": social_data})
|
|
496
|
+
|
|
497
|
+
entries_data = self.get_entry_data()
|
|
498
|
+
all_properties.append({"name": "Entries", "data": entries_data})
|
|
499
|
+
|
|
500
|
+
return all_properties
|
|
501
|
+
|
|
502
|
+
def get_properties_data(self):
|
|
503
|
+
properties = super().get_properties()
|
|
504
|
+
page_handler = self.get_handler()
|
|
505
|
+
|
|
506
|
+
properties["link_request"] = self.request_url
|
|
507
|
+
|
|
508
|
+
feeds = self.get_feeds()
|
|
509
|
+
if len(feeds) > 0:
|
|
510
|
+
properties["feeds"] = []
|
|
511
|
+
for key, feed in enumerate(feeds):
|
|
512
|
+
properties["feeds"].append(feed)
|
|
513
|
+
|
|
514
|
+
is_channel = False
|
|
515
|
+
channel_handler = YouTubeChannelHandler(url=self.url)
|
|
516
|
+
if channel_handler.is_handled_by():
|
|
517
|
+
is_channel = True
|
|
518
|
+
|
|
519
|
+
if page_handler:
|
|
520
|
+
"""
|
|
521
|
+
TODO detect type of handler. IsChannel?
|
|
522
|
+
"""
|
|
523
|
+
if is_channel:
|
|
524
|
+
if page_handler.get_channel_name():
|
|
525
|
+
properties["channel_name"] = page_handler.get_channel_name()
|
|
526
|
+
properties["channel_url"] = page_handler.get_channel_url()
|
|
527
|
+
|
|
528
|
+
if type(page_handler) is HttpPageHandler and type(page_handler.p) is HtmlPage:
|
|
529
|
+
properties["favicon"] = page_handler.p.get_favicon()
|
|
530
|
+
properties["meta title"] = page_handler.p.get_meta_field("title")
|
|
531
|
+
properties["meta description"] = page_handler.p.get_meta_field(
|
|
532
|
+
"description"
|
|
533
|
+
)
|
|
534
|
+
properties["meta keywords"] = page_handler.p.get_meta_field("keywords")
|
|
535
|
+
|
|
536
|
+
properties["og:title"] = page_handler.p.get_og_field("title")
|
|
537
|
+
properties["og:description"] = page_handler.p.get_og_field("description")
|
|
538
|
+
properties["og:image"] = page_handler.p.get_og_field("image")
|
|
539
|
+
properties["og:site_name"] = page_handler.p.get_og_field("site_name")
|
|
540
|
+
properties["schema:thumbnailUrl"] = page_handler.p.get_schema_field(
|
|
541
|
+
"thumbnailUrl"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
properties["link_archives"] = self.get_urls_archive()
|
|
545
|
+
|
|
546
|
+
return properties
|
|
547
|
+
|
|
548
|
+
def response_to_data(self, response):
|
|
549
|
+
response_data = response_to_json(response)
|
|
550
|
+
|
|
551
|
+
respect_robots_txt = False
|
|
552
|
+
is_allowed = True
|
|
553
|
+
if (self.request.respect_robots):
|
|
554
|
+
is_allowed = self.is_allowed()
|
|
555
|
+
|
|
556
|
+
response_data["is_allowed"] = is_allowed
|
|
557
|
+
|
|
558
|
+
return response_data
|
|
559
|
+
|
|
560
|
+
def get_response_data(self):
|
|
561
|
+
"""
|
|
562
|
+
Easy digestible response data
|
|
563
|
+
"""
|
|
564
|
+
response = self.get_response()
|
|
565
|
+
response_data = self.response_to_data(response)
|
|
566
|
+
return response_data
|
|
567
|
+
|
|
568
|
+
def get_entry_data(self):
|
|
569
|
+
index = 0
|
|
570
|
+
result = []
|
|
571
|
+
|
|
572
|
+
entries = self.get_entries()
|
|
573
|
+
|
|
574
|
+
if entries:
|
|
575
|
+
for entry in entries:
|
|
576
|
+
if "feed_entry" in entry:
|
|
577
|
+
del entry["feed_entry"]
|
|
578
|
+
result.append(entry)
|
|
579
|
+
|
|
580
|
+
return result
|
|
581
|
+
|
|
582
|
+
def property_encode(self, byte_property):
|
|
583
|
+
return base64.b64encode(byte_property).decode("utf-8")
|
|
584
|
+
|
|
585
|
+
def is_allowed(self):
|
|
586
|
+
"""
|
|
587
|
+
TODO remove?
|
|
588
|
+
"""
|
|
589
|
+
domain_info = self.get_domain_info()
|
|
590
|
+
return domain_info.is_allowed(self.request.url)
|
|
591
|
+
|
|
592
|
+
def get_social_properties(self):
|
|
593
|
+
url = self.request.url
|
|
594
|
+
|
|
595
|
+
json_obj = {}
|
|
596
|
+
|
|
597
|
+
handler = self.get_handler()
|
|
598
|
+
if not handler:
|
|
599
|
+
i = HandlerInterface()
|
|
600
|
+
return i.get_social_data()
|
|
601
|
+
|
|
602
|
+
json_data = handler.get_json_data()
|
|
603
|
+
return handler.get_social_data()
|
|
604
|
+
|
|
605
|
+
def get_properties_section(self, section_name, all_properties):
|
|
606
|
+
if not all_properties:
|
|
607
|
+
return
|
|
608
|
+
|
|
609
|
+
if "success" in all_properties and not all_properties["success"]:
|
|
610
|
+
# print("Url:{} Remote error. Not a success".format(link))
|
|
611
|
+
print("Remote error. Not a success")
|
|
612
|
+
# WebLogger.error(all_properties["error"])
|
|
613
|
+
return False
|
|
614
|
+
|
|
615
|
+
for properties in all_properties:
|
|
616
|
+
if section_name == properties["name"]:
|
|
617
|
+
return properties["data"]
|