webtoolkit 0.0.182__tar.gz → 0.0.184__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/PKG-INFO +1 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/pyproject.toml +1 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/baseurl.py +55 -39
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contentinterface.py +1 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contentlinkparser.py +9 -7
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/crawlers/crawlerinterface.py +2 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/crawlers/requestscrawler.py +2 -4
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/defaulturlhandler.py +6 -2
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerchannelodysee.py +3 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerchannelyoutube.py +5 -5
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerinterface.py +2 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlervideoodysee.py +1 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlervideoyoutube.py +3 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/pages.py +18 -14
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/remoteurl.py +77 -44
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/request.py +5 -6
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/response.py +9 -2
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/__init__.py +0 -1
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fakeresponse.py +6 -4
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/mocks.py +12 -4
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/urllocation.py +1 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/webtools.py +126 -31
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/LICENSE +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/README.md +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/__init__.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contentmoderation.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contenttext.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/crawlers/__init__.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/domaincache.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/__init__.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerhttppage.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlers.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/remoteserver.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/statuses.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/__init__.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/codeproject.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/firebog.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/geekwirecom.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/githubcom.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/hackernews.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/instance.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/opmlfile.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/reddit.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/returndislike.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/robotstxtcom.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/thehill.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/warhammercommunity.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fake/youtube.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fakeinternet.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/tests/fakeinternetcontents.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/utils/dateutils.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/utils/logger.py +0 -0
- {webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/webconfig.py +0 -0
|
@@ -7,6 +7,7 @@ response = url.get_response()
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
import base64
|
|
10
|
+
from typing import Any, Callable, Optional, Type
|
|
10
11
|
|
|
11
12
|
from .utils.dateutils import DateUtils
|
|
12
13
|
|
|
@@ -59,6 +60,7 @@ class BaseUrl(ContentInterface):
|
|
|
59
60
|
"""
|
|
60
61
|
Base Url class capable of reading network pages.
|
|
61
62
|
"""
|
|
63
|
+
|
|
62
64
|
def __init__(self, url=None, request=None, url_builder=None):
|
|
63
65
|
"""
|
|
64
66
|
Constructor
|
|
@@ -105,7 +107,7 @@ class BaseUrl(ContentInterface):
|
|
|
105
107
|
Returns available handlers.
|
|
106
108
|
Order is important - from the most precise handler to the most general.
|
|
107
109
|
"""
|
|
108
|
-
#fmt off
|
|
110
|
+
# fmt off
|
|
109
111
|
|
|
110
112
|
return [
|
|
111
113
|
YouTubeVideoHandler,
|
|
@@ -118,13 +120,13 @@ class BaseUrl(ContentInterface):
|
|
|
118
120
|
InternetArchive,
|
|
119
121
|
FourChanChannelHandler,
|
|
120
122
|
TwitterUrlHandler,
|
|
121
|
-
YouTubeChannelHandler,
|
|
122
|
-
HttpPageHandler,
|
|
123
|
+
YouTubeChannelHandler, # present here, if somebody wants to call it by name
|
|
124
|
+
HttpPageHandler, # default
|
|
123
125
|
]
|
|
124
|
-
#fmt on
|
|
126
|
+
# fmt on
|
|
125
127
|
|
|
126
128
|
def get_handler_by_name(self, handler_name):
|
|
127
|
-
"""
|
|
129
|
+
"""Returns handler class"""
|
|
128
130
|
handlers = self.get_handlers()
|
|
129
131
|
for handler in handlers:
|
|
130
132
|
if handler.__name__ == handler_name:
|
|
@@ -218,8 +220,10 @@ class BaseUrl(ContentInterface):
|
|
|
218
220
|
if self.response:
|
|
219
221
|
if not self.response.is_valid():
|
|
220
222
|
WebLogger.error(
|
|
221
|
-
"Url:{} Response is invalid:{}".format(
|
|
222
|
-
|
|
223
|
+
"Url:{} Response is invalid:{}".format(
|
|
224
|
+
self.request.url, self.response
|
|
225
|
+
),
|
|
226
|
+
detail_text=str(response_to_json(self.response)),
|
|
223
227
|
)
|
|
224
228
|
|
|
225
229
|
return self.response
|
|
@@ -256,7 +260,7 @@ class BaseUrl(ContentInterface):
|
|
|
256
260
|
return RequestsCrawler(self.request.url).ping()
|
|
257
261
|
|
|
258
262
|
def get_handler_implementation(self):
|
|
259
|
-
"""
|
|
263
|
+
"""Returns handler"""
|
|
260
264
|
url = self.request.url
|
|
261
265
|
if not url:
|
|
262
266
|
return
|
|
@@ -269,7 +273,11 @@ class BaseUrl(ContentInterface):
|
|
|
269
273
|
|
|
270
274
|
handlers = self.get_handlers()
|
|
271
275
|
for handler in handlers:
|
|
272
|
-
if
|
|
276
|
+
if (
|
|
277
|
+
self.request.handler_name
|
|
278
|
+
and self.request.handler_name != ""
|
|
279
|
+
and self.request.handler_name != handler.__name__
|
|
280
|
+
):
|
|
273
281
|
continue
|
|
274
282
|
if self.request.handler_type and self.request.handler_type != handler:
|
|
275
283
|
continue
|
|
@@ -289,7 +297,7 @@ class BaseUrl(ContentInterface):
|
|
|
289
297
|
raise NotImplementedError("Protocol has not been implemented")
|
|
290
298
|
|
|
291
299
|
def get_cleaned_link(self):
|
|
292
|
-
"""
|
|
300
|
+
"""Returns cleaned up link. Free of unwanted args, tracking, sanitized."""
|
|
293
301
|
url = self.request.url
|
|
294
302
|
|
|
295
303
|
url = url.strip()
|
|
@@ -310,7 +318,7 @@ class BaseUrl(ContentInterface):
|
|
|
310
318
|
return self.request.url
|
|
311
319
|
|
|
312
320
|
def get_urls(self):
|
|
313
|
-
"""
|
|
321
|
+
"""Returns various link versions for URL"""
|
|
314
322
|
properties = {}
|
|
315
323
|
properties["link"] = self.request.url
|
|
316
324
|
properties["link_request"] = self.request_url
|
|
@@ -320,7 +328,7 @@ class BaseUrl(ContentInterface):
|
|
|
320
328
|
return properties
|
|
321
329
|
|
|
322
330
|
def get_canonical_url(self):
|
|
323
|
-
"""
|
|
331
|
+
"""Returns canonical link"""
|
|
324
332
|
if self.handler:
|
|
325
333
|
return self.handler.get_canonical_url()
|
|
326
334
|
|
|
@@ -331,7 +339,7 @@ class BaseUrl(ContentInterface):
|
|
|
331
339
|
return handler.get_canonical_url()
|
|
332
340
|
|
|
333
341
|
def get_urls_archive(self):
|
|
334
|
-
"""
|
|
342
|
+
"""Returns archive link for URL"""
|
|
335
343
|
p = UrlLocation(self.request.url)
|
|
336
344
|
short_url = p.get_protocolless()
|
|
337
345
|
|
|
@@ -348,7 +356,7 @@ class BaseUrl(ContentInterface):
|
|
|
348
356
|
return "{}".format(self.request)
|
|
349
357
|
|
|
350
358
|
def is_valid(self):
|
|
351
|
-
"""
|
|
359
|
+
"""Returns indication if URL is valid"""
|
|
352
360
|
if not self.handler:
|
|
353
361
|
return False
|
|
354
362
|
|
|
@@ -364,54 +372,54 @@ class BaseUrl(ContentInterface):
|
|
|
364
372
|
return True
|
|
365
373
|
|
|
366
374
|
def get_title(self):
|
|
367
|
-
"""
|
|
375
|
+
"""Returns title"""
|
|
368
376
|
if self.handler:
|
|
369
377
|
return self.handler.get_title()
|
|
370
378
|
|
|
371
379
|
def get_description(self):
|
|
372
|
-
"""
|
|
380
|
+
"""Returns description"""
|
|
373
381
|
if self.handler:
|
|
374
382
|
return self.handler.get_description()
|
|
375
383
|
|
|
376
384
|
def get_language(self):
|
|
377
|
-
"""
|
|
385
|
+
"""Returns language"""
|
|
378
386
|
if self.handler:
|
|
379
387
|
return self.handler.get_language()
|
|
380
388
|
|
|
381
389
|
def get_thumbnail(self):
|
|
382
|
-
"""
|
|
390
|
+
"""Returns thumbnail"""
|
|
383
391
|
if self.handler:
|
|
384
392
|
return self.handler.get_thumbnail()
|
|
385
393
|
|
|
386
394
|
def get_author(self):
|
|
387
|
-
"""
|
|
395
|
+
"""Returns author"""
|
|
388
396
|
if self.handler:
|
|
389
397
|
return self.handler.get_author()
|
|
390
398
|
|
|
391
399
|
def get_album(self):
|
|
392
|
-
"""
|
|
400
|
+
"""Returns album"""
|
|
393
401
|
if self.handler:
|
|
394
402
|
return self.handler.get_album()
|
|
395
403
|
|
|
396
404
|
def get_tags(self):
|
|
397
|
-
"""
|
|
405
|
+
"""Returns tags"""
|
|
398
406
|
if self.handler:
|
|
399
407
|
return self.handler.get_tags()
|
|
400
408
|
|
|
401
409
|
def get_date_published(self):
|
|
402
|
-
"""
|
|
410
|
+
"""Returns date published"""
|
|
403
411
|
if self.handler:
|
|
404
412
|
return self.handler.get_date_published()
|
|
405
413
|
|
|
406
414
|
def get_status_code(self) -> int | None:
|
|
407
|
-
"""
|
|
415
|
+
"""Returns status code"""
|
|
408
416
|
if self.response:
|
|
409
417
|
return self.response.get_status_code()
|
|
410
418
|
|
|
411
419
|
return 0
|
|
412
420
|
|
|
413
421
|
def get_entries(self):
|
|
414
|
-
"""
|
|
422
|
+
"""Returns entries list"""
|
|
415
423
|
|
|
416
424
|
handler = self.get_handler()
|
|
417
425
|
if handler:
|
|
@@ -446,7 +454,7 @@ class BaseUrl(ContentInterface):
|
|
|
446
454
|
return u
|
|
447
455
|
|
|
448
456
|
def get_feeds(self):
|
|
449
|
-
"""
|
|
457
|
+
"""Returns feeds found for URL"""
|
|
450
458
|
result = []
|
|
451
459
|
|
|
452
460
|
handler = self.get_handler()
|
|
@@ -459,23 +467,24 @@ class BaseUrl(ContentInterface):
|
|
|
459
467
|
return calculate_hash(text)
|
|
460
468
|
|
|
461
469
|
def get_hash(self):
|
|
462
|
-
"""
|
|
470
|
+
"""Returns hash for URL"""
|
|
463
471
|
handler = self.get_handler()
|
|
464
472
|
if handler:
|
|
465
473
|
return handler.get_hash()
|
|
466
474
|
|
|
467
475
|
def get_body_hash(self):
|
|
468
|
-
"""
|
|
476
|
+
"""Returns body hash for URL"""
|
|
469
477
|
handler = self.get_handler()
|
|
470
478
|
if handler:
|
|
471
479
|
return handler.get_body_hash()
|
|
472
480
|
|
|
473
|
-
def get_meta_hash(self):
|
|
474
|
-
"""
|
|
475
|
-
|
|
476
|
-
|
|
481
|
+
def get_meta_hash(self) -> Optional[str]:
|
|
482
|
+
"""
|
|
483
|
+
Calculates and returns a hash of the page's metadata properties.
|
|
484
|
+
:return: A base64-encoded hash of the properties.
|
|
485
|
+
"""
|
|
486
|
+
self.get_response()
|
|
477
487
|
properties_data = self.get_properties_data()
|
|
478
|
-
|
|
479
488
|
properties_hash = self.property_encode(calculate_hash(str(properties_data)))
|
|
480
489
|
return properties_hash
|
|
481
490
|
|
|
@@ -486,7 +495,7 @@ class BaseUrl(ContentInterface):
|
|
|
486
495
|
return self.get_properties_data()
|
|
487
496
|
|
|
488
497
|
def get_all_properties(self, include_social=False):
|
|
489
|
-
"""
|
|
498
|
+
"""Returns all URL properties"""
|
|
490
499
|
response = self.get_response()
|
|
491
500
|
|
|
492
501
|
properties_data = self.get_properties()
|
|
@@ -540,8 +549,8 @@ class BaseUrl(ContentInterface):
|
|
|
540
549
|
return all_properties
|
|
541
550
|
|
|
542
551
|
def get_properties_data(self):
|
|
543
|
-
"""
|
|
544
|
-
TODO there should two functions: get_all_properties and get_properties
|
|
552
|
+
"""Returns simple meta properties.
|
|
553
|
+
TODO there should two functions: get_all_properties and get_properties"""
|
|
545
554
|
properties = super().get_properties()
|
|
546
555
|
page_handler = self.get_handler()
|
|
547
556
|
|
|
@@ -567,7 +576,10 @@ class BaseUrl(ContentInterface):
|
|
|
567
576
|
properties["channel_name"] = page_handler.get_channel_name()
|
|
568
577
|
properties["channel_url"] = page_handler.get_channel_url()
|
|
569
578
|
|
|
570
|
-
if
|
|
579
|
+
if (
|
|
580
|
+
type(page_handler) is HttpPageHandler
|
|
581
|
+
and type(page_handler.p) is HtmlPage
|
|
582
|
+
):
|
|
571
583
|
properties["favicon"] = page_handler.p.get_favicon()
|
|
572
584
|
properties["meta title"] = page_handler.p.get_meta_field("title")
|
|
573
585
|
properties["meta description"] = page_handler.p.get_meta_field(
|
|
@@ -576,7 +588,9 @@ class BaseUrl(ContentInterface):
|
|
|
576
588
|
properties["meta keywords"] = page_handler.p.get_meta_field("keywords")
|
|
577
589
|
|
|
578
590
|
properties["og:title"] = page_handler.p.get_og_field("title")
|
|
579
|
-
properties["og:description"] = page_handler.p.get_og_field(
|
|
591
|
+
properties["og:description"] = page_handler.p.get_og_field(
|
|
592
|
+
"description"
|
|
593
|
+
)
|
|
580
594
|
properties["og:image"] = page_handler.p.get_og_field("image")
|
|
581
595
|
properties["og:site_name"] = page_handler.p.get_og_field("site_name")
|
|
582
596
|
properties["schema:thumbnailUrl"] = page_handler.p.get_schema_field(
|
|
@@ -631,11 +645,13 @@ class BaseUrl(ContentInterface):
|
|
|
631
645
|
"""
|
|
632
646
|
Returns indication is access is allowed for bots, robots
|
|
633
647
|
"""
|
|
634
|
-
domain_info = DomainCache.get_object(
|
|
648
|
+
domain_info = DomainCache.get_object(
|
|
649
|
+
url=self.request.url, url_builder=self.url_builder
|
|
650
|
+
)
|
|
635
651
|
return domain_info.is_allowed(self.request.url)
|
|
636
652
|
|
|
637
653
|
def get_social_properties(self):
|
|
638
|
-
"""
|
|
654
|
+
"""Returns social properties"""
|
|
639
655
|
url = self.request.url
|
|
640
656
|
|
|
641
657
|
json_obj = {}
|
|
@@ -164,10 +164,12 @@ class ContentLinkParser(ContentInterface):
|
|
|
164
164
|
item = item[wh + 1 :]
|
|
165
165
|
|
|
166
166
|
# not absolute path
|
|
167
|
-
if (
|
|
167
|
+
if (
|
|
168
|
+
not item.startswith("http")
|
|
168
169
|
and not item.startswith("https")
|
|
169
170
|
and not item.startswith("ftp")
|
|
170
|
-
and not item.startswith("smb")
|
|
171
|
+
and not item.startswith("smb")
|
|
172
|
+
):
|
|
171
173
|
|
|
172
174
|
location = UrlLocation("https://" + item)
|
|
173
175
|
domain = location.get_domain_only()
|
|
@@ -179,15 +181,15 @@ class ContentLinkParser(ContentInterface):
|
|
|
179
181
|
return
|
|
180
182
|
item = self.join_url_parts(url, item)
|
|
181
183
|
|
|
182
|
-
if (
|
|
184
|
+
if (
|
|
185
|
+
not item.startswith("http")
|
|
183
186
|
and not item.startswith("https")
|
|
184
187
|
and not item.startswith("ftp")
|
|
185
|
-
and not item.startswith("smb")
|
|
188
|
+
and not item.startswith("smb")
|
|
189
|
+
):
|
|
186
190
|
item = "https://" + item
|
|
187
191
|
|
|
188
|
-
if item.startswith("https://") or item.startswith(
|
|
189
|
-
"http://"
|
|
190
|
-
):
|
|
192
|
+
if item.startswith("https://") or item.startswith("http://"):
|
|
191
193
|
item = ContentLinkParser.decode_url(item)
|
|
192
194
|
return item
|
|
193
195
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Crawler interface can be implemented to provide new mechanisms of crawling
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import json
|
|
5
6
|
import os
|
|
6
7
|
import base64
|
|
@@ -46,6 +47,7 @@ class CrawlerInterface(object):
|
|
|
46
47
|
Crawler is a tool that allows to obtain contents from the internet.
|
|
47
48
|
There are various tools.
|
|
48
49
|
"""
|
|
50
|
+
|
|
49
51
|
def __init__(self, url=None, request=None):
|
|
50
52
|
"""
|
|
51
53
|
@param response_file If set, response is stored in a file
|
|
@@ -202,9 +202,7 @@ class RequestsCrawler(CrawlerInterface):
|
|
|
202
202
|
|
|
203
203
|
def request_with_timeout(request, stream, result):
|
|
204
204
|
try:
|
|
205
|
-
result["response"] = self.make_requests_call(
|
|
206
|
-
request, stream
|
|
207
|
-
)
|
|
205
|
+
result["response"] = self.make_requests_call(request, stream)
|
|
208
206
|
except Exception as e:
|
|
209
207
|
result["exception"] = e
|
|
210
208
|
|
|
@@ -297,5 +295,5 @@ class RequestsCrawler(CrawlerInterface):
|
|
|
297
295
|
|
|
298
296
|
def update_request(self):
|
|
299
297
|
self.request.timeout_s = self.get_timeout_s()
|
|
300
|
-
#TODO - headers are not set
|
|
298
|
+
# TODO - headers are not set
|
|
301
299
|
# self.request.request_headers = self.get_request_headers()
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Default url handler.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import copy
|
|
5
6
|
from collections import OrderedDict
|
|
6
7
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -42,7 +43,7 @@ class DefaultUrlHandler(HttpPageHandler):
|
|
|
42
43
|
else:
|
|
43
44
|
request = PageRequestObject(url)
|
|
44
45
|
request.url = url
|
|
45
|
-
#request.handler_type = HttpPageHandler # object will be assigned by builder
|
|
46
|
+
# request.handler_type = HttpPageHandler # object will be assigned by builder
|
|
46
47
|
|
|
47
48
|
# if we will not hardcode this handler, then it will recursively loop
|
|
48
49
|
request.handler_name = "HttpPageHandler"
|
|
@@ -108,6 +109,7 @@ class DefaultCompoundChannelHandler(DefaultChannelHandler):
|
|
|
108
109
|
"""
|
|
109
110
|
Default URL handler which is capable of obtaining data from many network sources automatically.
|
|
110
111
|
"""
|
|
112
|
+
|
|
111
113
|
def __init__(self, url=None, contents=None, request=None, url_builder=None):
|
|
112
114
|
self.responses = []
|
|
113
115
|
self.channel_sources_urls = OrderedDict()
|
|
@@ -158,7 +160,9 @@ class DefaultCompoundChannelHandler(DefaultChannelHandler):
|
|
|
158
160
|
with ThreadPoolExecutor() as executor:
|
|
159
161
|
for channel_source in channel_sources:
|
|
160
162
|
if channel_source not in self.channel_sources_urls:
|
|
161
|
-
handles.append(
|
|
163
|
+
handles.append(
|
|
164
|
+
executor.submit(self.get_response_source, channel_source)
|
|
165
|
+
)
|
|
162
166
|
|
|
163
167
|
for handle in handles:
|
|
164
168
|
url = handle.result()
|
|
@@ -8,7 +8,9 @@ from .handlerhttppage import HttpPageHandler
|
|
|
8
8
|
|
|
9
9
|
class OdyseeChannelHandler(DefaultCompoundChannelHandler):
|
|
10
10
|
|
|
11
|
-
def __init__(
|
|
11
|
+
def __init__(
|
|
12
|
+
self, url=None, contents=None, request=None, url_builder=None, channel_code=None
|
|
13
|
+
):
|
|
12
14
|
if channel_code is not None:
|
|
13
15
|
url = self.code2url(channel_code)
|
|
14
16
|
|
|
@@ -150,20 +150,20 @@ class YouTubeChannelHandler(DefaultCompoundChannelHandler):
|
|
|
150
150
|
wh1 = url.find("youtube.com/user")
|
|
151
151
|
if wh1 >= 0:
|
|
152
152
|
start = wh1 + len("youtube.com/user") + 1
|
|
153
|
-
wh2 = url.find("/", start+1)
|
|
153
|
+
wh2 = url.find("/", start + 1)
|
|
154
154
|
if wh2 == -1:
|
|
155
|
-
return url[start-1:]
|
|
155
|
+
return url[start - 1 :]
|
|
156
156
|
else:
|
|
157
|
-
return url[start-1:wh2]
|
|
157
|
+
return url[start - 1 : wh2]
|
|
158
158
|
|
|
159
159
|
wh1 = url.find("youtube.com/@")
|
|
160
160
|
if wh1 >= 0:
|
|
161
161
|
start = wh1 + len("youtube.com/@") + 1
|
|
162
162
|
wh2 = url.find("/", start + 1)
|
|
163
163
|
if wh2 == -1:
|
|
164
|
-
return url[start-1:]
|
|
164
|
+
return url[start - 1 :]
|
|
165
165
|
else:
|
|
166
|
-
return url[start-1:wh2]
|
|
166
|
+
return url[start - 1 : wh2]
|
|
167
167
|
|
|
168
168
|
def input2code_channel(self, url):
|
|
169
169
|
wh = url.rfind("/")
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Handler interface that can be implemented to provide more complex logic for reading meta data.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
from webtoolkit import DefaultContentPage, calculate_hash_binary, calculate_hash
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class HandlerInterface(DefaultContentPage):
|
|
8
9
|
"""
|
|
9
10
|
Handler interface can be implemented to provide more complex means for obtaining data from the internet.
|
|
10
|
-
For example to obtain data about YouTube video you can fetch JSON file from yt-dlp, but also ask
|
|
11
|
+
For example to obtain data about YouTube video you can fetch JSON file from yt-dlp, but also ask
|
|
11
12
|
return dislike page to obtain dislike ratio.
|
|
12
13
|
"""
|
|
13
14
|
|
|
@@ -137,7 +137,9 @@ class YouTubeVideoHandler(DefaultCompoundChannelHandler):
|
|
|
137
137
|
return super().get_social_data()
|
|
138
138
|
|
|
139
139
|
def get_return_dislike_url_link(self):
|
|
140
|
-
return
|
|
140
|
+
return (
|
|
141
|
+
"https://returnyoutubedislikeapi.com/votes?videoId=" + self.get_video_code()
|
|
142
|
+
)
|
|
141
143
|
|
|
142
144
|
def get_view_count(self):
|
|
143
145
|
""" """
|
|
@@ -25,6 +25,7 @@ class DefaultContentPage(ContentInterface):
|
|
|
25
25
|
"""
|
|
26
26
|
Default content page that does not throw exceptions
|
|
27
27
|
"""
|
|
28
|
+
|
|
28
29
|
def __init__(self, url, contents=""):
|
|
29
30
|
super().__init__(url=url, contents=contents)
|
|
30
31
|
|
|
@@ -66,8 +67,9 @@ class JsonPage(ContentInterface):
|
|
|
66
67
|
"""
|
|
67
68
|
JSON page
|
|
68
69
|
"""
|
|
70
|
+
|
|
69
71
|
def __init__(self, url, contents):
|
|
70
|
-
"""
|
|
72
|
+
"""Constructor"""
|
|
71
73
|
super().__init__(url=url, contents=contents)
|
|
72
74
|
|
|
73
75
|
self.json_obj = None
|
|
@@ -80,9 +82,9 @@ class JsonPage(ContentInterface):
|
|
|
80
82
|
# to be expected
|
|
81
83
|
pass
|
|
82
84
|
|
|
83
|
-
#try:
|
|
85
|
+
# try:
|
|
84
86
|
# WebLogger.debug(f"Invalid json:{contents}")
|
|
85
|
-
#except Exception as E:
|
|
87
|
+
# except Exception as E:
|
|
86
88
|
# print(str(E))
|
|
87
89
|
|
|
88
90
|
def is_valid(self) -> bool:
|
|
@@ -128,7 +130,7 @@ class JsonPage(ContentInterface):
|
|
|
128
130
|
|
|
129
131
|
class RssPageEntry(ContentInterface):
|
|
130
132
|
def __init__(self, feed_index, feed_entry, url, contents, page_object_properties):
|
|
131
|
-
"""
|
|
133
|
+
"""Constructor"""
|
|
132
134
|
self.feed_index = feed_index
|
|
133
135
|
self.feed_entry = feed_entry
|
|
134
136
|
self.url = url
|
|
@@ -138,7 +140,7 @@ class RssPageEntry(ContentInterface):
|
|
|
138
140
|
super().__init__(url=self.url, contents=contents)
|
|
139
141
|
|
|
140
142
|
def get_properties(self):
|
|
141
|
-
"""
|
|
143
|
+
"""Returns map of properties"""
|
|
142
144
|
output_map = {}
|
|
143
145
|
|
|
144
146
|
link = None
|
|
@@ -310,7 +312,7 @@ class RssPage(ContentInterface):
|
|
|
310
312
|
"""
|
|
311
313
|
|
|
312
314
|
def __init__(self, url, contents):
|
|
313
|
-
"""
|
|
315
|
+
"""Constructor"""
|
|
314
316
|
self.feed = None
|
|
315
317
|
|
|
316
318
|
"""
|
|
@@ -582,8 +584,9 @@ class RssContentReader(object):
|
|
|
582
584
|
"""
|
|
583
585
|
RSS reader
|
|
584
586
|
"""
|
|
587
|
+
|
|
585
588
|
def __init__(self, url, contents):
|
|
586
|
-
"""
|
|
589
|
+
"""Constructor"""
|
|
587
590
|
self.contents = contents
|
|
588
591
|
self.process()
|
|
589
592
|
|
|
@@ -608,9 +611,10 @@ class RssContentReader(object):
|
|
|
608
611
|
|
|
609
612
|
|
|
610
613
|
class OpmlPageEntry(ContentInterface):
|
|
611
|
-
"""
|
|
614
|
+
"""OPML Page entry"""
|
|
615
|
+
|
|
612
616
|
def __init__(self, url, contents, opml_entry):
|
|
613
|
-
"""
|
|
617
|
+
"""Constructor"""
|
|
614
618
|
super().__init__(url=url, contents=contents)
|
|
615
619
|
self.opml_entry = opml_entry
|
|
616
620
|
self.title = None
|
|
@@ -718,7 +722,7 @@ class HtmlPage(ContentInterface):
|
|
|
718
722
|
"""
|
|
719
723
|
|
|
720
724
|
def __init__(self, url, contents):
|
|
721
|
-
"""
|
|
725
|
+
"""Constructor"""
|
|
722
726
|
super().__init__(url=url, contents=contents)
|
|
723
727
|
|
|
724
728
|
if self.contents:
|
|
@@ -1139,9 +1143,9 @@ class HtmlPage(ContentInterface):
|
|
|
1139
1143
|
# props["robots_txt_url"] = UrlLocation(self.url).get_robots_txt_url()
|
|
1140
1144
|
# props["site_maps_urls"] = self.get_site_maps()
|
|
1141
1145
|
|
|
1142
|
-
#props["links"] = self.get_links()
|
|
1143
|
-
#props["links_inner"] = self.get_links_inner()
|
|
1144
|
-
#props["links_outer"] = self.get_links_outer()
|
|
1146
|
+
# props["links"] = self.get_links()
|
|
1147
|
+
# props["links_inner"] = self.get_links_inner()
|
|
1148
|
+
# props["links_outer"] = self.get_links_outer()
|
|
1145
1149
|
|
|
1146
1150
|
props["favicons"] = self.get_favicons()
|
|
1147
1151
|
props["contents"] = self.get_contents()
|
|
@@ -1289,7 +1293,7 @@ class XmlPage(ContentInterface):
|
|
|
1289
1293
|
"""
|
|
1290
1294
|
|
|
1291
1295
|
def __init__(self, url, contents):
|
|
1292
|
-
"""
|
|
1296
|
+
"""Constructor"""
|
|
1293
1297
|
super().__init__(url=url, contents=contents)
|
|
1294
1298
|
|
|
1295
1299
|
def is_valid(self) -> bool:
|