webtoolkit 0.0.122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. webtoolkit/__init__.py +38 -0
  2. webtoolkit/baseurl.py +617 -0
  3. webtoolkit/contentinterface.py +414 -0
  4. webtoolkit/contentlinkparser.py +270 -0
  5. webtoolkit/contentmoderation.py +226 -0
  6. webtoolkit/contenttext.py +89 -0
  7. webtoolkit/crawlers/__init__.py +7 -0
  8. webtoolkit/crawlers/crawlerinterface.py +196 -0
  9. webtoolkit/crawlers/crawlers.py +305 -0
  10. webtoolkit/domaincache.py +246 -0
  11. webtoolkit/handlers/__init__.py +9 -0
  12. webtoolkit/handlers/defaulturlhandler.py +215 -0
  13. webtoolkit/handlers/handlerchannelodysee.py +103 -0
  14. webtoolkit/handlers/handlerchannelyoutube.py +185 -0
  15. webtoolkit/handlers/handlerhttppage.py +363 -0
  16. webtoolkit/handlers/handlerinterface.py +213 -0
  17. webtoolkit/handlers/handlers.py +499 -0
  18. webtoolkit/handlers/handlervideoodysee.py +93 -0
  19. webtoolkit/handlers/handlervideoyoutube.py +164 -0
  20. webtoolkit/pages.py +1613 -0
  21. webtoolkit/remoteserver.py +281 -0
  22. webtoolkit/remoteurl.py +152 -0
  23. webtoolkit/request.py +178 -0
  24. webtoolkit/response.py +594 -0
  25. webtoolkit/statuses.py +158 -0
  26. webtoolkit/tests/fake/__init__.py +3 -0
  27. webtoolkit/tests/fake/codeproject.py +282 -0
  28. webtoolkit/tests/fake/firebog.py +49 -0
  29. webtoolkit/tests/fake/geekwirecom.py +149 -0
  30. webtoolkit/tests/fake/githubcom.py +128 -0
  31. webtoolkit/tests/fake/hackernews.py +431 -0
  32. webtoolkit/tests/fake/instance.py +239 -0
  33. webtoolkit/tests/fake/opmlfile.py +24 -0
  34. webtoolkit/tests/fake/reddit.py +498 -0
  35. webtoolkit/tests/fake/returndislike.py +3 -0
  36. webtoolkit/tests/fake/robotstxtcom.py +4 -0
  37. webtoolkit/tests/fake/thehill.py +1952 -0
  38. webtoolkit/tests/fake/warhammercommunity.py +245 -0
  39. webtoolkit/tests/fake/youtube.py +2320 -0
  40. webtoolkit/tests/fakeinternet.py +272 -0
  41. webtoolkit/tests/fakeinternetcontents.py +183 -0
  42. webtoolkit/tests/fakeresponse.py +947 -0
  43. webtoolkit/tests/mocks.py +103 -0
  44. webtoolkit/urllocation.py +734 -0
  45. webtoolkit/utils/dateutils.py +152 -0
  46. webtoolkit/utils/logger.py +169 -0
  47. webtoolkit/webconfig.py +130 -0
  48. webtoolkit/webtools.py +230 -0
  49. webtoolkit-0.0.122.dist-info/LICENSE +674 -0
  50. webtoolkit-0.0.122.dist-info/METADATA +267 -0
  51. webtoolkit-0.0.122.dist-info/RECORD +52 -0
  52. webtoolkit-0.0.122.dist-info/WHEEL +4 -0
webtoolkit/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ Similar project: https://pypi.org/project/abstract-webtools/
3
+ """
4
+
5
+ from .webtools import *
6
+ from .statuses import *
7
+ from .webconfig import WebConfig
8
+ from .response import *
9
+ from .request import *
10
+
11
+ from .contentinterface import ContentInterface
12
+ from .contentlinkparser import ContentLinkParser
13
+ from .contenttext import ContentText
14
+ from .urllocation import UrlLocation
15
+ from .remoteserver import RemoteServer
16
+ from .remoteurl import RemoteUrl
17
+
18
+ from .pages import (
19
+ DefaultContentPage,
20
+ HtmlPage,
21
+ RssPage,
22
+ RssContentReader,
23
+ OpmlPage,
24
+ JsonPage,
25
+ PageFactory,
26
+ )
27
+
28
+ from .contentmoderation import (
29
+ UrlPropertyValidator,
30
+ UrlPropertyValidator,
31
+ UrlAgeModerator,
32
+ )
33
+
34
+ from .crawlers import *
35
+ from .handlers import *
36
+
37
+ from .baseurl import BaseUrl
38
+ from .domaincache import *
webtoolkit/baseurl.py ADDED
@@ -0,0 +1,617 @@
1
+ """
2
+ Main Url handling class
3
+
4
+ @example
5
+ url = Url(link = "https://google.com")
6
+ response = url.get_response()
7
+
8
+ options.request.url
9
+ options.mode_mapping
10
+
11
+ """
12
+
13
+ import base64
14
+
15
+ from .utils.dateutils import DateUtils
16
+
17
+ from .pages import (
18
+ ContentInterface,
19
+ DefaultContentPage,
20
+ RssPage,
21
+ HtmlPage,
22
+ )
23
+ from .webtools import (
24
+ calculate_hash,
25
+ WebLogger,
26
+ )
27
+ from .urllocation import (
28
+ UrlLocation,
29
+ URL_TYPE_RSS,
30
+ URL_TYPE_CSS,
31
+ URL_TYPE_JAVASCRIPT,
32
+ URL_TYPE_HTML,
33
+ URL_TYPE_FONT,
34
+ URL_TYPE_UNKNOWN,
35
+ )
36
+
37
+ from .statuses import status_code_to_text
38
+ from .response import response_to_json
39
+ from .request import request_to_json, PageRequestObject
40
+ from .handlers import (
41
+ HandlerInterface,
42
+ HttpPageHandler,
43
+ OdyseeVideoHandler,
44
+ OdyseeChannelHandler,
45
+ RedditUrlHandler,
46
+ ReturnDislike,
47
+ GitHubUrlHandler,
48
+ HackerNewsHandler,
49
+ InternetArchive,
50
+ FourChanChannelHandler,
51
+ TwitterUrlHandler,
52
+ YouTubeVideoHandler,
53
+ YouTubeChannelHandler,
54
+ )
55
+
56
+ from .crawlers import (
57
+ RequestsCrawler,
58
+ )
59
+
60
+
61
+ class BaseUrl(ContentInterface):
62
+ """
63
+ Encapsulates data page, and builder to make request
64
+ """
65
+ def __init__(self, url=None, request=None, url_builder=None):
66
+ """
67
+ @param handler_class Allows to enforce desired handler to be used to process link
68
+
69
+ There are various ways url can be specified. For simplicity we cleanup it.
70
+ I do not like trailing slashes, no google stupid links, etc.
71
+ """
72
+ if not request and url:
73
+ self.request_url = url
74
+ self.request = self.get_request_for_url(url)
75
+ else:
76
+ self.request_url = request.url
77
+ self.request = request
78
+
79
+ if not self.request.crawler_type:
80
+ self.request = self.get_request_for_request(self.request)
81
+
82
+ self.url = self.request.url
83
+ self.handler = None
84
+ self.response = None
85
+ self.url_builder = url_builder
86
+
87
+ if self.request.url:
88
+ self.request.url = self.get_cleaned_link()
89
+ else:
90
+ WebLogger.error("Url needs to be specified")
91
+ return
92
+
93
+ if not self.url_builder:
94
+ self.url_builder = BaseUrl
95
+
96
+ def get_request_for_url(self, url):
97
+ request = PageRequestObject(url)
98
+ request.crawler_name = "RequestsCrawler"
99
+ request.crawler_type = RequestsCrawler(url)
100
+
101
+ return request
102
+
103
+ def get_request_for_request(self, request):
104
+ request.crawler_name = "RequestsCrawler"
105
+ request.crawler_type = RequestsCrawler(request.url)
106
+
107
+ return request
108
+
109
+ def get_handlers(self):
110
+ #fmt off
111
+
112
+ return [
113
+ YouTubeVideoHandler,
114
+ OdyseeVideoHandler,
115
+ OdyseeChannelHandler,
116
+ RedditUrlHandler,
117
+ ReturnDislike,
118
+ GitHubUrlHandler,
119
+ HackerNewsHandler,
120
+ InternetArchive,
121
+ FourChanChannelHandler,
122
+ TwitterUrlHandler,
123
+ YouTubeChannelHandler, # present here, if somebody wants to call it by name
124
+ HttpPageHandler, # default
125
+ ]
126
+ #fmt on
127
+
128
+ def get_handler_by_name(self, handler_name):
129
+ handlers = self.get_handlers()
130
+ for handler in handlers:
131
+ if handler.__name__ == handler_name:
132
+ return handler
133
+
134
+ def get_handler(self):
135
+ """
136
+ This function does not fetch anything by itself
137
+ """
138
+ if self.handler:
139
+ return self.handler
140
+
141
+ self.handler = self.get_handler_implementation()
142
+ return self.handler
143
+
144
+ def get_type(self):
145
+ """
146
+ Based on link structure identify type.
147
+ Should provide a faster means of obtaining handler, without the need
148
+ to obtain the page
149
+
150
+ TODO maybe we should 'ping page' to see status
151
+ """
152
+ # based on link 'appearance'
153
+
154
+ url = self.request.url
155
+
156
+ if not url:
157
+ return
158
+
159
+ p = UrlLocation(url)
160
+ short_url = p.get_protocolless()
161
+ if not short_url:
162
+ return
163
+
164
+ handlers = self.get_handlers()
165
+ for handler in handlers:
166
+ if handler(url=url).is_handled_by():
167
+ if handler == HttpPageHandler:
168
+ page_type = UrlLocation(url).get_type()
169
+
170
+ # TODO this should return HttpPageHandler?
171
+
172
+ if page_type == URL_TYPE_HTML:
173
+ return HtmlPage(url, "")
174
+
175
+ if page_type == URL_TYPE_RSS:
176
+ return RssPage(url, "")
177
+
178
+ if url.find("rss") >= 0:
179
+ return RssPage(url, "")
180
+ if url.find("feed") >= 0:
181
+ return RssPage(url, "")
182
+
183
+ return
184
+
185
+ return handler(url)
186
+
187
+ def get_contents(self):
188
+ """
189
+ Returns text
190
+ """
191
+ if self.get_response():
192
+ return self.get_response().get_text()
193
+
194
+ def get_binary(self):
195
+ """
196
+ Returns binary
197
+ """
198
+ response = self.get_response()
199
+ if response:
200
+ return self.response.get_binary()
201
+
202
+ def get_response(self):
203
+ """
204
+ Returns full response, with page handling object
205
+ """
206
+ if self.response:
207
+ return self.response
208
+
209
+ if not self.handler:
210
+ self.handler = self.get_handler_implementation()
211
+
212
+ if self.handler:
213
+ if self.request.respect_robots:
214
+ if not self.is_allowed():
215
+ return
216
+
217
+ self.response = self.handler.get_response()
218
+ if self.response:
219
+ if not self.response.is_valid():
220
+ WebLogger.error(
221
+ "Url:{} Response is invalid:{}".format(self.request.url, self.response),
222
+ detail_text = str(response_to_json(self.response))
223
+ )
224
+
225
+ return self.response
226
+
227
+ def get_streams(self):
228
+ streams = []
229
+ streams_data = []
230
+
231
+ handler = self.get_handler()
232
+
233
+ if handler:
234
+ if self.request.respect_robots:
235
+ if not self.is_allowed():
236
+ return []
237
+
238
+ streams = self.handler.get_streams()
239
+
240
+ if streams:
241
+ for response in streams.values():
242
+ response_json = response_to_json(response)
243
+ streams_data.append(response_json)
244
+
245
+ return streams_data
246
+
247
+ def get_headers(self):
248
+ # TODO implement
249
+ pass
250
+
251
+ def ping(self, timeout_s=20, user_agent=None):
252
+ # TODO if that fails we would have to find suitable agent, and then ping
253
+ return RequestsCrawler(self.request.url).ping()
254
+
255
+ def get_handler_implementation(self):
256
+ url = self.request.url
257
+ if not url:
258
+ return
259
+
260
+ p = UrlLocation(url)
261
+ short_url = p.get_protocolless()
262
+
263
+ if not short_url:
264
+ return
265
+
266
+ handlers = self.get_handlers()
267
+ for handler in handlers:
268
+ if self.request.handler_name and self.request.handler_name != "" and self.request.handler_name != handler.__name__:
269
+ continue
270
+ if self.request.handler_type and self.request.handler_type != handler:
271
+ continue
272
+
273
+ h = handler(
274
+ url=self.request.url, request=self.request, url_builder=self.url_builder
275
+ )
276
+ if h.is_handled_by():
277
+ self.request.url = h.url
278
+ return h
279
+
280
+ if url.startswith("https") or url.startswith("http"):
281
+ return HttpPageHandler(
282
+ url=url, request=self.request, url_builder=self.url_builder
283
+ )
284
+ elif url.startswith("smb") or url.startswith("ftp"):
285
+ raise NotImplementedError("Protocol has not been implemented")
286
+
287
+ def get_cleaned_link(self):
288
+ url = self.request.url
289
+
290
+ url = url.strip()
291
+
292
+ if url.endswith("/"):
293
+ url = url[:-1]
294
+ if url.endswith("."):
295
+ url = url[:-1]
296
+
297
+ # domain is lowercase
298
+ return UrlLocation.get_cleaned_link(url)
299
+
300
+ def get_url(self):
301
+ self.get_handler()
302
+ if self.handler:
303
+ return self.handler.get_url()
304
+ else:
305
+ return self.request.url
306
+
307
+ def get_canonical_url(self):
308
+ if self.handler:
309
+ return self.handler.get_canonical_url()
310
+
311
+ handlers = self.get_handlers()
312
+ for handler_class in handlers:
313
+ handler = handler_class(url=self.request.url)
314
+ if handler.is_handled_by():
315
+ return handler.get_canonical_url()
316
+
317
+ def get_urls(self):
318
+ properties = {}
319
+ properties["link"] = self.request.url
320
+ properties["link_request"] = self.request_url
321
+ canonical = self.get_canonical_url()
322
+ if canonical:
323
+ properties["link_canonical"] = canonical
324
+ return properties
325
+
326
+ def get_urls_archive(self):
327
+ p = UrlLocation(self.request.url)
328
+ short_url = p.get_protocolless()
329
+
330
+ properties = []
331
+
332
+ archive = InternetArchive(self.request.url)
333
+ properties.append(archive.get_archive_url())
334
+
335
+ properties.append("https://archive.ph/" + short_url)
336
+
337
+ return properties
338
+
339
+ def __str__(self):
340
+ return "{}".format(self.request)
341
+
342
+ def is_valid(self):
343
+ if not self.handler:
344
+ return False
345
+
346
+ if self.response is None:
347
+ return False
348
+
349
+ if self.response and not self.response.is_valid():
350
+ return False
351
+
352
+ if not self.handler.is_valid():
353
+ return False
354
+
355
+ return True
356
+
357
+ def get_title(self):
358
+ if self.handler:
359
+ return self.handler.get_title()
360
+
361
+ def get_description(self):
362
+ if self.handler:
363
+ return self.handler.get_description()
364
+
365
+ def get_language(self):
366
+ if self.handler:
367
+ return self.handler.get_language()
368
+
369
+ def get_thumbnail(self):
370
+ if self.handler:
371
+ return self.handler.get_thumbnail()
372
+
373
+ def get_author(self):
374
+ if self.handler:
375
+ return self.handler.get_author()
376
+
377
+ def get_album(self):
378
+ if self.handler:
379
+ return self.handler.get_album()
380
+
381
+ def get_tags(self):
382
+ if self.handler:
383
+ return self.handler.get_tags()
384
+
385
+ def get_date_published(self):
386
+ if self.handler:
387
+ return self.handler.get_date_published()
388
+
389
+ def get_status_code(self):
390
+ if self.response:
391
+ return self.response.get_status_code()
392
+
393
+ return 0
394
+
395
+ def get_entries(self):
396
+ handler = self.get_handler()
397
+ if handler:
398
+ return handler.get_entries()
399
+ else:
400
+ return []
401
+
402
+ def find_rss_url(self):
403
+ """
404
+ TODO remove
405
+ """
406
+ url = self.url
407
+
408
+ if not url:
409
+ return
410
+
411
+ handler = self.get_handler()
412
+
413
+ if handler:
414
+ if type(handler) is HttpPageHandler:
415
+ if type(handler.p) is RssPage:
416
+ return self
417
+
418
+ # maybe our handler is able to produce feed without asking for response
419
+
420
+ feeds = self.get_feeds()
421
+ if url in feeds:
422
+ return self
423
+
424
+ if feeds and len(feeds) > 0:
425
+ u = self.url_builder(url=feeds[0])
426
+ return u
427
+
428
+ def get_feeds(self):
429
+ result = []
430
+
431
+ handler = self.get_handler()
432
+ if handler:
433
+ return handler.get_feeds()
434
+
435
+ return result
436
+
437
+ def get_contents_hash(self):
438
+ handler = self.get_handler()
439
+ if handler:
440
+ return handler.get_contents_hash()
441
+
442
+ def get_contents_body_hash(self):
443
+ handler = self.get_handler()
444
+ if handler:
445
+ return handler.get_contents_body_hash()
446
+
447
+ def get_properties(self, full=False, include_social=False, check_robots=False):
448
+ response = self.get_response()
449
+
450
+ properties_data = self.get_properties_data()
451
+ if not full:
452
+ return properties_data
453
+
454
+ all_properties = []
455
+
456
+ all_properties.append({"name": "Properties", "data": properties_data})
457
+
458
+ properties_hash = self.property_encode(calculate_hash(str(properties_data)))
459
+ all_properties.append({"name": "PropertiesHash", "data": properties_hash})
460
+
461
+ if response:
462
+ if response.get_text():
463
+ all_properties.append(
464
+ {"name": "Text", "data": {"Contents": response.get_text()}}
465
+ )
466
+ elif response.get_binary():
467
+ all_properties.append(
468
+ {
469
+ "name": "Binary",
470
+ "data": {
471
+ "Contents": self.property_encode(response.get_binary())
472
+ },
473
+ }
474
+ )
475
+
476
+ streams = self.get_streams()
477
+ all_properties.append({"name": "Streams", "data": streams})
478
+
479
+ # TODO request is part of response now. Should we include it?
480
+ request_data = request_to_json(self.request)
481
+ request_data["crawler_type"] = type(request_data["crawler_type"]).__name__
482
+ all_properties.append({"name": "Request", "data": request_data})
483
+
484
+ response_data = self.get_response_data()
485
+ all_properties.append({"name": "Response", "data": response_data})
486
+ if response:
487
+ raw_headers_data = response.get_headers()
488
+ all_properties.append({"name": "Headers", "data": raw_headers_data})
489
+ else:
490
+ all_properties.append({"name": "Headers", "data": None})
491
+
492
+ if include_social:
493
+ social_data = self.get_social_properties(self.request.url)
494
+ if social_data:
495
+ all_properties.append({"name": "Social", "data": social_data})
496
+
497
+ entries_data = self.get_entry_data()
498
+ all_properties.append({"name": "Entries", "data": entries_data})
499
+
500
+ return all_properties
501
+
502
+ def get_properties_data(self):
503
+ properties = super().get_properties()
504
+ page_handler = self.get_handler()
505
+
506
+ properties["link_request"] = self.request_url
507
+
508
+ feeds = self.get_feeds()
509
+ if len(feeds) > 0:
510
+ properties["feeds"] = []
511
+ for key, feed in enumerate(feeds):
512
+ properties["feeds"].append(feed)
513
+
514
+ is_channel = False
515
+ channel_handler = YouTubeChannelHandler(url=self.url)
516
+ if channel_handler.is_handled_by():
517
+ is_channel = True
518
+
519
+ if page_handler:
520
+ """
521
+ TODO detect type of handler. IsChannel?
522
+ """
523
+ if is_channel:
524
+ if page_handler.get_channel_name():
525
+ properties["channel_name"] = page_handler.get_channel_name()
526
+ properties["channel_url"] = page_handler.get_channel_url()
527
+
528
+ if type(page_handler) is HttpPageHandler and type(page_handler.p) is HtmlPage:
529
+ properties["favicon"] = page_handler.p.get_favicon()
530
+ properties["meta title"] = page_handler.p.get_meta_field("title")
531
+ properties["meta description"] = page_handler.p.get_meta_field(
532
+ "description"
533
+ )
534
+ properties["meta keywords"] = page_handler.p.get_meta_field("keywords")
535
+
536
+ properties["og:title"] = page_handler.p.get_og_field("title")
537
+ properties["og:description"] = page_handler.p.get_og_field("description")
538
+ properties["og:image"] = page_handler.p.get_og_field("image")
539
+ properties["og:site_name"] = page_handler.p.get_og_field("site_name")
540
+ properties["schema:thumbnailUrl"] = page_handler.p.get_schema_field(
541
+ "thumbnailUrl"
542
+ )
543
+
544
+ properties["link_archives"] = self.get_urls_archive()
545
+
546
+ return properties
547
+
548
+ def response_to_data(self, response):
549
+ response_data = response_to_json(response)
550
+
551
+ respect_robots_txt = False
552
+ is_allowed = True
553
+ if (self.request.respect_robots):
554
+ is_allowed = self.is_allowed()
555
+
556
+ response_data["is_allowed"] = is_allowed
557
+
558
+ return response_data
559
+
560
+ def get_response_data(self):
561
+ """
562
+ Easy digestible response data
563
+ """
564
+ response = self.get_response()
565
+ response_data = self.response_to_data(response)
566
+ return response_data
567
+
568
+ def get_entry_data(self):
569
+ index = 0
570
+ result = []
571
+
572
+ entries = self.get_entries()
573
+
574
+ if entries:
575
+ for entry in entries:
576
+ if "feed_entry" in entry:
577
+ del entry["feed_entry"]
578
+ result.append(entry)
579
+
580
+ return result
581
+
582
+ def property_encode(self, byte_property):
583
+ return base64.b64encode(byte_property).decode("utf-8")
584
+
585
+ def is_allowed(self):
586
+ """
587
+ TODO remove?
588
+ """
589
+ domain_info = self.get_domain_info()
590
+ return domain_info.is_allowed(self.request.url)
591
+
592
+ def get_social_properties(self):
593
+ url = self.request.url
594
+
595
+ json_obj = {}
596
+
597
+ handler = self.get_handler()
598
+ if not handler:
599
+ i = HandlerInterface()
600
+ return i.get_social_data()
601
+
602
+ json_data = handler.get_json_data()
603
+ return handler.get_social_data()
604
+
605
+ def get_properties_section(self, section_name, all_properties):
606
+ if not all_properties:
607
+ return
608
+
609
+ if "success" in all_properties and not all_properties["success"]:
610
+ # print("Url:{} Remote error. Not a success".format(link))
611
+ print("Remote error. Not a success")
612
+ # WebLogger.error(all_properties["error"])
613
+ return False
614
+
615
+ for properties in all_properties:
616
+ if section_name == properties["name"]:
617
+ return properties["data"]