PyPI - webtoolkit - Versions diffs - 0.0.182__tar.gz → 0.0.184__tar.gz - Mend

webtoolkit 0.0.182tar.gz → 0.0.184tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: webtoolkit
-Version: 0.0.182
+Version: 0.0.184
 Summary: Web tools and interfaces for Internet data processing.
 License: GPL3
 Author: Iwan Grozny

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@
 [tool.poetry]
 name = "webtoolkit"
-version = "0.0.182"
+version = "0.0.184"
 description = "Web tools and interfaces for Internet data processing."
 authors = ["Iwan Grozny <renegat@renegat0x0.ddns.net>"]
 license = "GPL3"

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/baseurl.py RENAMED Viewed

@@ -7,6 +7,7 @@ response = url.get_response()
 """
 import base64
+from typing import Any, Callable, Optional, Type
 from .utils.dateutils import DateUtils
@@ -59,6 +60,7 @@ class BaseUrl(ContentInterface):
     """
     Base Url class capable of reading network pages.
     """
     def __init__(self, url=None, request=None, url_builder=None):
         """
         Constructor
@@ -105,7 +107,7 @@ class BaseUrl(ContentInterface):
         Returns available handlers.
         Order is important - from the most precise handler to the most general.
         """
-        #fmt off
+        # fmt off
         return [
             YouTubeVideoHandler,
@@ -118,13 +120,13 @@ class BaseUrl(ContentInterface):
             InternetArchive,
             FourChanChannelHandler,
             TwitterUrlHandler,
-            YouTubeChannelHandler,      # present here, if somebody wants to call it by name
-            HttpPageHandler,            # default
+            YouTubeChannelHandler,  # present here, if somebody wants to call it by name
+            HttpPageHandler,  # default
         ]
-        #fmt on
+        # fmt on
     def get_handler_by_name(self, handler_name):
-        """ Returns handler class """
+        """Returns handler class"""
         handlers = self.get_handlers()
         for handler in handlers:
             if handler.__name__ == handler_name:
@@ -218,8 +220,10 @@ class BaseUrl(ContentInterface):
             if self.response:
                 if not self.response.is_valid():
                     WebLogger.error(
-                        "Url:{} Response is invalid:{}".format(self.request.url, self.response),
-                        detail_text = str(response_to_json(self.response))
+                        "Url:{} Response is invalid:{}".format(
+                            self.request.url, self.response
+                        ),
+                        detail_text=str(response_to_json(self.response)),
                     )
             return self.response
@@ -256,7 +260,7 @@ class BaseUrl(ContentInterface):
         return RequestsCrawler(self.request.url).ping()
     def get_handler_implementation(self):
-        """ Returns handler """
+        """Returns handler"""
         url = self.request.url
         if not url:
             return
@@ -269,7 +273,11 @@ class BaseUrl(ContentInterface):
         handlers = self.get_handlers()
         for handler in handlers:
-            if self.request.handler_name and self.request.handler_name != "" and self.request.handler_name != handler.__name__:
+            if (
+                self.request.handler_name
+                and self.request.handler_name != ""
+                and self.request.handler_name != handler.__name__
+            ):
                 continue
             if self.request.handler_type and self.request.handler_type != handler:
                 continue
@@ -289,7 +297,7 @@ class BaseUrl(ContentInterface):
             raise NotImplementedError("Protocol has not been implemented")
     def get_cleaned_link(self):
-        """ Returns cleaned up link. Free of unwanted args, tracking, sanitized. """
+        """Returns cleaned up link. Free of unwanted args, tracking, sanitized."""
         url = self.request.url
         url = url.strip()
@@ -310,7 +318,7 @@ class BaseUrl(ContentInterface):
             return self.request.url
     def get_urls(self):
-        """ Returns various link versions for URL """
+        """Returns various link versions for URL"""
         properties = {}
         properties["link"] = self.request.url
         properties["link_request"] = self.request_url
@@ -320,7 +328,7 @@ class BaseUrl(ContentInterface):
         return properties
     def get_canonical_url(self):
-        """ Returns canonical link """
+        """Returns canonical link"""
         if self.handler:
             return self.handler.get_canonical_url()
@@ -331,7 +339,7 @@ class BaseUrl(ContentInterface):
                 return handler.get_canonical_url()
     def get_urls_archive(self):
-        """ Returns archive link for URL """
+        """Returns archive link for URL"""
         p = UrlLocation(self.request.url)
         short_url = p.get_protocolless()
@@ -348,7 +356,7 @@ class BaseUrl(ContentInterface):
         return "{}".format(self.request)
     def is_valid(self):
-        """ Returns indication if URL is valid """
+        """Returns indication if URL is valid"""
         if not self.handler:
             return False
@@ -364,54 +372,54 @@ class BaseUrl(ContentInterface):
         return True
     def get_title(self):
-        """ Returns title """
+        """Returns title"""
         if self.handler:
             return self.handler.get_title()
     def get_description(self):
-        """ Returns description """
+        """Returns description"""
         if self.handler:
             return self.handler.get_description()
     def get_language(self):
-        """ Returns language """
+        """Returns language"""
         if self.handler:
             return self.handler.get_language()
     def get_thumbnail(self):
-        """ Returns thumbnail """
+        """Returns thumbnail"""
         if self.handler:
             return self.handler.get_thumbnail()
     def get_author(self):
-        """ Returns author """
+        """Returns author"""
         if self.handler:
             return self.handler.get_author()
     def get_album(self):
-        """ Returns album """
+        """Returns album"""
         if self.handler:
             return self.handler.get_album()
     def get_tags(self):
-        """ Returns tags """
+        """Returns tags"""
         if self.handler:
             return self.handler.get_tags()
     def get_date_published(self):
-        """ Returns date published """
+        """Returns date published"""
         if self.handler:
             return self.handler.get_date_published()
     def get_status_code(self) -> int | None:
-        """ Returns status code """
+        """Returns status code"""
         if self.response:
             return self.response.get_status_code()
         return 0
     def get_entries(self):
-        """ Returns entries list """
+        """Returns entries list"""
         handler = self.get_handler()
         if handler:
@@ -446,7 +454,7 @@ class BaseUrl(ContentInterface):
             return u
     def get_feeds(self):
-        """ Returns feeds found for URL """
+        """Returns feeds found for URL"""
         result = []
         handler = self.get_handler()
@@ -459,23 +467,24 @@ class BaseUrl(ContentInterface):
         return calculate_hash(text)
     def get_hash(self):
-        """ Returns hash for URL """
+        """Returns hash for URL"""
         handler = self.get_handler()
         if handler:
             return handler.get_hash()
     def get_body_hash(self):
-        """ Returns body hash for URL """
+        """Returns body hash for URL"""
         handler = self.get_handler()
         if handler:
             return handler.get_body_hash()
-    def get_meta_hash(self):
-        """ Returns meta hash for URL """
-        response = self.get_response()
+    def get_meta_hash(self) -> Optional[str]:
+        """
+        Calculates and returns a hash of the page's metadata properties.
+        :return: A base64-encoded hash of the properties.
+        """
+        self.get_response()
         properties_data = self.get_properties_data()
         properties_hash = self.property_encode(calculate_hash(str(properties_data)))
         return properties_hash
@@ -486,7 +495,7 @@ class BaseUrl(ContentInterface):
         return self.get_properties_data()
     def get_all_properties(self, include_social=False):
-        """ Returns all URL properties """
+        """Returns all URL properties"""
         response = self.get_response()
         properties_data = self.get_properties()
@@ -540,8 +549,8 @@ class BaseUrl(ContentInterface):
         return all_properties
     def get_properties_data(self):
-        """ Returns simple meta properties.
-        TODO there should two functions: get_all_properties and get_properties """
+        """Returns simple meta properties.
+        TODO there should two functions: get_all_properties and get_properties"""
         properties = super().get_properties()
         page_handler = self.get_handler()
@@ -567,7 +576,10 @@ class BaseUrl(ContentInterface):
                     properties["channel_name"] = page_handler.get_channel_name()
                     properties["channel_url"] = page_handler.get_channel_url()
-            if type(page_handler) is HttpPageHandler and type(page_handler.p) is HtmlPage:
+            if (
+                type(page_handler) is HttpPageHandler
+                and type(page_handler.p) is HtmlPage
+            ):
                 properties["favicon"] = page_handler.p.get_favicon()
                 properties["meta title"] = page_handler.p.get_meta_field("title")
                 properties["meta description"] = page_handler.p.get_meta_field(
@@ -576,7 +588,9 @@ class BaseUrl(ContentInterface):
                 properties["meta keywords"] = page_handler.p.get_meta_field("keywords")
                 properties["og:title"] = page_handler.p.get_og_field("title")
-                properties["og:description"] = page_handler.p.get_og_field("description")
+                properties["og:description"] = page_handler.p.get_og_field(
+                    "description"
+                )
                 properties["og:image"] = page_handler.p.get_og_field("image")
                 properties["og:site_name"] = page_handler.p.get_og_field("site_name")
                 properties["schema:thumbnailUrl"] = page_handler.p.get_schema_field(
@@ -631,11 +645,13 @@ class BaseUrl(ContentInterface):
         """
         Returns indication is access is allowed for bots, robots
         """
-        domain_info = DomainCache.get_object(url =self.request.url, url_builder=self.url_builder)
+        domain_info = DomainCache.get_object(
+            url=self.request.url, url_builder=self.url_builder
+        )
         return domain_info.is_allowed(self.request.url)
     def get_social_properties(self):
-        """ Returns social properties """
+        """Returns social properties"""
         url = self.request.url
         json_obj = {}

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contentinterface.py RENAMED Viewed

@@ -20,6 +20,7 @@ class ContentInterface(object):
     """
     Content interface
     """
     def __init__(self, url, contents):
         self.url = url
         self.contents = contents

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/contentlinkparser.py RENAMED Viewed

@@ -164,10 +164,12 @@ class ContentLinkParser(ContentInterface):
             item = item[wh + 1 :]
         # not absolute path
-        if (not item.startswith("http")
+        if (
+            not item.startswith("http")
             and not item.startswith("https")
             and not item.startswith("ftp")
-            and not item.startswith("smb")):
+            and not item.startswith("smb")
+        ):
             location = UrlLocation("https://" + item)
             domain = location.get_domain_only()
@@ -179,15 +181,15 @@ class ContentLinkParser(ContentInterface):
                     return
                 item = self.join_url_parts(url, item)
-        if (not item.startswith("http")
+        if (
+            not item.startswith("http")
             and not item.startswith("https")
             and not item.startswith("ftp")
-            and not item.startswith("smb")):
+            and not item.startswith("smb")
+        ):
             item = "https://" + item
-        if item.startswith("https:&#x2F;&#x2F") or item.startswith(
-            "http:&#x2F;&#x2F"
-        ):
+        if item.startswith("https:&#x2F;&#x2F") or item.startswith("http:&#x2F;&#x2F"):
             item = ContentLinkParser.decode_url(item)
         return item

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/crawlers/crawlerinterface.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 Crawler interface can be implemented to provide new mechanisms of crawling
 """
 import json
 import os
 import base64
@@ -46,6 +47,7 @@ class CrawlerInterface(object):
     Crawler is a tool that allows to obtain contents from the internet.
     There are various tools.
     """
     def __init__(self, url=None, request=None):
         """
         @param response_file If set, response is stored in a file

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/crawlers/requestscrawler.py RENAMED Viewed

@@ -202,9 +202,7 @@ class RequestsCrawler(CrawlerInterface):
         def request_with_timeout(request, stream, result):
             try:
-                result["response"] = self.make_requests_call(
-                    request, stream
-                )
+                result["response"] = self.make_requests_call(request, stream)
             except Exception as e:
                 result["exception"] = e
@@ -297,5 +295,5 @@ class RequestsCrawler(CrawlerInterface):
     def update_request(self):
         self.request.timeout_s = self.get_timeout_s()
-        #TODO - headers are not set
+        # TODO - headers are not set
         # self.request.request_headers = self.get_request_headers()

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/defaulturlhandler.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 Default url handler.
 """
 import copy
 from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
@@ -42,7 +43,7 @@ class DefaultUrlHandler(HttpPageHandler):
             else:
                 request = PageRequestObject(url)
             request.url = url
-            #request.handler_type = HttpPageHandler # object will be assigned by builder
+            # request.handler_type = HttpPageHandler # object will be assigned by builder
         # if we will not hardcode this handler, then it will recursively loop
         request.handler_name = "HttpPageHandler"
@@ -108,6 +109,7 @@ class DefaultCompoundChannelHandler(DefaultChannelHandler):
     """
     Default URL handler which is capable of obtaining data from many network sources automatically.
     """
     def __init__(self, url=None, contents=None, request=None, url_builder=None):
         self.responses = []
         self.channel_sources_urls = OrderedDict()
@@ -158,7 +160,9 @@ class DefaultCompoundChannelHandler(DefaultChannelHandler):
         with ThreadPoolExecutor() as executor:
             for channel_source in channel_sources:
                 if channel_source not in self.channel_sources_urls:
-                    handles.append(executor.submit(self.get_response_source, channel_source))
+                    handles.append(
+                        executor.submit(self.get_response_source, channel_source)
+                    )
             for handle in handles:
                 url = handle.result()

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerchannelodysee.py RENAMED Viewed

@@ -8,7 +8,9 @@ from .handlerhttppage import HttpPageHandler
 class OdyseeChannelHandler(DefaultCompoundChannelHandler):
-    def __init__(self, url=None, contents=None, request=None, url_builder=None, channel_code=None):
+    def __init__(
+        self, url=None, contents=None, request=None, url_builder=None, channel_code=None
+    ):
         if channel_code is not None:
             url = self.code2url(channel_code)

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerchannelyoutube.py RENAMED Viewed

@@ -150,20 +150,20 @@ class YouTubeChannelHandler(DefaultCompoundChannelHandler):
         wh1 = url.find("youtube.com/user")
         if wh1 >= 0:
             start = wh1 + len("youtube.com/user") + 1
-            wh2 = url.find("/", start+1)
+            wh2 = url.find("/", start + 1)
             if wh2 == -1:
-                return url[start-1:]
+                return url[start - 1 :]
             else:
-                return url[start-1:wh2]
+                return url[start - 1 : wh2]
         wh1 = url.find("youtube.com/@")
         if wh1 >= 0:
             start = wh1 + len("youtube.com/@") + 1
             wh2 = url.find("/", start + 1)
             if wh2 == -1:
-                return url[start-1:]
+                return url[start - 1 :]
             else:
-                return url[start-1:wh2]
+                return url[start - 1 : wh2]
     def input2code_channel(self, url):
         wh = url.rfind("/")

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlerinterface.py RENAMED Viewed

@@ -1,13 +1,14 @@
 """
 Handler interface that can be implemented to provide more complex logic for reading meta data.
 """
 from webtoolkit import DefaultContentPage, calculate_hash_binary, calculate_hash
 class HandlerInterface(DefaultContentPage):
     """
     Handler interface can be implemented to provide more complex means for obtaining data from the internet.
-    For example to obtain data about YouTube video you can fetch JSON file from yt-dlp, but also ask
+    For example to obtain data about YouTube video you can fetch JSON file from yt-dlp, but also ask
     return dislike page to obtain dislike ratio.
     """

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlervideoodysee.py RENAMED Viewed

@@ -89,6 +89,7 @@ class OdyseeVideoHandler(DefaultUrlHandler):
     def get_feeds(self):
         from .handlerchannelodysee import OdyseeChannelHandler
         feeds = OdyseeChannelHandler(channel_code=self.channel_code).get_feeds()
         return feeds

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/handlers/handlervideoyoutube.py RENAMED Viewed

@@ -137,7 +137,9 @@ class YouTubeVideoHandler(DefaultCompoundChannelHandler):
         return super().get_social_data()
     def get_return_dislike_url_link(self):
-        return "https://returnyoutubedislikeapi.com/votes?videoId=" + self.get_video_code()
+        return (
+            "https://returnyoutubedislikeapi.com/votes?videoId=" + self.get_video_code()
+        )
     def get_view_count(self):
         """ """

{webtoolkit-0.0.182 → webtoolkit-0.0.184}/webtoolkit/pages.py RENAMED Viewed

@@ -25,6 +25,7 @@ class DefaultContentPage(ContentInterface):
     """
     Default content page that does not throw exceptions
     """
     def __init__(self, url, contents=""):
         super().__init__(url=url, contents=contents)
@@ -66,8 +67,9 @@ class JsonPage(ContentInterface):
     """
     JSON page
     """
     def __init__(self, url, contents):
-        """ Constructor """
+        """Constructor"""
         super().__init__(url=url, contents=contents)
         self.json_obj = None
@@ -80,9 +82,9 @@ class JsonPage(ContentInterface):
             # to be expected
             pass
-            #try:
+            # try:
             #    WebLogger.debug(f"Invalid json:{contents}")
-            #except Exception as E:
+            # except Exception as E:
             #    print(str(E))
     def is_valid(self) -> bool:
@@ -128,7 +130,7 @@ class JsonPage(ContentInterface):
 class RssPageEntry(ContentInterface):
     def __init__(self, feed_index, feed_entry, url, contents, page_object_properties):
-        """ Constructor """
+        """Constructor"""
         self.feed_index = feed_index
         self.feed_entry = feed_entry
         self.url = url
@@ -138,7 +140,7 @@ class RssPageEntry(ContentInterface):
         super().__init__(url=self.url, contents=contents)
     def get_properties(self):
-        """ Returns map of properties """
+        """Returns map of properties"""
         output_map = {}
         link = None
@@ -310,7 +312,7 @@ class RssPage(ContentInterface):
     """
     def __init__(self, url, contents):
-        """ Constructor """
+        """Constructor"""
         self.feed = None
         """
@@ -582,8 +584,9 @@ class RssContentReader(object):
     """
     RSS reader
     """
     def __init__(self, url, contents):
-        """ Constructor """
+        """Constructor"""
         self.contents = contents
         self.process()
@@ -608,9 +611,10 @@ class RssContentReader(object):
 class OpmlPageEntry(ContentInterface):
-    """ OPML Page entry """
+    """OPML Page entry"""
     def __init__(self, url, contents, opml_entry):
-        """ Constructor """
+        """Constructor"""
         super().__init__(url=url, contents=contents)
         self.opml_entry = opml_entry
         self.title = None
@@ -718,7 +722,7 @@ class HtmlPage(ContentInterface):
     """
     def __init__(self, url, contents):
-        """ Constructor """
+        """Constructor"""
         super().__init__(url=url, contents=contents)
         if self.contents:
@@ -1139,9 +1143,9 @@ class HtmlPage(ContentInterface):
         #        props["robots_txt_url"] = UrlLocation(self.url).get_robots_txt_url()
         #        props["site_maps_urls"] = self.get_site_maps()
-        #props["links"] = self.get_links()
-        #props["links_inner"] = self.get_links_inner()
-        #props["links_outer"] = self.get_links_outer()
+        # props["links"] = self.get_links()
+        # props["links_inner"] = self.get_links_inner()
+        # props["links_outer"] = self.get_links_outer()
         props["favicons"] = self.get_favicons()
         props["contents"] = self.get_contents()
@@ -1289,7 +1293,7 @@ class XmlPage(ContentInterface):
     """
     def __init__(self, url, contents):
-        """ Constructor """
+        """Constructor"""
         super().__init__(url=url, contents=contents)
     def is_valid(self) -> bool:

webtoolkit 0.0.182__tar.gz → 0.0.184__tar.gz

webtoolkit 0.0.182tar.gz → 0.0.184tar.gz