urlicon 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ import os
2
+
3
+
4
+ class string_cache:
5
+ cache_folder: str | None = None
6
+ cache_files_extension: str = "html"
7
+
8
+ def __init__(self, cache_folder: str | None = None):
9
+ if cache_folder is not None:
10
+ self.cache_folder = cache_folder
11
+ else:
12
+ self.cache_folder = self.get_cache_folder()
13
+ pass
14
+
15
+ def safe_cache_id(func):
16
+ def filter_cache_id(cache_id):
17
+ cache_id = cache_id.replace('"', "").replace("\\", "")
18
+ cache_id = f'"{cache_id}"'
19
+ return cache_id
20
+
21
+ def _filter_cache_id_func(*args, **kwargs):
22
+ if "cache_id" in args:
23
+ args["cache_id"] = filter_cache_id(cache_id=args["cache_id"])
24
+
25
+ if "cache_id" in kwargs:
26
+ kwargs["cache_id"] = filter_cache_id(cache_id=kwargs["cache_id"])
27
+
28
+ return func(*args, **kwargs)
29
+
30
+ return _filter_cache_id_func
31
+
32
+ @safe_cache_id
33
+ def set(self, text: str, cache_id: str):
34
+ cache_folder = self.get_cache_folder()
35
+ cache_index_file_path = self.get_cache_index_path()
36
+
37
+ cache_folder_files = os.listdir(cache_folder)
38
+ cached_file_index = self.get_index_from_file_index(_safe_cache_id=cache_id)
39
+ if cached_file_index is not None:
40
+ new_file_index = cached_file_index
41
+ else:
42
+ new_file_index = len(cache_folder_files)
43
+ with open(cache_index_file_path, "a") as cache_index_file_writer:
44
+ cache_index_file_writer.write(f"\n{new_file_index}: {cache_id}")
45
+
46
+ new_file_name = f"{new_file_index}.{self.cache_files_extension}"
47
+ new_file_path = os.path.join(cache_folder, new_file_name)
48
+ with open(new_file_path, "w+") as new_file_writer:
49
+ new_file_writer.write(text)
50
+
51
+ @safe_cache_id
52
+ def get(self, cache_id: str) -> str:
53
+ cached_file_index = self.get_index_from_file_index(_safe_cache_id=cache_id)
54
+ code = self.get_cached_file_by_index(cached_file_index=cached_file_index)
55
+ return code
56
+
57
+ def get_index_from_file_index(self, _safe_cache_id):
58
+ cache_index_file = self.get_cache_index_file()
59
+ if cache_index_file.find(_safe_cache_id) < 1:
60
+ return None
61
+ cache_index_file = cache_index_file[: cache_index_file.find(_safe_cache_id) - 2]
62
+ cached_file_index = int(cache_index_file.split("\n")[-1].strip())
63
+ return cached_file_index
64
+
65
+ def get_cache_index_path(
66
+ self,
67
+ ) -> str:
68
+ cache_index_file_name = "cache_index.yaml"
69
+ cache_folder = self.get_cache_folder()
70
+ cache_index_file_path = os.path.join(cache_folder, cache_index_file_name)
71
+
72
+ if not os.path.exists(cache_index_file_path):
73
+ with open(cache_index_file_path, "w+") as cache_index_file_writer:
74
+ cache_index_file_writer.write(f"0: {cache_index_file_name}")
75
+
76
+ return cache_index_file_path
77
+
78
+ def get_cache_index_file(
79
+ self,
80
+ ) -> str:
81
+ with open(self.get_cache_index_path(), "r") as f:
82
+ cache_index_file_content = f.read()
83
+
84
+ return cache_index_file_content
85
+
86
+ def get_cached_file_by_index(self, cached_file_index: int) -> str:
87
+ code = None
88
+ cache_folder = self.get_cache_folder()
89
+ cached_file_name = f"{cached_file_index}.{self.cache_files_extension}"
90
+ cached_file_path = os.path.join(cache_folder, cached_file_name)
91
+ if not os.path.exists(cached_file_path):
92
+ return None
93
+ with open(cached_file_path, "r") as cached_file_reader:
94
+ code = cached_file_reader.read()
95
+ return code
96
+
97
+ def clean(
98
+ self,
99
+ ):
100
+ cache_folder = self.get_cache_folder()
101
+ cache_folder_files = os.listdir(cache_folder)
102
+ for file in cache_folder_files:
103
+ file_to_clean = os.path.join(cache_folder, file)
104
+ if os.path.exists(file_to_clean):
105
+ os.remove(file_to_clean)
106
+
107
+ def get_cache_folder(
108
+ self,
109
+ ):
110
+ import tempfile
111
+
112
+ if self.cache_folder is not None:
113
+ return self.cache_folder
114
+
115
+ tmpdirname = tempfile.mkdtemp()
116
+ return tmpdirname
urlicon/urlicon.py ADDED
@@ -0,0 +1,157 @@
1
+ import os
2
+ import re
3
+ import urllib
4
+
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from dotenv import load_dotenv
8
+
9
+ from urlicon import urls
10
+ from urlicon.string_cache import string_cache
11
+
12
+ load_dotenv()
13
+
14
+ STRING_CACHE_ROOT_DIR = os.getenv("STRING_CACHE_ROOT_DIR", None)
15
+ cache = string_cache(cache_folder=STRING_CACHE_ROOT_DIR)
16
+
17
+
18
+ def get_url_icon(url):
19
+
20
+ domain = urls.extract_domain_from_url(url=url)
21
+ if len(domain) == 0:
22
+ return get_default_img(text=url)
23
+
24
+ meta_icon, url_soup = get_meta_icon_from_url(url=url)
25
+ if meta_icon is not None:
26
+ return meta_icon
27
+
28
+ favicon = get_favicon_from_url(url=url)
29
+ if favicon is not None:
30
+ return favicon
31
+
32
+ url_first_img = get_first_img_from_url(url=url, url_soup=url_soup)
33
+ if url_first_img is not None:
34
+ return urls.ensure_domain(url_first_img, domain)
35
+
36
+ domain_first_img = get_first_img_from_url(url=domain)
37
+ if domain_first_img is not None:
38
+ return urls.ensure_domain(domain_first_img, domain)
39
+
40
+ return get_default_img(text=url)
41
+
42
+
43
+ def get_meta_icon_from_url(url, url_soup=None):
44
+ soup_icons, url_soup = get_soup_icons_from_url(url, url_soup)
45
+
46
+ if soup_icons is None or len(soup_icons) == 0:
47
+ img = None
48
+ return img, url_soup
49
+ else:
50
+ final_icon = soup_icons[0]
51
+ for soup_icon in soup_icons:
52
+ if get_soup_icon_size(soup_icon) >= get_soup_icon_size(final_icon):
53
+ final_icon = soup_icon
54
+ img = final_icon["href"]
55
+
56
+ img = urls.ensure_relative_path(img, url)
57
+ return img, url_soup
58
+
59
+
60
+ def get_soup_icons_from_url(url, url_soup=None):
61
+ if url_soup is None:
62
+ try:
63
+ url_request = requests_get(url)
64
+ url_soup = BeautifulSoup(url_request, features="html.parser")
65
+ except:
66
+ return None, None
67
+
68
+ soup_icons = url_soup.find_all(
69
+ "link",
70
+ attrs={
71
+ "rel": [
72
+ "icon",
73
+ "apple-touch-icon",
74
+ ]
75
+ },
76
+ )
77
+ return soup_icons, url_soup
78
+
79
+
80
+ def get_soup_icon_size(soup_icon):
81
+ if not soup_icon.has_attr("sizes"):
82
+ return 1
83
+ return int(soup_icon["sizes"].split("x")[0])
84
+
85
+
86
+ def get_favicon_from_url(url):
87
+ domain = urls.extract_domain_from_url(url=url)
88
+ favicon = domain + "/favicon.ico"
89
+ try:
90
+ favicon_request = requests_get(favicon)
91
+ except:
92
+ return None
93
+
94
+ if favicon_request is None or "html" in favicon_request[:150]:
95
+ return None
96
+ else:
97
+ return favicon
98
+
99
+
100
+ def get_first_img_from_url(url, url_soup=None):
101
+ try:
102
+ url_request = requests_get(url)
103
+ except:
104
+ return None
105
+
106
+ if url_request is None:
107
+ return None
108
+ if url_soup is None:
109
+ url_soup = BeautifulSoup(url_request, features="html.parser")
110
+
111
+ first_img = url_soup.find("img")
112
+ if first_img is not None:
113
+ img = urls.ensure_domain(first_img["src"], url)
114
+ return img
115
+ else:
116
+ return None
117
+
118
+
119
+ def get_default_img(text):
120
+ text = re.sub(r"[^a-zA-Z0-9./+]", "", text)
121
+ text = text.replace("www", "").replace("https", "")
122
+ text = text.replace(".", " ").replace("/", " ").strip()
123
+ text = urllib.parse.quote(text, safe="=?&")
124
+ return f"https://ui-avatars.com/api/?name={text}"
125
+
126
+
127
+ def get_img_from_a_soup_item(soup_item, domain):
128
+ soup_item["href"] = urls.ensure_domain(url=soup_item["href"], domain=domain)
129
+
130
+ use_soup_img = False
131
+ if len(soup_item.find_all("img")) > 0:
132
+ if len(soup_item.find("img")["src"]) < 200:
133
+ use_soup_img = False
134
+ else:
135
+ use_soup_img = True
136
+
137
+ if use_soup_img:
138
+ img = soup_item.find("img")["src"]
139
+ else:
140
+ img = get_url_icon(soup_item["href"])
141
+ return img
142
+
143
+
144
+ def requests_get(url):
145
+ cache_prefix = "sniff-urf:"
146
+ cached_code = cache.get(cache_id=cache_prefix + url)
147
+
148
+ if cached_code is not None:
149
+ return cached_code
150
+
151
+ req = requests.get(url=url, timeout=5)
152
+ if req.status_code != 200:
153
+ return None
154
+
155
+ code = req.text
156
+ cache.set(text=code, cache_id=cache_prefix + url)
157
+ return code
urlicon/urls.py ADDED
@@ -0,0 +1,85 @@
1
+ import urllib
2
+
3
+ import requests
4
+
5
+
6
+ def ensure_domain(url, domain):
7
+ if (
8
+ urllib.parse.urlparse(url).netloc == ""
9
+ and urllib.parse.urlparse(domain).netloc == ""
10
+ ):
11
+ raise ValueError("`url` or `domain` must be a domain")
12
+
13
+ if urllib.parse.urlparse(url).netloc == "":
14
+ parsed_url = urllib.parse.urlparse(url)
15
+ parsed_domain = urllib.parse.urlparse(domain)
16
+ url = parsed_url._replace(scheme="https", netloc=parsed_domain.netloc).geturl()
17
+
18
+ url = urllib.parse.urlparse(url)._replace(scheme="https").geturl()
19
+ return url
20
+
21
+
22
+ def get_name_from_domain(url):
23
+ href_parse = urllib.parse.urlparse(url)
24
+ country_code_len = 2
25
+
26
+ if href_parse.netloc == "":
27
+ raise ValueError("`url` must have a domain")
28
+ name = href_parse.netloc.split(".")
29
+
30
+ if len(name[-1]) == country_code_len:
31
+ name = name[:-1]
32
+
33
+ if name[0] == "www":
34
+ name = name[1:]
35
+
36
+ if len(name) > 2:
37
+ name = "_".join(name[-3:-1][::-1])
38
+ elif len(name) == 1:
39
+ name = name[0]
40
+ else:
41
+ name = name[-2]
42
+ return name
43
+
44
+
45
+ def extract_domain_from_url(url):
46
+ domain = ""
47
+
48
+ url_parse = urllib.parse.urlparse(url)
49
+ if len(url_parse.netloc) > 0:
50
+ domain = "://".join([url_parse.scheme, url_parse.netloc])
51
+ return domain
52
+
53
+
54
+ def read_from_url_or_path(url_path):
55
+ if str(url_path).startswith("https://"):
56
+ try:
57
+ content = requests.get(url_path).text
58
+ except:
59
+ return ""
60
+ else:
61
+ url_path = url_path.split(".")[0] + ".html"
62
+ with open(url_path, "r") as f:
63
+ content = f.read()
64
+ return content
65
+
66
+
67
+ def ensure_relative_path(path, url):
68
+ if path[0] == "/":
69
+ return ensure_domain(url=path, domain=url)
70
+
71
+ if len(extract_domain_from_url(path)) > 0:
72
+ return path
73
+
74
+ parsed_url = urllib.parse.urlparse(url)
75
+ last_url_element = parsed_url.path.split("/")[-1]
76
+ if last_url_element.find(".") >= 0:
77
+ parsed_url_path = parsed_url.path.split("/")[:-1]
78
+ else:
79
+ parsed_url_path = parsed_url.path.split("/")
80
+ url = parsed_url._replace(
81
+ path="/".join(parsed_url_path), query="", params="", fragment=""
82
+ ).geturl()
83
+
84
+ url = f"{url}/{path}"
85
+ return url
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: urlicon
3
+ Version: 0.1.0
4
+ Summary: `URLicon` helps you to discover an possible icon from a URL.
5
+ Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: bs4>=0.0.2
10
+ Requires-Dist: dotenv>=0.9.9
11
+ Requires-Dist: requests>=2.32.5
12
+
13
+ # URLicon - v0.1.0
14
+
15
+ `URLicon` helps you to discover an possible icon from a URL.
16
+
17
+ We check for the metatag `icons`, `favicons` and, if we don't find, we check the
18
+ first image in the URL html code. Finally, if nothing is found, we use the
19
+ [https://ui-avatars.com/api/](https://ui-avatars.com/api/) to bring you at least
20
+ some avatar-like icon.
21
+
22
+ ### How to install and use
23
+
24
+ Install with `uv` or `pip`
25
+ ```shell
26
+ uv add urlicon
27
+ # or
28
+ pip urlicon
29
+ ```
30
+
31
+ Usage:
32
+ ```python
33
+ from urlicon import urlicon
34
+
35
+ url = "https://this-is.your-url.com/some-path"
36
+
37
+ icon_url = urlicon.get_url_icon(url)
38
+
39
+ print("icon:", icon_url)
40
+ # icon: "https://this-is.your-url.com/icon.jpeg"
41
+ ```
42
+
43
+ ### Caching
44
+
45
+ `URLicon` use a simple "cache" method to avoid unecessary URL requests.
46
+ It uses a [temp dir](https://docs.python.org/3/library/tempfile.html) for each
47
+ execution. But you can define a your own directory and use the cache as much as
48
+ you want setting `STRING_CACHE_ROOT_DIR` env var.
49
+
50
+ ```python
51
+ STRING_CACHE_ROOT_DIR = os.getenv("STRING_CACHE_ROOT_DIR", None)
52
+ cache = string_cache(cache_folder=STRING_CACHE_ROOT_DIR)
53
+ ```
54
+
55
+ And you can clean the cache with:
56
+ ```python
57
+ urlicon.string_cache.clean()
58
+ ```
59
+
60
+ ## See Also
61
+
62
+ - Github: https://github.com/bouli/urlicon
63
+ - PyPI: https://pypi.org/project/urlicon/
64
+
65
+ ## License
66
+ This package is distributed under the [MIT license](https://opensource.org/license/MIT).
@@ -0,0 +1,7 @@
1
+ urlicon/string_cache.py,sha256=fY4ZtAdZC3PDfdotuHZmuSAcQxQLA4WyJJ6pmOUZZMc,4130
2
+ urlicon/urlicon.py,sha256=zcbevt29l2xbr14z9jFxz64TrvyS8O-fk29mCSQIoqI,4066
3
+ urlicon/urls.py,sha256=ErdlgRva3bxs6HuBX2iHng21PqpWGIF1etZFToJq4jc,2235
4
+ urlicon-0.1.0.dist-info/METADATA,sha256=1SEq8BOcKYjoGQhcFFrgQFscuJoe0Sv0wDcXyq3YBLw,1713
5
+ urlicon-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
6
+ urlicon-0.1.0.dist-info/top_level.txt,sha256=Jts8QbeWp-6xANJ9KVj3CHk6L-8D950AQ_ZKXmF4YGI,8
7
+ urlicon-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ urlicon