urlicon 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- urlicon/string_cache.py +116 -0
- urlicon/urlicon.py +157 -0
- urlicon/urls.py +85 -0
- urlicon-0.1.0.dist-info/METADATA +66 -0
- urlicon-0.1.0.dist-info/RECORD +7 -0
- urlicon-0.1.0.dist-info/WHEEL +5 -0
- urlicon-0.1.0.dist-info/top_level.txt +1 -0
urlicon/string_cache.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class string_cache:
|
|
5
|
+
cache_folder: str | None = None
|
|
6
|
+
cache_files_extension: str = "html"
|
|
7
|
+
|
|
8
|
+
def __init__(self, cache_folder: str | None = None):
|
|
9
|
+
if cache_folder is not None:
|
|
10
|
+
self.cache_folder = cache_folder
|
|
11
|
+
else:
|
|
12
|
+
self.cache_folder = self.get_cache_folder()
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def safe_cache_id(func):
|
|
16
|
+
def filter_cache_id(cache_id):
|
|
17
|
+
cache_id = cache_id.replace('"', "").replace("\\", "")
|
|
18
|
+
cache_id = f'"{cache_id}"'
|
|
19
|
+
return cache_id
|
|
20
|
+
|
|
21
|
+
def _filter_cache_id_func(*args, **kwargs):
|
|
22
|
+
if "cache_id" in args:
|
|
23
|
+
args["cache_id"] = filter_cache_id(cache_id=args["cache_id"])
|
|
24
|
+
|
|
25
|
+
if "cache_id" in kwargs:
|
|
26
|
+
kwargs["cache_id"] = filter_cache_id(cache_id=kwargs["cache_id"])
|
|
27
|
+
|
|
28
|
+
return func(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
return _filter_cache_id_func
|
|
31
|
+
|
|
32
|
+
@safe_cache_id
|
|
33
|
+
def set(self, text: str, cache_id: str):
|
|
34
|
+
cache_folder = self.get_cache_folder()
|
|
35
|
+
cache_index_file_path = self.get_cache_index_path()
|
|
36
|
+
|
|
37
|
+
cache_folder_files = os.listdir(cache_folder)
|
|
38
|
+
cached_file_index = self.get_index_from_file_index(_safe_cache_id=cache_id)
|
|
39
|
+
if cached_file_index is not None:
|
|
40
|
+
new_file_index = cached_file_index
|
|
41
|
+
else:
|
|
42
|
+
new_file_index = len(cache_folder_files)
|
|
43
|
+
with open(cache_index_file_path, "a") as cache_index_file_writer:
|
|
44
|
+
cache_index_file_writer.write(f"\n{new_file_index}: {cache_id}")
|
|
45
|
+
|
|
46
|
+
new_file_name = f"{new_file_index}.{self.cache_files_extension}"
|
|
47
|
+
new_file_path = os.path.join(cache_folder, new_file_name)
|
|
48
|
+
with open(new_file_path, "w+") as new_file_writer:
|
|
49
|
+
new_file_writer.write(text)
|
|
50
|
+
|
|
51
|
+
@safe_cache_id
|
|
52
|
+
def get(self, cache_id: str) -> str:
|
|
53
|
+
cached_file_index = self.get_index_from_file_index(_safe_cache_id=cache_id)
|
|
54
|
+
code = self.get_cached_file_by_index(cached_file_index=cached_file_index)
|
|
55
|
+
return code
|
|
56
|
+
|
|
57
|
+
def get_index_from_file_index(self, _safe_cache_id):
|
|
58
|
+
cache_index_file = self.get_cache_index_file()
|
|
59
|
+
if cache_index_file.find(_safe_cache_id) < 1:
|
|
60
|
+
return None
|
|
61
|
+
cache_index_file = cache_index_file[: cache_index_file.find(_safe_cache_id) - 2]
|
|
62
|
+
cached_file_index = int(cache_index_file.split("\n")[-1].strip())
|
|
63
|
+
return cached_file_index
|
|
64
|
+
|
|
65
|
+
def get_cache_index_path(
|
|
66
|
+
self,
|
|
67
|
+
) -> str:
|
|
68
|
+
cache_index_file_name = "cache_index.yaml"
|
|
69
|
+
cache_folder = self.get_cache_folder()
|
|
70
|
+
cache_index_file_path = os.path.join(cache_folder, cache_index_file_name)
|
|
71
|
+
|
|
72
|
+
if not os.path.exists(cache_index_file_path):
|
|
73
|
+
with open(cache_index_file_path, "w+") as cache_index_file_writer:
|
|
74
|
+
cache_index_file_writer.write(f"0: {cache_index_file_name}")
|
|
75
|
+
|
|
76
|
+
return cache_index_file_path
|
|
77
|
+
|
|
78
|
+
def get_cache_index_file(
|
|
79
|
+
self,
|
|
80
|
+
) -> str:
|
|
81
|
+
with open(self.get_cache_index_path(), "r") as f:
|
|
82
|
+
cache_index_file_content = f.read()
|
|
83
|
+
|
|
84
|
+
return cache_index_file_content
|
|
85
|
+
|
|
86
|
+
def get_cached_file_by_index(self, cached_file_index: int) -> str:
|
|
87
|
+
code = None
|
|
88
|
+
cache_folder = self.get_cache_folder()
|
|
89
|
+
cached_file_name = f"{cached_file_index}.{self.cache_files_extension}"
|
|
90
|
+
cached_file_path = os.path.join(cache_folder, cached_file_name)
|
|
91
|
+
if not os.path.exists(cached_file_path):
|
|
92
|
+
return None
|
|
93
|
+
with open(cached_file_path, "r") as cached_file_reader:
|
|
94
|
+
code = cached_file_reader.read()
|
|
95
|
+
return code
|
|
96
|
+
|
|
97
|
+
def clean(
|
|
98
|
+
self,
|
|
99
|
+
):
|
|
100
|
+
cache_folder = self.get_cache_folder()
|
|
101
|
+
cache_folder_files = os.listdir(cache_folder)
|
|
102
|
+
for file in cache_folder_files:
|
|
103
|
+
file_to_clean = os.path.join(cache_folder, file)
|
|
104
|
+
if os.path.exists(file_to_clean):
|
|
105
|
+
os.remove(file_to_clean)
|
|
106
|
+
|
|
107
|
+
def get_cache_folder(
|
|
108
|
+
self,
|
|
109
|
+
):
|
|
110
|
+
import tempfile
|
|
111
|
+
|
|
112
|
+
if self.cache_folder is not None:
|
|
113
|
+
return self.cache_folder
|
|
114
|
+
|
|
115
|
+
tmpdirname = tempfile.mkdtemp()
|
|
116
|
+
return tmpdirname
|
urlicon/urlicon.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import urllib
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
from urlicon import urls
|
|
10
|
+
from urlicon.string_cache import string_cache
|
|
11
|
+
|
|
12
|
+
load_dotenv()
|
|
13
|
+
|
|
14
|
+
STRING_CACHE_ROOT_DIR = os.getenv("STRING_CACHE_ROOT_DIR", None)
|
|
15
|
+
cache = string_cache(cache_folder=STRING_CACHE_ROOT_DIR)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_url_icon(url):
|
|
19
|
+
|
|
20
|
+
domain = urls.extract_domain_from_url(url=url)
|
|
21
|
+
if len(domain) == 0:
|
|
22
|
+
return get_default_img(text=url)
|
|
23
|
+
|
|
24
|
+
meta_icon, url_soup = get_meta_icon_from_url(url=url)
|
|
25
|
+
if meta_icon is not None:
|
|
26
|
+
return meta_icon
|
|
27
|
+
|
|
28
|
+
favicon = get_favicon_from_url(url=url)
|
|
29
|
+
if favicon is not None:
|
|
30
|
+
return favicon
|
|
31
|
+
|
|
32
|
+
url_first_img = get_first_img_from_url(url=url, url_soup=url_soup)
|
|
33
|
+
if url_first_img is not None:
|
|
34
|
+
return urls.ensure_domain(url_first_img, domain)
|
|
35
|
+
|
|
36
|
+
domain_first_img = get_first_img_from_url(url=domain)
|
|
37
|
+
if domain_first_img is not None:
|
|
38
|
+
return urls.ensure_domain(domain_first_img, domain)
|
|
39
|
+
|
|
40
|
+
return get_default_img(text=url)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_meta_icon_from_url(url, url_soup=None):
|
|
44
|
+
soup_icons, url_soup = get_soup_icons_from_url(url, url_soup)
|
|
45
|
+
|
|
46
|
+
if soup_icons is None or len(soup_icons) == 0:
|
|
47
|
+
img = None
|
|
48
|
+
return img, url_soup
|
|
49
|
+
else:
|
|
50
|
+
final_icon = soup_icons[0]
|
|
51
|
+
for soup_icon in soup_icons:
|
|
52
|
+
if get_soup_icon_size(soup_icon) >= get_soup_icon_size(final_icon):
|
|
53
|
+
final_icon = soup_icon
|
|
54
|
+
img = final_icon["href"]
|
|
55
|
+
|
|
56
|
+
img = urls.ensure_relative_path(img, url)
|
|
57
|
+
return img, url_soup
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_soup_icons_from_url(url, url_soup=None):
|
|
61
|
+
if url_soup is None:
|
|
62
|
+
try:
|
|
63
|
+
url_request = requests_get(url)
|
|
64
|
+
url_soup = BeautifulSoup(url_request, features="html.parser")
|
|
65
|
+
except:
|
|
66
|
+
return None, None
|
|
67
|
+
|
|
68
|
+
soup_icons = url_soup.find_all(
|
|
69
|
+
"link",
|
|
70
|
+
attrs={
|
|
71
|
+
"rel": [
|
|
72
|
+
"icon",
|
|
73
|
+
"apple-touch-icon",
|
|
74
|
+
]
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
return soup_icons, url_soup
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_soup_icon_size(soup_icon):
|
|
81
|
+
if not soup_icon.has_attr("sizes"):
|
|
82
|
+
return 1
|
|
83
|
+
return int(soup_icon["sizes"].split("x")[0])
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_favicon_from_url(url):
|
|
87
|
+
domain = urls.extract_domain_from_url(url=url)
|
|
88
|
+
favicon = domain + "/favicon.ico"
|
|
89
|
+
try:
|
|
90
|
+
favicon_request = requests_get(favicon)
|
|
91
|
+
except:
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
if favicon_request is None or "html" in favicon_request[:150]:
|
|
95
|
+
return None
|
|
96
|
+
else:
|
|
97
|
+
return favicon
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_first_img_from_url(url, url_soup=None):
|
|
101
|
+
try:
|
|
102
|
+
url_request = requests_get(url)
|
|
103
|
+
except:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
if url_request is None:
|
|
107
|
+
return None
|
|
108
|
+
if url_soup is None:
|
|
109
|
+
url_soup = BeautifulSoup(url_request, features="html.parser")
|
|
110
|
+
|
|
111
|
+
first_img = url_soup.find("img")
|
|
112
|
+
if first_img is not None:
|
|
113
|
+
img = urls.ensure_domain(first_img["src"], url)
|
|
114
|
+
return img
|
|
115
|
+
else:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_default_img(text):
|
|
120
|
+
text = re.sub(r"[^a-zA-Z0-9./+]", "", text)
|
|
121
|
+
text = text.replace("www", "").replace("https", "")
|
|
122
|
+
text = text.replace(".", " ").replace("/", " ").strip()
|
|
123
|
+
text = urllib.parse.quote(text, safe="=?&")
|
|
124
|
+
return f"https://ui-avatars.com/api/?name={text}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_img_from_a_soup_item(soup_item, domain):
|
|
128
|
+
soup_item["href"] = urls.ensure_domain(url=soup_item["href"], domain=domain)
|
|
129
|
+
|
|
130
|
+
use_soup_img = False
|
|
131
|
+
if len(soup_item.find_all("img")) > 0:
|
|
132
|
+
if len(soup_item.find("img")["src"]) < 200:
|
|
133
|
+
use_soup_img = False
|
|
134
|
+
else:
|
|
135
|
+
use_soup_img = True
|
|
136
|
+
|
|
137
|
+
if use_soup_img:
|
|
138
|
+
img = soup_item.find("img")["src"]
|
|
139
|
+
else:
|
|
140
|
+
img = get_url_icon(soup_item["href"])
|
|
141
|
+
return img
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def requests_get(url):
|
|
145
|
+
cache_prefix = "sniff-urf:"
|
|
146
|
+
cached_code = cache.get(cache_id=cache_prefix + url)
|
|
147
|
+
|
|
148
|
+
if cached_code is not None:
|
|
149
|
+
return cached_code
|
|
150
|
+
|
|
151
|
+
req = requests.get(url=url, timeout=5)
|
|
152
|
+
if req.status_code != 200:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
code = req.text
|
|
156
|
+
cache.set(text=code, cache_id=cache_prefix + url)
|
|
157
|
+
return code
|
urlicon/urls.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import urllib
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def ensure_domain(url, domain):
|
|
7
|
+
if (
|
|
8
|
+
urllib.parse.urlparse(url).netloc == ""
|
|
9
|
+
and urllib.parse.urlparse(domain).netloc == ""
|
|
10
|
+
):
|
|
11
|
+
raise ValueError("`url` or `domain` must be a domain")
|
|
12
|
+
|
|
13
|
+
if urllib.parse.urlparse(url).netloc == "":
|
|
14
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
15
|
+
parsed_domain = urllib.parse.urlparse(domain)
|
|
16
|
+
url = parsed_url._replace(scheme="https", netloc=parsed_domain.netloc).geturl()
|
|
17
|
+
|
|
18
|
+
url = urllib.parse.urlparse(url)._replace(scheme="https").geturl()
|
|
19
|
+
return url
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_name_from_domain(url):
|
|
23
|
+
href_parse = urllib.parse.urlparse(url)
|
|
24
|
+
country_code_len = 2
|
|
25
|
+
|
|
26
|
+
if href_parse.netloc == "":
|
|
27
|
+
raise ValueError("`url` must have a domain")
|
|
28
|
+
name = href_parse.netloc.split(".")
|
|
29
|
+
|
|
30
|
+
if len(name[-1]) == country_code_len:
|
|
31
|
+
name = name[:-1]
|
|
32
|
+
|
|
33
|
+
if name[0] == "www":
|
|
34
|
+
name = name[1:]
|
|
35
|
+
|
|
36
|
+
if len(name) > 2:
|
|
37
|
+
name = "_".join(name[-3:-1][::-1])
|
|
38
|
+
elif len(name) == 1:
|
|
39
|
+
name = name[0]
|
|
40
|
+
else:
|
|
41
|
+
name = name[-2]
|
|
42
|
+
return name
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_domain_from_url(url):
|
|
46
|
+
domain = ""
|
|
47
|
+
|
|
48
|
+
url_parse = urllib.parse.urlparse(url)
|
|
49
|
+
if len(url_parse.netloc) > 0:
|
|
50
|
+
domain = "://".join([url_parse.scheme, url_parse.netloc])
|
|
51
|
+
return domain
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def read_from_url_or_path(url_path):
|
|
55
|
+
if str(url_path).startswith("https://"):
|
|
56
|
+
try:
|
|
57
|
+
content = requests.get(url_path).text
|
|
58
|
+
except:
|
|
59
|
+
return ""
|
|
60
|
+
else:
|
|
61
|
+
url_path = url_path.split(".")[0] + ".html"
|
|
62
|
+
with open(url_path, "r") as f:
|
|
63
|
+
content = f.read()
|
|
64
|
+
return content
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def ensure_relative_path(path, url):
|
|
68
|
+
if path[0] == "/":
|
|
69
|
+
return ensure_domain(url=path, domain=url)
|
|
70
|
+
|
|
71
|
+
if len(extract_domain_from_url(path)) > 0:
|
|
72
|
+
return path
|
|
73
|
+
|
|
74
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
75
|
+
last_url_element = parsed_url.path.split("/")[-1]
|
|
76
|
+
if last_url_element.find(".") >= 0:
|
|
77
|
+
parsed_url_path = parsed_url.path.split("/")[:-1]
|
|
78
|
+
else:
|
|
79
|
+
parsed_url_path = parsed_url.path.split("/")
|
|
80
|
+
url = parsed_url._replace(
|
|
81
|
+
path="/".join(parsed_url_path), query="", params="", fragment=""
|
|
82
|
+
).geturl()
|
|
83
|
+
|
|
84
|
+
url = f"{url}/{path}"
|
|
85
|
+
return url
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: urlicon
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: `URLicon` helps you to discover an possible icon from a URL.
|
|
5
|
+
Author-email: Cesar Cardoso <hello@cesarcardoso.cc>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: bs4>=0.0.2
|
|
10
|
+
Requires-Dist: dotenv>=0.9.9
|
|
11
|
+
Requires-Dist: requests>=2.32.5
|
|
12
|
+
|
|
13
|
+
# URLicon - v0.1.0
|
|
14
|
+
|
|
15
|
+
`URLicon` helps you to discover an possible icon from a URL.
|
|
16
|
+
|
|
17
|
+
We check for the metatag `icons`, `favicons` and, if we don't find, we check the
|
|
18
|
+
first image in the URL html code. Finally, if nothing is found, we use the
|
|
19
|
+
[https://ui-avatars.com/api/](https://ui-avatars.com/api/) to bring you at least
|
|
20
|
+
some avatar-like icon.
|
|
21
|
+
|
|
22
|
+
### How to install and use
|
|
23
|
+
|
|
24
|
+
Install with `uv` or `pip`
|
|
25
|
+
```shell
|
|
26
|
+
uv add urlicon
|
|
27
|
+
# or
|
|
28
|
+
pip urlicon
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
```python
|
|
33
|
+
from urlicon import urlicon
|
|
34
|
+
|
|
35
|
+
url = "https://this-is.your-url.com/some-path"
|
|
36
|
+
|
|
37
|
+
icon_url = urlicon.get_url_icon(url)
|
|
38
|
+
|
|
39
|
+
print("icon:", icon_url)
|
|
40
|
+
# icon: "https://this-is.your-url.com/icon.jpeg"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Caching
|
|
44
|
+
|
|
45
|
+
`URLicon` use a simple "cache" method to avoid unecessary URL requests.
|
|
46
|
+
It uses a [temp dir](https://docs.python.org/3/library/tempfile.html) for each
|
|
47
|
+
execution. But you can define a your own directory and use the cache as much as
|
|
48
|
+
you want setting `STRING_CACHE_ROOT_DIR` env var.
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
STRING_CACHE_ROOT_DIR = os.getenv("STRING_CACHE_ROOT_DIR", None)
|
|
52
|
+
cache = string_cache(cache_folder=STRING_CACHE_ROOT_DIR)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
And you can clean the cache with:
|
|
56
|
+
```python
|
|
57
|
+
urlicon.string_cache.clean()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## See Also
|
|
61
|
+
|
|
62
|
+
- Github: https://github.com/bouli/urlicon
|
|
63
|
+
- PyPI: https://pypi.org/project/urlicon/
|
|
64
|
+
|
|
65
|
+
## License
|
|
66
|
+
This package is distributed under the [MIT license](https://opensource.org/license/MIT).
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
urlicon/string_cache.py,sha256=fY4ZtAdZC3PDfdotuHZmuSAcQxQLA4WyJJ6pmOUZZMc,4130
|
|
2
|
+
urlicon/urlicon.py,sha256=zcbevt29l2xbr14z9jFxz64TrvyS8O-fk29mCSQIoqI,4066
|
|
3
|
+
urlicon/urls.py,sha256=ErdlgRva3bxs6HuBX2iHng21PqpWGIF1etZFToJq4jc,2235
|
|
4
|
+
urlicon-0.1.0.dist-info/METADATA,sha256=1SEq8BOcKYjoGQhcFFrgQFscuJoe0Sv0wDcXyq3YBLw,1713
|
|
5
|
+
urlicon-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
6
|
+
urlicon-0.1.0.dist-info/top_level.txt,sha256=Jts8QbeWp-6xANJ9KVj3CHk6L-8D950AQ_ZKXmF4YGI,8
|
|
7
|
+
urlicon-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
urlicon
|