geler-CERTIC 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.1
2
+ Name: geler-CERTIC
3
+ Version: 0.1.0
4
+ Summary: Help convert dynamic webistes to static ones.
5
+ License: CECILL-C
6
+ Author: Mickaël Desfrênes
7
+ Author-email: mickael.desfrenes@unicaen.fr
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: License :: CeCILL-C Free Software License Agreement (CECILL-C)
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Dist: argh (>=0.31.3,<0.32.0)
16
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
17
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Geler
21
+
22
+ Help convert dynamic websites to static ones.
23
+
24
+ ## Install
25
+
26
+ pip install geler-CERTIC
27
+
28
+ ## Usage
29
+
30
+ As a library in your own program:
31
+
32
+ from geler import freeze
33
+ freeze("https://acme.tld/", "/path/to/local/dir/")
34
+
35
+ As a CLI tool:
36
+
37
+ $> geler --help
38
+ usage: geler [-h] start-url destination-path
39
+
40
+ Freeze given site at start_url to directory at destination_path
41
+
42
+ positional arguments:
43
+ start-url -
44
+ destination-path -
45
+
46
+ optional arguments:
47
+ -h, --help show this help message and exit
48
+
49
+
50
+ ## Why ?
51
+
52
+ For [MaX](https://git.unicaen.fr/pdn-certic/MaX) and associated tools,
53
+ we needed a lightweight, portable, pure Python solution to convert
54
+ small dynamic websites to static ones.
55
+
56
+ ## Alternatives
57
+
58
+ This tool has a narrow scope, on purpose. Please turn to these solutions if you need more:
59
+
60
+ - [wget](https://www.gnu.org/software/wget/)
61
+ - [pywebcopy](https://pypi.org/project/pywebcopy/)
62
+ - [HTTrack](https://www.httrack.com)
63
+
64
+ ## Limitations
65
+
66
+ - only works with HTTP GET
67
+ - does not submit forms (even with GET method)
68
+ - only considers URLs in `src` or `href` attributes
69
+ - only considers URLs with `http` or `https` schemes
70
+ - only downloads what is in the same [netloc](https://docs.python.org/3/library/urllib.parse.html) (same domain, same port) as the start URL
71
+ - only patches URLs in `*.html` files, not in `*.js`, not in `*.css` (watch out for those `url(...)` in your CSS)
@@ -0,0 +1,52 @@
1
+ # Geler
2
+
3
+ Help convert dynamic websites to static ones.
4
+
5
+ ## Install
6
+
7
+ pip install geler-CERTIC
8
+
9
+ ## Usage
10
+
11
+ As a library in your own program:
12
+
13
+ from geler import freeze
14
+ freeze("https://acme.tld/", "/path/to/local/dir/")
15
+
16
+ As a CLI tool:
17
+
18
+ $> geler --help
19
+ usage: geler [-h] start-url destination-path
20
+
21
+ Freeze given site at start_url to directory at destination_path
22
+
23
+ positional arguments:
24
+ start-url -
25
+ destination-path -
26
+
27
+ optional arguments:
28
+ -h, --help show this help message and exit
29
+
30
+
31
+ ## Why ?
32
+
33
+ For [MaX](https://git.unicaen.fr/pdn-certic/MaX) and associated tools,
34
+ we needed a lightweight, portable, pure Python solution to convert
35
+ small dynamic websites to static ones.
36
+
37
+ ## Alternatives
38
+
39
+ This tool has a narrow scope, on purpose. Please turn to these solutions if you need more:
40
+
41
+ - [wget](https://www.gnu.org/software/wget/)
42
+ - [pywebcopy](https://pypi.org/project/pywebcopy/)
43
+ - [HTTrack](https://www.httrack.com)
44
+
45
+ ## Limitations
46
+
47
+ - only works with HTTP GET
48
+ - does not submit forms (even with GET method)
49
+ - only considers URLs in `src` or `href` attributes
50
+ - only considers URLs with `http` or `https` schemes
51
+ - only downloads what is in the same [netloc](https://docs.python.org/3/library/urllib.parse.html) (same domain, same port) as the start URL
52
+ - only patches URLs in `*.html` files, not in `*.js`, not in `*.css` (watch out for those `url(...)` in your CSS)
@@ -0,0 +1,229 @@
1
+ from typing import Union
2
+ from pathlib import Path
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from urllib.parse import urlparse, urljoin, ParseResult, parse_qs
5
+ from bs4 import BeautifulSoup
6
+ from requests.exceptions import RequestException
7
+ from functools import cache
8
+ import logging
9
+ import requests
10
+ import os
11
+ import json
12
+ import mimetypes
13
+ from time import sleep
14
+ from base64 import urlsafe_b64encode
15
+
16
+ logger = logging.Logger(__name__)
17
+
18
+
19
+ def url64(s: Union[dict, str]) -> str:
20
+ if isinstance(s, dict):
21
+ s = json.dumps(s, sort_keys=True).encode()
22
+ if isinstance(s, str):
23
+ s = s.encode()
24
+ return urlsafe_b64encode(s).decode().rstrip("=")
25
+
26
+
27
+ @cache
28
+ def guess_extension(mime_type: str) -> str:
29
+ extension = mimetypes.guess_extension(mime_type)
30
+ if extension:
31
+ return extension
32
+ else:
33
+ return ""
34
+
35
+
36
+ @cache
37
+ def url_to_path(url: str, mime: str = "") -> str:
38
+ index = ""
39
+ parsed_url = cached_urlparse(url)
40
+ path_no_ext, ext = os.path.splitext(parsed_url.path)
41
+ guessed_ext = guess_extension(mime)
42
+ def_path = path_no_ext
43
+ if guessed_ext and guessed_ext.lower() != ext:
44
+ def_path = path_no_ext + ext
45
+ if parsed_url.path.endswith("/"):
46
+ index = "index"
47
+ query_digest = ""
48
+ if parsed_url.query:
49
+ query_digest = "-" + url64(dict(parse_qs(parsed_url.query)))
50
+ result = f"{def_path}{index}{query_digest}{guessed_ext}"
51
+ return result
52
+
53
+
54
+ @cache
55
+ def cached_urlparse(url):
56
+ return urlparse(url)
57
+
58
+
59
+ class Freezer:
60
+ def __init__(
61
+ self,
62
+ start_from: str,
63
+ destination: Union[Path, str],
64
+ skip_extensions=None,
65
+ http_get_timeout=30,
66
+ thread_pool_size=10,
67
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
68
+ ):
69
+ self.destination = Path(destination)
70
+ if not self.destination.exists():
71
+ os.makedirs(self.destination, exist_ok=True)
72
+ self.start_from_url = start_from.split("#")[0] # ignore anchors
73
+ self.start_from_domain = cached_urlparse(self.start_from_url).netloc
74
+ self.get_timeout = http_get_timeout
75
+ self.max_workers = thread_pool_size
76
+ self.html_mimes = ["text/html", "application/xhtml+xml", "application/xml"]
77
+ self.schemes = ["http", "https"]
78
+ self.scrap_executor_pool = ThreadPoolExecutor(max_workers=self.max_workers)
79
+ self.links_done = {}
80
+ self.links_to_do = []
81
+ self.user_agent = user_agent
82
+ self.skip_extensions = []
83
+ if skip_extensions is not None:
84
+ self.skip_extensions = skip_extensions
85
+
86
+ @cache
87
+ def validate_candidate_url(
88
+ self, url: str, parent_url: str
89
+ ) -> Union[ParseResult, bool]:
90
+ if not url.strip():
91
+ return False
92
+ # exclude absolute links not in the site
93
+ if (
94
+ url.startswith("http://") or url.startswith("https://")
95
+ ) and not url.startswith(self.start_from_url):
96
+ return False # Skip foreign URL
97
+ url = urljoin(parent_url, url) # make link absolute
98
+ url = url.split("#")[0] # ignore anchors
99
+ parsed_url = cached_urlparse(url)
100
+ if parsed_url.scheme not in self.schemes: # excludes non http protocols
101
+ return False
102
+ if parsed_url.netloc != self.start_from_domain:
103
+ return False
104
+ _, ext = os.path.splitext(parsed_url.path)
105
+ if ext.lower() in self.skip_extensions: # exclude extensions
106
+ return False
107
+ # TODO: should make url based on parsed_url here ?
108
+ if (
109
+ url not in self.links_done.keys() and url not in self.links_to_do
110
+ ): # Not in queue, submit to executor
111
+ self.scrap_executor_pool.submit(self.scrap_item, url)
112
+ self.links_to_do.append(url)
113
+ else: # Already in queue, nothing to do here
114
+ pass
115
+ return parsed_url
116
+
117
+ def _rebuild_link(self, parsed_url: ParseResult) -> str:
118
+ return (
119
+ parsed_url.path
120
+ + ("?" + parsed_url.query if parsed_url.query else "")
121
+ + ("#" + parsed_url.fragment if parsed_url.fragment else "")
122
+ )
123
+
124
+ def scrap_item(self, item_url):
125
+ anchor = ""
126
+ parts = item_url.split("#")
127
+ item_url_no_anchor = parts[0]
128
+ if len(parts) > 1:
129
+ anchor = parts[1]
130
+
131
+ # add to links in done list early, avoid race conditions
132
+ if item_url_no_anchor not in self.links_done.keys():
133
+ self.links_done[item_url_no_anchor] = ""
134
+ else:
135
+ return
136
+ try:
137
+ response = requests.get(
138
+ item_url_no_anchor,
139
+ allow_redirects=True,
140
+ timeout=self.get_timeout,
141
+ headers={"User-Agent": self.user_agent},
142
+ stream=True,
143
+ )
144
+ if not response.ok:
145
+ return
146
+ content_type = response.headers.get("Content-Type").split(";")[0].strip()
147
+ self.links_done[item_url_no_anchor] = url_to_path(
148
+ item_url_no_anchor, content_type
149
+ )
150
+ if content_type in self.html_mimes:
151
+ soup = BeautifulSoup(response.content, "html.parser")
152
+ for attrib in ["href", "src"]:
153
+ for link in soup.select(f"[{attrib}]"):
154
+ validated_link = self.validate_candidate_url(
155
+ link.get(attrib), item_url_no_anchor
156
+ )
157
+ if validated_link:
158
+ link[attrib] = self._rebuild_link(validated_link)
159
+ self._save_item_to_disk(item_url_no_anchor, content_type, soup)
160
+ else:
161
+ self._save_item_to_disk(item_url_no_anchor, content_type, response)
162
+ except RequestException as e:
163
+ logger.warning(f"RequestException: {e} on URL {item_url_no_anchor}")
164
+ # whatever happens, remove item from to do list
165
+ finally:
166
+ while item_url_no_anchor in self.links_to_do:
167
+ self.links_to_do.remove(item_url_no_anchor)
168
+
169
+ def _save_item_to_disk(
170
+ self,
171
+ item_url: str,
172
+ content_type: str,
173
+ data: Union[requests.Response, BeautifulSoup],
174
+ ):
175
+ convert_url_to_path = url_to_path(item_url, content_type)
176
+ local_path = Path(self.destination, convert_url_to_path.lstrip("/"))
177
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
178
+ if type(data) is BeautifulSoup:
179
+ with open(local_path, "w") as f:
180
+ f.write(str(data))
181
+
182
+ if type(data) is requests.Response:
183
+ with open(local_path, "wb") as f:
184
+ f.write(data.content)
185
+
186
+ # update done list with new URL value
187
+ self.links_done[item_url] = convert_url_to_path
188
+
189
+ def _patch_links(self):
190
+ parsed_start_url = cached_urlparse(self.start_from_url)
191
+ trans_table = {}
192
+ for original_url, local_path in self.links_done.items():
193
+ url_no_netloc = original_url.split(parsed_start_url.netloc)[1]
194
+ trans_table[url_no_netloc] = local_path
195
+ for _, local_path in self.links_done.items():
196
+ if local_path.endswith(".html"):
197
+ self._patch_html_file(local_path, trans_table)
198
+
199
+ def _patch_html_file(self, local_path: Union[Path, str], trans_table: dict) -> bool:
200
+ f_path = Path(self.destination, local_path.lstrip("/"))
201
+ with open(f_path, "r+") as fp:
202
+ soup = BeautifulSoup(fp, "html.parser")
203
+ for link in soup.select("[href]"):
204
+ parts = link.get("href").split("#")
205
+ if parts[0] in trans_table.keys():
206
+ link["href"] = trans_table[link.get("href")] + (
207
+ "#" + parts[1] if len(parts) > 1 else ""
208
+ )
209
+ for link in soup.select("[src]"):
210
+ parts = link.get("src").split("#")
211
+ if parts[0] in trans_table.keys():
212
+ link["src"] = trans_table[link.get("src")] + (
213
+ "#" + parts[1] if len(parts) > 1 else ""
214
+ )
215
+ fp.seek(0)
216
+ fp.write(str(soup))
217
+ fp.truncate()
218
+ return True
219
+
220
+ def freeze(self):
221
+ self.scrap_item(self.start_from_url)
222
+ while len(self.links_to_do) > 0:
223
+ sleep(1)
224
+ self.scrap_executor_pool.shutdown(wait=True)
225
+ self._patch_links()
226
+
227
+
228
+ def freeze(start_from_url: str, save_to_path: Union[Path, str]):
229
+ Freezer(start_from_url, save_to_path).freeze()
@@ -0,0 +1,21 @@
1
+ import argh
2
+ from geler import Freezer
3
+
4
+
5
+ def freeze(start_url: str, destination_path: str):
6
+ """
7
+ Freeze given site at start-url to directory at destination-path
8
+ """
9
+ f = Freezer(start_url, destination_path)
10
+ f.freeze()
11
+
12
+
13
+ def run_cli():
14
+ # parser = argh.ArghParser()
15
+ # parser.add_commands([freeze])
16
+ # parser.dispatch()
17
+ argh.dispatch_command(freeze)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ run_cli()
@@ -0,0 +1,24 @@
1
+ [tool.poetry]
2
+ name = "geler-CERTIC"
3
+ version = "0.1.0"
4
+ description = "Help convert dynamic webistes to static ones."
5
+ authors = ["Mickaël Desfrênes <mickael.desfrenes@unicaen.fr>"]
6
+ license = "CECILL-C"
7
+ readme = "README.md"
8
+ packages = [{include = "geler"}]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = "^3.9"
12
+ argh = "^0.31.3"
13
+ beautifulsoup4 = "^4.12.3"
14
+ requests = "^2.32.3"
15
+
16
+ [tool.poetry.group.dev.dependencies]
17
+ black = "^24.10.0"
18
+
19
+ [tool.poetry.scripts]
20
+ geler = "geler.__main__:run_cli"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry.core.masonry.api"