geler-CERTIC 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geler_certic-0.1.0/PKG-INFO +71 -0
- geler_certic-0.1.0/README.md +52 -0
- geler_certic-0.1.0/geler/__init__.py +229 -0
- geler_certic-0.1.0/geler/__main__.py +21 -0
- geler_certic-0.1.0/pyproject.toml +24 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: geler-CERTIC
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Help convert dynamic webistes to static ones.
|
|
5
|
+
License: CECILL-C
|
|
6
|
+
Author: Mickaël Desfrênes
|
|
7
|
+
Author-email: mickael.desfrenes@unicaen.fr
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: License :: CeCILL-C Free Software License Agreement (CECILL-C)
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Dist: argh (>=0.31.3,<0.32.0)
|
|
16
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
|
17
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Geler
|
|
21
|
+
|
|
22
|
+
Help convert dynamic websites to static ones.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
pip install geler-CERTIC
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
As a library in your own program:
|
|
31
|
+
|
|
32
|
+
from geler import freeze
|
|
33
|
+
freeze("https://acme.tld/", "/path/to/local/dir/")
|
|
34
|
+
|
|
35
|
+
As a CLI tool:
|
|
36
|
+
|
|
37
|
+
$> geler --help
|
|
38
|
+
usage: geler [-h] start-url destination-path
|
|
39
|
+
|
|
40
|
+
Freeze given site at start_url to directory at destination_path
|
|
41
|
+
|
|
42
|
+
positional arguments:
|
|
43
|
+
start-url -
|
|
44
|
+
destination-path -
|
|
45
|
+
|
|
46
|
+
optional arguments:
|
|
47
|
+
-h, --help show this help message and exit
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## Why ?
|
|
51
|
+
|
|
52
|
+
For [MaX](https://git.unicaen.fr/pdn-certic/MaX) and associated tools,
|
|
53
|
+
we needed a lightweight, portable, pure Python solution to convert
|
|
54
|
+
small dynamic websites to static ones.
|
|
55
|
+
|
|
56
|
+
## Alternatives
|
|
57
|
+
|
|
58
|
+
This tool has a narrow scope, on purpose. Please turn to these solutions if you need more:
|
|
59
|
+
|
|
60
|
+
- [wget](https://www.gnu.org/software/wget/)
|
|
61
|
+
- [pywebcopy](https://pypi.org/project/pywebcopy/)
|
|
62
|
+
- [HTTrack](https://www.httrack.com)
|
|
63
|
+
|
|
64
|
+
## Limitations
|
|
65
|
+
|
|
66
|
+
- only works with HTTP GET
|
|
67
|
+
- does not submit forms (even with GET method)
|
|
68
|
+
- only considers URLs in `src` or `href` attributes
|
|
69
|
+
- only considers URLs with `http` or `https` schemes
|
|
70
|
+
- only downloads what is in the same [netloc](https://docs.python.org/3/library/urllib.parse.html) (same domain, same port) as the start URL
|
|
71
|
+
- only patches URLs in `*.html` files, not in `*.js`, not in `*.css` (watch out for those `url(...)` in your CSS)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Geler
|
|
2
|
+
|
|
3
|
+
Help convert dynamic websites to static ones.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
pip install geler-CERTIC
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
As a library in your own program:
|
|
12
|
+
|
|
13
|
+
from geler import freeze
|
|
14
|
+
freeze("https://acme.tld/", "/path/to/local/dir/")
|
|
15
|
+
|
|
16
|
+
As a CLI tool:
|
|
17
|
+
|
|
18
|
+
$> geler --help
|
|
19
|
+
usage: geler [-h] start-url destination-path
|
|
20
|
+
|
|
21
|
+
Freeze given site at start_url to directory at destination_path
|
|
22
|
+
|
|
23
|
+
positional arguments:
|
|
24
|
+
start-url -
|
|
25
|
+
destination-path -
|
|
26
|
+
|
|
27
|
+
optional arguments:
|
|
28
|
+
-h, --help show this help message and exit
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Why ?
|
|
32
|
+
|
|
33
|
+
For [MaX](https://git.unicaen.fr/pdn-certic/MaX) and associated tools,
|
|
34
|
+
we needed a lightweight, portable, pure Python solution to convert
|
|
35
|
+
small dynamic websites to static ones.
|
|
36
|
+
|
|
37
|
+
## Alternatives
|
|
38
|
+
|
|
39
|
+
This tool has a narrow scope, on purpose. Please turn to these solutions if you need more:
|
|
40
|
+
|
|
41
|
+
- [wget](https://www.gnu.org/software/wget/)
|
|
42
|
+
- [pywebcopy](https://pypi.org/project/pywebcopy/)
|
|
43
|
+
- [HTTrack](https://www.httrack.com)
|
|
44
|
+
|
|
45
|
+
## Limitations
|
|
46
|
+
|
|
47
|
+
- only works with HTTP GET
|
|
48
|
+
- does not submit forms (even with GET method)
|
|
49
|
+
- only considers URLs in `src` or `href` attributes
|
|
50
|
+
- only considers URLs with `http` or `https` schemes
|
|
51
|
+
- only downloads what is in the same [netloc](https://docs.python.org/3/library/urllib.parse.html) (same domain, same port) as the start URL
|
|
52
|
+
- only patches URLs in `*.html` files, not in `*.js`, not in `*.css` (watch out for those `url(...)` in your CSS)
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from urllib.parse import urlparse, urljoin, ParseResult, parse_qs
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from requests.exceptions import RequestException
|
|
7
|
+
from functools import cache
|
|
8
|
+
import logging
|
|
9
|
+
import requests
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
import mimetypes
|
|
13
|
+
from time import sleep
|
|
14
|
+
from base64 import urlsafe_b64encode
|
|
15
|
+
|
|
16
|
+
logger = logging.Logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def url64(s: Union[dict, str]) -> str:
|
|
20
|
+
if isinstance(s, dict):
|
|
21
|
+
s = json.dumps(s, sort_keys=True).encode()
|
|
22
|
+
if isinstance(s, str):
|
|
23
|
+
s = s.encode()
|
|
24
|
+
return urlsafe_b64encode(s).decode().rstrip("=")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@cache
|
|
28
|
+
def guess_extension(mime_type: str) -> str:
|
|
29
|
+
extension = mimetypes.guess_extension(mime_type)
|
|
30
|
+
if extension:
|
|
31
|
+
return extension
|
|
32
|
+
else:
|
|
33
|
+
return ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@cache
|
|
37
|
+
def url_to_path(url: str, mime: str = "") -> str:
|
|
38
|
+
index = ""
|
|
39
|
+
parsed_url = cached_urlparse(url)
|
|
40
|
+
path_no_ext, ext = os.path.splitext(parsed_url.path)
|
|
41
|
+
guessed_ext = guess_extension(mime)
|
|
42
|
+
def_path = path_no_ext
|
|
43
|
+
if guessed_ext and guessed_ext.lower() != ext:
|
|
44
|
+
def_path = path_no_ext + ext
|
|
45
|
+
if parsed_url.path.endswith("/"):
|
|
46
|
+
index = "index"
|
|
47
|
+
query_digest = ""
|
|
48
|
+
if parsed_url.query:
|
|
49
|
+
query_digest = "-" + url64(dict(parse_qs(parsed_url.query)))
|
|
50
|
+
result = f"{def_path}{index}{query_digest}{guessed_ext}"
|
|
51
|
+
return result
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@cache
|
|
55
|
+
def cached_urlparse(url):
|
|
56
|
+
return urlparse(url)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Freezer:
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
start_from: str,
|
|
63
|
+
destination: Union[Path, str],
|
|
64
|
+
skip_extensions=None,
|
|
65
|
+
http_get_timeout=30,
|
|
66
|
+
thread_pool_size=10,
|
|
67
|
+
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
68
|
+
):
|
|
69
|
+
self.destination = Path(destination)
|
|
70
|
+
if not self.destination.exists():
|
|
71
|
+
os.makedirs(self.destination, exist_ok=True)
|
|
72
|
+
self.start_from_url = start_from.split("#")[0] # ignore anchors
|
|
73
|
+
self.start_from_domain = cached_urlparse(self.start_from_url).netloc
|
|
74
|
+
self.get_timeout = http_get_timeout
|
|
75
|
+
self.max_workers = thread_pool_size
|
|
76
|
+
self.html_mimes = ["text/html", "application/xhtml+xml", "application/xml"]
|
|
77
|
+
self.schemes = ["http", "https"]
|
|
78
|
+
self.scrap_executor_pool = ThreadPoolExecutor(max_workers=self.max_workers)
|
|
79
|
+
self.links_done = {}
|
|
80
|
+
self.links_to_do = []
|
|
81
|
+
self.user_agent = user_agent
|
|
82
|
+
self.skip_extensions = []
|
|
83
|
+
if skip_extensions is not None:
|
|
84
|
+
self.skip_extensions = skip_extensions
|
|
85
|
+
|
|
86
|
+
@cache
|
|
87
|
+
def validate_candidate_url(
|
|
88
|
+
self, url: str, parent_url: str
|
|
89
|
+
) -> Union[ParseResult, bool]:
|
|
90
|
+
if not url.strip():
|
|
91
|
+
return False
|
|
92
|
+
# exclude absolute links not in the site
|
|
93
|
+
if (
|
|
94
|
+
url.startswith("http://") or url.startswith("https://")
|
|
95
|
+
) and not url.startswith(self.start_from_url):
|
|
96
|
+
return False # Skip foreign URL
|
|
97
|
+
url = urljoin(parent_url, url) # make link absolute
|
|
98
|
+
url = url.split("#")[0] # ignore anchors
|
|
99
|
+
parsed_url = cached_urlparse(url)
|
|
100
|
+
if parsed_url.scheme not in self.schemes: # excludes non http protocols
|
|
101
|
+
return False
|
|
102
|
+
if parsed_url.netloc != self.start_from_domain:
|
|
103
|
+
return False
|
|
104
|
+
_, ext = os.path.splitext(parsed_url.path)
|
|
105
|
+
if ext.lower() in self.skip_extensions: # exclude extensions
|
|
106
|
+
return False
|
|
107
|
+
# TODO: should make url based on parsed_url here ?
|
|
108
|
+
if (
|
|
109
|
+
url not in self.links_done.keys() and url not in self.links_to_do
|
|
110
|
+
): # Not in queue, submit to executor
|
|
111
|
+
self.scrap_executor_pool.submit(self.scrap_item, url)
|
|
112
|
+
self.links_to_do.append(url)
|
|
113
|
+
else: # Already in queue, nothing to do here
|
|
114
|
+
pass
|
|
115
|
+
return parsed_url
|
|
116
|
+
|
|
117
|
+
def _rebuild_link(self, parsed_url: ParseResult) -> str:
|
|
118
|
+
return (
|
|
119
|
+
parsed_url.path
|
|
120
|
+
+ ("?" + parsed_url.query if parsed_url.query else "")
|
|
121
|
+
+ ("#" + parsed_url.fragment if parsed_url.fragment else "")
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def scrap_item(self, item_url):
|
|
125
|
+
anchor = ""
|
|
126
|
+
parts = item_url.split("#")
|
|
127
|
+
item_url_no_anchor = parts[0]
|
|
128
|
+
if len(parts) > 1:
|
|
129
|
+
anchor = parts[1]
|
|
130
|
+
|
|
131
|
+
# add to links in done list early, avoid race conditions
|
|
132
|
+
if item_url_no_anchor not in self.links_done.keys():
|
|
133
|
+
self.links_done[item_url_no_anchor] = ""
|
|
134
|
+
else:
|
|
135
|
+
return
|
|
136
|
+
try:
|
|
137
|
+
response = requests.get(
|
|
138
|
+
item_url_no_anchor,
|
|
139
|
+
allow_redirects=True,
|
|
140
|
+
timeout=self.get_timeout,
|
|
141
|
+
headers={"User-Agent": self.user_agent},
|
|
142
|
+
stream=True,
|
|
143
|
+
)
|
|
144
|
+
if not response.ok:
|
|
145
|
+
return
|
|
146
|
+
content_type = response.headers.get("Content-Type").split(";")[0].strip()
|
|
147
|
+
self.links_done[item_url_no_anchor] = url_to_path(
|
|
148
|
+
item_url_no_anchor, content_type
|
|
149
|
+
)
|
|
150
|
+
if content_type in self.html_mimes:
|
|
151
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
152
|
+
for attrib in ["href", "src"]:
|
|
153
|
+
for link in soup.select(f"[{attrib}]"):
|
|
154
|
+
validated_link = self.validate_candidate_url(
|
|
155
|
+
link.get(attrib), item_url_no_anchor
|
|
156
|
+
)
|
|
157
|
+
if validated_link:
|
|
158
|
+
link[attrib] = self._rebuild_link(validated_link)
|
|
159
|
+
self._save_item_to_disk(item_url_no_anchor, content_type, soup)
|
|
160
|
+
else:
|
|
161
|
+
self._save_item_to_disk(item_url_no_anchor, content_type, response)
|
|
162
|
+
except RequestException as e:
|
|
163
|
+
logger.warning(f"RequestException: {e} on URL {item_url_no_anchor}")
|
|
164
|
+
# whatever happens, remove item from to do list
|
|
165
|
+
finally:
|
|
166
|
+
while item_url_no_anchor in self.links_to_do:
|
|
167
|
+
self.links_to_do.remove(item_url_no_anchor)
|
|
168
|
+
|
|
169
|
+
def _save_item_to_disk(
|
|
170
|
+
self,
|
|
171
|
+
item_url: str,
|
|
172
|
+
content_type: str,
|
|
173
|
+
data: Union[requests.Response, BeautifulSoup],
|
|
174
|
+
):
|
|
175
|
+
convert_url_to_path = url_to_path(item_url, content_type)
|
|
176
|
+
local_path = Path(self.destination, convert_url_to_path.lstrip("/"))
|
|
177
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
178
|
+
if type(data) is BeautifulSoup:
|
|
179
|
+
with open(local_path, "w") as f:
|
|
180
|
+
f.write(str(data))
|
|
181
|
+
|
|
182
|
+
if type(data) is requests.Response:
|
|
183
|
+
with open(local_path, "wb") as f:
|
|
184
|
+
f.write(data.content)
|
|
185
|
+
|
|
186
|
+
# update done list with new URL value
|
|
187
|
+
self.links_done[item_url] = convert_url_to_path
|
|
188
|
+
|
|
189
|
+
def _patch_links(self):
|
|
190
|
+
parsed_start_url = cached_urlparse(self.start_from_url)
|
|
191
|
+
trans_table = {}
|
|
192
|
+
for original_url, local_path in self.links_done.items():
|
|
193
|
+
url_no_netloc = original_url.split(parsed_start_url.netloc)[1]
|
|
194
|
+
trans_table[url_no_netloc] = local_path
|
|
195
|
+
for _, local_path in self.links_done.items():
|
|
196
|
+
if local_path.endswith(".html"):
|
|
197
|
+
self._patch_html_file(local_path, trans_table)
|
|
198
|
+
|
|
199
|
+
def _patch_html_file(self, local_path: Union[Path, str], trans_table: dict) -> bool:
|
|
200
|
+
f_path = Path(self.destination, local_path.lstrip("/"))
|
|
201
|
+
with open(f_path, "r+") as fp:
|
|
202
|
+
soup = BeautifulSoup(fp, "html.parser")
|
|
203
|
+
for link in soup.select("[href]"):
|
|
204
|
+
parts = link.get("href").split("#")
|
|
205
|
+
if parts[0] in trans_table.keys():
|
|
206
|
+
link["href"] = trans_table[link.get("href")] + (
|
|
207
|
+
"#" + parts[1] if len(parts) > 1 else ""
|
|
208
|
+
)
|
|
209
|
+
for link in soup.select("[src]"):
|
|
210
|
+
parts = link.get("src").split("#")
|
|
211
|
+
if parts[0] in trans_table.keys():
|
|
212
|
+
link["src"] = trans_table[link.get("src")] + (
|
|
213
|
+
"#" + parts[1] if len(parts) > 1 else ""
|
|
214
|
+
)
|
|
215
|
+
fp.seek(0)
|
|
216
|
+
fp.write(str(soup))
|
|
217
|
+
fp.truncate()
|
|
218
|
+
return True
|
|
219
|
+
|
|
220
|
+
def freeze(self):
|
|
221
|
+
self.scrap_item(self.start_from_url)
|
|
222
|
+
while len(self.links_to_do) > 0:
|
|
223
|
+
sleep(1)
|
|
224
|
+
self.scrap_executor_pool.shutdown(wait=True)
|
|
225
|
+
self._patch_links()
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def freeze(start_from_url: str, save_to_path: Union[Path, str]):
|
|
229
|
+
Freezer(start_from_url, save_to_path).freeze()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import argh
|
|
2
|
+
from geler import Freezer
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def freeze(start_url: str, destination_path: str):
|
|
6
|
+
"""
|
|
7
|
+
Freeze given site at start-url to directory at destination-path
|
|
8
|
+
"""
|
|
9
|
+
f = Freezer(start_url, destination_path)
|
|
10
|
+
f.freeze()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_cli():
|
|
14
|
+
# parser = argh.ArghParser()
|
|
15
|
+
# parser.add_commands([freeze])
|
|
16
|
+
# parser.dispatch()
|
|
17
|
+
argh.dispatch_command(freeze)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
run_cli()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "geler-CERTIC"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Help convert dynamic webistes to static ones."
|
|
5
|
+
authors = ["Mickaël Desfrênes <mickael.desfrenes@unicaen.fr>"]
|
|
6
|
+
license = "CECILL-C"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
packages = [{include = "geler"}]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = "^3.9"
|
|
12
|
+
argh = "^0.31.3"
|
|
13
|
+
beautifulsoup4 = "^4.12.3"
|
|
14
|
+
requests = "^2.32.3"
|
|
15
|
+
|
|
16
|
+
[tool.poetry.group.dev.dependencies]
|
|
17
|
+
black = "^24.10.0"
|
|
18
|
+
|
|
19
|
+
[tool.poetry.scripts]
|
|
20
|
+
geler = "geler.__main__:run_cli"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry.core.masonry.api"
|