ex-cd 1.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ex_cd/__init__.py +24 -0
- ex_cd/__main__.py +5 -0
- ex_cd/collect.py +69 -0
- ex_cd/common.py +87 -0
- ex_cd/config.py +92 -0
- ex_cd/deprecate.py +44 -0
- ex_cd/download.py +74 -0
- ex_cd/gallery_dl_exec.py +21 -0
- ex_cd/history.py +76 -0
- ex_cd/meta.py +123 -0
- ex_cd/output.py +33 -0
- ex_cd/tools/__init__.py +29 -0
- ex_cd/tools/__main__.py +5 -0
- ex_cd/tools/config.py +48 -0
- ex_cd/tools/delete.py +119 -0
- ex_cd/tools/latest_meta.py +24 -0
- ex_cd/validate.py +80 -0
- ex_cd-1.17.0.dist-info/METADATA +99 -0
- ex_cd-1.17.0.dist-info/RECORD +22 -0
- ex_cd-1.17.0.dist-info/WHEEL +5 -0
- ex_cd-1.17.0.dist-info/licenses/LICENSE +21 -0
- ex_cd-1.17.0.dist-info/top_level.txt +1 -0
ex_cd/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .config import build_parser, read_config
|
|
2
|
+
from .history import get_latest_url, put_history_placeholder
|
|
3
|
+
from .output import initialize_logging
|
|
4
|
+
from .download import download_gallery_history
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
parser = build_parser()
|
|
9
|
+
args = parser.parse_args()
|
|
10
|
+
logger = initialize_logging(args.loglevel)
|
|
11
|
+
config = read_config(args, logger)
|
|
12
|
+
logger.info(f"Parsed config: {args}")
|
|
13
|
+
url = args.url
|
|
14
|
+
url, gallery_dir = get_latest_url(url, config, logger)
|
|
15
|
+
logger.info(f"Downloading: {url} -> {gallery_dir}")
|
|
16
|
+
put_history_placeholder(url, gallery_dir, config, logger)
|
|
17
|
+
for _ in range(config['retry']):
|
|
18
|
+
try:
|
|
19
|
+
download_gallery_history(url, gallery_dir, config, logger)
|
|
20
|
+
except Exception as e:
|
|
21
|
+
logger.error(f"download_gallery_history failed, retry: {e}")
|
|
22
|
+
continue
|
|
23
|
+
return # Successfully downloaded
|
|
24
|
+
return download_gallery_history(url, gallery_dir, config, logger) # Final attempt, may raise an exception
|
ex_cd/__main__.py
ADDED
ex_cd/collect.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
from .common import META_FOLDER, _get_gallery_metadata_filenames
|
|
5
|
+
from .meta import _download_gallery_meta
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _iter_metadata(gallery_dir, logger):
|
|
9
|
+
meta_folder = os.path.join(gallery_dir, META_FOLDER)
|
|
10
|
+
for metafile in _get_gallery_metadata_filenames(gallery_dir):
|
|
11
|
+
img, meta_ext = os.path.splitext(metafile)
|
|
12
|
+
if not meta_ext == '.json':
|
|
13
|
+
continue
|
|
14
|
+
metapath = os.path.join(meta_folder, metafile)
|
|
15
|
+
try:
|
|
16
|
+
with open(metapath, encoding='utf8') as fp:
|
|
17
|
+
yield img, json.load(fp)
|
|
18
|
+
except Exception as e:
|
|
19
|
+
logger.warning(f"Cannot load exist json file {metafile}: {e}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _iter_imgfile_metadata(gallery_dir, logger):
|
|
23
|
+
for img, meta in _iter_metadata(gallery_dir, logger):
|
|
24
|
+
imgfile = os.path.join(gallery_dir, img)
|
|
25
|
+
if not os.path.isfile(imgfile):
|
|
26
|
+
continue
|
|
27
|
+
yield imgfile, meta
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _check_img(imgfile, meta, logger):
|
|
31
|
+
if 'image_token' not in meta:
|
|
32
|
+
return False
|
|
33
|
+
image_token = meta['image_token']
|
|
34
|
+
sha1_token = None
|
|
35
|
+
if not os.path.isfile(imgfile):
|
|
36
|
+
return False
|
|
37
|
+
try:
|
|
38
|
+
with open(imgfile, mode="rb") as fp:
|
|
39
|
+
sha1_token = hashlib.sha1(fp.read()).hexdigest()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Invalid {imgfile}: cannot compute token, {e}")
|
|
42
|
+
return False
|
|
43
|
+
return image_token == sha1_token[0:10]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _collect_gallery_history(gallery_dir, config, logger):
|
|
47
|
+
history = {}
|
|
48
|
+
for imgfile, meta in _iter_imgfile_metadata(gallery_dir, logger):
|
|
49
|
+
if _check_img(imgfile, meta, logger):
|
|
50
|
+
history[meta['image_token']] = imgfile
|
|
51
|
+
return history
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _load_gallery_history(url, gallery_dir, config, logger, history):
|
|
55
|
+
"""Move deprecated images from child gallery to parent gallery"""
|
|
56
|
+
_download_gallery_meta(url, gallery_dir, config, logger)
|
|
57
|
+
for img, meta in _iter_metadata(gallery_dir, logger):
|
|
58
|
+
if 'image_token' not in meta:
|
|
59
|
+
continue
|
|
60
|
+
image_token = meta['image_token']
|
|
61
|
+
if image_token not in history:
|
|
62
|
+
continue
|
|
63
|
+
imgfile = os.path.join(gallery_dir, img)
|
|
64
|
+
if _check_img(imgfile, meta, logger):
|
|
65
|
+
continue
|
|
66
|
+
if os.path.exists(imgfile):
|
|
67
|
+
os.remove(imgfile)
|
|
68
|
+
os.rename(history[image_token], imgfile)
|
|
69
|
+
del history[image_token]
|
ex_cd/common.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
import json
|
|
4
|
+
import random
|
|
5
|
+
import re
|
|
6
|
+
import ex_cd.gallery_dl_exec as gallery_dl
|
|
7
|
+
|
|
8
|
+
replace_site_re = re.compile(r"https://e[-x]hentai.org")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def replace_site(url, config):
|
|
12
|
+
site = config["replace-site"]
|
|
13
|
+
return re.sub(replace_site_re, site, url) if site else url
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
META_FOLDER = '.metadata'
|
|
17
|
+
|
|
18
|
+
metadata_args = ["--write-metadata", "--postprocessor-option", f"directory={META_FOLDER}"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _download_gallery_metadata_and_extract_gallery_dir(url, config, logger):
|
|
22
|
+
'''Download a metadata json file of the gallery'''
|
|
23
|
+
with TemporaryDirectory() as dirname:
|
|
24
|
+
filename = os.path.join(dirname, 'temp.txt')
|
|
25
|
+
gallery_dl_exec = config["gallery-dl-exec"]
|
|
26
|
+
gallery_dl_meta_args = config["gallery-dl-meta-args"]
|
|
27
|
+
args = [
|
|
28
|
+
*gallery_dl_exec, "--no-download", "--no-skip", "--range", "1",
|
|
29
|
+
"--exec-after", "echo {_directory} > %s" % filename,
|
|
30
|
+
*metadata_args, *gallery_dl_meta_args, replace_site(url, config)
|
|
31
|
+
]
|
|
32
|
+
logger.debug(f"Exec: {args}")
|
|
33
|
+
returncode = gallery_dl.main(*args)
|
|
34
|
+
if returncode != 0 or not os.path.isfile(filename):
|
|
35
|
+
raise ValueError("Cannot get gallery by gallery-dl")
|
|
36
|
+
with open(filename, encoding='utf8') as fp:
|
|
37
|
+
gallery_dir = os.path.join(fp.read().strip())
|
|
38
|
+
# ↓↓↓↓↓↓↓↓ for stupid windows ↓↓↓↓↓↓↓↓
|
|
39
|
+
gallery_dir = re.sub(r'^"', "", gallery_dir)
|
|
40
|
+
gallery_dir = re.sub(r'"$', "", gallery_dir)
|
|
41
|
+
gallery_dir = re.sub(r'^\\+\?\\+', "", gallery_dir)
|
|
42
|
+
# ↑↑↑↑↑↑↑↑ for stupid windows ↑↑↑↑↑↑↑↑
|
|
43
|
+
gallery_dir = os.path.join(gallery_dir)
|
|
44
|
+
return gallery_dir
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_gallery_metadata_filenames(gallery_dir):
|
|
48
|
+
metafiles = []
|
|
49
|
+
meta_folder = os.path.join(gallery_dir, META_FOLDER)
|
|
50
|
+
os.makedirs(meta_folder, exist_ok=True)
|
|
51
|
+
for file in os.listdir(meta_folder):
|
|
52
|
+
if os.path.splitext(file)[1] == '.json':
|
|
53
|
+
metafiles.append(file)
|
|
54
|
+
return metafiles
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _get_gallery_metadata_files_path(gallery_dir):
|
|
58
|
+
meta_folder = os.path.join(gallery_dir, META_FOLDER)
|
|
59
|
+
return [os.path.join(meta_folder, file) for file in _get_gallery_metadata_filenames(gallery_dir)]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _try_get_gallery_one_metadata_from_dir(gallery_dir, logger):
|
|
63
|
+
'''Read a json file in the gallery metadata json files'''
|
|
64
|
+
if gallery_dir is not None:
|
|
65
|
+
metafiles = _get_gallery_metadata_files_path(gallery_dir)
|
|
66
|
+
if len(metafiles) > 0:
|
|
67
|
+
metafile = metafiles[random.randint(0, len(metafiles) - 1)]
|
|
68
|
+
try:
|
|
69
|
+
with open(metafile, encoding='utf8') as fp:
|
|
70
|
+
return json.load(fp)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error(f"Cannot load exist json file {metafile}: {e}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_gallery_one_metadata(url, gallery_dir, config, logger):
|
|
76
|
+
'''Get a metadata of a gallery'''
|
|
77
|
+
meta = _try_get_gallery_one_metadata_from_dir(gallery_dir, logger)
|
|
78
|
+
if not meta:
|
|
79
|
+
gallery_dir2 = _download_gallery_metadata_and_extract_gallery_dir(url, config, logger)
|
|
80
|
+
if gallery_dir is None:
|
|
81
|
+
gallery_dir = gallery_dir2
|
|
82
|
+
if os.path.abspath(gallery_dir) != os.path.abspath(gallery_dir2):
|
|
83
|
+
raise ValueError(f"gallery_dir not match: {gallery_dir} != {gallery_dir2}")
|
|
84
|
+
meta = _try_get_gallery_one_metadata_from_dir(gallery_dir, logger)
|
|
85
|
+
if not meta:
|
|
86
|
+
raise ValueError(f"Cannot get metadata: {url, gallery_dir}")
|
|
87
|
+
return meta
|
ex_cd/config.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_parser():
|
|
9
|
+
"""Build and configure an ArgumentParser object"""
|
|
10
|
+
parser = argparse.ArgumentParser(usage="%(prog)s [OPTION]... URL...")
|
|
11
|
+
|
|
12
|
+
general = parser.add_argument_group("General Options")
|
|
13
|
+
general.add_argument(
|
|
14
|
+
"-c", "--config",
|
|
15
|
+
dest="config", type=str,
|
|
16
|
+
help="Path to config json file or a json string",
|
|
17
|
+
default=None,
|
|
18
|
+
)
|
|
19
|
+
output = parser.add_argument_group("Output Options")
|
|
20
|
+
output.add_argument(
|
|
21
|
+
"-q", "--quiet",
|
|
22
|
+
dest="loglevel", default=logging.INFO,
|
|
23
|
+
action="store_const", const=logging.ERROR,
|
|
24
|
+
help="Activate quiet mode",
|
|
25
|
+
)
|
|
26
|
+
output.add_argument(
|
|
27
|
+
"-v", "--verbose",
|
|
28
|
+
dest="loglevel",
|
|
29
|
+
action="store_const", const=logging.DEBUG,
|
|
30
|
+
help="Print various debugging information",
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"url", type=str,
|
|
34
|
+
help="URL of the gallery to download",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return parser
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
config = {
|
|
41
|
+
# Replace https://e-hentai.org or https://exhentai.org by this before start download
|
|
42
|
+
"replace-site": None,
|
|
43
|
+
# Root for the gallery
|
|
44
|
+
"gallery-root": None,
|
|
45
|
+
# Regular expression to extract path of the file from URL
|
|
46
|
+
"path-re": "^https://e[-x]hentai.org/g/([0-9]+)/[0-9a-z]+/*$",
|
|
47
|
+
# if specified "gallery-root" and valid "path-re", get_gallery_dir_by_re will be used to get path
|
|
48
|
+
|
|
49
|
+
# Retry times on failure
|
|
50
|
+
"retry": 3,
|
|
51
|
+
|
|
52
|
+
# Maximum depth of the gallery history
|
|
53
|
+
"depth": 8,
|
|
54
|
+
|
|
55
|
+
# Executable gallery-dl commandline program
|
|
56
|
+
"gallery-dl-exec": [sys.executable, "-m", "gallery_dl"],
|
|
57
|
+
# Args for running gallery-dl commandline program
|
|
58
|
+
"gallery-dl-args": [],
|
|
59
|
+
# Args for running gallery-dl commandline program for meta extraction
|
|
60
|
+
"gallery-dl-meta-args": [],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def read_config(args, logger):
|
|
65
|
+
if "EXCD_CONFIG_FILE" in os.environ:
|
|
66
|
+
try:
|
|
67
|
+
with open(os.environ["EXCD_CONFIG_FILE"], encoding='utf8') as f:
|
|
68
|
+
override = json.load(f)
|
|
69
|
+
for k in config:
|
|
70
|
+
if k in override:
|
|
71
|
+
config[k] = override[k]
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.warn("Cannot read EXCD_CONFIG_FILE %s: %s" % (os.environ["EXCD_CONFIG_FILE"], e))
|
|
74
|
+
|
|
75
|
+
if args.config is not None:
|
|
76
|
+
if os.path.isfile(args.config):
|
|
77
|
+
try:
|
|
78
|
+
with open(args.config, encoding='utf8') as f:
|
|
79
|
+
override = json.load(f)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warn("Cannot read config %s: %s" % (args.config, e))
|
|
82
|
+
else:
|
|
83
|
+
try:
|
|
84
|
+
override = json.loads(args.config)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warn("Cannot parse config %s: %s" % (args.config, e))
|
|
87
|
+
|
|
88
|
+
for k in config:
|
|
89
|
+
if k in override:
|
|
90
|
+
config[k] = override[k]
|
|
91
|
+
logger.info("Config: %s" % config)
|
|
92
|
+
return config
|
ex_cd/deprecate.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from .common import META_FOLDER, _get_gallery_metadata_filenames
|
|
4
|
+
from .validate import _validate_gallery
|
|
5
|
+
from .meta import _download_gallery_meta
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEPRECAT_COMPLETED_FILE = 'DeprecateCompleted'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _deprecate_gallery_history(parent_url, parent_gallery_dir, child_url, child_gallery_dir, config, logger):
|
|
12
|
+
"""Move deprecated images from parent gallery to child gallery (reverse of _load_gallery_history)"""
|
|
13
|
+
ok_file = os.path.join(parent_gallery_dir, META_FOLDER, DEPRECAT_COMPLETED_FILE)
|
|
14
|
+
if os.path.isfile(ok_file):
|
|
15
|
+
return
|
|
16
|
+
if not _validate_gallery(parent_url, parent_gallery_dir, config, logger):
|
|
17
|
+
raise RuntimeError(f"Cannot deprecate from a invalid gallery {parent_gallery_dir}")
|
|
18
|
+
_download_gallery_meta(child_url, child_gallery_dir, config, logger)
|
|
19
|
+
# move from parent_gallery_dir
|
|
20
|
+
parent_metadata_files = _get_gallery_metadata_filenames(parent_gallery_dir)
|
|
21
|
+
src = {}
|
|
22
|
+
for metafile in parent_metadata_files:
|
|
23
|
+
imgfile = metafile[0:-5]
|
|
24
|
+
metapath = os.path.join(parent_gallery_dir, META_FOLDER, metafile)
|
|
25
|
+
imgpath = os.path.join(parent_gallery_dir, imgfile)
|
|
26
|
+
with open(metapath, 'r', encoding='utf8') as fp:
|
|
27
|
+
meta = json.load(fp)
|
|
28
|
+
src[meta['image_token']] = imgpath
|
|
29
|
+
# move to child_gallery_dir
|
|
30
|
+
child_metadata_files = _get_gallery_metadata_filenames(child_gallery_dir)
|
|
31
|
+
dst = {}
|
|
32
|
+
for metafile in child_metadata_files:
|
|
33
|
+
imgfile = metafile[0:-5]
|
|
34
|
+
metapath = os.path.join(child_gallery_dir, META_FOLDER, metafile)
|
|
35
|
+
imgpath = os.path.join(child_gallery_dir, imgfile)
|
|
36
|
+
with open(metapath, 'r', encoding='utf8') as fp:
|
|
37
|
+
meta = json.load(fp)
|
|
38
|
+
dst[meta['image_token']] = imgpath
|
|
39
|
+
# move them
|
|
40
|
+
for src_image_token, src_imgpath in src.items():
|
|
41
|
+
if src_image_token in dst:
|
|
42
|
+
os.replace(src_imgpath, dst[src_image_token])
|
|
43
|
+
with open(ok_file, "w", encoding='utf8'):
|
|
44
|
+
return # record that this gallery has been validated
|
ex_cd/download.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import ex_cd.gallery_dl_exec as gallery_dl
|
|
3
|
+
from .validate import _validate_gallery, DOWNLOAD_RESUME_FILE, VALIDATE_COMPLETED_FILE
|
|
4
|
+
from .collect import _load_gallery_history
|
|
5
|
+
from .common import metadata_args, META_FOLDER, replace_site
|
|
6
|
+
from .history import _get_gallery_parent_url, _get_gallery_dir
|
|
7
|
+
from .deprecate import _deprecate_gallery_history
|
|
8
|
+
from .collect import _collect_gallery_history
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _gather_gallery_history(url, gallery_dir, config, logger):
|
|
12
|
+
this_history = {}
|
|
13
|
+
ok_file = os.path.join(gallery_dir, META_FOLDER, VALIDATE_COMPLETED_FILE)
|
|
14
|
+
if not os.path.isfile(ok_file): # if not complete
|
|
15
|
+
this_history = _collect_gallery_history(gallery_dir, config, logger) # collect existing history
|
|
16
|
+
parent_url = _get_gallery_parent_url(url, gallery_dir, config, logger)
|
|
17
|
+
if parent_url == '': # if no parent
|
|
18
|
+
return this_history # just return it
|
|
19
|
+
parent_gallery_dir = _get_gallery_dir(parent_url, config, logger)
|
|
20
|
+
return {**this_history, **_gather_gallery_history(parent_url, parent_gallery_dir, config, logger)}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def download_gallery_history(url, gallery_dir, config, logger, history={}, depth=0):
|
|
24
|
+
"""Download all the history of the gallery"""
|
|
25
|
+
parent_url = _get_gallery_parent_url(url, gallery_dir, config, logger)
|
|
26
|
+
if parent_url == '': # if no parent
|
|
27
|
+
return _download_gallery(url, gallery_dir, config, logger, history) # just download it
|
|
28
|
+
# if has parent
|
|
29
|
+
parent_gallery_dir = _get_gallery_dir(parent_url, config, logger)
|
|
30
|
+
if depth >= config["depth"]:
|
|
31
|
+
return _download_gallery(url, gallery_dir, config, logger,
|
|
32
|
+
{**history, **_gather_gallery_history(parent_url, parent_gallery_dir, config, logger)})
|
|
33
|
+
ok_file = os.path.join(gallery_dir, META_FOLDER, VALIDATE_COMPLETED_FILE)
|
|
34
|
+
if not os.path.isfile(ok_file): # if not complete
|
|
35
|
+
history = {**history, **_collect_gallery_history(gallery_dir, config, logger)} # collect existing history
|
|
36
|
+
# in this process, the VALIDATE_COMPLETED_FILE will be placed from old history to new
|
|
37
|
+
# if VALIDATE_COMPLETED_FILE is placed here, there is two condition:
|
|
38
|
+
# 1. all the old gallery is downloaded
|
|
39
|
+
# 2. this gallery is download by download_gallery_latest and there is old gallery not downloaded
|
|
40
|
+
# both these two conditions, this gallery should not use as history
|
|
41
|
+
download_gallery_history(parent_url, parent_gallery_dir, config, logger, history, depth+1) # download parent
|
|
42
|
+
_deprecate_gallery_history(parent_url, parent_gallery_dir, url, gallery_dir,
|
|
43
|
+
config, logger) # deprecate from parent
|
|
44
|
+
return _download_gallery(url, gallery_dir, config, logger, history) # download the rest
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _download_gallery(url, gallery_dir, config, logger, history={}):
|
|
48
|
+
"""download by gallery_dl and validate"""
|
|
49
|
+
if _validate_gallery(url, gallery_dir, config, logger): # validate the gallery
|
|
50
|
+
return # exit
|
|
51
|
+
_load_gallery_history(url, gallery_dir, config, logger, history) # load existing history
|
|
52
|
+
if _validate_gallery(url, gallery_dir, config, logger): # validate the gallery
|
|
53
|
+
return # record that this gallery has been downloaded
|
|
54
|
+
resume_url = url
|
|
55
|
+
resume_file = os.path.join(gallery_dir, META_FOLDER, DOWNLOAD_RESUME_FILE)
|
|
56
|
+
try:
|
|
57
|
+
with open(resume_file, "r", encoding="utf8") as fp:
|
|
58
|
+
resume_url = fp.readline()
|
|
59
|
+
except:
|
|
60
|
+
pass
|
|
61
|
+
gallery_dl_exec = config["gallery-dl-exec"]
|
|
62
|
+
gallery_dl_meta_args = config["gallery-dl-meta-args"]
|
|
63
|
+
args = [
|
|
64
|
+
*gallery_dl_exec,
|
|
65
|
+
*metadata_args, *gallery_dl_meta_args, replace_site(resume_url, config)
|
|
66
|
+
]
|
|
67
|
+
logger.debug(f"Exec: {args}")
|
|
68
|
+
returncode = gallery_dl.main(*args)
|
|
69
|
+
if _validate_gallery(url, gallery_dir, config, logger): # validate the gallery
|
|
70
|
+
return # record that this gallery has been downloaded
|
|
71
|
+
elif returncode != 0:
|
|
72
|
+
raise RuntimeError(f"Download failed: {url} -> {gallery_dir}")
|
|
73
|
+
else:
|
|
74
|
+
raise RuntimeError(f"Download not valid: {url} -> {gallery_dir}")
|
ex_cd/gallery_dl_exec.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def main(*args):
|
|
5
|
+
return subprocess.Popen(args=args).wait()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
os.chdir(os.path.dirname(os.path.dirname(sys.argv[0])))
|
|
12
|
+
main(
|
|
13
|
+
sys.executable, "-m", "gallery_dl",
|
|
14
|
+
"--no-download",
|
|
15
|
+
"--write-metadata",
|
|
16
|
+
"--postprocessor-option", 'directory=metadata',
|
|
17
|
+
'-v',
|
|
18
|
+
'--sleep-request', '1',
|
|
19
|
+
'-c', '.vscode/gallery-dl.config.json',
|
|
20
|
+
'https://exhentai.org/g/2752577/8ffd3778cb/'
|
|
21
|
+
)
|
ex_cd/history.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from .common import get_gallery_one_metadata, _download_gallery_metadata_and_extract_gallery_dir
|
|
4
|
+
from .common import META_FOLDER
|
|
5
|
+
|
|
6
|
+
CHILD_NAME = 'child.url'
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_gallery_dir_by_re(url, path_re, root):
|
|
10
|
+
dirname = re.findall(path_re, url)[0]
|
|
11
|
+
return os.path.join(root, dirname)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_gallery_dir_by_gdl(url, config, logger):
|
|
15
|
+
return _download_gallery_metadata_and_extract_gallery_dir(url, config, logger)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_gallery_dir(url, config, logger):
|
|
19
|
+
try:
|
|
20
|
+
if config['gallery-root'] and config['path-re']:
|
|
21
|
+
return _get_gallery_dir_by_re(url, config['path-re'], config['gallery-root'])
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.warn("Should parse by path-re and gallery-root, but error: %s" % e)
|
|
24
|
+
return _get_gallery_dir_by_gdl(url, config, logger)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
url2gid_re = re.compile(r"^https://e[-x]hentai.org/g/([0-9]+)/[0-9a-z]+/*$")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _isparent(url, gallery_dir, child_url, child_gallery_dir, config, logger):
|
|
31
|
+
metadata = get_gallery_one_metadata(url, gallery_dir, config, logger)
|
|
32
|
+
child_metadata = get_gallery_one_metadata(child_url, child_gallery_dir, config, logger)
|
|
33
|
+
return str(metadata['gid']) == re.findall(url2gid_re, child_metadata['parent'])[0]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_latest_url(url, config, logger):
|
|
37
|
+
'''Find url of the latest child of `url`'''
|
|
38
|
+
gallery_dir = _get_gallery_dir(url, config, logger)
|
|
39
|
+
child_path = os.path.join(gallery_dir, META_FOLDER, CHILD_NAME)
|
|
40
|
+
if not os.path.isfile(child_path):
|
|
41
|
+
return url, gallery_dir
|
|
42
|
+
with open(child_path, encoding='utf8') as fp:
|
|
43
|
+
child_url = fp.read().strip()
|
|
44
|
+
child_gallery_dir = None
|
|
45
|
+
try:
|
|
46
|
+
if config['gallery-root'] and config['path-re']:
|
|
47
|
+
child_gallery_dir = _get_gallery_dir_by_re(child_url, config['path-re'], config['gallery-root'])
|
|
48
|
+
except Exception as e:
|
|
49
|
+
logger.warn("Should parse by path-re and gallery-root, but error: %s" % e)
|
|
50
|
+
if not _isparent(url, gallery_dir, child_url, child_gallery_dir, config, logger): # 交叉验证
|
|
51
|
+
raise ValueError(f"{url} is not the parent of {child_url}")
|
|
52
|
+
return get_latest_url(child_url, config, logger)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _get_gallery_parent_url(url, gallery_dir, config, logger):
|
|
56
|
+
metadata = get_gallery_one_metadata(url, gallery_dir, config, logger)
|
|
57
|
+
if 'parent' not in metadata:
|
|
58
|
+
raise ValueError(f"No 'parent' in {url}")
|
|
59
|
+
return metadata['parent']
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def put_history_placeholder(url, gallery_dir, config, logger):
|
|
63
|
+
'''Find and tag all the parent galleries of `url`'''
|
|
64
|
+
parent_url = _get_gallery_parent_url(url, gallery_dir, config, logger)
|
|
65
|
+
if parent_url == '':
|
|
66
|
+
return
|
|
67
|
+
parent_gallery_dir = _get_gallery_dir(parent_url, config, logger)
|
|
68
|
+
child_file = os.path.join(parent_gallery_dir, META_FOLDER, CHILD_NAME)
|
|
69
|
+
os.makedirs(os.path.join(parent_gallery_dir, META_FOLDER), exist_ok=True)
|
|
70
|
+
if os.path.isfile(child_file):
|
|
71
|
+
with open(child_file, 'r', encoding='utf8') as fp:
|
|
72
|
+
if url == fp.read().strip():
|
|
73
|
+
return put_history_placeholder(parent_url, parent_gallery_dir, config, logger)
|
|
74
|
+
with open(child_file, 'w', encoding='utf8') as fp:
|
|
75
|
+
fp.write(url)
|
|
76
|
+
return put_history_placeholder(parent_url, parent_gallery_dir, config, logger)
|
ex_cd/meta.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import json
|
|
4
|
+
import ex_cd.gallery_dl_exec as gallery_dl
|
|
5
|
+
from .common import META_FOLDER, metadata_args, _get_gallery_metadata_filenames, get_gallery_one_metadata, replace_site
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
META_DOWNLOAD_RESUME_FILE = 'MetaDownloadResume'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _download_gallery_meta(url, gallery_dir, config, logger):
|
|
12
|
+
"""download gallery metadata by gallery_dl"""
|
|
13
|
+
if _valid_gallery_meta(url, gallery_dir, config, logger): # validate the gallery
|
|
14
|
+
return # exit
|
|
15
|
+
resume_url = url
|
|
16
|
+
resume_file = os.path.join(gallery_dir, META_FOLDER, META_DOWNLOAD_RESUME_FILE)
|
|
17
|
+
try:
|
|
18
|
+
with open(resume_file, "r", encoding="utf8") as fp:
|
|
19
|
+
resume_url = fp.readline()
|
|
20
|
+
except:
|
|
21
|
+
pass
|
|
22
|
+
gallery_dl_exec = config["gallery-dl-exec"]
|
|
23
|
+
gallery_dl_meta_args = config["gallery-dl-meta-args"]
|
|
24
|
+
args = [
|
|
25
|
+
*gallery_dl_exec, "--no-download", "--no-skip",
|
|
26
|
+
*metadata_args, *gallery_dl_meta_args, replace_site(resume_url, config)
|
|
27
|
+
]
|
|
28
|
+
logger.debug(f"Exec: {args}")
|
|
29
|
+
returncode = gallery_dl.main(*args)
|
|
30
|
+
if _valid_gallery_meta(url, gallery_dir, config, logger): # validate the gallery
|
|
31
|
+
return # record that this gallery has been downloaded
|
|
32
|
+
elif returncode != 0:
|
|
33
|
+
raise RuntimeError(f"Download gallery meta failed: {url} -> {gallery_dir}")
|
|
34
|
+
else:
|
|
35
|
+
raise RuntimeError(f"Download gallery meta invalid: {url} -> {gallery_dir}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
url2gid_re = re.compile(r"^https://e[-x]hentai.org/g/([0-9]+)/[0-9a-z]+/*$")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _url2gid_by_re(url):
|
|
42
|
+
return re.findall(url2gid_re, url)[0]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
url2site_re = re.compile(r"(^https://e[-x]hentai.org)/g/[0-9]+/[0-9a-z]+/*$")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _url2site_by_re(url):
|
|
49
|
+
return re.findall(url2site_re, url)[0]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_image_tokens(url, gallery_dir, config, logger):
|
|
53
|
+
# check if has enough metadata json files
|
|
54
|
+
meta = get_gallery_one_metadata(url, gallery_dir, config, logger)
|
|
55
|
+
if 'filecount' not in meta:
|
|
56
|
+
raise ValueError(f"'filecount' not in metadata")
|
|
57
|
+
metafilenames = [None] * int(meta['filecount'])
|
|
58
|
+
image_tokens = [None] * int(meta['filecount'])
|
|
59
|
+
should_deletes = [[] for _ in range(int(meta['filecount']))]
|
|
60
|
+
site, gid = _url2site_by_re(url), _url2gid_by_re(url)
|
|
61
|
+
for metafilename in _get_gallery_metadata_filenames(gallery_dir):
|
|
62
|
+
metafile = os.path.join(gallery_dir, META_FOLDER, metafilename)
|
|
63
|
+
try:
|
|
64
|
+
with open(metafile, "r", encoding="utf8") as fp:
|
|
65
|
+
meta = json.load(fp)
|
|
66
|
+
if str(meta["gid"]) == gid:
|
|
67
|
+
num = meta["num"] - 1
|
|
68
|
+
# sometimes there are deperated metafiles
|
|
69
|
+
if metafilenames[num] is not None:
|
|
70
|
+
should_deletes[num].append(metafilenames[num])
|
|
71
|
+
should_deletes[num].append(metafilename)
|
|
72
|
+
image_tokens[num] = metafilenames[num] = None
|
|
73
|
+
elif len(should_deletes[num]) > 0:
|
|
74
|
+
should_deletes[num].append(metafilename)
|
|
75
|
+
else:
|
|
76
|
+
image_tokens[num] = meta["image_token"]
|
|
77
|
+
metafilenames[num] = metafilename
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Invalid metadata {metafile}: {e}")
|
|
80
|
+
# delete deperated metafiles
|
|
81
|
+
n = 0
|
|
82
|
+
for should_delete in should_deletes:
|
|
83
|
+
for delete in should_delete:
|
|
84
|
+
metafile = os.path.join(gallery_dir, META_FOLDER, delete)
|
|
85
|
+
if os.path.exists(metafile):
|
|
86
|
+
os.remove(metafile)
|
|
87
|
+
n += 1
|
|
88
|
+
if n > 0:
|
|
89
|
+
completefile = os.path.join(gallery_dir, META_FOLDER, META_VALIDATE_COMPLETED_FILE)
|
|
90
|
+
if os.path.exists(completefile):
|
|
91
|
+
os.remove(completefile)
|
|
92
|
+
raise ValueError(f"There are {n} deperated metafiles! just deleted! should restart meta doanload!")
|
|
93
|
+
return site, gid, image_tokens, metafilenames
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
META_VALIDATE_COMPLETED_FILE = 'MetaValidateCompleted'
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _valid_gallery_meta(url, gallery_dir, config, logger):
|
|
100
|
+
"""validate the gallery metadata"""
|
|
101
|
+
ok_file = os.path.join(gallery_dir, META_FOLDER, META_VALIDATE_COMPLETED_FILE)
|
|
102
|
+
if os.path.isfile(ok_file): # if valid
|
|
103
|
+
return True # exit
|
|
104
|
+
# check if has enough metadata json files
|
|
105
|
+
resume_url = url
|
|
106
|
+
try:
|
|
107
|
+
site, gid, image_tokens, _ = _get_image_tokens(url, gallery_dir, config, logger)
|
|
108
|
+
for i, image_token in enumerate(image_tokens):
|
|
109
|
+
if image_token:
|
|
110
|
+
resume_url = f"{site}/s/{image_token}/{gid}-{i+1}"
|
|
111
|
+
else:
|
|
112
|
+
break
|
|
113
|
+
if None in image_tokens:
|
|
114
|
+
resume_file = os.path.join(gallery_dir, META_FOLDER, META_DOWNLOAD_RESUME_FILE)
|
|
115
|
+
with open(resume_file, "w", encoding='utf8') as fp:
|
|
116
|
+
fp.write(resume_url)
|
|
117
|
+
logger.error(f"Invalid {gallery_dir}: no enough metadata files, should resume from {resume_url}")
|
|
118
|
+
return False
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Invalid {gallery_dir}: {e}")
|
|
121
|
+
|
|
122
|
+
with open(ok_file, "w", encoding='utf8'):
|
|
123
|
+
return True # record that this gallery has been validated
|
ex_cd/output.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Logger(logging.Logger):
|
|
5
|
+
"""Custom Logger that includes extra info in log records"""
|
|
6
|
+
|
|
7
|
+
def makeRecord(self, name, level, fn, lno, msg, args, exc_info,
|
|
8
|
+
func=None, extra=None, sinfo=None,
|
|
9
|
+
factory=logging._logRecordFactory):
|
|
10
|
+
rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo)
|
|
11
|
+
if extra:
|
|
12
|
+
rv.__dict__.update(extra)
|
|
13
|
+
return rv
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def initialize_logging(loglevel):
|
|
17
|
+
"""Setup basic logging functionality before configfiles have been loaded"""
|
|
18
|
+
# convert levelnames to lowercase
|
|
19
|
+
for level in (10, 20, 30, 40, 50):
|
|
20
|
+
name = logging.getLevelName(level)
|
|
21
|
+
logging.addLevelName(level, name.lower())
|
|
22
|
+
|
|
23
|
+
# register custom Logging class
|
|
24
|
+
logging.Logger.manager.setLoggerClass(Logger)
|
|
25
|
+
|
|
26
|
+
# setup basic logging to stderr
|
|
27
|
+
handler = logging.StreamHandler()
|
|
28
|
+
handler.setLevel(loglevel)
|
|
29
|
+
root = logging.getLogger()
|
|
30
|
+
root.setLevel(logging.NOTSET)
|
|
31
|
+
root.addHandler(handler)
|
|
32
|
+
|
|
33
|
+
return logging.getLogger("ex-cd")
|
ex_cd/tools/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from .config import build_parser
|
|
3
|
+
from .delete import delete_gallery_history
|
|
4
|
+
from .latest_meta import get_latest_gallery_metadata
|
|
5
|
+
from ..history import put_history_placeholder
|
|
6
|
+
from ..config import read_config
|
|
7
|
+
from ..output import initialize_logging
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = build_parser()
|
|
12
|
+
args = parser.parse_args()
|
|
13
|
+
logger = initialize_logging(args.loglevel)
|
|
14
|
+
config = read_config(args, logger)
|
|
15
|
+
logger.info(f"Parsed config: {args}")
|
|
16
|
+
|
|
17
|
+
if args.command == "delete":
|
|
18
|
+
delete_gallery_history(args.url, config, logger)
|
|
19
|
+
|
|
20
|
+
elif args.command == "latest-meta":
|
|
21
|
+
latest_url, gallery_dir, metadata = get_latest_gallery_metadata(args.url, config, logger)
|
|
22
|
+
put_history_placeholder(latest_url, gallery_dir, config, logger)
|
|
23
|
+
output = json.dumps(metadata, ensure_ascii=False, indent=2)
|
|
24
|
+
if args.output:
|
|
25
|
+
with open(args.output, "w", encoding="utf8") as fp:
|
|
26
|
+
fp.write(output)
|
|
27
|
+
logger.info(f"Metadata saved to {args.output}")
|
|
28
|
+
else:
|
|
29
|
+
print(output)
|
ex_cd/tools/__main__.py
ADDED
ex_cd/tools/config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def build_parser():
|
|
6
|
+
"""Build and configure an ArgumentParser with subcommands"""
|
|
7
|
+
parser = argparse.ArgumentParser(
|
|
8
|
+
prog="ex_cd.tools",
|
|
9
|
+
description="ex-cd tools for gallery management"
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
# Common options
|
|
13
|
+
parser.add_argument(
|
|
14
|
+
"-c", "--config",
|
|
15
|
+
dest="config", type=str,
|
|
16
|
+
help="Path to config json file or a json string",
|
|
17
|
+
default=None,
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"-q", "--quiet",
|
|
21
|
+
dest="loglevel", default=logging.INFO,
|
|
22
|
+
action="store_const", const=logging.ERROR,
|
|
23
|
+
help="Activate quiet mode",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"-v", "--verbose",
|
|
27
|
+
dest="loglevel",
|
|
28
|
+
action="store_const", const=logging.DEBUG,
|
|
29
|
+
help="Print various debugging information",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Subcommands
|
|
33
|
+
subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands")
|
|
34
|
+
|
|
35
|
+
# delete subcommand
|
|
36
|
+
delete_parser = subparsers.add_parser("delete", help="Delete all history of a gallery")
|
|
37
|
+
delete_parser.add_argument("url", type=str, help="URL of the gallery to delete")
|
|
38
|
+
|
|
39
|
+
# latest-meta subcommand
|
|
40
|
+
meta_parser = subparsers.add_parser("latest-meta", help="Get metadata of the latest version of a gallery")
|
|
41
|
+
meta_parser.add_argument("url", type=str, help="URL of the gallery")
|
|
42
|
+
meta_parser.add_argument(
|
|
43
|
+
"-o", "--output",
|
|
44
|
+
dest="output", type=str, default=None,
|
|
45
|
+
help="Output file path for metadata JSON (default: print to stdout)"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return parser
|
ex_cd/tools/delete.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from ..common import META_FOLDER
|
|
4
|
+
from ..meta import META_VALIDATE_COMPLETED_FILE, META_DOWNLOAD_RESUME_FILE
|
|
5
|
+
from ..validate import VALIDATE_COMPLETED_FILE, DOWNLOAD_RESUME_FILE
|
|
6
|
+
from ..deprecate import DEPRECAT_COMPLETED_FILE
|
|
7
|
+
from ..history import get_latest_url, put_history_placeholder, _get_gallery_dir, _get_gallery_parent_url
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_all_history_dirs(url, gallery_dir, config, logger):
|
|
11
|
+
"""Get all gallery directories in the history chain, from newest to oldest"""
|
|
12
|
+
dirs = [(url, gallery_dir)]
|
|
13
|
+
try:
|
|
14
|
+
parent_url = _get_gallery_parent_url(url, gallery_dir, config, logger)
|
|
15
|
+
except Exception as e:
|
|
16
|
+
logger.debug(f"No parent for {url}: {e}")
|
|
17
|
+
return dirs
|
|
18
|
+
while parent_url:
|
|
19
|
+
parent_gallery_dir = _get_gallery_dir(parent_url, config, logger)
|
|
20
|
+
dirs.append((parent_url, parent_gallery_dir))
|
|
21
|
+
try:
|
|
22
|
+
parent_url = _get_gallery_parent_url(parent_url, parent_gallery_dir, config, logger)
|
|
23
|
+
except Exception:
|
|
24
|
+
break
|
|
25
|
+
return dirs
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _delete_gallery_content(gallery_dir, logger):
|
|
29
|
+
"""Delete all content from a gallery in a resumable way"""
|
|
30
|
+
meta_folder = os.path.join(gallery_dir, META_FOLDER)
|
|
31
|
+
|
|
32
|
+
# Step 1: Delete download completion markers
|
|
33
|
+
for marker in [VALIDATE_COMPLETED_FILE, DOWNLOAD_RESUME_FILE, DEPRECAT_COMPLETED_FILE]:
|
|
34
|
+
marker_path = os.path.join(meta_folder, marker)
|
|
35
|
+
if os.path.isfile(marker_path):
|
|
36
|
+
os.remove(marker_path)
|
|
37
|
+
logger.debug(f"Deleted marker: {marker_path}")
|
|
38
|
+
|
|
39
|
+
# Step 2: Delete all files outside .metadata (images)
|
|
40
|
+
if os.path.isdir(gallery_dir):
|
|
41
|
+
for item in os.listdir(gallery_dir):
|
|
42
|
+
if item == META_FOLDER:
|
|
43
|
+
continue
|
|
44
|
+
item_path = os.path.join(gallery_dir, item)
|
|
45
|
+
if os.path.isfile(item_path):
|
|
46
|
+
os.remove(item_path)
|
|
47
|
+
logger.debug(f"Deleted file: {item_path}")
|
|
48
|
+
elif os.path.isdir(item_path):
|
|
49
|
+
shutil.rmtree(item_path)
|
|
50
|
+
logger.debug(f"Deleted dir: {item_path}")
|
|
51
|
+
|
|
52
|
+
# Step 3: Delete meta completion markers
|
|
53
|
+
for marker in [META_VALIDATE_COMPLETED_FILE, META_DOWNLOAD_RESUME_FILE]:
|
|
54
|
+
marker_path = os.path.join(meta_folder, marker)
|
|
55
|
+
if os.path.isfile(marker_path):
|
|
56
|
+
os.remove(marker_path)
|
|
57
|
+
logger.debug(f"Deleted meta marker: {marker_path}")
|
|
58
|
+
|
|
59
|
+
# Step 4: Delete JSON files in .metadata
|
|
60
|
+
if os.path.isdir(meta_folder):
|
|
61
|
+
for item in os.listdir(meta_folder):
|
|
62
|
+
if item.endswith('.json'):
|
|
63
|
+
item_path = os.path.join(meta_folder, item)
|
|
64
|
+
os.remove(item_path)
|
|
65
|
+
logger.debug(f"Deleted json: {item_path}")
|
|
66
|
+
|
|
67
|
+
# Step 5: Delete remaining files in .metadata (like child.url)
|
|
68
|
+
if os.path.isdir(meta_folder):
|
|
69
|
+
for item in os.listdir(meta_folder):
|
|
70
|
+
item_path = os.path.join(meta_folder, item)
|
|
71
|
+
if os.path.isfile(item_path):
|
|
72
|
+
os.remove(item_path)
|
|
73
|
+
logger.debug(f"Deleted remaining: {item_path}")
|
|
74
|
+
|
|
75
|
+
# Step 6: Delete the .metadata folder
|
|
76
|
+
if os.path.isdir(meta_folder):
|
|
77
|
+
try:
|
|
78
|
+
os.rmdir(meta_folder)
|
|
79
|
+
except OSError:
|
|
80
|
+
shutil.rmtree(meta_folder)
|
|
81
|
+
logger.debug(f"Deleted metadata folder: {meta_folder}")
|
|
82
|
+
|
|
83
|
+
# Step 7: Delete the gallery folder itself
|
|
84
|
+
if os.path.isdir(gallery_dir):
|
|
85
|
+
try:
|
|
86
|
+
os.rmdir(gallery_dir)
|
|
87
|
+
except OSError:
|
|
88
|
+
shutil.rmtree(gallery_dir)
|
|
89
|
+
logger.debug(f"Deleted gallery folder: {gallery_dir}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def delete_gallery_history(url, config, logger):
|
|
93
|
+
"""Delete all history of a gallery
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
url: URL of the gallery (can be any version in the history chain)
|
|
97
|
+
config: Configuration dict
|
|
98
|
+
logger: Logger instance
|
|
99
|
+
"""
|
|
100
|
+
# Step 1: Get the latest URL and create placeholder chain
|
|
101
|
+
url, gallery_dir = get_latest_url(url, config, logger)
|
|
102
|
+
logger.info(f"Latest gallery: {url} -> {gallery_dir}")
|
|
103
|
+
put_history_placeholder(url, gallery_dir, config, logger)
|
|
104
|
+
|
|
105
|
+
# Step 2: Collect all gallery directories (newest to oldest)
|
|
106
|
+
all_dirs = _get_all_history_dirs(url, gallery_dir, config, logger)
|
|
107
|
+
logger.info(f"Found {len(all_dirs)} galleries to delete:")
|
|
108
|
+
for i, (dir_url, dir_path) in enumerate(all_dirs):
|
|
109
|
+
logger.info(f" [{i+1}] {dir_path}")
|
|
110
|
+
|
|
111
|
+
# Step 3: Delete from newest to oldest
|
|
112
|
+
for dir_url, dir_path in all_dirs:
|
|
113
|
+
if os.path.isdir(dir_path):
|
|
114
|
+
logger.info(f"Deleting: {dir_path}")
|
|
115
|
+
_delete_gallery_content(dir_path, logger)
|
|
116
|
+
else:
|
|
117
|
+
logger.debug(f"Already deleted: {dir_path}")
|
|
118
|
+
|
|
119
|
+
logger.info("Delete completed")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from ..common import get_gallery_one_metadata
|
|
2
|
+
from ..history import get_latest_url
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_latest_gallery_metadata(url, config, logger):
|
|
6
|
+
"""Get metadata of the latest version of a gallery
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
url: URL of the gallery (can be any version in the history chain)
|
|
10
|
+
config: Configuration dict
|
|
11
|
+
logger: Logger instance
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
tuple: (latest_url, gallery_dir, metadata)
|
|
15
|
+
"""
|
|
16
|
+
# Step 1: Get the latest URL
|
|
17
|
+
latest_url, gallery_dir = get_latest_url(url, config, logger)
|
|
18
|
+
logger.info(f"Latest gallery: {latest_url} -> {gallery_dir}")
|
|
19
|
+
|
|
20
|
+
# Step 2: Get metadata
|
|
21
|
+
metadata = get_gallery_one_metadata(latest_url, gallery_dir, config, logger)
|
|
22
|
+
logger.info(f"Got metadata for gid={metadata.get('gid')}, title={metadata.get('title')}")
|
|
23
|
+
|
|
24
|
+
return latest_url, gallery_dir, metadata
|
ex_cd/validate.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import hashlib
|
|
4
|
+
from .common import META_FOLDER
|
|
5
|
+
from .meta import _valid_gallery_meta, _get_image_tokens
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
VALIDATE_COMPLETED_FILE = 'ValidateCompleted'
|
|
9
|
+
DOWNLOAD_RESUME_FILE = 'DownloadResume'
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _validate_gallery(url, gallery_dir, config, logger):
|
|
13
|
+
"""validate the gallery"""
|
|
14
|
+
ok_file = os.path.join(gallery_dir, META_FOLDER, VALIDATE_COMPLETED_FILE)
|
|
15
|
+
if os.path.isfile(ok_file): # if valid
|
|
16
|
+
return True # exit
|
|
17
|
+
|
|
18
|
+
# check if has enough metadata json files
|
|
19
|
+
if not _valid_gallery_meta(url, gallery_dir, config, logger):
|
|
20
|
+
return False
|
|
21
|
+
site, gid, image_tokens, metafiles = _get_image_tokens(url, gallery_dir, config, logger)
|
|
22
|
+
|
|
23
|
+
# check if has enough image files
|
|
24
|
+
images = []
|
|
25
|
+
for img in os.listdir(gallery_dir):
|
|
26
|
+
if img == META_FOLDER:
|
|
27
|
+
continue
|
|
28
|
+
images.append(img)
|
|
29
|
+
|
|
30
|
+
resume_file = os.path.join(gallery_dir, META_FOLDER, DOWNLOAD_RESUME_FILE)
|
|
31
|
+
start_from = None
|
|
32
|
+
try:
|
|
33
|
+
with open(resume_file, "r", encoding="utf8") as fp:
|
|
34
|
+
exist_resume_url = fp.readline()
|
|
35
|
+
start_from_re = re.findall(fr"^https://e[-x]hentai.org/s/[0-9a-z]+/{gid}-([0-9]+)$", exist_resume_url)
|
|
36
|
+
if len(start_from_re) > 0:
|
|
37
|
+
start_from = int(start_from_re[0])
|
|
38
|
+
except:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
resume_url = url
|
|
42
|
+
ok = True
|
|
43
|
+
# check if image content SHA1 match image_token
|
|
44
|
+
for i, (image_token, metafile) in enumerate(zip(image_tokens, metafiles)):
|
|
45
|
+
img = metafile[0:-5]
|
|
46
|
+
if img not in images:
|
|
47
|
+
logger.error(f"Invalid {gallery_dir}: no image {img} for {metafile}")
|
|
48
|
+
resume_url = f"{site}/s/{image_token}/{gid}-{i+1}"
|
|
49
|
+
ok = False
|
|
50
|
+
break
|
|
51
|
+
# check resume
|
|
52
|
+
if start_from is not None:
|
|
53
|
+
if i+1 < start_from:
|
|
54
|
+
logger.debug(f"Skip validate {img}: start from {start_from}, current {i+1}")
|
|
55
|
+
continue
|
|
56
|
+
# compare image_token
|
|
57
|
+
imgfile = os.path.join(gallery_dir, img)
|
|
58
|
+
try:
|
|
59
|
+
with open(imgfile, mode="rb") as fp:
|
|
60
|
+
sha1 = hashlib.sha1(fp.read()).hexdigest()
|
|
61
|
+
if image_token != sha1[0:10]:
|
|
62
|
+
logger.error(f"Invalid {imgfile}: image token not match, {image_token} != {sha1}, delete the image")
|
|
63
|
+
os.remove(imgfile)
|
|
64
|
+
resume_url = f"{site}/s/{image_token}/{gid}-{i+1}"
|
|
65
|
+
ok = False
|
|
66
|
+
break
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.error(f"Invalid {imgfile}: cannot compare token, {e}, delete the image")
|
|
69
|
+
os.remove(imgfile)
|
|
70
|
+
resume_url = f"{site}/s/{image_token}/{gid}-{i+1}"
|
|
71
|
+
ok = False
|
|
72
|
+
break
|
|
73
|
+
if ok:
|
|
74
|
+
with open(ok_file, "w", encoding='utf8'):
|
|
75
|
+
return True # record that this gallery has been validated
|
|
76
|
+
else:
|
|
77
|
+
with open(resume_file, "w", encoding='utf8') as fp:
|
|
78
|
+
fp.write(resume_url)
|
|
79
|
+
logger.error(f"Invalid {gallery_dir}: no enough images, should resume from {resume_url}")
|
|
80
|
+
return False
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ex_cd
|
|
3
|
+
Version: 1.17.0
|
|
4
|
+
Summary: 高效下载E站gallery的所有历史数据
|
|
5
|
+
Author-email: yindaheng98 <yindaheng98@163.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yindaheng98/ex-cd
|
|
8
|
+
Project-URL: Repository, https://github.com/yindaheng98/ex-cd
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: gallery-dl
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# ex-cd
|
|
19
|
+
|
|
20
|
+
高效下载E站gallery的所有历史数据
|
|
21
|
+
|
|
22
|
+
* 尽量避免集中数据库,能放进文件夹的数据尽量放进文件夹
|
|
23
|
+
* 尽量减少请求操作,能只用读文件的尽量只读文件
|
|
24
|
+
* 尽量减少文件读写操作,能只用读文件列表的尽量只读文件列表
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```sh
|
|
29
|
+
python -m ex_cd -c .vscode/config.json https://exhentai.org/g/2635845/ecbc9d9681/
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
```sh
|
|
33
|
+
python -m ex_cd -c <a json string> https://exhentai.org/g/2635845/ecbc9d9681/
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
You can see the example config file: `.vscode/config.json`
|
|
37
|
+
|
|
38
|
+
You can also set an `EXCD_CONFIG_FILE` env to specify a file, and the config in this file will be overridden by the config specified by `-c`:
|
|
39
|
+
|
|
40
|
+
```sh
|
|
41
|
+
export EXCD_CONFIG_FILE=".vscode/config.json"
|
|
42
|
+
python -m ex_cd -c <a json string> https://exhentai.org/g/2635845/ecbc9d9681/
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
You can see the example command line: `.vscode/launch.json`
|
|
46
|
+
|
|
47
|
+
## How does it work?
|
|
48
|
+
|
|
49
|
+
### URL更新
|
|
50
|
+
|
|
51
|
+
```mermaid
|
|
52
|
+
flowchart TD
|
|
53
|
+
|
|
54
|
+
UrlCheck1[输入URL] --> UrlCheck2(从URL中提取目标文件夹路径\ngallery-dl --dump-json '%s' --range 0\n< gallery_path >)
|
|
55
|
+
UrlCheck2 --> UrlCheck3(检查是否是过时内容\n< gallery_path >/metadata/child.url是否存在)
|
|
56
|
+
UrlCheck3 --> UrlCheck4{child.url存在 ?}
|
|
57
|
+
UrlCheck4 -->|是| UrlCheck5(按照child.url更新URL为最新) --> UrlCheck1
|
|
58
|
+
UrlCheck4 -->|否| MetaCheck1[结束\n返回最新URL] --> OldPlacehold[后台执行\n过时元数据占位]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 过时元数据占位
|
|
62
|
+
|
|
63
|
+
```mermaid
|
|
64
|
+
flowchart TD
|
|
65
|
+
|
|
66
|
+
UrlCheck1[输入URL] --> UrlCheck2(从URL中提取目标文件夹路径\ngallery-dl --dump-json '%s' --range 0\n< gallery_path >) --> MetaCheck1(检查元数据文件存在性\n< gallery_path >/metadata/*.json 文件存在)
|
|
67
|
+
MetaCheck1 --> MetaCheck2{元数据文件存在 ?}
|
|
68
|
+
MetaCheck2 -->|是| MetaCheck3(检查parent存在性\n元数据文件中存在parent字段) --> MetaCheck4{parent字段存在 ?} -->|是| UrlCheck3(按照parent字段更新URL为过时URL) --> UrlCheck1
|
|
69
|
+
UrlCheck3 --> OldPlacehold1(从URL中提取目标文件夹路径) --> OldPlacehold2[在目标文件夹路径下放置child.url]
|
|
70
|
+
MetaCheck2 -->|否| MetaCheck5(下载一个元数据\ngallery-dl -v '%s' --no-download --range 0)
|
|
71
|
+
MetaCheck4 -->|否| MetaCheck5 --> MetaCheck1
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 元数据下载
|
|
75
|
+
|
|
76
|
+
```mermaid
|
|
77
|
+
flowchart TD
|
|
78
|
+
|
|
79
|
+
UrlCheck1[输入URL] --> URL更新 --> UrlCheck2(从URL中提取目标文件夹路径\ngallery-dl --dump-json '%s' --range 0\n< gallery_path >) --> MetaCheck1(检查元数据文件存在性\n< gallery_path >/metadata/*.json 文件存在)
|
|
80
|
+
MetaCheck1 --> MetaCheck2{元数据文件存在 ?}
|
|
81
|
+
MetaCheck2 -->|是| MetaCheck4(检查元数据完整性\n< gallery_path >/metadata/*.json 每个文件都可json解析\n其中 'filecount' 值和 < gallery_path >/metadata/*.json 文件数相等)
|
|
82
|
+
MetaCheck4 --> MetaCheck5{元数据文件完整 ?}
|
|
83
|
+
MetaCheck5 -->|否| MetaCheck3
|
|
84
|
+
MetaCheck2 -->|否| MetaCheck3(下载元数据 gallery-dl -v '%s' --no-download) --> MetaCheck1
|
|
85
|
+
MetaCheck5 -->|是| MetaCheck6[结束]
|
|
86
|
+
MetaCheck3 --> MetaCheck6
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### 图片下载
|
|
90
|
+
|
|
91
|
+
!!!!!!!!! TODO: 确定是最新之后,元数据下载和图片下载同时进行 !!!!!!!!!
|
|
92
|
+
|
|
93
|
+
```mermaid
|
|
94
|
+
flowchart TD
|
|
95
|
+
UrlCheck1[输入URL] --> UrlCheck2[URL更新] --> ImgCheck1(检查图片文件存在性: \n< gallery_path >/metadata/*.json 对应的每一个图片文件都存在) --> ImgCheck2{图片文件均存在 ?} -->|是| ImgCheck3(检查图片文件内容: \n< gallery_path >/metadata/*.json 对应的图片文件的SHA1值都与< image_token >字段值相符) --> ImgCheck4{图片文件内容均符合image_token ?} -->|是| ImgCheck5[结束]
|
|
96
|
+
ImgCheck2 -->|否| Download(调用gallery-dl下载)
|
|
97
|
+
ImgCheck4 -->|否| Download
|
|
98
|
+
Download --> ImgCheck5
|
|
99
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
ex_cd/__init__.py,sha256=v5oliHtuQqbpMTgJM2tpDB6_pRHaiCvU-bKEfwjFd3U,989
|
|
2
|
+
ex_cd/__main__.py,sha256=RFTiY-4W5PRgmFW-vTX36xXpg0zkat1kPOgnuicj8dk,78
|
|
3
|
+
ex_cd/collect.py,sha256=ZMVGiPhi_T05MqSb-aT4RIcjGgsjPD6RknHiQmH3YSI,2375
|
|
4
|
+
ex_cd/common.py,sha256=q2VJAfPyQKkJi_LM-vhmk1MlLbCsCDKY7nVMLR1bD6k,3544
|
|
5
|
+
ex_cd/config.py,sha256=UpbE0WIN81U_llVSFZBuBzssWiQpR46WXMpkTGmHRKU,2903
|
|
6
|
+
ex_cd/deprecate.py,sha256=96PWa88jaa4e6tkb8cLDHZfcfbyFpDBr6YKNIOAEvBU,2023
|
|
7
|
+
ex_cd/download.py,sha256=CVTT6dIdCnflry4BlB-1f-uFRRe41C1AXqY6H4th_7g,4061
|
|
8
|
+
ex_cd/gallery_dl_exec.py,sha256=czIO6ea_9spFdt0i5Vrb_9DKxOh5k36qA-WFh8wLdRw,517
|
|
9
|
+
ex_cd/history.py,sha256=JjJzSRDjuUiOVlP1cqbhrlXuIf2QyAZFvye936w8ChU,3270
|
|
10
|
+
ex_cd/meta.py,sha256=mQKEWBAgsmhiojBPfkwE9xL6sUlW9ach4mbmpX0p1YY,5062
|
|
11
|
+
ex_cd/output.py,sha256=eOoWIsYEypxDcZncf-jrm-9HODRi_CnVWbaJNHOLrB4,1052
|
|
12
|
+
ex_cd/validate.py,sha256=Y2ApFzrSUaQMryV1gRuiBXMECKO5dvxPRrC3O60Rh6Q,3046
|
|
13
|
+
ex_cd/tools/__init__.py,sha256=ARvxJXlhJ0kgV4vhCGNgFpV32s-Xp6sO87Piwr4qjrs,1053
|
|
14
|
+
ex_cd/tools/__main__.py,sha256=yhxacDeHmwg_y9Xn_U3G4rTOlqIKfQ6xREEPdWXMY5k,91
|
|
15
|
+
ex_cd/tools/config.py,sha256=uwr2ngbfkvl6Dcc1GuhXJrSjo3YSXWX9V5sW5uaXklA,1552
|
|
16
|
+
ex_cd/tools/delete.py,sha256=xisoPW7Hcv4SEiHVh8SyI93GIFFlwOHjq5BoPillw14,4692
|
|
17
|
+
ex_cd/tools/latest_meta.py,sha256=-NdmUuCSLrdeS0_tnlPaR0dFCBCXoo76DMbpGMALfqI,833
|
|
18
|
+
ex_cd-1.17.0.dist-info/licenses/LICENSE,sha256=-OPEaaNL-VWb8CSDMMCDftVBnZkSlWIN9zn9BnqKld8,1067
|
|
19
|
+
ex_cd-1.17.0.dist-info/METADATA,sha256=U9Q9dou5fd4t5R40kajcynq6cPnC1pD4Cmd-t7SAuxs,4245
|
|
20
|
+
ex_cd-1.17.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
21
|
+
ex_cd-1.17.0.dist-info/top_level.txt,sha256=MIjfJbCr3rAXXTqGS9pAN1nALoq1hNP4VFivF78A_z0,6
|
|
22
|
+
ex_cd-1.17.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Howard Yin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ex_cd
|