web-novel-scraper 2.0.3__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web_novel_scraper/__main__.py +123 -69
- web_novel_scraper/config_manager.py +12 -12
- web_novel_scraper/custom_processor/__init__.py +1 -1
- web_novel_scraper/custom_processor/sites/fanmtl.py +15 -0
- web_novel_scraper/decode.py +225 -80
- web_novel_scraper/decode_guide/decode_guide.json +28 -0
- web_novel_scraper/file_manager.py +292 -110
- web_novel_scraper/models.py +76 -0
- web_novel_scraper/novel_scraper.py +893 -424
- web_novel_scraper/request_manager.py +50 -17
- web_novel_scraper/utils.py +22 -1
- web_novel_scraper/version.py +1 -1
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/METADATA +1 -1
- web_novel_scraper-2.1.1.dist-info/RECORD +21 -0
- web_novel_scraper-2.0.3.dist-info/RECORD +0 -19
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/WHEEL +0 -0
- {web_novel_scraper-2.0.3.dist-info → web_novel_scraper-2.1.1.dist-info}/entry_points.txt +0 -0
web_novel_scraper/__main__.py
CHANGED
@@ -1,27 +1,41 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from datetime import datetime
|
3
|
-
from typing import Optional
|
4
3
|
|
5
4
|
import click
|
6
5
|
|
7
6
|
from .config_manager import ScraperConfig
|
8
7
|
from .novel_scraper import Novel
|
8
|
+
from .models import Chapter
|
9
|
+
from .utils import ValidationError, ScraperError, NetworkError, DecodeError, FileManagerError
|
9
10
|
from .version import __version__
|
10
11
|
|
11
12
|
CURRENT_DIR = Path(__file__).resolve().parent
|
12
13
|
|
14
|
+
|
13
15
|
def global_options(f):
|
14
|
-
f = click.option('-nb', '--novel-base-dir', type=click.Path(), required=False,
|
16
|
+
f = click.option('-nb', '--novel-base-dir', type=click.Path(), required=False,
|
17
|
+
help="Alternative directory for this novel.")(f)
|
15
18
|
f = click.option('--config-file', type=click.Path(), required=False, help="Path to config file.")(f)
|
16
|
-
f = click.option('--base-novels-dir', type=click.Path(), required=False,
|
17
|
-
|
19
|
+
f = click.option('--base-novels-dir', type=click.Path(), required=False,
|
20
|
+
help="Alternative base directory for all novels.")(f)
|
21
|
+
f = click.option('--decode-guide-file', type=click.Path(), required=False,
|
22
|
+
help="Path to alternative decode guide file.")(f)
|
18
23
|
return f
|
19
24
|
|
25
|
+
|
26
|
+
@click.group()
|
27
|
+
@global_options
|
28
|
+
@click.pass_context
|
29
|
+
def cli(ctx, **kwargs):
|
30
|
+
"""CLI Tool for web novel scraping."""
|
31
|
+
ctx.obj = kwargs
|
32
|
+
|
33
|
+
|
20
34
|
def obtain_novel(title, ctx_opts, allow_missing=False):
|
21
|
-
cfg = ScraperConfig(ctx_opts
|
35
|
+
cfg = ScraperConfig(parameters=ctx_opts)
|
22
36
|
try:
|
23
|
-
return Novel.load(title, cfg, ctx_opts.get("
|
24
|
-
except
|
37
|
+
return Novel.load(title, cfg, ctx_opts.get("novel_base_dir"))
|
38
|
+
except ValidationError:
|
25
39
|
if allow_missing:
|
26
40
|
return None
|
27
41
|
click.echo("Novel not found.", err=True)
|
@@ -45,23 +59,14 @@ def validate_date(ctx, param, value):
|
|
45
59
|
'Date should be a valid date and must be in the format YYYY-MM-DD, YYYY-MM or YYYY') from exc
|
46
60
|
return value
|
47
61
|
|
62
|
+
|
48
63
|
# COMMON ARGUMENTS
|
49
64
|
title_option = click.option(
|
50
|
-
'-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE',
|
65
|
+
'-t', '--title', type=str, required=True, envvar='SCRAPER_NOVEL_TITLE',
|
66
|
+
help='Title of the novel, this server as the identifier.')
|
51
67
|
novel_base_dir_option = click.option(
|
52
68
|
'-nb', '--novel-base-dir', type=str, help='Alternative base directory for the novel files.')
|
53
69
|
|
54
|
-
@click.group()
|
55
|
-
@global_options
|
56
|
-
@click.pass_context
|
57
|
-
def cli(ctx, novel_base_dir, config_file, base_novels_dir, decode_guide_file):
|
58
|
-
"""CLI Tool for web novel scraping."""
|
59
|
-
ctx.ensure_object(dict)
|
60
|
-
ctx.obj['NOVEL_BASE_DIR'] = novel_base_dir
|
61
|
-
ctx.obj['CONFIG_FILE'] = config_file
|
62
|
-
ctx.obj['BASE_NOVELS_DIR'] = base_novels_dir
|
63
|
-
ctx.obj['DECODE_GUIDE_FILE'] = decode_guide_file
|
64
|
-
|
65
70
|
# Metadata:
|
66
71
|
metadata_author_option = click.option(
|
67
72
|
'--author', type=str, help='Name of the novel author.')
|
@@ -70,9 +75,11 @@ metadata_language_option = click.option(
|
|
70
75
|
metadata_description_option = click.option(
|
71
76
|
'--description', type=str, help='Description of the novel.')
|
72
77
|
metadata_start_date_option = click.option(
|
73
|
-
'--start-date', callback=validate_date, type=str,
|
78
|
+
'--start-date', callback=validate_date, type=str,
|
79
|
+
help='Start date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
|
74
80
|
metadata_end_date_option = click.option(
|
75
|
-
'--end-date', callback=validate_date, type=str,
|
81
|
+
'--end-date', callback=validate_date, type=str,
|
82
|
+
help='End date of the novel, should be in the format YYYY-MM-DD, YYYY-MM or YYYY.')
|
76
83
|
|
77
84
|
# TOC options
|
78
85
|
toc_main_url_option = click.option(
|
@@ -80,14 +87,17 @@ toc_main_url_option = click.option(
|
|
80
87
|
sync_toc_option = click.option('--sync-toc', is_flag=True, default=False, show_default=True,
|
81
88
|
help='Reload the TOC before requesting chapters.')
|
82
89
|
|
90
|
+
|
83
91
|
def create_toc_html_option(required: bool = False):
|
84
92
|
return click.option(
|
85
93
|
'--toc-html',
|
86
94
|
type=click.File(encoding='utf-8'),
|
87
95
|
required=required,
|
88
|
-
help=(
|
96
|
+
help=(
|
97
|
+
'Novel TOC HTML loaded from file.' if required else 'Novel TOC HTML loaded from file (required if not loading from URL)')
|
89
98
|
)
|
90
99
|
|
100
|
+
|
91
101
|
host_option = click.option(
|
92
102
|
'--host', type=str, help='Host used for decoding, optional if toc-main-url is provided.')
|
93
103
|
|
@@ -99,6 +109,7 @@ auto_add_host_option = click.option('--auto-add-host', is_flag=True, show_defaul
|
|
99
109
|
force_flaresolver_option = click.option('--force-flaresolver', is_flag=True, show_default=True,
|
100
110
|
default=False, help='Force the use of FlareSolver for requests.')
|
101
111
|
|
112
|
+
|
102
113
|
# Novel creation and data management commands
|
103
114
|
|
104
115
|
@cli.command()
|
@@ -117,7 +128,8 @@ force_flaresolver_option = click.option('--force-flaresolver', is_flag=True, sho
|
|
117
128
|
@save_title_to_content_option
|
118
129
|
@auto_add_host_option
|
119
130
|
@force_flaresolver_option
|
120
|
-
def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, end_date, language, description, tags,
|
131
|
+
def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, end_date, language, description, tags,
|
132
|
+
cover, save_title_to_content, auto_add_host, force_flaresolver):
|
121
133
|
"""Creates a new novel and saves it."""
|
122
134
|
novel = obtain_novel(title, ctx.obj, allow_missing=True)
|
123
135
|
if novel:
|
@@ -140,29 +152,35 @@ def create_novel(ctx, title, toc_main_url, toc_html, host, author, start_date, e
|
|
140
152
|
toc_html_content = None
|
141
153
|
if toc_html:
|
142
154
|
toc_html_content = toc_html.read()
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end_date=end_date,
|
155
|
+
config = ScraperConfig(parameters=ctx.obj)
|
156
|
+
|
157
|
+
novel = Novel.new(title=title,
|
158
|
+
cfg=config,
|
159
|
+
host=host,
|
160
|
+
toc_main_url=toc_main_url,
|
161
|
+
toc_html=toc_html_content)
|
162
|
+
novel.set_config(cfg=config,
|
163
|
+
novel_base_dir=ctx.obj.get('novel_base_dir'))
|
164
|
+
novel.set_metadata(author=author,
|
165
|
+
start_date=start_date,
|
166
|
+
end_date=end_date,
|
167
|
+
language=language,
|
168
|
+
description=description)
|
155
169
|
novel.set_scraper_behavior(save_title_to_content=save_title_to_content,
|
156
|
-
|
170
|
+
auto_add_host=auto_add_host,
|
171
|
+
force_flaresolver=force_flaresolver)
|
172
|
+
|
157
173
|
if tags:
|
158
174
|
for tag in tags:
|
159
175
|
novel.add_tag(tag)
|
176
|
+
|
160
177
|
if cover:
|
161
178
|
if not novel.set_cover_image(cover):
|
162
179
|
click.echo('Error saving the novel cover image.', err=True)
|
163
180
|
novel.save_novel()
|
164
181
|
click.echo('Novel saved successfully.')
|
165
182
|
|
183
|
+
|
166
184
|
@cli.command()
|
167
185
|
@click.pass_context
|
168
186
|
@title_option
|
@@ -171,6 +189,7 @@ def show_novel_info(ctx, title):
|
|
171
189
|
novel = obtain_novel(title, ctx.obj)
|
172
190
|
click.echo(novel)
|
173
191
|
|
192
|
+
|
174
193
|
@cli.command()
|
175
194
|
@click.pass_context
|
176
195
|
@title_option
|
@@ -188,6 +207,7 @@ def set_metadata(ctx, title, author, start_date, end_date, language, description
|
|
188
207
|
click.echo('Novel metadata saved successfully.')
|
189
208
|
click.echo(novel.metadata)
|
190
209
|
|
210
|
+
|
191
211
|
@cli.command()
|
192
212
|
@click.pass_context
|
193
213
|
@title_option
|
@@ -196,6 +216,7 @@ def show_metadata(ctx, title):
|
|
196
216
|
novel = obtain_novel(title, ctx.obj)
|
197
217
|
click.echo(novel.metadata)
|
198
218
|
|
219
|
+
|
199
220
|
@cli.command()
|
200
221
|
@click.pass_context
|
201
222
|
@title_option
|
@@ -209,6 +230,7 @@ def add_tags(ctx, title, tags):
|
|
209
230
|
novel.save_novel()
|
210
231
|
click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
|
211
232
|
|
233
|
+
|
212
234
|
@cli.command()
|
213
235
|
@click.pass_context
|
214
236
|
@title_option
|
@@ -222,6 +244,7 @@ def remove_tags(ctx, title, tags):
|
|
222
244
|
novel.save_novel()
|
223
245
|
click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
|
224
246
|
|
247
|
+
|
225
248
|
@cli.command()
|
226
249
|
@click.pass_context
|
227
250
|
@title_option
|
@@ -230,6 +253,7 @@ def show_tags(ctx, title):
|
|
230
253
|
novel = obtain_novel(title, ctx.obj)
|
231
254
|
click.echo(f'Tags: {", ".join(novel.metadata.tags)}')
|
232
255
|
|
256
|
+
|
233
257
|
@cli.command()
|
234
258
|
@click.pass_context
|
235
259
|
@title_option
|
@@ -240,11 +264,14 @@ def set_cover_image(ctx, title, cover_image):
|
|
240
264
|
novel.set_cover_image(cover_image)
|
241
265
|
click.echo(f'Cover image saved successfully.')
|
242
266
|
|
267
|
+
|
243
268
|
@cli.command()
|
244
269
|
@click.pass_context
|
245
270
|
@title_option
|
246
|
-
@click.option('--save-title-to-content', type=bool,
|
247
|
-
|
271
|
+
@click.option('--save-title-to-content', type=bool,
|
272
|
+
help='Toggle the title of the chapter being added to the content (use true or false).')
|
273
|
+
@click.option('--auto-add-host', type=bool,
|
274
|
+
help='Toggle automatic addition of the host to chapter URLs (use true or false).')
|
248
275
|
@click.option('--force-flaresolver', type=bool, help='Toggle forcing the use of FlareSolver (use true or false).')
|
249
276
|
@click.option('--hard-clean', type=bool, help='Toggle using a hard clean when cleaning HTML files (use true or false).')
|
250
277
|
def set_scraper_behavior(ctx, title, save_title_to_content, auto_add_host, force_flaresolver, hard_clean):
|
@@ -259,6 +286,7 @@ def set_scraper_behavior(ctx, title, save_title_to_content, auto_add_host, force
|
|
259
286
|
novel.save_novel()
|
260
287
|
click.echo('New scraper behavior added successfully.')
|
261
288
|
|
289
|
+
|
262
290
|
@cli.command()
|
263
291
|
@click.pass_context
|
264
292
|
@title_option
|
@@ -267,6 +295,7 @@ def show_scraper_behavior(ctx, title):
|
|
267
295
|
novel = obtain_novel(title, ctx.obj)
|
268
296
|
click.echo(novel.scraper_behavior)
|
269
297
|
|
298
|
+
|
270
299
|
@cli.command()
|
271
300
|
@click.pass_context
|
272
301
|
@title_option
|
@@ -278,6 +307,7 @@ def set_host(ctx, title, host):
|
|
278
307
|
novel.save_novel()
|
279
308
|
click.echo('New host set successfully.')
|
280
309
|
|
310
|
+
|
281
311
|
# TOC MANAGEMENT COMMANDS
|
282
312
|
|
283
313
|
@cli.command()
|
@@ -290,6 +320,7 @@ def set_toc_main_url(ctx, title, toc_main_url):
|
|
290
320
|
novel.set_toc_main_url(toc_main_url)
|
291
321
|
novel.save_novel()
|
292
322
|
|
323
|
+
|
293
324
|
@cli.command()
|
294
325
|
@click.pass_context
|
295
326
|
@title_option
|
@@ -302,10 +333,12 @@ def add_toc_html(ctx, title, toc_html, host):
|
|
302
333
|
novel.add_toc_html(html_content, host)
|
303
334
|
novel.save_novel()
|
304
335
|
|
336
|
+
|
305
337
|
@cli.command()
|
306
338
|
@click.pass_context
|
307
339
|
@title_option
|
308
|
-
@click.option('--reload-files', is_flag=True, required=False, default=False, show_default=True,
|
340
|
+
@click.option('--reload-files', is_flag=True, required=False, default=False, show_default=True,
|
341
|
+
help='Reload the TOC files before sync (only works if using a TOC URL).')
|
309
342
|
def sync_toc(ctx, title, reload_files):
|
310
343
|
"""Sync the TOC of a novel."""
|
311
344
|
novel = obtain_novel(title, ctx.obj)
|
@@ -317,6 +350,7 @@ def sync_toc(ctx, title, reload_files):
|
|
317
350
|
'Error with the TOC syncing, please check the TOC files and decoding options.', err=True)
|
318
351
|
novel.save_novel()
|
319
352
|
|
353
|
+
|
320
354
|
@cli.command()
|
321
355
|
@click.pass_context
|
322
356
|
@title_option
|
@@ -329,6 +363,7 @@ def delete_toc(ctx, title, auto_approve):
|
|
329
363
|
novel.delete_toc()
|
330
364
|
novel.save_novel()
|
331
365
|
|
366
|
+
|
332
367
|
@cli.command()
|
333
368
|
@click.pass_context
|
334
369
|
@title_option
|
@@ -337,6 +372,7 @@ def show_toc(ctx, title):
|
|
337
372
|
novel = obtain_novel(title, ctx.obj)
|
338
373
|
click.echo(novel.show_toc())
|
339
374
|
|
375
|
+
|
340
376
|
# CHAPTER MANAGEMENT COMMANDS
|
341
377
|
|
342
378
|
@cli.command()
|
@@ -344,46 +380,55 @@ def show_toc(ctx, title):
|
|
344
380
|
@title_option
|
345
381
|
@click.option('--chapter-url', type=str, required=False, help='Chapter URL to be scrapped.')
|
346
382
|
@click.option('--chapter-num', type=int, required=False, help='Chapter number to be scrapped.')
|
347
|
-
@click.option('--update-html', is_flag=True, default=False, show_default=True,
|
383
|
+
@click.option('--update-html', is_flag=True, default=False, show_default=True,
|
384
|
+
help='If the chapter HTML is saved, it will be updated.')
|
348
385
|
def scrap_chapter(ctx, title, chapter_url, chapter_num, update_html):
|
349
386
|
"""Scrap a chapter of a novel."""
|
350
|
-
if (chapter_url is None and chapter_num is None) or (chapter_url and chapter_num):
|
351
|
-
raise click.UsageError("You must set exactly one: --chapter-url o --chapter-num.")
|
352
|
-
|
353
387
|
novel = obtain_novel(title, ctx.obj)
|
388
|
+
try:
|
389
|
+
if chapter_num is not None:
|
390
|
+
chapter_num = chapter_num - 1
|
391
|
+
chapter = novel.get_chapter(chapter_index=chapter_num,
|
392
|
+
chapter_url=chapter_url)
|
393
|
+
except ValidationError:
|
394
|
+
raise click.UsageError(
|
395
|
+
'You must set exactly one: --chapter-url o --chapter-num.')
|
396
|
+
except ValueError:
|
397
|
+
raise click.UsageError('--chapter-num must be a positive number.')
|
354
398
|
|
355
|
-
if
|
356
|
-
if
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
else:
|
363
|
-
chapter = novel.scrap_chapter(chapter_url=chapter_url,
|
364
|
-
update_html=update_html)
|
365
|
-
|
366
|
-
if not chapter:
|
367
|
-
raise click.ClickException('Chapter not found or scrap failed.')
|
399
|
+
if chapter is None:
|
400
|
+
if chapter_url is not None:
|
401
|
+
click.echo('Chapter not found on novel TOC, will try anyways with chapter url')
|
402
|
+
chapter = Chapter(chapter_url=chapter_url)
|
403
|
+
else:
|
404
|
+
raise click.ClickException('Chapter not found.')
|
368
405
|
|
406
|
+
chapter = novel.scrap_chapter(chapter=chapter,
|
407
|
+
reload_file=update_html)
|
369
408
|
click.echo(chapter)
|
370
409
|
click.echo('Content:')
|
371
410
|
click.echo(chapter.chapter_content)
|
372
411
|
|
412
|
+
|
373
413
|
@cli.command()
|
374
414
|
@click.pass_context
|
375
415
|
@title_option
|
376
416
|
@sync_toc_option
|
377
|
-
@click.option('--update-html', is_flag=True, default=False, show_default=True,
|
378
|
-
|
417
|
+
@click.option('--update-html', is_flag=True, default=False, show_default=True,
|
418
|
+
help='If the chapter HTML is saved, it will be updated.')
|
419
|
+
@click.option('--clean-chapters', is_flag=True, default=False, show_default=True,
|
420
|
+
help='If the chapter HTML should be cleaned upon saving.')
|
379
421
|
def request_all_chapters(ctx, title, sync_toc, update_html, clean_chapters):
|
380
422
|
"""Request all chapters of a novel."""
|
381
423
|
novel = obtain_novel(title, ctx.obj)
|
382
424
|
novel.request_all_chapters(
|
383
|
-
sync_toc=sync_toc,
|
425
|
+
sync_toc=sync_toc,
|
426
|
+
reload_files=update_html,
|
427
|
+
clean_chapters=clean_chapters)
|
384
428
|
novel.save_novel()
|
385
429
|
click.echo('All chapters requested and saved.')
|
386
430
|
|
431
|
+
|
387
432
|
@cli.command()
|
388
433
|
@click.pass_context
|
389
434
|
@title_option
|
@@ -398,9 +443,12 @@ def show_chapters(ctx, title):
|
|
398
443
|
@click.pass_context
|
399
444
|
@title_option
|
400
445
|
@sync_toc_option
|
401
|
-
@click.option('--start-chapter', type=int, default=1, show_default=True,
|
402
|
-
|
403
|
-
@click.option('--
|
446
|
+
@click.option('--start-chapter', type=int, default=1, show_default=True,
|
447
|
+
help='The start chapter for the books (position in the TOC, may differ from the actual number).')
|
448
|
+
@click.option('--end-chapter', type=int, default=None, show_default=True,
|
449
|
+
help='The end chapter for the books (if not defined, every chapter will be saved).')
|
450
|
+
@click.option('--chapters-by-book', type=int, default=100, show_default=True,
|
451
|
+
help='The number of chapters each book will have.')
|
404
452
|
def save_novel_to_epub(ctx, title, sync_toc, start_chapter, end_chapter, chapters_by_book):
|
405
453
|
"""Save the novel to EPUB format."""
|
406
454
|
if start_chapter <= 0:
|
@@ -416,19 +464,22 @@ def save_novel_to_epub(ctx, title, sync_toc, start_chapter, end_chapter, chapter
|
|
416
464
|
'Should be a positive number.', param_hint='--chapters-by-book')
|
417
465
|
|
418
466
|
novel = obtain_novel(title, ctx.obj)
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
467
|
+
novel.save_novel_to_epub(sync_toc=sync_toc, start_chapter=start_chapter, end_chapter=end_chapter,
|
468
|
+
chapters_by_book=chapters_by_book)
|
469
|
+
click.echo('All books saved.')
|
470
|
+
|
471
|
+
|
423
472
|
|
424
473
|
# UTILS
|
425
474
|
|
426
475
|
@cli.command()
|
427
476
|
@click.pass_context
|
428
477
|
@title_option
|
429
|
-
@click.option('--clean-chapters', is_flag=True, default=False, show_default=True,
|
478
|
+
@click.option('--clean-chapters', is_flag=True, default=False, show_default=True,
|
479
|
+
help='If the chapters HTML files are cleaned.')
|
430
480
|
@click.option('--clean-toc', is_flag=True, default=False, show_default=True, help='If the TOC files are cleaned.')
|
431
|
-
@click.option('--hard-clean', is_flag=True, default=False, show_default=True,
|
481
|
+
@click.option('--hard-clean', is_flag=True, default=False, show_default=True,
|
482
|
+
help='If the files are more deeply cleaned.')
|
432
483
|
def clean_files(ctx, title, clean_chapters, clean_toc, hard_clean):
|
433
484
|
"""Clean files of a novel."""
|
434
485
|
if not clean_chapters and not clean_toc:
|
@@ -439,6 +490,7 @@ def clean_files(ctx, title, clean_chapters, clean_toc, hard_clean):
|
|
439
490
|
novel.clean_files(clean_chapters=clean_chapters,
|
440
491
|
clean_toc=clean_toc, hard_clean=hard_clean)
|
441
492
|
|
493
|
+
|
442
494
|
@cli.command()
|
443
495
|
@click.pass_context
|
444
496
|
@title_option
|
@@ -447,10 +499,12 @@ def show_novel_dir(ctx, title):
|
|
447
499
|
novel = obtain_novel(title, ctx.obj)
|
448
500
|
click.echo(novel.show_novel_dir())
|
449
501
|
|
502
|
+
|
450
503
|
@cli.command()
|
451
504
|
def version():
|
452
|
-
"""
|
505
|
+
"""Shows the program version."""
|
453
506
|
click.echo(f'Version {__version__}')
|
454
507
|
|
508
|
+
|
455
509
|
if __name__ == '__main__':
|
456
510
|
cli()
|
@@ -4,10 +4,10 @@ import json
|
|
4
4
|
import platformdirs
|
5
5
|
from dotenv import load_dotenv
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import Optional
|
7
|
+
from typing import Optional, Any
|
8
8
|
|
9
9
|
from .logger_manager import create_logger
|
10
|
-
from .utils import FileOps
|
10
|
+
from .utils import FileOps, ValidationError
|
11
11
|
|
12
12
|
load_dotenv()
|
13
13
|
|
@@ -30,18 +30,18 @@ logger = create_logger("CONFIG MANAGER")
|
|
30
30
|
## 3. CONFIG FILE VALUE
|
31
31
|
## 4. DEFAULT VALUE
|
32
32
|
class ScraperConfig:
|
33
|
-
base_novels_dir:
|
34
|
-
decode_guide_file:
|
33
|
+
base_novels_dir: Path
|
34
|
+
decode_guide_file: Path
|
35
35
|
|
36
36
|
def __init__(self,
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
parameters: dict[str, Any] | None = None):
|
38
|
+
if parameters is None:
|
39
|
+
parameters = {}
|
40
40
|
## LOADING CONFIGURATION
|
41
41
|
config_file = self._get_config(default_value=SCRAPER_CONFIG_FILE,
|
42
42
|
config_file_value=None,
|
43
43
|
env_variable="SCRAPER_CONFIG_FILE",
|
44
|
-
parameter_value=config_file)
|
44
|
+
parameter_value=parameters.get('config_file'))
|
45
45
|
|
46
46
|
config_file = Path(config_file)
|
47
47
|
logger.debug(f'Obtaining configuration from file "{config_file}"')
|
@@ -54,15 +54,15 @@ class ScraperConfig:
|
|
54
54
|
|
55
55
|
## SETTING CONFIGURATION VALUES
|
56
56
|
|
57
|
-
self.base_novels_dir = self._get_config(default_value=SCRAPER_BASE_NOVELS_DIR,
|
57
|
+
self.base_novels_dir = Path(self._get_config(default_value=SCRAPER_BASE_NOVELS_DIR,
|
58
58
|
config_file_value=config.get("base_novels_dir"),
|
59
59
|
env_variable="SCRAPER_BASE_NOVELS_DIR",
|
60
|
-
parameter_value=base_novels_dir)
|
60
|
+
parameter_value=parameters.get('base_novels_dir')))
|
61
61
|
|
62
|
-
self.decode_guide_file = self._get_config(default_value=SCRAPER_DECODE_GUIDE_FILE,
|
62
|
+
self.decode_guide_file = Path(self._get_config(default_value=SCRAPER_DECODE_GUIDE_FILE,
|
63
63
|
config_file_value=config.get("decode_guide_file"),
|
64
64
|
env_variable="SCRAPER_DECODE_GUIDE_FILE",
|
65
|
-
parameter_value=decode_guide_file)
|
65
|
+
parameter_value=parameters.get('decode_guide_file')))
|
66
66
|
|
67
67
|
@staticmethod
|
68
68
|
def _get_config(default_value: str,
|
@@ -1,2 +1,2 @@
|
|
1
1
|
from .custom_processor import CustomProcessor, ProcessorRegistry
|
2
|
-
from .sites import royalroad, genesis
|
2
|
+
from .sites import royalroad, genesis, fanmtl
|
@@ -0,0 +1,15 @@
|
|
1
|
+
import re
|
2
|
+
from typing import List, Optional
|
3
|
+
from ..custom_processor import CustomProcessor, ProcessorRegistry
|
4
|
+
|
5
|
+
class GenesisNextPageProcessor(CustomProcessor):
|
6
|
+
def process(self, html: str) -> Optional[str]:
|
7
|
+
pattern = r'href="([^"]+page=\d+[^"]*)">></a'
|
8
|
+
match = re.search(pattern, html)
|
9
|
+
if match is None:
|
10
|
+
return None
|
11
|
+
next_page = match.group(1)
|
12
|
+
next_page = next_page.replace('&', '&')
|
13
|
+
return f'https://www.fanmtl.com{next_page}'
|
14
|
+
|
15
|
+
ProcessorRegistry.register('fanmtl.com', 'next_page', GenesisNextPageProcessor())
|