txt2ebook 0.1.151__py3-none-any.whl → 0.1.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2ebook/languages/zh_cn.py +1 -0
- txt2ebook/subcommands/massage.py +67 -3
- txt2ebook/subcommands/typ.py +1 -1
- {txt2ebook-0.1.151.dist-info → txt2ebook-0.1.153.dist-info}/METADATA +1 -1
- {txt2ebook-0.1.151.dist-info → txt2ebook-0.1.153.dist-info}/RECORD +8 -8
- {txt2ebook-0.1.151.dist-info → txt2ebook-0.1.153.dist-info}/WHEEL +0 -0
- {txt2ebook-0.1.151.dist-info → txt2ebook-0.1.153.dist-info}/entry_points.txt +0 -0
- {txt2ebook-0.1.151.dist-info → txt2ebook-0.1.153.dist-info}/licenses/LICENSE.md +0 -0
txt2ebook/languages/zh_cn.py
CHANGED
txt2ebook/subcommands/massage.py
CHANGED
@@ -22,9 +22,11 @@ from importlib import import_module
|
|
22
22
|
from pathlib import Path
|
23
23
|
|
24
24
|
import cjkwrap
|
25
|
+
import jieba.analyse
|
25
26
|
import regex as re
|
26
27
|
from bs4 import UnicodeDammit
|
27
28
|
|
29
|
+
from txt2ebook import detect_and_expect_language
|
28
30
|
from txt2ebook.exceptions import EmptyFileError
|
29
31
|
from txt2ebook.models.book import Book
|
30
32
|
from txt2ebook.zh_utils import zh_halfwidth_to_fullwidth, zh_words_to_numbers
|
@@ -70,7 +72,22 @@ def build_subparser(subparsers) -> None:
|
|
70
72
|
default=False,
|
71
73
|
action="store_true",
|
72
74
|
dest="fullwidth",
|
73
|
-
help=
|
75
|
+
help=(
|
76
|
+
"use fullwidth character (only for zh-cn and zh-tw) "
|
77
|
+
"(default: %(default)r)"
|
78
|
+
),
|
79
|
+
)
|
80
|
+
|
81
|
+
massage_parser.add_argument(
|
82
|
+
"-ri",
|
83
|
+
"--reindent",
|
84
|
+
default=False,
|
85
|
+
action="store_true",
|
86
|
+
dest="reindent",
|
87
|
+
help=(
|
88
|
+
"reindent each paragraph (only for zh-cn and zh-tw) "
|
89
|
+
"(default: %(default)r)"
|
90
|
+
),
|
74
91
|
)
|
75
92
|
|
76
93
|
massage_parser.add_argument(
|
@@ -318,12 +335,18 @@ def massage_txt(args: argparse.Namespace) -> str:
|
|
318
335
|
|
319
336
|
content = to_unix_newline(content)
|
320
337
|
|
338
|
+
args.language = detect_and_expect_language(content, args.language)
|
339
|
+
|
321
340
|
(metadata, body) = extract_metadata_and_body(args, content)
|
322
341
|
|
323
342
|
if args.fullwidth and args.language in ("zh-cn", "zh-tw"):
|
324
343
|
logger.info("Convert halfwidth ASCII characters to fullwidth")
|
325
344
|
body = zh_halfwidth_to_fullwidth(body)
|
326
345
|
|
346
|
+
if args.reindent and args.language in ("zh-cn", "zh-tw"):
|
347
|
+
logger.info("Reindent paragraph")
|
348
|
+
body = do_reindent_paragraph(args, body)
|
349
|
+
|
327
350
|
if args.re_delete:
|
328
351
|
body = do_delete_regex(args, body)
|
329
352
|
|
@@ -339,7 +362,7 @@ def massage_txt(args: argparse.Namespace) -> str:
|
|
339
362
|
if args.width:
|
340
363
|
body = do_wrapping(args, body)
|
341
364
|
|
342
|
-
return f"{metadata}
|
365
|
+
return f"{metadata}{body}"
|
343
366
|
|
344
367
|
|
345
368
|
def to_unix_newline(content: str) -> str:
|
@@ -353,6 +376,30 @@ def to_unix_newline(content: str) -> str:
|
|
353
376
|
"""
|
354
377
|
return content.replace("\r\n", "\n").replace("\r", "\n")
|
355
378
|
|
379
|
+
def do_reindent_paragraph(args, content: str) -> str:
|
380
|
+
"""Reindent each paragraph.
|
381
|
+
|
382
|
+
Args:
|
383
|
+
content(str): The formatted book content.
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
str: The formatted book content.
|
387
|
+
"""
|
388
|
+
paragraphs = re.split(r'\n\s*\n+', content)
|
389
|
+
reindented_paragraphs = []
|
390
|
+
for paragraph in paragraphs:
|
391
|
+
lines = paragraph.split('\n')
|
392
|
+
reindented_lines = []
|
393
|
+
for line in lines:
|
394
|
+
stripped_line = line.strip()
|
395
|
+
reindented_lines.append(stripped_line)
|
396
|
+
|
397
|
+
reindented_paragraph = '\n'.join(reindented_lines)
|
398
|
+
reindented_paragraph = " " + reindented_paragraph
|
399
|
+
reindented_paragraphs.append(reindented_paragraph)
|
400
|
+
|
401
|
+
return args.paragraph_separator.join(reindented_paragraphs)
|
402
|
+
|
356
403
|
|
357
404
|
def do_delete_regex(args, content: str) -> str:
|
358
405
|
"""Remove words/phrases based on regex.
|
@@ -425,7 +472,24 @@ def extract_metadata_and_body(_args, content: str) -> tuple:
|
|
425
472
|
metadata = match.group(0).strip()
|
426
473
|
body = content.replace(metadata, "", 1)
|
427
474
|
|
428
|
-
|
475
|
+
|
476
|
+
metadata_block = metadata.split("---")[1]
|
477
|
+
|
478
|
+
metadata_dict = {}
|
479
|
+
for line in metadata_block.strip().splitlines():
|
480
|
+
key, value = line.split(":", 1)
|
481
|
+
metadata_dict[key.strip()] = value.strip()
|
482
|
+
|
483
|
+
tags = jieba.analyse.extract_tags(content, topK=100)
|
484
|
+
metadata_tags = " ".join(tags)
|
485
|
+
logger.info("tags: %s", metadata_tags)
|
486
|
+
metadata_dict["索引"] = metadata_tags
|
487
|
+
|
488
|
+
meta_lines = [f"{key}:{value}" for key, value in metadata_dict.items()]
|
489
|
+
meta_body = "\n".join(meta_lines)
|
490
|
+
meta_str = f"---\n{meta_body}\n---"
|
491
|
+
|
492
|
+
return (meta_str, body)
|
429
493
|
|
430
494
|
|
431
495
|
def do_single_newline(args, content: str) -> str:
|
txt2ebook/subcommands/typ.py
CHANGED
@@ -22,7 +22,7 @@ txt2ebook/formats/templates/epub/noindent.css,sha256=_O5Tv90TKyyPBRdgjuNKFwtKFbd
|
|
22
22
|
txt2ebook/helpers/__init__.py,sha256=c2EItHvPABDORfgfjArfa5XR--43es4D1tKWqaPcBxY,1309
|
23
23
|
txt2ebook/languages/__init__.py,sha256=1AfDn-D0q-dvODGP-9KxPHY_Wtk-ifZdN1FutZMT9-Q,763
|
24
24
|
txt2ebook/languages/en.py,sha256=8qsmbKB69M3SD9nBnSX8rP8hAL_RFkhB-zyH93INgaQ,999
|
25
|
-
txt2ebook/languages/zh_cn.py,sha256=
|
25
|
+
txt2ebook/languages/zh_cn.py,sha256=ryKMeaNgX2J6BGrHl7KZL9S6HwIlTyLk75z3lvVQIi4,1960
|
26
26
|
txt2ebook/languages/zh_tw.py,sha256=_fdXOOSLK0nTMuBe1Om2qjb4zr2PVd6N4xi2rrYkNTI,1515
|
27
27
|
txt2ebook/locales/txt2ebook.pot,sha256=VoXU9LrDzZApUVCCcKZC5Fu5QLx1fwd1lYEkbIdCEgc,641
|
28
28
|
txt2ebook/locales/en/LC_MESSAGES/txt2ebook.mo,sha256=Ym6soeijV3zsv9FUPWlJnu18-CNb5tcOTN5JsMOfV9c,672
|
@@ -39,14 +39,14 @@ txt2ebook/subcommands/__init__.py,sha256=ldhzvsrMsR8lZmhZef77JFz0jValpV3pytFfwJS
|
|
39
39
|
txt2ebook/subcommands/env.py,sha256=gEzra4b6guy7pRZUTCWX1_eiR7JmrtR1Z-J-vxljvMY,1549
|
40
40
|
txt2ebook/subcommands/epub.py,sha256=_obM1_fvVBPHOBXBOCYK8nyJadBX3_gOn9kaXA5HipA,3570
|
41
41
|
txt2ebook/subcommands/gmi.py,sha256=ANnPg-RFsylTo44fUzFOSHN1fC3Ce82gBzrv-sBv5fU,3318
|
42
|
-
txt2ebook/subcommands/massage.py,sha256=
|
42
|
+
txt2ebook/subcommands/massage.py,sha256=cjeee4wJJ6xPV76efnlWRVdeVkEtJYIBDaKhFMPjKTg,14643
|
43
43
|
txt2ebook/subcommands/md.py,sha256=PmIqrqrnzLywvN4qTkle0V9N3FTIJGRWpC0Xbk76B5o,3329
|
44
44
|
txt2ebook/subcommands/parse.py,sha256=xjhW8I9zS5DL3n3m04RyFofgci-6-_L6aF3d4N7c7M4,2938
|
45
45
|
txt2ebook/subcommands/pdf.py,sha256=1JQtpugzAIaho6G3CK1rGYk74hotAexXZxPH9PHpRps,2980
|
46
46
|
txt2ebook/subcommands/tex.py,sha256=59Bzl67VSHMZS9BtU9zZDmKwsd6cQGoFfuGftAz9efc,3438
|
47
|
-
txt2ebook/subcommands/typ.py,sha256=
|
48
|
-
txt2ebook-0.1.
|
49
|
-
txt2ebook-0.1.
|
50
|
-
txt2ebook-0.1.
|
51
|
-
txt2ebook-0.1.
|
52
|
-
txt2ebook-0.1.
|
47
|
+
txt2ebook/subcommands/typ.py,sha256=PoyozIueAvNL3yHHpS9NnMTMVV4ppUBX37SWGXd5Zhg,3680
|
48
|
+
txt2ebook-0.1.153.dist-info/entry_points.txt,sha256=AFikuCV6fqf8_GHwsvWuo9jTGNrCkWY1TJWk5GfMWSk,71
|
49
|
+
txt2ebook-0.1.153.dist-info/licenses/LICENSE.md,sha256=tGtFDwxWTjuR9syrJoSv1Hiffd2u8Tu8cYClfrXS_YU,31956
|
50
|
+
txt2ebook-0.1.153.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
51
|
+
txt2ebook-0.1.153.dist-info/METADATA,sha256=vDQlh4KChG-igGwNAQ4NB8saV4dRRGj-Rz4eWUpqCEc,4700
|
52
|
+
txt2ebook-0.1.153.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|