txt2ebook 0.1.124__py3-none-any.whl → 0.1.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2ebook/__init__.py +1 -1
- txt2ebook/subcommands/massage.py +71 -6
- txt2ebook/tokenizer.py +2 -0
- {txt2ebook-0.1.124.dist-info → txt2ebook-0.1.125.dist-info}/METADATA +1 -1
- {txt2ebook-0.1.124.dist-info → txt2ebook-0.1.125.dist-info}/RECORD +8 -8
- {txt2ebook-0.1.124.dist-info → txt2ebook-0.1.125.dist-info}/LICENSE.md +0 -0
- {txt2ebook-0.1.124.dist-info → txt2ebook-0.1.125.dist-info}/WHEEL +0 -0
- {txt2ebook-0.1.124.dist-info → txt2ebook-0.1.125.dist-info}/entry_points.txt +0 -0
txt2ebook/__init__.py
CHANGED
txt2ebook/subcommands/massage.py
CHANGED
@@ -66,6 +66,25 @@ def build_subparser(subparsers) -> None:
|
|
66
66
|
help="convert section header from words to numbers",
|
67
67
|
)
|
68
68
|
|
69
|
+
massage_parser.add_argument(
|
70
|
+
"-fw",
|
71
|
+
"--fullwidth",
|
72
|
+
default=False,
|
73
|
+
action="store_true",
|
74
|
+
dest="fullwidth",
|
75
|
+
help="use fullwidth character (only for zh-cn and zh-tw)",
|
76
|
+
)
|
77
|
+
|
78
|
+
massage_parser.add_argument(
|
79
|
+
"-ps",
|
80
|
+
"--paragraph_separator",
|
81
|
+
dest="paragraph_separator",
|
82
|
+
type=lambda value: value.encode("utf-8").decode("unicode_escape"),
|
83
|
+
default="\n\n",
|
84
|
+
help="paragraph separator (default: %(default)r)",
|
85
|
+
metavar="SEPARATOR",
|
86
|
+
)
|
87
|
+
|
69
88
|
massage_parser.add_argument(
|
70
89
|
"-sp",
|
71
90
|
"--split-volume-and-chapter",
|
@@ -137,6 +156,15 @@ def build_subparser(subparsers) -> None:
|
|
137
156
|
help="short volume and chapter",
|
138
157
|
)
|
139
158
|
|
159
|
+
massage_parser.add_argument(
|
160
|
+
"-sn",
|
161
|
+
"--single-newline",
|
162
|
+
default=False,
|
163
|
+
action="store_true",
|
164
|
+
dest="single_newline",
|
165
|
+
help="format paragraph by single newline",
|
166
|
+
)
|
167
|
+
|
140
168
|
massage_parser.add_argument(
|
141
169
|
"-op",
|
142
170
|
"--open",
|
@@ -263,23 +291,28 @@ def massage_txt(args: argparse.Namespace) -> str:
|
|
263
291
|
|
264
292
|
content = to_unix_newline(content)
|
265
293
|
|
294
|
+
(metadata, body) = extract_metadata_and_body(args, content)
|
295
|
+
|
266
296
|
if args.fullwidth and args.language in ("zh-cn", "zh-tw"):
|
267
297
|
logger.info("Convert halfwidth ASCII characters to fullwidth")
|
268
|
-
|
298
|
+
body = zh_halfwidth_to_fullwidth(body)
|
269
299
|
|
270
300
|
if args.re_delete:
|
271
|
-
|
301
|
+
body = do_delete_regex(args, body)
|
272
302
|
|
273
303
|
if args.re_replace:
|
274
|
-
|
304
|
+
body = do_replace_regex(args, body)
|
275
305
|
|
276
306
|
if args.re_delete_line:
|
277
|
-
|
307
|
+
body = do_delete_line_regex(args, body)
|
308
|
+
|
309
|
+
if args.single_newline:
|
310
|
+
body = do_single_newline(args, body)
|
278
311
|
|
279
312
|
if args.width:
|
280
|
-
|
313
|
+
body = do_wrapping(args, body)
|
281
314
|
|
282
|
-
return
|
315
|
+
return f"{metadata}\n\n{body}"
|
283
316
|
|
284
317
|
|
285
318
|
def to_unix_newline(content: str) -> str:
|
@@ -349,6 +382,38 @@ def do_delete_line_regex(args, content: str) -> str:
|
|
349
382
|
return content
|
350
383
|
|
351
384
|
|
385
|
+
def extract_metadata_and_body(_args, content: str) -> tuple:
|
386
|
+
"""Extract the metadata and body.
|
387
|
+
|
388
|
+
Args:
|
389
|
+
content (str): The formatted book content.
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
tuple: The metadata and body content.
|
393
|
+
"""
|
394
|
+
metadata = ""
|
395
|
+
body = ""
|
396
|
+
match = re.search(r"---(.*?)---", content, re.DOTALL)
|
397
|
+
if match:
|
398
|
+
metadata = match.group(0).strip()
|
399
|
+
body = content.replace(metadata, "", 1)
|
400
|
+
|
401
|
+
return (metadata, body)
|
402
|
+
|
403
|
+
|
404
|
+
def do_single_newline(args, content: str) -> str:
|
405
|
+
"""Set single newline.
|
406
|
+
|
407
|
+
Args:
|
408
|
+
content (str): The formatted book content.
|
409
|
+
|
410
|
+
Returns:
|
411
|
+
str: The formatted book content.
|
412
|
+
"""
|
413
|
+
modified_content = re.sub(r"\n+", "\n\n", content)
|
414
|
+
return modified_content
|
415
|
+
|
416
|
+
|
352
417
|
def do_wrapping(args, content: str) -> str:
|
353
418
|
"""Wrap or fill CJK text.
|
354
419
|
|
txt2ebook/tokenizer.py
CHANGED
@@ -169,7 +169,9 @@ class Tokenizer:
|
|
169
169
|
|
170
170
|
return metadata
|
171
171
|
|
172
|
+
|
172
173
|
def _tokenize_content(self) -> None:
|
174
|
+
# TODO: split by metadata and content
|
173
175
|
content = self.raw_content.split(f"{self.metadata_marker}\n\n")[1]
|
174
176
|
content = content.strip(self.config.paragraph_separator)
|
175
177
|
lines = content.split(self.config.paragraph_separator)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
txt2ebook/__init__.py,sha256=
|
1
|
+
txt2ebook/__init__.py,sha256=G_p-8TBwpVBfi-gEVDNwuhkFMbIM9p1g5bTUr8DXTvQ,2706
|
2
2
|
txt2ebook/__main__.py,sha256=L29rlfPSx9XMnVaHBYP2dyYgDmutJvONR3yUejjYwRY,855
|
3
3
|
txt2ebook/cli.py,sha256=D0jseJLlFDjLfX-yiGCC0e98a5IJ1IbRFx_mVGyYIxc,4241
|
4
4
|
txt2ebook/exceptions.py,sha256=oVtVMCqrxWq-CZ5GQYOBioil9i2kJ2mqD08IQ9A636Q,847
|
@@ -37,16 +37,16 @@ txt2ebook/subcommands/__init__.py,sha256=ldhzvsrMsR8lZmhZef77JFz0jValpV3pytFfwJS
|
|
37
37
|
txt2ebook/subcommands/env.py,sha256=Fx2IXNmmlW-6jlMjRPI-nYp90Sbi77Z2SeL4q3cGg2w,1495
|
38
38
|
txt2ebook/subcommands/epub.py,sha256=JDDucrRiiQW1B7ycKz5zS1X5SMQZ82GBtlE2_SBYIdw,3507
|
39
39
|
txt2ebook/subcommands/gmi.py,sha256=zVvP2ZjLtDdqew4Vlab2_R3H2OmQkpMKdfND6qgppiU,3320
|
40
|
-
txt2ebook/subcommands/massage.py,sha256=
|
40
|
+
txt2ebook/subcommands/massage.py,sha256=EuC-C03NMJk9V1_PEUOa-n4SmQCRpj1TJ_GwSJE8_Ss,11809
|
41
41
|
txt2ebook/subcommands/md.py,sha256=P-oFtb2u-v2F_KU8t249-f5Ihjb_TCT_NWMlOYoq5p4,3330
|
42
42
|
txt2ebook/subcommands/parse.py,sha256=FaYTWa2yqkowwPAmHWJC7iCii2Rnus3SUHG10GjjJp4,3022
|
43
43
|
txt2ebook/subcommands/pdf.py,sha256=KS3rzxPJG6ovt8GPJj8u1Bum5ye3zrEI0LPz21EMLZo,2981
|
44
44
|
txt2ebook/subcommands/tex.py,sha256=X6ZBfuAs_mcJe8PNjzoW339ecPynZduVbcCq0henjiA,3131
|
45
45
|
txt2ebook/subcommands/typ.py,sha256=r4Xf7xSINbYfaIKkVzdyQDlUMWPvOIcbvOwC71spu6w,3682
|
46
|
-
txt2ebook/tokenizer.py,sha256=
|
46
|
+
txt2ebook/tokenizer.py,sha256=H9AaZVmNP43L3ONvj58u_5weZAjFG9SzQSeS9upGN1U,9642
|
47
47
|
txt2ebook/zh_utils.py,sha256=0Yq9r-JL4HntW68vFR6TBP9yQim1a07mfsh_sp-XmaE,4887
|
48
|
-
txt2ebook-0.1.
|
49
|
-
txt2ebook-0.1.
|
50
|
-
txt2ebook-0.1.
|
51
|
-
txt2ebook-0.1.
|
52
|
-
txt2ebook-0.1.
|
48
|
+
txt2ebook-0.1.125.dist-info/LICENSE.md,sha256=tGtFDwxWTjuR9syrJoSv1Hiffd2u8Tu8cYClfrXS_YU,31956
|
49
|
+
txt2ebook-0.1.125.dist-info/METADATA,sha256=E8RlY3BV0QFaeYdHbBOlbjA9ebxT_w3RlhIc8KIFgtI,4969
|
50
|
+
txt2ebook-0.1.125.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
51
|
+
txt2ebook-0.1.125.dist-info/entry_points.txt,sha256=q4krNWsYNu4Rcf72nFc66JeR0J9BiFA6-NVEJKBZ_F4,71
|
52
|
+
txt2ebook-0.1.125.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|