txt2ebook 0.1.107__tar.gz → 0.1.108__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/PKG-INFO +1 -1
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/pyproject.toml +2 -2
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/__init__.py +1 -1
- txt2ebook-0.1.108/src/txt2ebook/cli.py +271 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/models/book.py +0 -1
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/parser.py +1 -121
- txt2ebook-0.1.108/src/txt2ebook/subcommands/env.py +52 -0
- txt2ebook-0.1.108/src/txt2ebook/subcommands/massage.py +220 -0
- txt2ebook-0.1.108/src/txt2ebook/subcommands/parse.py +86 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/tokenizer.py +3 -1
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/LICENSE.md +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/README.md +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/__main__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/exceptions.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/base.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/epub.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/gmi.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/md.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/pdf.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/clean.css +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/condense.css +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/noindent.css +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/tex.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/txt.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/formats/typ.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/helpers/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/languages/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/languages/en.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/languages/zh_cn.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/languages/zh_tw.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.mo +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.po +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/txt2ebook.pot +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.mo +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.po +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.mo +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.po +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/models/__init__.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/models/chapter.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/models/volume.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/txt2ebook.py +0 -0
- {txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/zh_utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "txt2ebook"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.108"
|
4
4
|
description = "CLI tool to convert txt file to ebook format"
|
5
5
|
authors = ["Kian-Meng Ang <kianmeng@cpan.org>"]
|
6
6
|
license = "AGPL-3.0-or-later"
|
@@ -44,7 +44,7 @@ lxml = "^5.2.2"
|
|
44
44
|
|
45
45
|
[tool.poetry.scripts]
|
46
46
|
txt2ebook = 'txt2ebook.txt2ebook:main'
|
47
|
-
tte = 'txt2ebook.
|
47
|
+
tte = 'txt2ebook.cli:main'
|
48
48
|
|
49
49
|
[tool.poetry.group.dev.dependencies]
|
50
50
|
babel = "^2.12.1"
|
@@ -0,0 +1,271 @@
|
|
1
|
+
# Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU Affero General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU Affero General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Affero General Public License
|
14
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
"""txt2ebook/tte is a cli tool to convert txt file to ebook format.
|
17
|
+
|
18
|
+
website: https://github.com/kianmeng/txt2ebook
|
19
|
+
changelog: https://github.com/kianmeng/txt2ebook/blob/master/CHANGELOG.md
|
20
|
+
issues: https://github.com/kianmeng/txt2ebook/issues
|
21
|
+
"""
|
22
|
+
|
23
|
+
import argparse
|
24
|
+
import logging
|
25
|
+
import sys
|
26
|
+
from typing import Optional, Sequence
|
27
|
+
|
28
|
+
import txt2ebook.subcommands.env
|
29
|
+
import txt2ebook.subcommands.parse
|
30
|
+
import txt2ebook.subcommands.massage
|
31
|
+
from txt2ebook import __version__, setup_logger
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
def build_parser(
|
37
|
+
args: Optional[Sequence[str]] = [],
|
38
|
+
) -> argparse.ArgumentParser:
|
39
|
+
"""Build the CLI parser."""
|
40
|
+
parser = argparse.ArgumentParser(
|
41
|
+
prog="txt2ebook",
|
42
|
+
add_help=False,
|
43
|
+
description=__doc__,
|
44
|
+
formatter_class=lambda prog: argparse.RawTextHelpFormatter(
|
45
|
+
prog, max_help_position=6
|
46
|
+
),
|
47
|
+
)
|
48
|
+
|
49
|
+
parser.add_argument(
|
50
|
+
"-t",
|
51
|
+
"--title",
|
52
|
+
dest="title",
|
53
|
+
default=None,
|
54
|
+
help="title of the ebook (default: '%(default)s')",
|
55
|
+
metavar="TITLE",
|
56
|
+
)
|
57
|
+
|
58
|
+
parser.add_argument(
|
59
|
+
"-l",
|
60
|
+
"--language",
|
61
|
+
dest="language",
|
62
|
+
default=None,
|
63
|
+
help="language of the ebook (default: '%(default)s')",
|
64
|
+
metavar="LANGUAGE",
|
65
|
+
)
|
66
|
+
|
67
|
+
parser.add_argument(
|
68
|
+
"-a",
|
69
|
+
"--author",
|
70
|
+
dest="author",
|
71
|
+
default=[],
|
72
|
+
action="append",
|
73
|
+
help="author of the ebook (default: '%(default)s')",
|
74
|
+
metavar="AUTHOR",
|
75
|
+
)
|
76
|
+
|
77
|
+
parser.add_argument(
|
78
|
+
"-c",
|
79
|
+
"--cover",
|
80
|
+
dest="cover",
|
81
|
+
default=None,
|
82
|
+
help="cover of the ebook",
|
83
|
+
metavar="IMAGE_FILENAME",
|
84
|
+
)
|
85
|
+
|
86
|
+
parser.add_argument(
|
87
|
+
"-tr",
|
88
|
+
"--translator",
|
89
|
+
dest="translator",
|
90
|
+
default=[],
|
91
|
+
action="append",
|
92
|
+
help="translator of the ebook (default: '%(default)s')",
|
93
|
+
metavar="TRANSLATOR",
|
94
|
+
)
|
95
|
+
|
96
|
+
parser.add_argument(
|
97
|
+
"-fw",
|
98
|
+
"--fullwidth",
|
99
|
+
default=False,
|
100
|
+
action="store_true",
|
101
|
+
dest="fullwidth",
|
102
|
+
help="convert ASCII character from halfwidth to fullwidth",
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
parser.add_argument(
|
107
|
+
"-ra",
|
108
|
+
"--regex-author",
|
109
|
+
dest="re_author",
|
110
|
+
default=[],
|
111
|
+
action="append",
|
112
|
+
help="regex to parse author of the book (default: by LANGUAGE)",
|
113
|
+
metavar="REGEX",
|
114
|
+
)
|
115
|
+
|
116
|
+
parser.add_argument(
|
117
|
+
"-rc",
|
118
|
+
"--regex-chapter",
|
119
|
+
dest="re_chapter",
|
120
|
+
default=[],
|
121
|
+
action="append",
|
122
|
+
help="regex to parse chapter header (default: by LANGUAGE)",
|
123
|
+
metavar="REGEX",
|
124
|
+
)
|
125
|
+
|
126
|
+
parser.add_argument(
|
127
|
+
"-rvc",
|
128
|
+
"--regex-volume-chapter",
|
129
|
+
dest="re_volume_chapter",
|
130
|
+
default=[],
|
131
|
+
action="append",
|
132
|
+
help=(
|
133
|
+
"regex to parse volume and chapter header "
|
134
|
+
"(default: by LANGUAGE)"
|
135
|
+
),
|
136
|
+
metavar="REGEX",
|
137
|
+
)
|
138
|
+
|
139
|
+
parser.add_argument(
|
140
|
+
"-rv",
|
141
|
+
"--regex-volume",
|
142
|
+
dest="re_volume",
|
143
|
+
default=[],
|
144
|
+
action="append",
|
145
|
+
help="regex to parse volume header (default: by LANGUAGE)",
|
146
|
+
metavar="REGEX",
|
147
|
+
)
|
148
|
+
|
149
|
+
parser.add_argument(
|
150
|
+
"-hn",
|
151
|
+
"--header-number",
|
152
|
+
default=False,
|
153
|
+
action="store_true",
|
154
|
+
dest="header_number",
|
155
|
+
help="convert section header from words to numbers",
|
156
|
+
)
|
157
|
+
|
158
|
+
parser.add_argument(
|
159
|
+
"-ps",
|
160
|
+
"--paragraph_separator",
|
161
|
+
dest="paragraph_separator",
|
162
|
+
type=lambda value: value.encode("utf-8").decode("unicode_escape"),
|
163
|
+
default="\n\n",
|
164
|
+
help="paragraph separator (default: %(default)r)",
|
165
|
+
metavar="SEPARATOR",
|
166
|
+
)
|
167
|
+
|
168
|
+
parser.add_argument(
|
169
|
+
"-rt",
|
170
|
+
"--regex-title",
|
171
|
+
dest="re_title",
|
172
|
+
default=[],
|
173
|
+
action="append",
|
174
|
+
help="regex to parse title of the book (default: by LANGUAGE)",
|
175
|
+
metavar="REGEX",
|
176
|
+
)
|
177
|
+
|
178
|
+
parser.add_argument(
|
179
|
+
"-ss",
|
180
|
+
"--sort-volume-and-chapter",
|
181
|
+
default=False,
|
182
|
+
action="store_true",
|
183
|
+
dest="sort_volume_and_chapter",
|
184
|
+
help="short volume and chapter",
|
185
|
+
)
|
186
|
+
|
187
|
+
parser.add_argument(
|
188
|
+
"-rw",
|
189
|
+
"--raise-on-warning",
|
190
|
+
default=False,
|
191
|
+
action="store_true",
|
192
|
+
dest="raise_on_warning",
|
193
|
+
help="raise exception and stop parsing upon warning",
|
194
|
+
)
|
195
|
+
|
196
|
+
parser.add_argument(
|
197
|
+
"-q",
|
198
|
+
"--quiet",
|
199
|
+
default=False,
|
200
|
+
action="store_true",
|
201
|
+
dest="quiet",
|
202
|
+
help="suppress all logging",
|
203
|
+
)
|
204
|
+
|
205
|
+
parser.add_argument(
|
206
|
+
"-v",
|
207
|
+
"--verbose",
|
208
|
+
default=0,
|
209
|
+
action="count",
|
210
|
+
dest="verbose",
|
211
|
+
help="show verbosity of debugging log, use -vv, -vvv for more details",
|
212
|
+
)
|
213
|
+
|
214
|
+
parser.add_argument(
|
215
|
+
"-d",
|
216
|
+
"--debug",
|
217
|
+
default=False,
|
218
|
+
action="store_true",
|
219
|
+
dest="debug",
|
220
|
+
help="show debugging log and stacktrace",
|
221
|
+
)
|
222
|
+
|
223
|
+
parser.add_argument(
|
224
|
+
"-h",
|
225
|
+
"--help",
|
226
|
+
action="help",
|
227
|
+
default=argparse.SUPPRESS,
|
228
|
+
help="show this help message and exit",
|
229
|
+
)
|
230
|
+
|
231
|
+
parser.add_argument(
|
232
|
+
"-V",
|
233
|
+
"--version",
|
234
|
+
action="version",
|
235
|
+
version=f"%(prog)s {__version__}",
|
236
|
+
)
|
237
|
+
|
238
|
+
subparsers = parser.add_subparsers(help="sub-command help")
|
239
|
+
txt2ebook.subcommands.parse.build_subparser(subparsers)
|
240
|
+
txt2ebook.subcommands.massage.build_subparser(subparsers)
|
241
|
+
txt2ebook.subcommands.env.build_subparser(subparsers)
|
242
|
+
|
243
|
+
return parser
|
244
|
+
|
245
|
+
|
246
|
+
def main(args: Optional[Sequence[str]] = None):
|
247
|
+
"""Set the main entrypoint of the CLI script."""
|
248
|
+
args = args or sys.argv[1:]
|
249
|
+
|
250
|
+
try:
|
251
|
+
parser = build_parser()
|
252
|
+
if len(args) == 0:
|
253
|
+
parser.print_help(sys.stderr)
|
254
|
+
else:
|
255
|
+
parsed_args = parser.parse_args(args)
|
256
|
+
setup_logger(parsed_args)
|
257
|
+
|
258
|
+
if hasattr(parsed_args, "func"):
|
259
|
+
logger.debug(parsed_args)
|
260
|
+
parsed_args.func(parsed_args)
|
261
|
+
else:
|
262
|
+
parser.print_help(sys.stderr)
|
263
|
+
|
264
|
+
except Exception as error:
|
265
|
+
logger.error(
|
266
|
+
"error: %s",
|
267
|
+
getattr(error, "message", str(error)),
|
268
|
+
exc_info=("-d" in args or "--debug" in args),
|
269
|
+
)
|
270
|
+
|
271
|
+
raise SystemExit(1) from None
|
@@ -40,7 +40,6 @@ class Book:
|
|
40
40
|
language: str = field(default="")
|
41
41
|
cover: str = field(default="", repr=False)
|
42
42
|
raw_content: str = field(default="", repr=False)
|
43
|
-
massaged_content: str = field(default="", repr=False)
|
44
43
|
toc: List[Union[Volume, Chapter]] = field(
|
45
44
|
default_factory=lambda: [], repr=False
|
46
45
|
)
|
@@ -52,8 +52,7 @@ class Parser:
|
|
52
52
|
Returns:
|
53
53
|
txt2ebook.models.Book: The Book model.
|
54
54
|
"""
|
55
|
-
|
56
|
-
tokenizer = Tokenizer(massaged_content, self.config)
|
55
|
+
tokenizer = Tokenizer(self.raw_content, self.config)
|
57
56
|
|
58
57
|
(book_title, authors, translators, tags, index, toc) = (
|
59
58
|
self.parse_tokens(tokenizer)
|
@@ -68,7 +67,6 @@ class Parser:
|
|
68
67
|
index=index,
|
69
68
|
cover=self.config.cover,
|
70
69
|
raw_content=self.raw_content,
|
71
|
-
massaged_content=massaged_content,
|
72
70
|
toc=toc,
|
73
71
|
)
|
74
72
|
|
@@ -252,121 +250,3 @@ class Parser:
|
|
252
250
|
section.chapters.sort(key=lambda x: x.title)
|
253
251
|
|
254
252
|
toc.sort(key=lambda x: x.title if isinstance(x, Volume) else "")
|
255
|
-
|
256
|
-
def massage(self) -> str:
|
257
|
-
"""Massage the txt content.
|
258
|
-
|
259
|
-
Returns:
|
260
|
-
str: The formatted book content
|
261
|
-
"""
|
262
|
-
content = self.raw_content
|
263
|
-
|
264
|
-
content = Parser.to_unix_newline(content)
|
265
|
-
|
266
|
-
if self.config.fullwidth and self.config.language in (
|
267
|
-
"zh-cn",
|
268
|
-
"zh-tw",
|
269
|
-
):
|
270
|
-
logger.info("Convert halfwidth ASCII characters to fullwidth")
|
271
|
-
content = zh_halfwidth_to_fullwidth(content)
|
272
|
-
|
273
|
-
if self.config.re_delete:
|
274
|
-
content = self.do_delete_regex(content)
|
275
|
-
|
276
|
-
if self.config.re_replace:
|
277
|
-
content = self.do_replace_regex(content)
|
278
|
-
|
279
|
-
if self.config.re_delete_line:
|
280
|
-
content = self.do_delete_line_regex(content)
|
281
|
-
|
282
|
-
if self.config.width:
|
283
|
-
content = self.do_wrapping(content)
|
284
|
-
|
285
|
-
return content
|
286
|
-
|
287
|
-
@staticmethod
|
288
|
-
def to_unix_newline(content: str) -> str:
|
289
|
-
"""Convert all other line ends to Unix line end.
|
290
|
-
|
291
|
-
Args:
|
292
|
-
content(str): The formatted book content.
|
293
|
-
|
294
|
-
Returns:
|
295
|
-
str: The formatted book content.
|
296
|
-
"""
|
297
|
-
return content.replace("\r\n", "\n").replace("\r", "\n")
|
298
|
-
|
299
|
-
def do_delete_regex(self, content: str) -> str:
|
300
|
-
"""Remove words/phrases based on regex.
|
301
|
-
|
302
|
-
Args:
|
303
|
-
content(str): The formatted book content.
|
304
|
-
|
305
|
-
Returns:
|
306
|
-
str: The formatted book content.
|
307
|
-
"""
|
308
|
-
for delete_regex in self.config.re_delete:
|
309
|
-
content = re.sub(
|
310
|
-
re.compile(rf"{delete_regex}", re.MULTILINE), "", content
|
311
|
-
)
|
312
|
-
return content
|
313
|
-
|
314
|
-
def do_replace_regex(self, content: str) -> str:
|
315
|
-
"""Replace words/phrases based on regex.
|
316
|
-
|
317
|
-
Args:
|
318
|
-
content(str): The formatted book content.
|
319
|
-
|
320
|
-
Returns:
|
321
|
-
str: The formatted book content.
|
322
|
-
"""
|
323
|
-
regex = self.config.re_replace
|
324
|
-
if isinstance(regex, list):
|
325
|
-
for search, replace in regex:
|
326
|
-
content = re.sub(
|
327
|
-
re.compile(rf"{search}", re.MULTILINE),
|
328
|
-
rf"{replace}",
|
329
|
-
content,
|
330
|
-
)
|
331
|
-
|
332
|
-
return content
|
333
|
-
|
334
|
-
def do_delete_line_regex(self, content: str) -> str:
|
335
|
-
"""Delete whole line based on regex.
|
336
|
-
|
337
|
-
Args:
|
338
|
-
content(str): The formatted book content.
|
339
|
-
|
340
|
-
Returns:
|
341
|
-
str: The formatted book content.
|
342
|
-
"""
|
343
|
-
for delete_line_regex in self.config.re_delete_line:
|
344
|
-
content = re.sub(
|
345
|
-
re.compile(rf"^.*{delete_line_regex}.*$", re.MULTILINE),
|
346
|
-
"",
|
347
|
-
content,
|
348
|
-
)
|
349
|
-
return content
|
350
|
-
|
351
|
-
def do_wrapping(self, content: str) -> str:
|
352
|
-
"""Wrap or fill CJK text.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
content (str): The formatted book content.
|
356
|
-
|
357
|
-
Returns:
|
358
|
-
str: The formatted book content.
|
359
|
-
"""
|
360
|
-
logger.info("Wrapping paragraph to width: %s", self.config.width)
|
361
|
-
|
362
|
-
paragraphs = []
|
363
|
-
# We don't remove empty line and keep all formatting as it.
|
364
|
-
for paragraph in content.split("\n"):
|
365
|
-
paragraph = paragraph.strip()
|
366
|
-
|
367
|
-
lines = cjkwrap.wrap(paragraph, width=self.config.width)
|
368
|
-
paragraph = "\n".join(lines)
|
369
|
-
paragraphs.append(paragraph)
|
370
|
-
|
371
|
-
wrapped_content = "\n".join(paragraphs)
|
372
|
-
return wrapped_content
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU Affero General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU Affero General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Affero General Public License
|
14
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
"""Env subcommand."""
|
17
|
+
|
18
|
+
import argparse
|
19
|
+
import logging
|
20
|
+
import platform
|
21
|
+
import sys
|
22
|
+
|
23
|
+
from txt2ebook import __version__
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
def build_subparser(subparsers) -> None:
|
29
|
+
"""Build the subparser."""
|
30
|
+
env_parser = subparsers.add_parser(
|
31
|
+
"env", help="print environment information for bug reporting"
|
32
|
+
)
|
33
|
+
|
34
|
+
env_parser.set_defaults(func=run)
|
35
|
+
|
36
|
+
|
37
|
+
def run(_args: argparse.Namespace) -> None:
|
38
|
+
"""Run env subcommand.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
config (argparse.Namespace): Config from command line arguments
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
None
|
45
|
+
"""
|
46
|
+
sys_version = sys.version.replace("\n", "")
|
47
|
+
print(
|
48
|
+
f"txt2ebook: {__version__}",
|
49
|
+
f"python: {sys_version}",
|
50
|
+
f"platform: {platform.platform()}",
|
51
|
+
sep="\n",
|
52
|
+
)
|
@@ -0,0 +1,220 @@
|
|
1
|
+
# Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU Affero General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU Affero General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Affero General Public License
|
14
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
"""Env subcommand."""
|
17
|
+
|
18
|
+
import argparse
|
19
|
+
import logging
|
20
|
+
import sys
|
21
|
+
|
22
|
+
import cjkwrap
|
23
|
+
import regex as re
|
24
|
+
from bs4 import UnicodeDammit
|
25
|
+
|
26
|
+
from txt2ebook.exceptions import EmptyFileError
|
27
|
+
from txt2ebook.zh_utils import zh_halfwidth_to_fullwidth
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def build_subparser(subparsers) -> None:
|
33
|
+
"""Build the subparser."""
|
34
|
+
massage_parser = subparsers.add_parser(
|
35
|
+
"massage", help="massage the source txt file"
|
36
|
+
)
|
37
|
+
|
38
|
+
massage_parser.add_argument(
|
39
|
+
"input_file",
|
40
|
+
nargs=None if sys.stdin.isatty() else "?", # type: ignore
|
41
|
+
type=argparse.FileType("rb"),
|
42
|
+
default=None if sys.stdin.isatty() else sys.stdin,
|
43
|
+
help="source text filename",
|
44
|
+
metavar="TXT_FILENAME",
|
45
|
+
)
|
46
|
+
|
47
|
+
massage_parser.add_argument(
|
48
|
+
"-rd",
|
49
|
+
"--regex-delete",
|
50
|
+
dest="re_delete",
|
51
|
+
default=[],
|
52
|
+
action="append",
|
53
|
+
help="regex to delete word or phrase (default: '%(default)s')",
|
54
|
+
metavar="REGEX",
|
55
|
+
)
|
56
|
+
|
57
|
+
massage_parser.add_argument(
|
58
|
+
"-rr",
|
59
|
+
"--regex-replace",
|
60
|
+
dest="re_replace",
|
61
|
+
nargs=2,
|
62
|
+
default=[],
|
63
|
+
action="append",
|
64
|
+
help="regex to search and replace (default: '%(default)s')",
|
65
|
+
metavar="REGEX",
|
66
|
+
)
|
67
|
+
|
68
|
+
massage_parser.add_argument(
|
69
|
+
"-rl",
|
70
|
+
"--regex-delete-line",
|
71
|
+
dest="re_delete_line",
|
72
|
+
default=[],
|
73
|
+
action="append",
|
74
|
+
help="regex to delete whole line (default: '%(default)s')",
|
75
|
+
metavar="REGEX",
|
76
|
+
)
|
77
|
+
|
78
|
+
massage_parser.add_argument(
|
79
|
+
"-w",
|
80
|
+
"--width",
|
81
|
+
dest="width",
|
82
|
+
type=int,
|
83
|
+
default=None,
|
84
|
+
help="width for line wrapping",
|
85
|
+
metavar="WIDTH",
|
86
|
+
)
|
87
|
+
|
88
|
+
massage_parser.set_defaults(func=run)
|
89
|
+
|
90
|
+
|
91
|
+
def run(args: argparse.Namespace) -> None:
|
92
|
+
"""Run massage subcommand.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
args (argparse.Namespace): args.from command line arguments
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
None
|
99
|
+
"""
|
100
|
+
logger.info("Parsing txt file: %s", args.input_file.name)
|
101
|
+
|
102
|
+
unicode = UnicodeDammit(args.input_file.read())
|
103
|
+
logger.info("Detect encoding : %s", unicode.original_encoding)
|
104
|
+
|
105
|
+
content = unicode.unicode_markup
|
106
|
+
if not content:
|
107
|
+
raise EmptyFileError(
|
108
|
+
f"Empty file content in {args.input_file.name}"
|
109
|
+
)
|
110
|
+
|
111
|
+
content = to_unix_newline(content)
|
112
|
+
|
113
|
+
if args.fullwidth and args.language in ("zh-cn", "zh-tw"):
|
114
|
+
logger.info("Convert halfwidth ASCII characters to fullwidth")
|
115
|
+
content = zh_halfwidth_to_fullwidth(content)
|
116
|
+
|
117
|
+
if args.re_delete:
|
118
|
+
content = do_delete_regex(args, content)
|
119
|
+
|
120
|
+
if args.re_replace:
|
121
|
+
content = do_replace_regex(args, content)
|
122
|
+
|
123
|
+
if args.re_delete_line:
|
124
|
+
content = do_delete_line_regex(args, content)
|
125
|
+
|
126
|
+
if args.width:
|
127
|
+
content = do_wrapping(args, content)
|
128
|
+
|
129
|
+
return content
|
130
|
+
|
131
|
+
|
132
|
+
def to_unix_newline(content: str) -> str:
|
133
|
+
"""Convert all other line ends to Unix line end.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
content(str): The formatted book content.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
str: The formatted book content.
|
140
|
+
"""
|
141
|
+
return content.replace("\r\n", "\n").replace("\r", "\n")
|
142
|
+
|
143
|
+
|
144
|
+
def do_delete_regex(args, content: str) -> str:
|
145
|
+
"""Remove words/phrases based on regex.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
content(str): The formatted book content.
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
str: The formatted book content.
|
152
|
+
"""
|
153
|
+
for delete_regex in args.re_delete:
|
154
|
+
content = re.sub(
|
155
|
+
re.compile(rf"{delete_regex}", re.MULTILINE), "", content
|
156
|
+
)
|
157
|
+
return content
|
158
|
+
|
159
|
+
|
160
|
+
def do_replace_regex(args, content: str) -> str:
|
161
|
+
"""Replace words/phrases based on regex.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
content(str): The formatted book content.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
str: The formatted book content.
|
168
|
+
"""
|
169
|
+
regex = args.re_replace
|
170
|
+
if isinstance(regex, list):
|
171
|
+
for search, replace in regex:
|
172
|
+
content = re.sub(
|
173
|
+
re.compile(rf"{search}", re.MULTILINE),
|
174
|
+
rf"{replace}",
|
175
|
+
content,
|
176
|
+
)
|
177
|
+
|
178
|
+
return content
|
179
|
+
|
180
|
+
|
181
|
+
def do_delete_line_regex(args, content: str) -> str:
|
182
|
+
"""Delete whole line based on regex.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
content(str): The formatted book content.
|
186
|
+
|
187
|
+
Returns:
|
188
|
+
str: The formatted book content.
|
189
|
+
"""
|
190
|
+
for delete_line_regex in args.re_delete_line:
|
191
|
+
content = re.sub(
|
192
|
+
re.compile(rf"^.*{delete_line_regex}.*$", re.MULTILINE),
|
193
|
+
"",
|
194
|
+
content,
|
195
|
+
)
|
196
|
+
return content
|
197
|
+
|
198
|
+
|
199
|
+
def do_wrapping(args, content: str) -> str:
|
200
|
+
"""Wrap or fill CJK text.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
content (str): The formatted book content.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
str: The formatted book content.
|
207
|
+
"""
|
208
|
+
logger.info("Wrapping paragraph to width: %s", args.width)
|
209
|
+
|
210
|
+
paragraphs = []
|
211
|
+
# We don't remove empty line and keep all formatting as it.
|
212
|
+
for paragraph in content.split("\n"):
|
213
|
+
paragraph = paragraph.strip()
|
214
|
+
|
215
|
+
lines = cjkwrap.wrap(paragraph, width=args.width)
|
216
|
+
paragraph = "\n".join(lines)
|
217
|
+
paragraphs.append(paragraph)
|
218
|
+
|
219
|
+
wrapped_content = "\n".join(paragraphs)
|
220
|
+
return wrapped_content
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU Affero General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU Affero General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU Affero General Public License
|
14
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
"""Env subcommand."""
|
17
|
+
|
18
|
+
import argparse
|
19
|
+
import logging
|
20
|
+
import sys
|
21
|
+
|
22
|
+
from bs4 import UnicodeDammit
|
23
|
+
from langdetect import detect
|
24
|
+
|
25
|
+
from txt2ebook.exceptions import EmptyFileError
|
26
|
+
from txt2ebook.parser import Parser
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
def build_subparser(subparsers) -> None:
|
32
|
+
"""Build the subparser."""
|
33
|
+
parse_parser = subparsers.add_parser(
|
34
|
+
"parse", help="parse and validate the txt file"
|
35
|
+
)
|
36
|
+
|
37
|
+
parse_parser.add_argument(
|
38
|
+
"input_file",
|
39
|
+
nargs=None if sys.stdin.isatty() else "?", # type: ignore
|
40
|
+
type=argparse.FileType("rb"),
|
41
|
+
default=None if sys.stdin.isatty() else sys.stdin,
|
42
|
+
help="source text filename",
|
43
|
+
metavar="TXT_FILENAME",
|
44
|
+
)
|
45
|
+
|
46
|
+
parse_parser.set_defaults(func=run)
|
47
|
+
|
48
|
+
|
49
|
+
def run(args: argparse.Namespace) -> None:
|
50
|
+
"""Run env subcommand.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
args (argparse.Namespace): Config from command line arguments
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
None
|
57
|
+
"""
|
58
|
+
logger.info("Parsing txt file: %s", args.input_file.name)
|
59
|
+
|
60
|
+
unicode = UnicodeDammit(args.input_file.read())
|
61
|
+
logger.info("Detect encoding : %s", unicode.original_encoding)
|
62
|
+
|
63
|
+
content = unicode.unicode_markup
|
64
|
+
if not content:
|
65
|
+
raise EmptyFileError(
|
66
|
+
f"Empty file content in {args.input_file.name}"
|
67
|
+
)
|
68
|
+
|
69
|
+
args_language = args.language
|
70
|
+
detect_language = detect(content)
|
71
|
+
args.language = args_language or detect_language
|
72
|
+
logger.info("args language: %s", args_language)
|
73
|
+
logger.info("Detect language: %s", detect_language)
|
74
|
+
|
75
|
+
if args_language and args_language != detect_language:
|
76
|
+
logger.warning(
|
77
|
+
"args (%s) and detect (%s) language mismatch",
|
78
|
+
args_language,
|
79
|
+
detect_language,
|
80
|
+
)
|
81
|
+
|
82
|
+
parser = Parser(content, args)
|
83
|
+
book = parser.parse()
|
84
|
+
|
85
|
+
if args.debug:
|
86
|
+
book.debug(args.verbose)
|
@@ -168,7 +168,9 @@ class Tokenizer:
|
|
168
168
|
return []
|
169
169
|
|
170
170
|
metadata = match[1].split("\n")
|
171
|
-
|
171
|
+
for metadata_field in metadata:
|
172
|
+
logger.info("Metadata: %s", metadata_field)
|
173
|
+
|
172
174
|
return metadata
|
173
175
|
|
174
176
|
def _tokenize_content(self) -> None:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.mo
RENAMED
File without changes
|
{txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.po
RENAMED
File without changes
|
{txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.mo
RENAMED
File without changes
|
{txt2ebook-0.1.107 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.po
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|