txt2ebook 0.1.106__tar.gz → 0.1.108__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/PKG-INFO +6 -2
  2. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/README.md +5 -1
  3. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/pyproject.toml +2 -2
  4. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/__init__.py +1 -1
  5. txt2ebook-0.1.108/src/txt2ebook/cli.py +271 -0
  6. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/tex.py +4 -2
  7. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/models/book.py +0 -1
  8. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/parser.py +1 -121
  9. txt2ebook-0.1.108/src/txt2ebook/subcommands/env.py +52 -0
  10. txt2ebook-0.1.108/src/txt2ebook/subcommands/massage.py +220 -0
  11. txt2ebook-0.1.108/src/txt2ebook/subcommands/parse.py +86 -0
  12. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/tokenizer.py +3 -1
  13. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/txt2ebook.py +10 -0
  14. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/LICENSE.md +0 -0
  15. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/__main__.py +0 -0
  16. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/exceptions.py +0 -0
  17. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/__init__.py +0 -0
  18. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/base.py +0 -0
  19. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/epub.py +0 -0
  20. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/gmi.py +0 -0
  21. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/md.py +0 -0
  22. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/pdf.py +0 -0
  23. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/__init__.py +0 -0
  24. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/__init__.py +0 -0
  25. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/clean.css +0 -0
  26. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/condense.css +0 -0
  27. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/templates/epub/noindent.css +0 -0
  28. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/txt.py +0 -0
  29. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/formats/typ.py +0 -0
  30. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/helpers/__init__.py +0 -0
  31. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/languages/__init__.py +0 -0
  32. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/languages/en.py +0 -0
  33. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/languages/zh_cn.py +0 -0
  34. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/languages/zh_tw.py +0 -0
  35. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.mo +0 -0
  36. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.po +0 -0
  37. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/txt2ebook.pot +0 -0
  38. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.mo +0 -0
  39. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.po +0 -0
  40. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.mo +0 -0
  41. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.po +0 -0
  42. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/models/__init__.py +0 -0
  43. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/models/chapter.py +0 -0
  44. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/models/volume.py +0 -0
  45. {txt2ebook-0.1.106 → txt2ebook-0.1.108}/src/txt2ebook/zh_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: txt2ebook
3
- Version: 0.1.106
3
+ Version: 0.1.108
4
4
  Summary: CLI tool to convert txt file to ebook format
5
5
  Home-page: https://github.com/kianmeng/txt2ebook
6
6
  License: AGPL-3.0-or-later
@@ -80,7 +80,7 @@ usage: txt2ebook [-of OUTPUT_FOLDER] [-p] [-f {epub,gmi,md,pdf,tex,txt,typ}]
80
80
  [-tr TRANSLATOR] [-c IMAGE_FILENAME] [-w WIDTH]
81
81
  [-ff FILENAME_FORMAT] [-ps SEPARATOR] [-pz PAGE_SIZE]
82
82
  [-rd REGEX] [-rvc REGEX] [-rv REGEX] [-rc REGEX] [-rt REGEX]
83
- [-ra REGEX] [-rl REGEX] [-rr REGEX REGEX]
83
+ [-ra REGEX] [-rl REGEX] [-rr REGEX REGEX] [-ct]
84
84
  [-et {clean,condense,noindent}] [-vp] [-tp] [-sp] [-ss]
85
85
  [-toc] [-hn] [-fw] [-rw] [-ow] [-op] [-q] [-v] [-y] [-d]
86
86
  [--env] [-h] [-V]
@@ -182,6 +182,10 @@ options:
182
182
  -toc, --table-of-content
183
183
  add table of content
184
184
 
185
+ --format tex:
186
+ -ct, --clean-tex
187
+ purge artifacts generated by TeX (default: 'False')
188
+
185
189
  --language zh-cn / --language zh-tw:
186
190
  -hn, --header-number
187
191
  convert section header from words to numbers
@@ -38,7 +38,7 @@ usage: txt2ebook [-of OUTPUT_FOLDER] [-p] [-f {epub,gmi,md,pdf,tex,txt,typ}]
38
38
  [-tr TRANSLATOR] [-c IMAGE_FILENAME] [-w WIDTH]
39
39
  [-ff FILENAME_FORMAT] [-ps SEPARATOR] [-pz PAGE_SIZE]
40
40
  [-rd REGEX] [-rvc REGEX] [-rv REGEX] [-rc REGEX] [-rt REGEX]
41
- [-ra REGEX] [-rl REGEX] [-rr REGEX REGEX]
41
+ [-ra REGEX] [-rl REGEX] [-rr REGEX REGEX] [-ct]
42
42
  [-et {clean,condense,noindent}] [-vp] [-tp] [-sp] [-ss]
43
43
  [-toc] [-hn] [-fw] [-rw] [-ow] [-op] [-q] [-v] [-y] [-d]
44
44
  [--env] [-h] [-V]
@@ -140,6 +140,10 @@ options:
140
140
  -toc, --table-of-content
141
141
  add table of content
142
142
 
143
+ --format tex:
144
+ -ct, --clean-tex
145
+ purge artifacts generated by TeX (default: 'False')
146
+
143
147
  --language zh-cn / --language zh-tw:
144
148
  -hn, --header-number
145
149
  convert section header from words to numbers
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "txt2ebook"
3
- version = "0.1.106"
3
+ version = "0.1.108"
4
4
  description = "CLI tool to convert txt file to ebook format"
5
5
  authors = ["Kian-Meng Ang <kianmeng@cpan.org>"]
6
6
  license = "AGPL-3.0-or-later"
@@ -44,7 +44,7 @@ lxml = "^5.2.2"
44
44
 
45
45
  [tool.poetry.scripts]
46
46
  txt2ebook = 'txt2ebook.txt2ebook:main'
47
- tte = 'txt2ebook.txt2ebook:main'
47
+ tte = 'txt2ebook.cli:main'
48
48
 
49
49
  [tool.poetry.group.dev.dependencies]
50
50
  babel = "^2.12.1"
@@ -22,7 +22,7 @@ import sys
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
- __version__ = "0.1.106"
25
+ __version__ = "0.1.108"
26
26
 
27
27
 
28
28
  def setup_logger(config: argparse.Namespace) -> None:
@@ -0,0 +1,271 @@
1
+ # Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU Affero General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU Affero General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Affero General Public License
14
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ """txt2ebook/tte is a cli tool to convert txt file to ebook format.
17
+
18
+ website: https://github.com/kianmeng/txt2ebook
19
+ changelog: https://github.com/kianmeng/txt2ebook/blob/master/CHANGELOG.md
20
+ issues: https://github.com/kianmeng/txt2ebook/issues
21
+ """
22
+
23
+ import argparse
24
+ import logging
25
+ import sys
26
+ from typing import Optional, Sequence
27
+
28
+ import txt2ebook.subcommands.env
29
+ import txt2ebook.subcommands.parse
30
+ import txt2ebook.subcommands.massage
31
+ from txt2ebook import __version__, setup_logger
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def build_parser(
37
+ args: Optional[Sequence[str]] = [],
38
+ ) -> argparse.ArgumentParser:
39
+ """Build the CLI parser."""
40
+ parser = argparse.ArgumentParser(
41
+ prog="txt2ebook",
42
+ add_help=False,
43
+ description=__doc__,
44
+ formatter_class=lambda prog: argparse.RawTextHelpFormatter(
45
+ prog, max_help_position=6
46
+ ),
47
+ )
48
+
49
+ parser.add_argument(
50
+ "-t",
51
+ "--title",
52
+ dest="title",
53
+ default=None,
54
+ help="title of the ebook (default: '%(default)s')",
55
+ metavar="TITLE",
56
+ )
57
+
58
+ parser.add_argument(
59
+ "-l",
60
+ "--language",
61
+ dest="language",
62
+ default=None,
63
+ help="language of the ebook (default: '%(default)s')",
64
+ metavar="LANGUAGE",
65
+ )
66
+
67
+ parser.add_argument(
68
+ "-a",
69
+ "--author",
70
+ dest="author",
71
+ default=[],
72
+ action="append",
73
+ help="author of the ebook (default: '%(default)s')",
74
+ metavar="AUTHOR",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "-c",
79
+ "--cover",
80
+ dest="cover",
81
+ default=None,
82
+ help="cover of the ebook",
83
+ metavar="IMAGE_FILENAME",
84
+ )
85
+
86
+ parser.add_argument(
87
+ "-tr",
88
+ "--translator",
89
+ dest="translator",
90
+ default=[],
91
+ action="append",
92
+ help="translator of the ebook (default: '%(default)s')",
93
+ metavar="TRANSLATOR",
94
+ )
95
+
96
+ parser.add_argument(
97
+ "-fw",
98
+ "--fullwidth",
99
+ default=False,
100
+ action="store_true",
101
+ dest="fullwidth",
102
+ help="convert ASCII character from halfwidth to fullwidth",
103
+ )
104
+
105
+
106
+ parser.add_argument(
107
+ "-ra",
108
+ "--regex-author",
109
+ dest="re_author",
110
+ default=[],
111
+ action="append",
112
+ help="regex to parse author of the book (default: by LANGUAGE)",
113
+ metavar="REGEX",
114
+ )
115
+
116
+ parser.add_argument(
117
+ "-rc",
118
+ "--regex-chapter",
119
+ dest="re_chapter",
120
+ default=[],
121
+ action="append",
122
+ help="regex to parse chapter header (default: by LANGUAGE)",
123
+ metavar="REGEX",
124
+ )
125
+
126
+ parser.add_argument(
127
+ "-rvc",
128
+ "--regex-volume-chapter",
129
+ dest="re_volume_chapter",
130
+ default=[],
131
+ action="append",
132
+ help=(
133
+ "regex to parse volume and chapter header "
134
+ "(default: by LANGUAGE)"
135
+ ),
136
+ metavar="REGEX",
137
+ )
138
+
139
+ parser.add_argument(
140
+ "-rv",
141
+ "--regex-volume",
142
+ dest="re_volume",
143
+ default=[],
144
+ action="append",
145
+ help="regex to parse volume header (default: by LANGUAGE)",
146
+ metavar="REGEX",
147
+ )
148
+
149
+ parser.add_argument(
150
+ "-hn",
151
+ "--header-number",
152
+ default=False,
153
+ action="store_true",
154
+ dest="header_number",
155
+ help="convert section header from words to numbers",
156
+ )
157
+
158
+ parser.add_argument(
159
+ "-ps",
160
+ "--paragraph_separator",
161
+ dest="paragraph_separator",
162
+ type=lambda value: value.encode("utf-8").decode("unicode_escape"),
163
+ default="\n\n",
164
+ help="paragraph separator (default: %(default)r)",
165
+ metavar="SEPARATOR",
166
+ )
167
+
168
+ parser.add_argument(
169
+ "-rt",
170
+ "--regex-title",
171
+ dest="re_title",
172
+ default=[],
173
+ action="append",
174
+ help="regex to parse title of the book (default: by LANGUAGE)",
175
+ metavar="REGEX",
176
+ )
177
+
178
+ parser.add_argument(
179
+ "-ss",
180
+ "--sort-volume-and-chapter",
181
+ default=False,
182
+ action="store_true",
183
+ dest="sort_volume_and_chapter",
184
+ help="short volume and chapter",
185
+ )
186
+
187
+ parser.add_argument(
188
+ "-rw",
189
+ "--raise-on-warning",
190
+ default=False,
191
+ action="store_true",
192
+ dest="raise_on_warning",
193
+ help="raise exception and stop parsing upon warning",
194
+ )
195
+
196
+ parser.add_argument(
197
+ "-q",
198
+ "--quiet",
199
+ default=False,
200
+ action="store_true",
201
+ dest="quiet",
202
+ help="suppress all logging",
203
+ )
204
+
205
+ parser.add_argument(
206
+ "-v",
207
+ "--verbose",
208
+ default=0,
209
+ action="count",
210
+ dest="verbose",
211
+ help="show verbosity of debugging log, use -vv, -vvv for more details",
212
+ )
213
+
214
+ parser.add_argument(
215
+ "-d",
216
+ "--debug",
217
+ default=False,
218
+ action="store_true",
219
+ dest="debug",
220
+ help="show debugging log and stacktrace",
221
+ )
222
+
223
+ parser.add_argument(
224
+ "-h",
225
+ "--help",
226
+ action="help",
227
+ default=argparse.SUPPRESS,
228
+ help="show this help message and exit",
229
+ )
230
+
231
+ parser.add_argument(
232
+ "-V",
233
+ "--version",
234
+ action="version",
235
+ version=f"%(prog)s {__version__}",
236
+ )
237
+
238
+ subparsers = parser.add_subparsers(help="sub-command help")
239
+ txt2ebook.subcommands.parse.build_subparser(subparsers)
240
+ txt2ebook.subcommands.massage.build_subparser(subparsers)
241
+ txt2ebook.subcommands.env.build_subparser(subparsers)
242
+
243
+ return parser
244
+
245
+
246
+ def main(args: Optional[Sequence[str]] = None):
247
+ """Set the main entrypoint of the CLI script."""
248
+ args = args or sys.argv[1:]
249
+
250
+ try:
251
+ parser = build_parser()
252
+ if len(args) == 0:
253
+ parser.print_help(sys.stderr)
254
+ else:
255
+ parsed_args = parser.parse_args(args)
256
+ setup_logger(parsed_args)
257
+
258
+ if hasattr(parsed_args, "func"):
259
+ logger.debug(parsed_args)
260
+ parsed_args.func(parsed_args)
261
+ else:
262
+ parser.print_help(sys.stderr)
263
+
264
+ except Exception as error:
265
+ logger.error(
266
+ "error: %s",
267
+ getattr(error, "message", str(error)),
268
+ exc_info=("-d" in args or "--debug" in args),
269
+ )
270
+
271
+ raise SystemExit(1) from None
@@ -128,7 +128,9 @@ class TexWriter(BaseWriter):
128
128
 
129
129
  filename = str(new_filename.parent / new_filename.stem)
130
130
  pdf_filename = Path(filename).with_suffix(".pdf")
131
- doc.generate_pdf(filename, compiler="latexmk", clean_tex=False)
131
+ doc.generate_pdf(
132
+ filename, compiler="latexmk", clean_tex=self.config.clean_tex
133
+ )
132
134
  logger.info("Generate PDF file: %s", pdf_filename.resolve())
133
135
 
134
136
  if self.config.open:
@@ -140,7 +142,7 @@ class TexWriter(BaseWriter):
140
142
  for keyword in self.index_keywords:
141
143
  par = par.replace(
142
144
  keyword,
143
- rf"\color{{red}}\index{{{keyword}}}{keyword}\color{{black}}"
145
+ rf"\color{{red}}\index{{{keyword}}}{keyword}\color{{black}}",
144
146
  )
145
147
 
146
148
  return par
@@ -40,7 +40,6 @@ class Book:
40
40
  language: str = field(default="")
41
41
  cover: str = field(default="", repr=False)
42
42
  raw_content: str = field(default="", repr=False)
43
- massaged_content: str = field(default="", repr=False)
44
43
  toc: List[Union[Volume, Chapter]] = field(
45
44
  default_factory=lambda: [], repr=False
46
45
  )
@@ -52,8 +52,7 @@ class Parser:
52
52
  Returns:
53
53
  txt2ebook.models.Book: The Book model.
54
54
  """
55
- massaged_content = self.massage()
56
- tokenizer = Tokenizer(massaged_content, self.config)
55
+ tokenizer = Tokenizer(self.raw_content, self.config)
57
56
 
58
57
  (book_title, authors, translators, tags, index, toc) = (
59
58
  self.parse_tokens(tokenizer)
@@ -68,7 +67,6 @@ class Parser:
68
67
  index=index,
69
68
  cover=self.config.cover,
70
69
  raw_content=self.raw_content,
71
- massaged_content=massaged_content,
72
70
  toc=toc,
73
71
  )
74
72
 
@@ -252,121 +250,3 @@ class Parser:
252
250
  section.chapters.sort(key=lambda x: x.title)
253
251
 
254
252
  toc.sort(key=lambda x: x.title if isinstance(x, Volume) else "")
255
-
256
- def massage(self) -> str:
257
- """Massage the txt content.
258
-
259
- Returns:
260
- str: The formatted book content
261
- """
262
- content = self.raw_content
263
-
264
- content = Parser.to_unix_newline(content)
265
-
266
- if self.config.fullwidth and self.config.language in (
267
- "zh-cn",
268
- "zh-tw",
269
- ):
270
- logger.info("Convert halfwidth ASCII characters to fullwidth")
271
- content = zh_halfwidth_to_fullwidth(content)
272
-
273
- if self.config.re_delete:
274
- content = self.do_delete_regex(content)
275
-
276
- if self.config.re_replace:
277
- content = self.do_replace_regex(content)
278
-
279
- if self.config.re_delete_line:
280
- content = self.do_delete_line_regex(content)
281
-
282
- if self.config.width:
283
- content = self.do_wrapping(content)
284
-
285
- return content
286
-
287
- @staticmethod
288
- def to_unix_newline(content: str) -> str:
289
- """Convert all other line ends to Unix line end.
290
-
291
- Args:
292
- content(str): The formatted book content.
293
-
294
- Returns:
295
- str: The formatted book content.
296
- """
297
- return content.replace("\r\n", "\n").replace("\r", "\n")
298
-
299
- def do_delete_regex(self, content: str) -> str:
300
- """Remove words/phrases based on regex.
301
-
302
- Args:
303
- content(str): The formatted book content.
304
-
305
- Returns:
306
- str: The formatted book content.
307
- """
308
- for delete_regex in self.config.re_delete:
309
- content = re.sub(
310
- re.compile(rf"{delete_regex}", re.MULTILINE), "", content
311
- )
312
- return content
313
-
314
- def do_replace_regex(self, content: str) -> str:
315
- """Replace words/phrases based on regex.
316
-
317
- Args:
318
- content(str): The formatted book content.
319
-
320
- Returns:
321
- str: The formatted book content.
322
- """
323
- regex = self.config.re_replace
324
- if isinstance(regex, list):
325
- for search, replace in regex:
326
- content = re.sub(
327
- re.compile(rf"{search}", re.MULTILINE),
328
- rf"{replace}",
329
- content,
330
- )
331
-
332
- return content
333
-
334
- def do_delete_line_regex(self, content: str) -> str:
335
- """Delete whole line based on regex.
336
-
337
- Args:
338
- content(str): The formatted book content.
339
-
340
- Returns:
341
- str: The formatted book content.
342
- """
343
- for delete_line_regex in self.config.re_delete_line:
344
- content = re.sub(
345
- re.compile(rf"^.*{delete_line_regex}.*$", re.MULTILINE),
346
- "",
347
- content,
348
- )
349
- return content
350
-
351
- def do_wrapping(self, content: str) -> str:
352
- """Wrap or fill CJK text.
353
-
354
- Args:
355
- content (str): The formatted book content.
356
-
357
- Returns:
358
- str: The formatted book content.
359
- """
360
- logger.info("Wrapping paragraph to width: %s", self.config.width)
361
-
362
- paragraphs = []
363
- # We don't remove empty line and keep all formatting as it.
364
- for paragraph in content.split("\n"):
365
- paragraph = paragraph.strip()
366
-
367
- lines = cjkwrap.wrap(paragraph, width=self.config.width)
368
- paragraph = "\n".join(lines)
369
- paragraphs.append(paragraph)
370
-
371
- wrapped_content = "\n".join(paragraphs)
372
- return wrapped_content
@@ -0,0 +1,52 @@
1
+ # Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU Affero General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU Affero General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Affero General Public License
14
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ """Env subcommand."""
17
+
18
+ import argparse
19
+ import logging
20
+ import platform
21
+ import sys
22
+
23
+ from txt2ebook import __version__
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def build_subparser(subparsers) -> None:
29
+ """Build the subparser."""
30
+ env_parser = subparsers.add_parser(
31
+ "env", help="print environment information for bug reporting"
32
+ )
33
+
34
+ env_parser.set_defaults(func=run)
35
+
36
+
37
+ def run(_args: argparse.Namespace) -> None:
38
+ """Run env subcommand.
39
+
40
+ Args:
41
+ config (argparse.Namespace): Config from command line arguments
42
+
43
+ Returns:
44
+ None
45
+ """
46
+ sys_version = sys.version.replace("\n", "")
47
+ print(
48
+ f"txt2ebook: {__version__}",
49
+ f"python: {sys_version}",
50
+ f"platform: {platform.platform()}",
51
+ sep="\n",
52
+ )
@@ -0,0 +1,220 @@
1
+ # Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU Affero General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU Affero General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Affero General Public License
14
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ """Env subcommand."""
17
+
18
+ import argparse
19
+ import logging
20
+ import sys
21
+
22
+ import cjkwrap
23
+ import regex as re
24
+ from bs4 import UnicodeDammit
25
+
26
+ from txt2ebook.exceptions import EmptyFileError
27
+ from txt2ebook.zh_utils import zh_halfwidth_to_fullwidth
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def build_subparser(subparsers) -> None:
33
+ """Build the subparser."""
34
+ massage_parser = subparsers.add_parser(
35
+ "massage", help="massage the source txt file"
36
+ )
37
+
38
+ massage_parser.add_argument(
39
+ "input_file",
40
+ nargs=None if sys.stdin.isatty() else "?", # type: ignore
41
+ type=argparse.FileType("rb"),
42
+ default=None if sys.stdin.isatty() else sys.stdin,
43
+ help="source text filename",
44
+ metavar="TXT_FILENAME",
45
+ )
46
+
47
+ massage_parser.add_argument(
48
+ "-rd",
49
+ "--regex-delete",
50
+ dest="re_delete",
51
+ default=[],
52
+ action="append",
53
+ help="regex to delete word or phrase (default: '%(default)s')",
54
+ metavar="REGEX",
55
+ )
56
+
57
+ massage_parser.add_argument(
58
+ "-rr",
59
+ "--regex-replace",
60
+ dest="re_replace",
61
+ nargs=2,
62
+ default=[],
63
+ action="append",
64
+ help="regex to search and replace (default: '%(default)s')",
65
+ metavar="REGEX",
66
+ )
67
+
68
+ massage_parser.add_argument(
69
+ "-rl",
70
+ "--regex-delete-line",
71
+ dest="re_delete_line",
72
+ default=[],
73
+ action="append",
74
+ help="regex to delete whole line (default: '%(default)s')",
75
+ metavar="REGEX",
76
+ )
77
+
78
+ massage_parser.add_argument(
79
+ "-w",
80
+ "--width",
81
+ dest="width",
82
+ type=int,
83
+ default=None,
84
+ help="width for line wrapping",
85
+ metavar="WIDTH",
86
+ )
87
+
88
+ massage_parser.set_defaults(func=run)
89
+
90
+
91
+ def run(args: argparse.Namespace) -> None:
92
+ """Run massage subcommand.
93
+
94
+ Args:
95
+ args (argparse.Namespace): args.from command line arguments
96
+
97
+ Returns:
98
+ None
99
+ """
100
+ logger.info("Parsing txt file: %s", args.input_file.name)
101
+
102
+ unicode = UnicodeDammit(args.input_file.read())
103
+ logger.info("Detect encoding : %s", unicode.original_encoding)
104
+
105
+ content = unicode.unicode_markup
106
+ if not content:
107
+ raise EmptyFileError(
108
+ f"Empty file content in {args.input_file.name}"
109
+ )
110
+
111
+ content = to_unix_newline(content)
112
+
113
+ if args.fullwidth and args.language in ("zh-cn", "zh-tw"):
114
+ logger.info("Convert halfwidth ASCII characters to fullwidth")
115
+ content = zh_halfwidth_to_fullwidth(content)
116
+
117
+ if args.re_delete:
118
+ content = do_delete_regex(args, content)
119
+
120
+ if args.re_replace:
121
+ content = do_replace_regex(args, content)
122
+
123
+ if args.re_delete_line:
124
+ content = do_delete_line_regex(args, content)
125
+
126
+ if args.width:
127
+ content = do_wrapping(args, content)
128
+
129
+ return content
130
+
131
+
132
+ def to_unix_newline(content: str) -> str:
133
+ """Convert all other line ends to Unix line end.
134
+
135
+ Args:
136
+ content(str): The formatted book content.
137
+
138
+ Returns:
139
+ str: The formatted book content.
140
+ """
141
+ return content.replace("\r\n", "\n").replace("\r", "\n")
142
+
143
+
144
+ def do_delete_regex(args, content: str) -> str:
145
+ """Remove words/phrases based on regex.
146
+
147
+ Args:
148
+ content(str): The formatted book content.
149
+
150
+ Returns:
151
+ str: The formatted book content.
152
+ """
153
+ for delete_regex in args.re_delete:
154
+ content = re.sub(
155
+ re.compile(rf"{delete_regex}", re.MULTILINE), "", content
156
+ )
157
+ return content
158
+
159
+
160
+ def do_replace_regex(args, content: str) -> str:
161
+ """Replace words/phrases based on regex.
162
+
163
+ Args:
164
+ content(str): The formatted book content.
165
+
166
+ Returns:
167
+ str: The formatted book content.
168
+ """
169
+ regex = args.re_replace
170
+ if isinstance(regex, list):
171
+ for search, replace in regex:
172
+ content = re.sub(
173
+ re.compile(rf"{search}", re.MULTILINE),
174
+ rf"{replace}",
175
+ content,
176
+ )
177
+
178
+ return content
179
+
180
+
181
+ def do_delete_line_regex(args, content: str) -> str:
182
+ """Delete whole line based on regex.
183
+
184
+ Args:
185
+ content(str): The formatted book content.
186
+
187
+ Returns:
188
+ str: The formatted book content.
189
+ """
190
+ for delete_line_regex in args.re_delete_line:
191
+ content = re.sub(
192
+ re.compile(rf"^.*{delete_line_regex}.*$", re.MULTILINE),
193
+ "",
194
+ content,
195
+ )
196
+ return content
197
+
198
+
199
+ def do_wrapping(args, content: str) -> str:
200
+ """Wrap or fill CJK text.
201
+
202
+ Args:
203
+ content (str): The formatted book content.
204
+
205
+ Returns:
206
+ str: The formatted book content.
207
+ """
208
+ logger.info("Wrapping paragraph to width: %s", args.width)
209
+
210
+ paragraphs = []
211
+ # We don't remove empty line and keep all formatting as it.
212
+ for paragraph in content.split("\n"):
213
+ paragraph = paragraph.strip()
214
+
215
+ lines = cjkwrap.wrap(paragraph, width=args.width)
216
+ paragraph = "\n".join(lines)
217
+ paragraphs.append(paragraph)
218
+
219
+ wrapped_content = "\n".join(paragraphs)
220
+ return wrapped_content
@@ -0,0 +1,86 @@
1
+ # Copyright (C) 2021,2022,2023,2024 Kian-Meng Ang
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU Affero General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU Affero General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU Affero General Public License
14
+ # along with this program. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ """Env subcommand."""
17
+
18
+ import argparse
19
+ import logging
20
+ import sys
21
+
22
+ from bs4 import UnicodeDammit
23
+ from langdetect import detect
24
+
25
+ from txt2ebook.exceptions import EmptyFileError
26
+ from txt2ebook.parser import Parser
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def build_subparser(subparsers) -> None:
32
+ """Build the subparser."""
33
+ parse_parser = subparsers.add_parser(
34
+ "parse", help="parse and validate the txt file"
35
+ )
36
+
37
+ parse_parser.add_argument(
38
+ "input_file",
39
+ nargs=None if sys.stdin.isatty() else "?", # type: ignore
40
+ type=argparse.FileType("rb"),
41
+ default=None if sys.stdin.isatty() else sys.stdin,
42
+ help="source text filename",
43
+ metavar="TXT_FILENAME",
44
+ )
45
+
46
+ parse_parser.set_defaults(func=run)
47
+
48
+
49
+ def run(args: argparse.Namespace) -> None:
50
+ """Run env subcommand.
51
+
52
+ Args:
53
+ args (argparse.Namespace): Config from command line arguments
54
+
55
+ Returns:
56
+ None
57
+ """
58
+ logger.info("Parsing txt file: %s", args.input_file.name)
59
+
60
+ unicode = UnicodeDammit(args.input_file.read())
61
+ logger.info("Detect encoding : %s", unicode.original_encoding)
62
+
63
+ content = unicode.unicode_markup
64
+ if not content:
65
+ raise EmptyFileError(
66
+ f"Empty file content in {args.input_file.name}"
67
+ )
68
+
69
+ args_language = args.language
70
+ detect_language = detect(content)
71
+ args.language = args_language or detect_language
72
+ logger.info("args language: %s", args_language)
73
+ logger.info("Detect language: %s", detect_language)
74
+
75
+ if args_language and args_language != detect_language:
76
+ logger.warning(
77
+ "args (%s) and detect (%s) language mismatch",
78
+ args_language,
79
+ detect_language,
80
+ )
81
+
82
+ parser = Parser(content, args)
83
+ book = parser.parse()
84
+
85
+ if args.debug:
86
+ book.debug(args.verbose)
@@ -168,7 +168,9 @@ class Tokenizer:
168
168
  return []
169
169
 
170
170
  metadata = match[1].split("\n")
171
- logger.debug("Metadata: %s", metadata)
171
+ for metadata_field in metadata:
172
+ logger.info("Metadata: %s", metadata_field)
173
+
172
174
  return metadata
173
175
 
174
176
  def _tokenize_content(self) -> None:
@@ -109,6 +109,7 @@ def build_parser(
109
109
  epub = parser.add_argument_group("--format epub")
110
110
  pdf = parser.add_argument_group("--format pdf")
111
111
  txt = parser.add_argument_group("--format txt")
112
+ tex = parser.add_argument_group("--format tex")
112
113
  zhlang = parser.add_argument_group("--language zh-cn / --language zh-tw")
113
114
 
114
115
  if "--env" not in args:
@@ -343,6 +344,15 @@ def build_parser(
343
344
  metavar="REGEX",
344
345
  )
345
346
 
347
+ tex.add_argument(
348
+ "-ct",
349
+ "--clean-tex",
350
+ default=False,
351
+ action="store_true",
352
+ dest="clean_tex",
353
+ help="purge artifacts generated by TeX (default: '%(default)s')",
354
+ )
355
+
346
356
  epub.add_argument(
347
357
  "-et",
348
358
  "--epub-template",
File without changes