txt2ebook 0.1.151__tar.gz → 0.1.152__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/PKG-INFO +1 -1
  2. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/pyproject.toml +1 -1
  3. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/languages/zh_cn.py +1 -0
  4. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/massage.py +67 -3
  5. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/typ.py +1 -1
  6. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/LICENSE.md +0 -0
  7. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/README.md +0 -0
  8. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/__init__.py +0 -0
  9. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/__main__.py +0 -0
  10. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/cli.py +0 -0
  11. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/exceptions.py +0 -0
  12. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/__init__.py +0 -0
  13. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/base.py +0 -0
  14. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/epub.py +0 -0
  15. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/gmi.py +0 -0
  16. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/md.py +0 -0
  17. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/pdf.py +0 -0
  18. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/templates/__init__.py +0 -0
  19. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/templates/epub/__init__.py +0 -0
  20. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/templates/epub/clean.css +0 -0
  21. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/templates/epub/condense.css +0 -0
  22. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/templates/epub/noindent.css +0 -0
  23. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/tex.py +0 -0
  24. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/txt.py +0 -0
  25. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/formats/typ.py +0 -0
  26. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/helpers/__init__.py +0 -0
  27. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/languages/__init__.py +0 -0
  28. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/languages/en.py +0 -0
  29. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/languages/zh_tw.py +0 -0
  30. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.mo +0 -0
  31. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/en/LC_MESSAGES/txt2ebook.po +0 -0
  32. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/txt2ebook.pot +0 -0
  33. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.mo +0 -0
  34. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/zh-cn/LC_MESSAGES/txt2ebook.po +0 -0
  35. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.mo +0 -0
  36. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/locales/zh-tw/LC_MESSAGES/txt2ebook.po +0 -0
  37. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/models/__init__.py +0 -0
  38. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/models/book.py +0 -0
  39. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/models/chapter.py +0 -0
  40. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/models/volume.py +0 -0
  41. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/parser.py +0 -0
  42. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/__init__.py +0 -0
  43. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/env.py +0 -0
  44. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/epub.py +0 -0
  45. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/gmi.py +0 -0
  46. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/md.py +0 -0
  47. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/parse.py +0 -0
  48. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/pdf.py +0 -0
  49. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/subcommands/tex.py +0 -0
  50. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/tokenizer.py +0 -0
  51. {txt2ebook-0.1.151 → txt2ebook-0.1.152}/src/txt2ebook/zh_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: txt2ebook
3
- Version: 0.1.151
3
+ Version: 0.1.152
4
4
  Summary: CLI tool to convert txt file to ebook format
5
5
  Keywords: cjk,ebook,epub,gmi,latex,md,pdf,txt,typst
6
6
  Author-email: Kian-Meng Ang <kianmeng@cpan.org>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "txt2ebook"
3
- version = "0.1.151"
3
+ version = "0.1.152"
4
4
  description = "CLI tool to convert txt file to ebook format"
5
5
  authors = [{ name = "Kian-Meng Ang", email = "kianmeng@cpan.org" }]
6
6
  requires-python = "~=3.9"
@@ -45,6 +45,7 @@ RE_CHAPTERS = [
45
45
  "作者[介绍自介].*",
46
46
  "正文",
47
47
  "人物谱",
48
+ "作者按",
48
49
  ]
49
50
 
50
51
  DEFAULT_RE_AUTHOR = r"作者:(.*)"
@@ -22,9 +22,11 @@ from importlib import import_module
22
22
  from pathlib import Path
23
23
 
24
24
  import cjkwrap
25
+ import jieba.analyse
25
26
  import regex as re
26
27
  from bs4 import UnicodeDammit
27
28
 
29
+ from txt2ebook import detect_and_expect_language
28
30
  from txt2ebook.exceptions import EmptyFileError
29
31
  from txt2ebook.models.book import Book
30
32
  from txt2ebook.zh_utils import zh_halfwidth_to_fullwidth, zh_words_to_numbers
@@ -70,7 +72,22 @@ def build_subparser(subparsers) -> None:
70
72
  default=False,
71
73
  action="store_true",
72
74
  dest="fullwidth",
73
- help="use fullwidth character (only for zh-cn and zh-tw)",
75
+ help=(
76
+ "use fullwidth character (only for zh-cn and zh-tw) "
77
+ "(default: %(default)r)"
78
+ ),
79
+ )
80
+
81
+ massage_parser.add_argument(
82
+ "-ri",
83
+ "--reindent",
84
+ default=False,
85
+ action="store_true",
86
+ dest="reindent",
87
+ help=(
88
+ "reindent each paragraph (only for zh-cn and zh-tw) "
89
+ "(default: %(default)r)"
90
+ ),
74
91
  )
75
92
 
76
93
  massage_parser.add_argument(
@@ -318,12 +335,18 @@ def massage_txt(args: argparse.Namespace) -> str:
318
335
 
319
336
  content = to_unix_newline(content)
320
337
 
338
+ args.language = detect_and_expect_language(content, args.language)
339
+
321
340
  (metadata, body) = extract_metadata_and_body(args, content)
322
341
 
323
342
  if args.fullwidth and args.language in ("zh-cn", "zh-tw"):
324
343
  logger.info("Convert halfwidth ASCII characters to fullwidth")
325
344
  body = zh_halfwidth_to_fullwidth(body)
326
345
 
346
+ if args.reindent and args.language in ("zh-cn", "zh-tw"):
347
+ logger.info("Reindent paragraph")
348
+ body = do_reindent_paragraph(args, body)
349
+
327
350
  if args.re_delete:
328
351
  body = do_delete_regex(args, body)
329
352
 
@@ -339,7 +362,7 @@ def massage_txt(args: argparse.Namespace) -> str:
339
362
  if args.width:
340
363
  body = do_wrapping(args, body)
341
364
 
342
- return f"{metadata}\n\n{body}"
365
+ return f"{metadata}{body}"
343
366
 
344
367
 
345
368
  def to_unix_newline(content: str) -> str:
@@ -353,6 +376,30 @@ def to_unix_newline(content: str) -> str:
353
376
  """
354
377
  return content.replace("\r\n", "\n").replace("\r", "\n")
355
378
 
379
+ def do_reindent_paragraph(args, content: str) -> str:
380
+ """Reindent each paragraph.
381
+
382
+ Args:
383
+ content(str): The formatted book content.
384
+
385
+ Returns:
386
+ str: The formatted book content.
387
+ """
388
+ paragraphs = re.split(r'\n\s*\n+', content)
389
+ reindented_paragraphs = []
390
+ for paragraph in paragraphs:
391
+ lines = paragraph.split('\n')
392
+ reindented_lines = []
393
+ for line in lines:
394
+ stripped_line = line.strip()
395
+ reindented_lines.append(stripped_line)
396
+
397
+ reindented_paragraph = '\n'.join(reindented_lines)
398
+ reindented_paragraph = "  " + reindented_paragraph
399
+ reindented_paragraphs.append(reindented_paragraph)
400
+
401
+ return args.paragraph_separator.join(reindented_paragraphs)
402
+
356
403
 
357
404
  def do_delete_regex(args, content: str) -> str:
358
405
  """Remove words/phrases based on regex.
@@ -425,7 +472,24 @@ def extract_metadata_and_body(_args, content: str) -> tuple:
425
472
  metadata = match.group(0).strip()
426
473
  body = content.replace(metadata, "", 1)
427
474
 
428
- return (metadata, body)
475
+
476
+ metadata_block = metadata.split("---")[1]
477
+
478
+ metadata_dict = {}
479
+ for line in metadata_block.strip().splitlines():
480
+ key, value = line.split(":", 1)
481
+ metadata_dict[key.strip()] = value.strip()
482
+
483
+ tags = jieba.analyse.extract_tags(content, topK=100)
484
+ metadata_tags = " ".join(tags)
485
+ logger.info("tags: %s", metadata_tags)
486
+ metadata_dict["索引"] = metadata_tags
487
+
488
+ meta_lines = [f"{key}:{value}" for key, value in metadata_dict.items()]
489
+ meta_body = "\n".join(meta_lines)
490
+ meta_str = f"---\n{meta_body}\n---"
491
+
492
+ return (meta_str, body)
429
493
 
430
494
 
431
495
  def do_single_newline(args, content: str) -> str:
@@ -72,7 +72,7 @@ def build_subparser(subparsers) -> None:
72
72
 
73
73
  typ_parser.add_argument(
74
74
  "--toc",
75
- default=False,
75
+ default=True,
76
76
  action=argparse.BooleanOptionalAction,
77
77
  dest="with_toc",
78
78
  help="add table of content",
File without changes
File without changes