txt2ebook 0.1.141__py3-none-any.whl → 0.1.142__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2ebook/parser.py +7 -4
- txt2ebook/subcommands/parse.py +11 -13
- {txt2ebook-0.1.141.dist-info → txt2ebook-0.1.142.dist-info}/METADATA +1 -1
- {txt2ebook-0.1.141.dist-info → txt2ebook-0.1.142.dist-info}/RECORD +7 -7
- {txt2ebook-0.1.141.dist-info → txt2ebook-0.1.142.dist-info}/WHEEL +0 -0
- {txt2ebook-0.1.141.dist-info → txt2ebook-0.1.142.dist-info}/entry_points.txt +0 -0
- {txt2ebook-0.1.141.dist-info → txt2ebook-0.1.142.dist-info}/licenses/LICENSE.md +0 -0
txt2ebook/parser.py
CHANGED
@@ -19,6 +19,8 @@ import argparse
|
|
19
19
|
import logging
|
20
20
|
from dataclasses import dataclass
|
21
21
|
from importlib import import_module
|
22
|
+
from importlib import import_module
|
23
|
+
from types import ModuleType
|
22
24
|
from typing import List, Tuple, Union
|
23
25
|
|
24
26
|
import regex as re
|
@@ -36,14 +38,15 @@ class Parser:
|
|
36
38
|
|
37
39
|
raw_content: str
|
38
40
|
config: argparse.Namespace
|
41
|
+
langconf: ModuleType
|
39
42
|
|
40
|
-
def __init__(
|
43
|
+
def __init__(
|
44
|
+
self, raw_content: str, config: argparse.Namespace, langconf: ModuleType
|
45
|
+
) -> None:
|
41
46
|
"""Set the constructor for the Parser."""
|
42
47
|
self.raw_content = raw_content
|
43
48
|
self.config = config
|
44
|
-
|
45
|
-
config_lang = config.language.replace("-", "_")
|
46
|
-
self.langconf = import_module(f"txt2ebook.languages.{config_lang}")
|
49
|
+
self.langconf = langconf
|
47
50
|
|
48
51
|
def parse(self) -> Book:
|
49
52
|
"""Parse the content into volumes (optional) and chapters.
|
txt2ebook/subcommands/parse.py
CHANGED
@@ -19,10 +19,14 @@ import argparse
|
|
19
19
|
import logging
|
20
20
|
import sys
|
21
21
|
|
22
|
+
import logging
|
23
|
+
import sys
|
24
|
+
from importlib import import_module
|
25
|
+
|
22
26
|
import jieba.analyse
|
23
27
|
from bs4 import UnicodeDammit
|
24
|
-
from langdetect import detect
|
25
28
|
|
29
|
+
from txt2ebook import detect_and_expect_language
|
26
30
|
from txt2ebook.exceptions import EmptyFileError
|
27
31
|
from txt2ebook.models import Book
|
28
32
|
from txt2ebook.parser import Parser
|
@@ -73,26 +77,20 @@ def run(args: argparse.Namespace) -> Book:
|
|
73
77
|
logger.info("Detect encoding : %s", unicode.original_encoding)
|
74
78
|
|
75
79
|
content = unicode.unicode_markup
|
80
|
+
|
76
81
|
if not content:
|
77
82
|
raise EmptyFileError(f"Empty file content in {args.input_file.name}")
|
78
83
|
|
79
|
-
|
80
|
-
detect_language = detect(content)
|
81
|
-
args.language = args_language or detect_language
|
82
|
-
logger.info("args language: %s", args_language)
|
83
|
-
logger.info("Detect language: %s", detect_language)
|
84
|
+
logger.info("Detect encoding : %s", unicode.original_encoding)
|
84
85
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
args_language,
|
89
|
-
detect_language,
|
90
|
-
)
|
86
|
+
args.language = detect_and_expect_language(content, args.language)
|
87
|
+
config_lang = args.language.replace("-", "_")
|
88
|
+
langconf = import_module(f"txt2ebook.languages.{config_lang}")
|
91
89
|
|
92
90
|
tags = jieba.analyse.extract_tags(content, topK=100)
|
93
91
|
logger.info("tags: %s", " ".join(tags))
|
94
92
|
|
95
|
-
parser = Parser(content, args)
|
93
|
+
parser = Parser(content, args, langconf)
|
96
94
|
book = parser.parse()
|
97
95
|
|
98
96
|
if args.debug:
|
@@ -2,7 +2,7 @@ txt2ebook/__init__.py,sha256=Oq0Yor9IB6LPfAsVVTl-wbh-EFVy8T309BR1UVMC0kw,3055
|
|
2
2
|
txt2ebook/__main__.py,sha256=L29rlfPSx9XMnVaHBYP2dyYgDmutJvONR3yUejjYwRY,855
|
3
3
|
txt2ebook/cli.py,sha256=i8NrYJyC9ckMC5opCGkIcs42p4AFzhE0lTGKSU-S8Zw,4418
|
4
4
|
txt2ebook/exceptions.py,sha256=PT3m85PE5QopHHUfRwEQzp0kJ4AA9yjLO6V6lYC8WhQ,858
|
5
|
-
txt2ebook/parser.py,sha256=
|
5
|
+
txt2ebook/parser.py,sha256=foAO-ezeb_cSPW2U3Ul83xsf4xdOZsBJaqJmcy1_rio,9015
|
6
6
|
txt2ebook/tokenizer.py,sha256=UGyOBGxlKOXJtvP2UFp38EgFym8-PAU3A7Jl9RF3w6Y,10299
|
7
7
|
txt2ebook/zh_utils.py,sha256=0Yq9r-JL4HntW68vFR6TBP9yQim1a07mfsh_sp-XmaE,4887
|
8
8
|
txt2ebook/formats/__init__.py,sha256=CBZSA9zbLL4-4VYH7Xp76HK4kHTyISoNs7gMs7lBIzY,1646
|
@@ -41,12 +41,12 @@ txt2ebook/subcommands/epub.py,sha256=_obM1_fvVBPHOBXBOCYK8nyJadBX3_gOn9kaXA5HipA
|
|
41
41
|
txt2ebook/subcommands/gmi.py,sha256=ANnPg-RFsylTo44fUzFOSHN1fC3Ce82gBzrv-sBv5fU,3318
|
42
42
|
txt2ebook/subcommands/massage.py,sha256=EuC-C03NMJk9V1_PEUOa-n4SmQCRpj1TJ_GwSJE8_Ss,11809
|
43
43
|
txt2ebook/subcommands/md.py,sha256=PmIqrqrnzLywvN4qTkle0V9N3FTIJGRWpC0Xbk76B5o,3329
|
44
|
-
txt2ebook/subcommands/parse.py,sha256=
|
44
|
+
txt2ebook/subcommands/parse.py,sha256=3LP3GGgX5amfde3fpvobZf6Ks1_nA9PqFh2hjYslmaA,2929
|
45
45
|
txt2ebook/subcommands/pdf.py,sha256=1JQtpugzAIaho6G3CK1rGYk74hotAexXZxPH9PHpRps,2980
|
46
46
|
txt2ebook/subcommands/tex.py,sha256=ToYdFXnFLwsXxTsZzCRsURo7TCeOIFJtp5sFJDt0R-E,3131
|
47
47
|
txt2ebook/subcommands/typ.py,sha256=qXpHMmtu_1nAMs264oKUSolWAUBjZpTziTSBcTe2JgA,3681
|
48
|
-
txt2ebook-0.1.
|
49
|
-
txt2ebook-0.1.
|
50
|
-
txt2ebook-0.1.
|
51
|
-
txt2ebook-0.1.
|
52
|
-
txt2ebook-0.1.
|
48
|
+
txt2ebook-0.1.142.dist-info/METADATA,sha256=0-WMp4gWS4JzWBe0UDwU6LFtGgHr-WF4G6JieMtZ78Y,4901
|
49
|
+
txt2ebook-0.1.142.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
50
|
+
txt2ebook-0.1.142.dist-info/entry_points.txt,sha256=3jm5vpUsDRgoM6S3CQVMMiP7tJQqfq1vfV0sh_KaK9s,74
|
51
|
+
txt2ebook-0.1.142.dist-info/licenses/LICENSE.md,sha256=tGtFDwxWTjuR9syrJoSv1Hiffd2u8Tu8cYClfrXS_YU,31956
|
52
|
+
txt2ebook-0.1.142.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|