epub-translator 0.0.6__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +3 -1
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +175 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +205 -168
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +176 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +178 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.0.dist-info/METADATA +283 -0
- epub_translator-0.1.0.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -62
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.6.dist-info/METADATA +0 -170
- epub_translator-0.0.6.dist-info/RECORD +0 -36
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.6.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
from math import ceil
|
|
2
|
-
from typing import Callable, Iterator, Generator
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from concurrent.futures import as_completed, ThreadPoolExecutor
|
|
5
|
-
from xml.etree.ElementTree import Element
|
|
6
|
-
|
|
7
|
-
from ..llm import LLM
|
|
8
|
-
from ..xml import encode_friendly
|
|
9
|
-
|
|
10
|
-
from .types import language_chinese_name, Fragment, Language
|
|
11
|
-
from .store import Store
|
|
12
|
-
from .splitter import split_into_chunks
|
|
13
|
-
from .chunk import match_fragments, Chunk
|
|
14
|
-
from .utils import is_empty, clean_spaces
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
ProgressReporter = Callable[[float], None]
|
|
18
|
-
|
|
19
|
-
def translate(
|
|
20
|
-
llm: LLM,
|
|
21
|
-
gen_fragments_iter: Callable[[], Iterator[Fragment]],
|
|
22
|
-
cache_path: Path | None,
|
|
23
|
-
target_language: Language,
|
|
24
|
-
user_prompt: str | None,
|
|
25
|
-
max_chunk_tokens_count: int,
|
|
26
|
-
max_threads_count: int,
|
|
27
|
-
report_progress: ProgressReporter,
|
|
28
|
-
) -> Generator[str, None, None]:
|
|
29
|
-
|
|
30
|
-
if user_prompt is not None:
|
|
31
|
-
user_prompt = _normalize_user_input(user_prompt.splitlines())
|
|
32
|
-
|
|
33
|
-
store = Store(cache_path) if cache_path else None
|
|
34
|
-
chunk_ranges = list(split_into_chunks(
|
|
35
|
-
llm=llm,
|
|
36
|
-
fragments_iter=gen_fragments_iter(),
|
|
37
|
-
max_chunk_tokens_count=max_chunk_tokens_count,
|
|
38
|
-
))
|
|
39
|
-
with ThreadPoolExecutor(max_workers=max_threads_count) as executor:
|
|
40
|
-
futures = [
|
|
41
|
-
executor.submit(lambda chunk=chunk: (chunk, _translate_chunk(
|
|
42
|
-
llm=llm,
|
|
43
|
-
store=store,
|
|
44
|
-
chunk=chunk,
|
|
45
|
-
target_language=target_language,
|
|
46
|
-
user_prompt=user_prompt,
|
|
47
|
-
)))
|
|
48
|
-
for chunk in match_fragments(
|
|
49
|
-
llm=llm,
|
|
50
|
-
target_language=target_language,
|
|
51
|
-
chunk_ranges_iter=iter(chunk_ranges),
|
|
52
|
-
fragments_iter=gen_fragments_iter(),
|
|
53
|
-
)
|
|
54
|
-
]
|
|
55
|
-
def _generate_chunks_from_futures():
|
|
56
|
-
try:
|
|
57
|
-
for future in as_completed(futures):
|
|
58
|
-
yield future.result()
|
|
59
|
-
except Exception as err:
|
|
60
|
-
for future in futures:
|
|
61
|
-
if not future.done():
|
|
62
|
-
future.cancel()
|
|
63
|
-
raise err
|
|
64
|
-
|
|
65
|
-
yield from _sort_translated_texts_by_chunk(
|
|
66
|
-
target=_generate_chunks_from_futures(),
|
|
67
|
-
total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
|
|
68
|
-
report_progress=report_progress,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
def _sort_translated_texts_by_chunk(
|
|
72
|
-
target: Iterator[tuple[Chunk, list[str]]],
|
|
73
|
-
total_tokens_count: int,
|
|
74
|
-
report_progress: ProgressReporter,
|
|
75
|
-
) -> Generator[str, None, None]:
|
|
76
|
-
|
|
77
|
-
buffer: list[tuple[Chunk, list[str]]] = []
|
|
78
|
-
wanna_next_index: int = 0
|
|
79
|
-
translated_tokens_count: int = 0
|
|
80
|
-
|
|
81
|
-
for chunk, translated_texts in target:
|
|
82
|
-
buffer.append((chunk, translated_texts))
|
|
83
|
-
if wanna_next_index == chunk.index:
|
|
84
|
-
buffer.sort(key=lambda e: e[0].index)
|
|
85
|
-
to_clear: list[list[str]] = []
|
|
86
|
-
|
|
87
|
-
for chunk, translated_texts in buffer:
|
|
88
|
-
if chunk.index > wanna_next_index:
|
|
89
|
-
break
|
|
90
|
-
to_clear.append(translated_texts)
|
|
91
|
-
if chunk.index == wanna_next_index:
|
|
92
|
-
wanna_next_index += 1
|
|
93
|
-
|
|
94
|
-
if to_clear:
|
|
95
|
-
buffer = buffer[len(to_clear):]
|
|
96
|
-
for translated_texts in to_clear:
|
|
97
|
-
yield from translated_texts
|
|
98
|
-
|
|
99
|
-
translated_tokens_count += chunk.tokens_count
|
|
100
|
-
report_progress(float(translated_tokens_count) / total_tokens_count)
|
|
101
|
-
|
|
102
|
-
def _translate_chunk(
|
|
103
|
-
llm: LLM,
|
|
104
|
-
store: Store | None,
|
|
105
|
-
chunk: Chunk,
|
|
106
|
-
target_language: Language,
|
|
107
|
-
user_prompt: str | None,
|
|
108
|
-
) -> list[str]:
|
|
109
|
-
|
|
110
|
-
translated_texts: list[str] | None = None
|
|
111
|
-
source_texts = chunk.head + chunk.body + chunk.tail
|
|
112
|
-
if store is not None:
|
|
113
|
-
translated_texts = store.get(chunk.hash)
|
|
114
|
-
if translated_texts is not None and \
|
|
115
|
-
len(source_texts) != len(translated_texts):
|
|
116
|
-
translated_texts = None
|
|
117
|
-
print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
|
|
118
|
-
|
|
119
|
-
if translated_texts is None:
|
|
120
|
-
translated_texts = [
|
|
121
|
-
clean_spaces(text)
|
|
122
|
-
for text in _translate_texts(
|
|
123
|
-
llm=llm,
|
|
124
|
-
texts=source_texts,
|
|
125
|
-
texts_tokens=chunk.tokens_count,
|
|
126
|
-
target_language=target_language,
|
|
127
|
-
user_prompt=user_prompt,
|
|
128
|
-
)
|
|
129
|
-
]
|
|
130
|
-
if store is not None:
|
|
131
|
-
store.put(chunk.hash, translated_texts)
|
|
132
|
-
|
|
133
|
-
head_length = len(chunk.head)
|
|
134
|
-
translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
|
|
135
|
-
|
|
136
|
-
return translated_texts
|
|
137
|
-
|
|
138
|
-
_PLAIN_TEXT_SCALE = 2.0
|
|
139
|
-
_XML_TEXT_SCALE = 2.5
|
|
140
|
-
|
|
141
|
-
def _translate_texts(
|
|
142
|
-
llm: LLM,
|
|
143
|
-
texts: list[str],
|
|
144
|
-
texts_tokens: int,
|
|
145
|
-
target_language: Language,
|
|
146
|
-
user_prompt: str | None,
|
|
147
|
-
) -> list[str]:
|
|
148
|
-
|
|
149
|
-
original_text = _normalize_user_input(texts)
|
|
150
|
-
if original_text is None:
|
|
151
|
-
return [""] * len(texts)
|
|
152
|
-
|
|
153
|
-
user_data = original_text
|
|
154
|
-
if user_prompt is not None:
|
|
155
|
-
user_data = f"<rules>{user_prompt}</rules>\n\n{original_text}"
|
|
156
|
-
|
|
157
|
-
translated_text = llm.request_text(
|
|
158
|
-
template_name="translate",
|
|
159
|
-
text_tag="TXT",
|
|
160
|
-
user_data=user_data,
|
|
161
|
-
parser=lambda r: r,
|
|
162
|
-
max_tokens=ceil(texts_tokens * _PLAIN_TEXT_SCALE),
|
|
163
|
-
params={
|
|
164
|
-
"target_language": language_chinese_name(target_language),
|
|
165
|
-
"user_prompt": user_prompt,
|
|
166
|
-
},
|
|
167
|
-
)
|
|
168
|
-
request_element = Element("request")
|
|
169
|
-
|
|
170
|
-
for i, fragment in enumerate(texts):
|
|
171
|
-
fragment_element = Element("fragment", attrib={
|
|
172
|
-
"id": str(i + 1),
|
|
173
|
-
})
|
|
174
|
-
fragment_element.text = clean_spaces(fragment)
|
|
175
|
-
request_element.append(fragment_element)
|
|
176
|
-
|
|
177
|
-
request_element_text = encode_friendly(request_element)
|
|
178
|
-
request_text = f"```XML\n{request_element_text}\n```\n\n{translated_text}"
|
|
179
|
-
|
|
180
|
-
return llm.request_xml(
|
|
181
|
-
template_name="format",
|
|
182
|
-
user_data=request_text,
|
|
183
|
-
max_tokens=ceil(texts_tokens * _XML_TEXT_SCALE),
|
|
184
|
-
parser=lambda r: _parse_translated_response(r, len(texts)),
|
|
185
|
-
params={
|
|
186
|
-
"target_language": language_chinese_name(target_language),
|
|
187
|
-
},
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
|
|
191
|
-
fragments: list[str | None] = [None] * sources_count
|
|
192
|
-
for fragment_element in resp_element:
|
|
193
|
-
if fragment_element.text is None:
|
|
194
|
-
continue
|
|
195
|
-
id = fragment_element.get("id", None)
|
|
196
|
-
if id is None:
|
|
197
|
-
continue
|
|
198
|
-
index = int(id) - 1
|
|
199
|
-
if index < 0 or index >= len(fragments):
|
|
200
|
-
raise ValueError(f"invalid fragment id: {id}")
|
|
201
|
-
fragments[index] = fragment_element.text.strip()
|
|
202
|
-
|
|
203
|
-
# 有时 LLM 会将多段融合在一起,这里尽可能让译文靠后,将空白段留在前面。
|
|
204
|
-
# 这样看起来一大段的译文对应若干小段原文,观感更好。
|
|
205
|
-
for i in range(len(fragments)):
|
|
206
|
-
fragment = fragments[i]
|
|
207
|
-
if fragment is not None and i < len(fragments) - 1:
|
|
208
|
-
next_fragment = fragments[i + 1]
|
|
209
|
-
if next_fragment is None:
|
|
210
|
-
fragments[i] = None
|
|
211
|
-
fragments[i + 1] = fragment
|
|
212
|
-
|
|
213
|
-
return [f or "" for f in fragments]
|
|
214
|
-
|
|
215
|
-
def _normalize_user_input(user_lines: list[str]) -> str | None:
|
|
216
|
-
empty_lines_count: int = 0
|
|
217
|
-
lines: list[str] = []
|
|
218
|
-
for line in user_lines:
|
|
219
|
-
if is_empty(line):
|
|
220
|
-
empty_lines_count += 1
|
|
221
|
-
else:
|
|
222
|
-
if lines:
|
|
223
|
-
if empty_lines_count >= 2:
|
|
224
|
-
lines.append("")
|
|
225
|
-
lines.append("")
|
|
226
|
-
elif empty_lines_count == 1:
|
|
227
|
-
lines.append("")
|
|
228
|
-
lines.append(clean_spaces(line))
|
|
229
|
-
if not lines:
|
|
230
|
-
return None
|
|
231
|
-
return "\n".join(lines)
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from enum import Enum, IntEnum
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Incision(IntEnum):
|
|
6
|
-
MUST_BE = 3
|
|
7
|
-
MOST_LIKELY = 2
|
|
8
|
-
IMPOSSIBLE = 0
|
|
9
|
-
UNCERTAIN = 1
|
|
10
|
-
|
|
11
|
-
@dataclass
|
|
12
|
-
class Fragment:
|
|
13
|
-
text: str
|
|
14
|
-
start_incision: Incision
|
|
15
|
-
end_incision: Incision
|
|
16
|
-
|
|
17
|
-
class Language(Enum):
|
|
18
|
-
SIMPLIFIED_CHINESE = "zh-Hans"
|
|
19
|
-
TRADITIONAL_CHINESE = "zh-Hant"
|
|
20
|
-
ENGLISH = "en"
|
|
21
|
-
FRENCH = "fr"
|
|
22
|
-
GERMAN = "de"
|
|
23
|
-
SPANISH = "es"
|
|
24
|
-
RUSSIAN = "ru"
|
|
25
|
-
ITALIAN = "it"
|
|
26
|
-
PORTUGUESE = "pt"
|
|
27
|
-
JAPANESE = "ja"
|
|
28
|
-
KOREAN = "ko"
|
|
29
|
-
|
|
30
|
-
_LANGUAGE_NAMES = {
|
|
31
|
-
Language.SIMPLIFIED_CHINESE: "简体中文",
|
|
32
|
-
Language.TRADITIONAL_CHINESE: "繁体中文",
|
|
33
|
-
Language.ENGLISH: "英语",
|
|
34
|
-
Language.FRENCH: "法语",
|
|
35
|
-
Language.GERMAN: "德语",
|
|
36
|
-
Language.SPANISH: "西班牙语",
|
|
37
|
-
Language.RUSSIAN: "俄语",
|
|
38
|
-
Language.ITALIAN: "意大利语",
|
|
39
|
-
Language.PORTUGUESE: "葡萄牙语",
|
|
40
|
-
Language.JAPANESE: "日语",
|
|
41
|
-
Language.KOREAN: "韩语",
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
def language_chinese_name(language: Language) -> str:
|
|
45
|
-
return _LANGUAGE_NAMES[language]
|
epub_translator/xml/decoder.py
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
from typing import Generator, Iterable
|
|
2
|
-
from xml.etree.ElementTree import Element
|
|
3
|
-
|
|
4
|
-
from .tag import Tag, TagKind
|
|
5
|
-
from .parser import parse_tags
|
|
6
|
-
from .transform import tag_to_element
|
|
7
|
-
from .utils import clone
|
|
8
|
-
|
|
9
|
-
# why implement XML decoding?
|
|
10
|
-
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
11
|
-
def decode_friendly(chars: Iterable[str], tags: Iterable[str] | str = ()) -> Generator[Element, None, None]:
|
|
12
|
-
if isinstance(tags, str):
|
|
13
|
-
tags = set((tags,))
|
|
14
|
-
else:
|
|
15
|
-
tags = set(tags)
|
|
16
|
-
|
|
17
|
-
for element in _collect_elements(chars):
|
|
18
|
-
if element.tag in tags or len(tags) == 0:
|
|
19
|
-
yield clone(element)
|
|
20
|
-
|
|
21
|
-
def _collect_elements(chars: Iterable[str]) -> Generator[Element, None, None]:
|
|
22
|
-
opening_stack: list[Element] = []
|
|
23
|
-
last_closed_element: Element | None = None
|
|
24
|
-
|
|
25
|
-
for cell in parse_tags(chars):
|
|
26
|
-
if isinstance(cell, Tag):
|
|
27
|
-
tag: Tag = cell
|
|
28
|
-
element = tag_to_element(tag)
|
|
29
|
-
if tag.kind == TagKind.CLOSING:
|
|
30
|
-
popped = _pop_element(tag.name, opening_stack)
|
|
31
|
-
if popped is not None:
|
|
32
|
-
yield popped
|
|
33
|
-
last_closed_element = popped
|
|
34
|
-
elif last_closed_element is not None:
|
|
35
|
-
_append_to_tail(last_closed_element, tag.proto)
|
|
36
|
-
else:
|
|
37
|
-
if opening_stack:
|
|
38
|
-
opening_stack[-1].append(element)
|
|
39
|
-
if tag.kind == TagKind.SELF_CLOSING:
|
|
40
|
-
yield element
|
|
41
|
-
last_closed_element = element
|
|
42
|
-
elif tag.kind == TagKind.OPENING:
|
|
43
|
-
opening_stack.append(element)
|
|
44
|
-
last_closed_element = None
|
|
45
|
-
|
|
46
|
-
elif last_closed_element is not None:
|
|
47
|
-
_append_to_tail(last_closed_element, cell)
|
|
48
|
-
|
|
49
|
-
elif opening_stack:
|
|
50
|
-
opening_stack[-1].text = cell
|
|
51
|
-
|
|
52
|
-
def _append_to_tail(element: Element, text: str) -> None:
|
|
53
|
-
if element.tail:
|
|
54
|
-
element.tail += text
|
|
55
|
-
else:
|
|
56
|
-
element.tail = text
|
|
57
|
-
|
|
58
|
-
def _pop_element(tag_name: str, opening_stack: list[Element]) -> Element | None:
|
|
59
|
-
index = -1
|
|
60
|
-
for i in range(len(opening_stack) - 1, -1, -1):
|
|
61
|
-
opening_element = opening_stack[i]
|
|
62
|
-
if tag_name == opening_element.tag:
|
|
63
|
-
index = i
|
|
64
|
-
break
|
|
65
|
-
if index == -1:
|
|
66
|
-
return None
|
|
67
|
-
|
|
68
|
-
popped: Element | None = None
|
|
69
|
-
for _ in range(len(opening_stack) - index):
|
|
70
|
-
popped = opening_stack.pop()
|
|
71
|
-
return popped
|
epub_translator/xml/encoder.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
from io import StringIO
|
|
2
|
-
from typing import Callable
|
|
3
|
-
from html import escape as escape_html
|
|
4
|
-
from xml.etree.ElementTree import Element
|
|
5
|
-
|
|
6
|
-
from .tag import Tag, TagKind
|
|
7
|
-
from .parser import parse_tags
|
|
8
|
-
from .transform import element_to_tag
|
|
9
|
-
|
|
10
|
-
# why implement XML encoding?
|
|
11
|
-
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
12
|
-
def encode_friendly(element: Element, indent: int = 2) -> str:
|
|
13
|
-
buffer = StringIO()
|
|
14
|
-
_encode_element(
|
|
15
|
-
buffer=buffer,
|
|
16
|
-
element=element,
|
|
17
|
-
indent=indent,
|
|
18
|
-
depth=0,
|
|
19
|
-
escape=_escape_text,
|
|
20
|
-
)
|
|
21
|
-
return buffer.getvalue()
|
|
22
|
-
|
|
23
|
-
def _escape_text(text: str) -> str:
|
|
24
|
-
buffer = StringIO()
|
|
25
|
-
for cell in parse_tags(text):
|
|
26
|
-
if isinstance(cell, Tag):
|
|
27
|
-
cell = escape_html(str(cell))
|
|
28
|
-
buffer.write(cell)
|
|
29
|
-
return buffer.getvalue()
|
|
30
|
-
|
|
31
|
-
def encode(element: Element, indent: int = 2) -> str:
|
|
32
|
-
buffer = StringIO()
|
|
33
|
-
_encode_element(
|
|
34
|
-
buffer=buffer,
|
|
35
|
-
element=element,
|
|
36
|
-
indent=indent,
|
|
37
|
-
depth=0,
|
|
38
|
-
escape=escape_html,
|
|
39
|
-
)
|
|
40
|
-
return buffer.getvalue()
|
|
41
|
-
|
|
42
|
-
_TINY_TEXT_LEN = 35
|
|
43
|
-
|
|
44
|
-
def _encode_element(
|
|
45
|
-
buffer: StringIO,
|
|
46
|
-
element: Element,
|
|
47
|
-
indent: int,
|
|
48
|
-
depth: int,
|
|
49
|
-
escape: Callable[[str], str],
|
|
50
|
-
) -> None:
|
|
51
|
-
|
|
52
|
-
_write_indent(buffer, indent, depth)
|
|
53
|
-
if len(element) == 0 and not element.text:
|
|
54
|
-
tag = element_to_tag(element, TagKind.SELF_CLOSING)
|
|
55
|
-
buffer.write(str(tag))
|
|
56
|
-
else:
|
|
57
|
-
text = (element.text or "").strip()
|
|
58
|
-
opening_tag = element_to_tag(element, TagKind.OPENING)
|
|
59
|
-
closing_tag = element_to_tag(element, TagKind.CLOSING)
|
|
60
|
-
buffer.write(str(opening_tag))
|
|
61
|
-
is_one_line = (
|
|
62
|
-
len(text) <= _TINY_TEXT_LEN and
|
|
63
|
-
len(element) == 0 and
|
|
64
|
-
"\n" not in text
|
|
65
|
-
)
|
|
66
|
-
if text:
|
|
67
|
-
if not is_one_line:
|
|
68
|
-
buffer.write("\n")
|
|
69
|
-
_write_indent(buffer, indent, depth + 1)
|
|
70
|
-
buffer.write(escape(text))
|
|
71
|
-
|
|
72
|
-
for child in element:
|
|
73
|
-
buffer.write("\n")
|
|
74
|
-
_encode_element(
|
|
75
|
-
buffer=buffer,
|
|
76
|
-
element=child,
|
|
77
|
-
indent=indent,
|
|
78
|
-
depth=depth + 1,
|
|
79
|
-
escape=escape,
|
|
80
|
-
)
|
|
81
|
-
child_tail = (child.tail or "").strip()
|
|
82
|
-
if child_tail:
|
|
83
|
-
buffer.write("\n")
|
|
84
|
-
_write_indent(buffer, indent, depth + 1)
|
|
85
|
-
buffer.write(escape(child_tail))
|
|
86
|
-
|
|
87
|
-
if not is_one_line:
|
|
88
|
-
buffer.write("\n")
|
|
89
|
-
_write_indent(buffer, indent, depth)
|
|
90
|
-
|
|
91
|
-
buffer.write(str(closing_tag))
|
|
92
|
-
|
|
93
|
-
def _write_indent(buffer: StringIO, indent: int, depth: int) -> None:
|
|
94
|
-
for _ in range(indent * depth):
|
|
95
|
-
buffer.write(" ")
|
epub_translator/xml/parser.py
DELETED
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
from io import StringIO
|
|
2
|
-
from typing import Generator, Iterable
|
|
3
|
-
from enum import auto, Enum
|
|
4
|
-
from .tag import is_valid_name_char, is_valid_value_char, Tag, TagKind
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
_SPACES = (" ", "\n")
|
|
8
|
-
|
|
9
|
-
class _Phase(Enum):
|
|
10
|
-
OUTSIDE = auto()
|
|
11
|
-
LEFT_BRACKET = auto()
|
|
12
|
-
LEFT_SLASH = auto()
|
|
13
|
-
TAG_NAME = auto()
|
|
14
|
-
TAG_GAP = auto()
|
|
15
|
-
ATTRIBUTE_NAME = auto()
|
|
16
|
-
ATTRIBUTE_NAME_EQUAL = auto()
|
|
17
|
-
ATTRIBUTE_VALUE = auto()
|
|
18
|
-
MUST_CLOSING_SIGN = auto()
|
|
19
|
-
|
|
20
|
-
class _ParsedResult(Enum):
|
|
21
|
-
Continue = auto()
|
|
22
|
-
Success = auto()
|
|
23
|
-
Failed = auto()
|
|
24
|
-
|
|
25
|
-
def parse_tags(chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
26
|
-
yield from _XMLTagsParser().do(chars)
|
|
27
|
-
|
|
28
|
-
class _XMLTagsParser:
|
|
29
|
-
def __init__(self):
|
|
30
|
-
self._outside_buffer: StringIO = StringIO()
|
|
31
|
-
self._tag_buffer: StringIO = StringIO()
|
|
32
|
-
self._tag: Tag | None = None
|
|
33
|
-
self._phase: _Phase = _Phase.OUTSIDE
|
|
34
|
-
|
|
35
|
-
def do(self, chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
36
|
-
for char in chars:
|
|
37
|
-
parsed_result = self._parse_char(char)
|
|
38
|
-
yield from self._generate_by_result(parsed_result)
|
|
39
|
-
|
|
40
|
-
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
41
|
-
outside_text = self._outside_buffer.getvalue()
|
|
42
|
-
if outside_text != "":
|
|
43
|
-
yield outside_text
|
|
44
|
-
|
|
45
|
-
def _parse_char(self, char: str) -> _ParsedResult:
|
|
46
|
-
parsed_result: _ParsedResult = _ParsedResult.Continue
|
|
47
|
-
|
|
48
|
-
if self._phase == _Phase.OUTSIDE:
|
|
49
|
-
if char != "<":
|
|
50
|
-
self._outside_buffer.write(char)
|
|
51
|
-
else:
|
|
52
|
-
self._phase = _Phase.LEFT_BRACKET
|
|
53
|
-
self._tag_buffer.write(char)
|
|
54
|
-
self._tag = Tag(
|
|
55
|
-
kind=TagKind.OPENING,
|
|
56
|
-
name="",
|
|
57
|
-
proto="",
|
|
58
|
-
attributes=[],
|
|
59
|
-
)
|
|
60
|
-
else:
|
|
61
|
-
self._tag_buffer.write(char)
|
|
62
|
-
|
|
63
|
-
if self._phase == _Phase.LEFT_BRACKET:
|
|
64
|
-
if char == "/":
|
|
65
|
-
self._tag.kind = TagKind.CLOSING
|
|
66
|
-
self._phase = _Phase.LEFT_SLASH
|
|
67
|
-
elif is_valid_name_char(char):
|
|
68
|
-
self._tag.name += char
|
|
69
|
-
self._phase = _Phase.TAG_NAME
|
|
70
|
-
else:
|
|
71
|
-
parsed_result = _ParsedResult.Failed
|
|
72
|
-
|
|
73
|
-
elif self._phase == _Phase.LEFT_SLASH:
|
|
74
|
-
if is_valid_name_char(char):
|
|
75
|
-
self._tag.name += char
|
|
76
|
-
self._phase = _Phase.TAG_NAME
|
|
77
|
-
else:
|
|
78
|
-
parsed_result = _ParsedResult.Failed
|
|
79
|
-
|
|
80
|
-
elif self._phase == _Phase.TAG_NAME:
|
|
81
|
-
if char in _SPACES:
|
|
82
|
-
self._phase = _Phase.TAG_GAP
|
|
83
|
-
elif is_valid_name_char(char):
|
|
84
|
-
self._tag.name += char
|
|
85
|
-
elif char == ">":
|
|
86
|
-
parsed_result = _ParsedResult.Success
|
|
87
|
-
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
88
|
-
self._tag.kind = TagKind.SELF_CLOSING
|
|
89
|
-
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
90
|
-
else:
|
|
91
|
-
parsed_result = _ParsedResult.Failed
|
|
92
|
-
|
|
93
|
-
elif self._phase == _Phase.TAG_GAP:
|
|
94
|
-
if char in _SPACES:
|
|
95
|
-
pass
|
|
96
|
-
elif is_valid_name_char(char):
|
|
97
|
-
self._tag.attributes.append((char, ""))
|
|
98
|
-
self._phase = _Phase.ATTRIBUTE_NAME
|
|
99
|
-
elif char == ">":
|
|
100
|
-
parsed_result = _ParsedResult.Success
|
|
101
|
-
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
102
|
-
self._tag.kind = TagKind.SELF_CLOSING
|
|
103
|
-
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
104
|
-
else:
|
|
105
|
-
parsed_result = _ParsedResult.Failed
|
|
106
|
-
|
|
107
|
-
elif self._phase == _Phase.ATTRIBUTE_NAME:
|
|
108
|
-
if is_valid_name_char(char):
|
|
109
|
-
attr_name, attr_value = self._tag.attributes[-1]
|
|
110
|
-
attr_name = attr_name + char
|
|
111
|
-
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
112
|
-
elif char == "=":
|
|
113
|
-
self._phase = _Phase.ATTRIBUTE_NAME_EQUAL
|
|
114
|
-
else:
|
|
115
|
-
parsed_result = _ParsedResult.Failed
|
|
116
|
-
|
|
117
|
-
elif self._phase == _Phase.ATTRIBUTE_NAME_EQUAL:
|
|
118
|
-
if char == "\"":
|
|
119
|
-
self._phase = _Phase.ATTRIBUTE_VALUE
|
|
120
|
-
else:
|
|
121
|
-
parsed_result = _ParsedResult.Failed
|
|
122
|
-
|
|
123
|
-
elif self._phase == _Phase.ATTRIBUTE_VALUE:
|
|
124
|
-
if is_valid_value_char(char):
|
|
125
|
-
attr_name, attr_value = self._tag.attributes[-1]
|
|
126
|
-
attr_value = attr_value + char
|
|
127
|
-
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
128
|
-
elif char == "\"":
|
|
129
|
-
self._phase = _Phase.TAG_GAP
|
|
130
|
-
else:
|
|
131
|
-
parsed_result = _ParsedResult.Failed
|
|
132
|
-
|
|
133
|
-
elif self._phase == _Phase.MUST_CLOSING_SIGN:
|
|
134
|
-
if char == ">":
|
|
135
|
-
parsed_result = _ParsedResult.Success
|
|
136
|
-
else:
|
|
137
|
-
parsed_result = _ParsedResult.Failed
|
|
138
|
-
|
|
139
|
-
return parsed_result
|
|
140
|
-
|
|
141
|
-
def _generate_by_result(self, parsed_result: _ParsedResult) -> Generator[str | Tag, None, None]:
|
|
142
|
-
if parsed_result == _ParsedResult.Success:
|
|
143
|
-
assert self._tag is not None
|
|
144
|
-
if self._is_tag_valid(self._tag):
|
|
145
|
-
outside_text = self._outside_buffer.getvalue()
|
|
146
|
-
self._clear_buffer(self._outside_buffer)
|
|
147
|
-
self._clear_buffer(self._tag_buffer)
|
|
148
|
-
if outside_text != "":
|
|
149
|
-
yield outside_text
|
|
150
|
-
yield self._tag
|
|
151
|
-
else:
|
|
152
|
-
self._tag.proto = self._tag_buffer.getvalue()
|
|
153
|
-
self._outside_buffer.write(self._tag.proto)
|
|
154
|
-
self._clear_buffer(self._tag_buffer)
|
|
155
|
-
self._tag = None
|
|
156
|
-
self._phase = _Phase.OUTSIDE
|
|
157
|
-
|
|
158
|
-
elif parsed_result == _ParsedResult.Failed:
|
|
159
|
-
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
160
|
-
self._clear_buffer(self._tag_buffer)
|
|
161
|
-
self._phase = _Phase.OUTSIDE
|
|
162
|
-
|
|
163
|
-
def _is_tag_valid(self, tag: Tag) -> bool:
|
|
164
|
-
if tag.kind == TagKind.CLOSING and len(tag.attributes) > 0:
|
|
165
|
-
return False
|
|
166
|
-
if tag.find_invalid_name() is not None:
|
|
167
|
-
return False
|
|
168
|
-
return True
|
|
169
|
-
|
|
170
|
-
def _clear_buffer(self, buffer: StringIO):
|
|
171
|
-
buffer.truncate(0)
|
|
172
|
-
buffer.seek(0)
|