epub-translator 0.0.7__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +4 -2
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +175 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +205 -178
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +176 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +178 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.0.dist-info/METADATA +283 -0
  56. epub_translator-0.1.0.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -68
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.7.dist-info/METADATA +0 -170
  80. epub_translator-0.0.7.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.7.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.7.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
@@ -1,231 +0,0 @@
1
- from math import ceil
2
- from typing import Callable, Iterator, Generator
3
- from pathlib import Path
4
- from concurrent.futures import as_completed, ThreadPoolExecutor
5
- from xml.etree.ElementTree import Element
6
-
7
- from ..llm import LLM
8
- from ..xml import encode_friendly
9
-
10
- from .types import language_chinese_name, Fragment, Language
11
- from .store import Store
12
- from .splitter import split_into_chunks
13
- from .chunk import match_fragments, Chunk
14
- from .utils import is_empty, clean_spaces
15
-
16
-
17
- ProgressReporter = Callable[[float], None]
18
-
19
- def translate(
20
- llm: LLM,
21
- gen_fragments_iter: Callable[[], Iterator[Fragment]],
22
- cache_path: Path | None,
23
- target_language: Language,
24
- user_prompt: str | None,
25
- max_chunk_tokens_count: int,
26
- max_threads_count: int,
27
- report_progress: ProgressReporter,
28
- ) -> Generator[str, None, None]:
29
-
30
- if user_prompt is not None:
31
- user_prompt = _normalize_user_input(user_prompt.splitlines())
32
-
33
- store = Store(cache_path) if cache_path else None
34
- chunk_ranges = list(split_into_chunks(
35
- llm=llm,
36
- fragments_iter=gen_fragments_iter(),
37
- max_chunk_tokens_count=max_chunk_tokens_count,
38
- ))
39
- with ThreadPoolExecutor(max_workers=max_threads_count) as executor:
40
- futures = [
41
- executor.submit(lambda chunk=chunk: (chunk, _translate_chunk(
42
- llm=llm,
43
- store=store,
44
- chunk=chunk,
45
- target_language=target_language,
46
- user_prompt=user_prompt,
47
- )))
48
- for chunk in match_fragments(
49
- llm=llm,
50
- target_language=target_language,
51
- chunk_ranges_iter=iter(chunk_ranges),
52
- fragments_iter=gen_fragments_iter(),
53
- )
54
- ]
55
- def _generate_chunks_from_futures():
56
- try:
57
- for future in as_completed(futures):
58
- yield future.result()
59
- except Exception as err:
60
- for future in futures:
61
- if not future.done():
62
- future.cancel()
63
- raise err
64
-
65
- yield from _sort_translated_texts_by_chunk(
66
- target=_generate_chunks_from_futures(),
67
- total_tokens_count=sum(chunk.tokens_count for chunk in chunk_ranges),
68
- report_progress=report_progress,
69
- )
70
-
71
- def _sort_translated_texts_by_chunk(
72
- target: Iterator[tuple[Chunk, list[str]]],
73
- total_tokens_count: int,
74
- report_progress: ProgressReporter,
75
- ) -> Generator[str, None, None]:
76
-
77
- buffer: list[tuple[Chunk, list[str]]] = []
78
- wanna_next_index: int = 0
79
- translated_tokens_count: int = 0
80
-
81
- for chunk, translated_texts in target:
82
- buffer.append((chunk, translated_texts))
83
- if wanna_next_index == chunk.index:
84
- buffer.sort(key=lambda e: e[0].index)
85
- to_clear: list[list[str]] = []
86
-
87
- for chunk, translated_texts in buffer:
88
- if chunk.index > wanna_next_index:
89
- break
90
- to_clear.append(translated_texts)
91
- if chunk.index == wanna_next_index:
92
- wanna_next_index += 1
93
-
94
- if to_clear:
95
- buffer = buffer[len(to_clear):]
96
- for translated_texts in to_clear:
97
- yield from translated_texts
98
-
99
- translated_tokens_count += chunk.tokens_count
100
- report_progress(float(translated_tokens_count) / total_tokens_count)
101
-
102
- def _translate_chunk(
103
- llm: LLM,
104
- store: Store | None,
105
- chunk: Chunk,
106
- target_language: Language,
107
- user_prompt: str | None,
108
- ) -> list[str]:
109
-
110
- translated_texts: list[str] | None = None
111
- source_texts = chunk.head + chunk.body + chunk.tail
112
- if store is not None:
113
- translated_texts = store.get(chunk.hash)
114
- if translated_texts is not None and \
115
- len(source_texts) != len(translated_texts):
116
- translated_texts = None
117
- print(f"Warning: Mismatched lengths in cached translation for chunk: {chunk.hash.hex()}",)
118
-
119
- if translated_texts is None:
120
- translated_texts = [
121
- clean_spaces(text)
122
- for text in _translate_texts(
123
- llm=llm,
124
- texts=source_texts,
125
- texts_tokens=chunk.tokens_count,
126
- target_language=target_language,
127
- user_prompt=user_prompt,
128
- )
129
- ]
130
- if store is not None:
131
- store.put(chunk.hash, translated_texts)
132
-
133
- head_length = len(chunk.head)
134
- translated_texts = translated_texts[head_length:head_length + len(chunk.body)]
135
-
136
- return translated_texts
137
-
138
- _PLAIN_TEXT_SCALE = 2.0
139
- _XML_TEXT_SCALE = 2.5
140
-
141
- def _translate_texts(
142
- llm: LLM,
143
- texts: list[str],
144
- texts_tokens: int,
145
- target_language: Language,
146
- user_prompt: str | None,
147
- ) -> list[str]:
148
-
149
- original_text = _normalize_user_input(texts)
150
- if original_text is None:
151
- return [""] * len(texts)
152
-
153
- user_data = original_text
154
- if user_prompt is not None:
155
- user_data = f"<rules>{user_prompt}</rules>\n\n{original_text}"
156
-
157
- translated_text = llm.request_text(
158
- template_name="translate",
159
- text_tag="TXT",
160
- user_data=user_data,
161
- parser=lambda r: r,
162
- max_tokens=ceil(texts_tokens * _PLAIN_TEXT_SCALE),
163
- params={
164
- "target_language": language_chinese_name(target_language),
165
- "user_prompt": user_prompt,
166
- },
167
- )
168
- request_element = Element("request")
169
-
170
- for i, fragment in enumerate(texts):
171
- fragment_element = Element("fragment", attrib={
172
- "id": str(i + 1),
173
- })
174
- fragment_element.text = clean_spaces(fragment)
175
- request_element.append(fragment_element)
176
-
177
- request_element_text = encode_friendly(request_element)
178
- request_text = f"```XML\n{request_element_text}\n```\n\n{translated_text}"
179
-
180
- return llm.request_xml(
181
- template_name="format",
182
- user_data=request_text,
183
- max_tokens=ceil(texts_tokens * _XML_TEXT_SCALE),
184
- parser=lambda r: _parse_translated_response(r, len(texts)),
185
- params={
186
- "target_language": language_chinese_name(target_language),
187
- },
188
- )
189
-
190
- def _parse_translated_response(resp_element: Element, sources_count: int) -> list[str]:
191
- fragments: list[str | None] = [None] * sources_count
192
- for fragment_element in resp_element:
193
- if fragment_element.text is None:
194
- continue
195
- id = fragment_element.get("id", None)
196
- if id is None:
197
- continue
198
- index = int(id) - 1
199
- if index < 0 or index >= len(fragments):
200
- raise ValueError(f"invalid fragment id: {id}")
201
- fragments[index] = fragment_element.text.strip()
202
-
203
- # 有时 LLM 会将多段融合在一起,这里尽可能让译文靠后,将空白段留在前面。
204
- # 这样看起来一大段的译文对应若干小段原文,观感更好。
205
- for i in range(len(fragments)):
206
- fragment = fragments[i]
207
- if fragment is not None and i < len(fragments) - 1:
208
- next_fragment = fragments[i + 1]
209
- if next_fragment is None:
210
- fragments[i] = None
211
- fragments[i + 1] = fragment
212
-
213
- return [f or "" for f in fragments]
214
-
215
- def _normalize_user_input(user_lines: list[str]) -> str | None:
216
- empty_lines_count: int = 0
217
- lines: list[str] = []
218
- for line in user_lines:
219
- if is_empty(line):
220
- empty_lines_count += 1
221
- else:
222
- if lines:
223
- if empty_lines_count >= 2:
224
- lines.append("")
225
- lines.append("")
226
- elif empty_lines_count == 1:
227
- lines.append("")
228
- lines.append(clean_spaces(line))
229
- if not lines:
230
- return None
231
- return "\n".join(lines)
@@ -1,45 +0,0 @@
1
- from enum import Enum, IntEnum
2
- from dataclasses import dataclass
3
-
4
-
5
- class Incision(IntEnum):
6
- MUST_BE = 3
7
- MOST_LIKELY = 2
8
- IMPOSSIBLE = 0
9
- UNCERTAIN = 1
10
-
11
- @dataclass
12
- class Fragment:
13
- text: str
14
- start_incision: Incision
15
- end_incision: Incision
16
-
17
- class Language(Enum):
18
- SIMPLIFIED_CHINESE = "zh-Hans"
19
- TRADITIONAL_CHINESE = "zh-Hant"
20
- ENGLISH = "en"
21
- FRENCH = "fr"
22
- GERMAN = "de"
23
- SPANISH = "es"
24
- RUSSIAN = "ru"
25
- ITALIAN = "it"
26
- PORTUGUESE = "pt"
27
- JAPANESE = "ja"
28
- KOREAN = "ko"
29
-
30
- _LANGUAGE_NAMES = {
31
- Language.SIMPLIFIED_CHINESE: "简体中文",
32
- Language.TRADITIONAL_CHINESE: "繁体中文",
33
- Language.ENGLISH: "英语",
34
- Language.FRENCH: "法语",
35
- Language.GERMAN: "德语",
36
- Language.SPANISH: "西班牙语",
37
- Language.RUSSIAN: "俄语",
38
- Language.ITALIAN: "意大利语",
39
- Language.PORTUGUESE: "葡萄牙语",
40
- Language.JAPANESE: "日语",
41
- Language.KOREAN: "韩语",
42
- }
43
-
44
- def language_chinese_name(language: Language) -> str:
45
- return _LANGUAGE_NAMES[language]
@@ -1,11 +0,0 @@
1
- import re
2
-
3
-
4
- _EMPTY_LINE = re.compile(r"^\s*$")
5
- _SPACE = re.compile(r"\s+")
6
-
7
- def is_empty(text: str) -> bool:
8
- return bool(_EMPTY_LINE.match(text))
9
-
10
- def clean_spaces(text: str) -> str:
11
- return _SPACE.sub(" ", text.strip())
@@ -1,71 +0,0 @@
1
- from typing import Generator, Iterable
2
- from xml.etree.ElementTree import Element
3
-
4
- from .tag import Tag, TagKind
5
- from .parser import parse_tags
6
- from .transform import tag_to_element
7
- from .utils import clone
8
-
9
- # why implement XML decoding?
10
- # https://github.com/oomol-lab/pdf-craft/issues/149
11
- def decode_friendly(chars: Iterable[str], tags: Iterable[str] | str = ()) -> Generator[Element, None, None]:
12
- if isinstance(tags, str):
13
- tags = set((tags,))
14
- else:
15
- tags = set(tags)
16
-
17
- for element in _collect_elements(chars):
18
- if element.tag in tags or len(tags) == 0:
19
- yield clone(element)
20
-
21
- def _collect_elements(chars: Iterable[str]) -> Generator[Element, None, None]:
22
- opening_stack: list[Element] = []
23
- last_closed_element: Element | None = None
24
-
25
- for cell in parse_tags(chars):
26
- if isinstance(cell, Tag):
27
- tag: Tag = cell
28
- element = tag_to_element(tag)
29
- if tag.kind == TagKind.CLOSING:
30
- popped = _pop_element(tag.name, opening_stack)
31
- if popped is not None:
32
- yield popped
33
- last_closed_element = popped
34
- elif last_closed_element is not None:
35
- _append_to_tail(last_closed_element, tag.proto)
36
- else:
37
- if opening_stack:
38
- opening_stack[-1].append(element)
39
- if tag.kind == TagKind.SELF_CLOSING:
40
- yield element
41
- last_closed_element = element
42
- elif tag.kind == TagKind.OPENING:
43
- opening_stack.append(element)
44
- last_closed_element = None
45
-
46
- elif last_closed_element is not None:
47
- _append_to_tail(last_closed_element, cell)
48
-
49
- elif opening_stack:
50
- opening_stack[-1].text = cell
51
-
52
- def _append_to_tail(element: Element, text: str) -> None:
53
- if element.tail:
54
- element.tail += text
55
- else:
56
- element.tail = text
57
-
58
- def _pop_element(tag_name: str, opening_stack: list[Element]) -> Element | None:
59
- index = -1
60
- for i in range(len(opening_stack) - 1, -1, -1):
61
- opening_element = opening_stack[i]
62
- if tag_name == opening_element.tag:
63
- index = i
64
- break
65
- if index == -1:
66
- return None
67
-
68
- popped: Element | None = None
69
- for _ in range(len(opening_stack) - index):
70
- popped = opening_stack.pop()
71
- return popped
@@ -1,95 +0,0 @@
1
- from io import StringIO
2
- from typing import Callable
3
- from html import escape as escape_html
4
- from xml.etree.ElementTree import Element
5
-
6
- from .tag import Tag, TagKind
7
- from .parser import parse_tags
8
- from .transform import element_to_tag
9
-
10
- # why implement XML encoding?
11
- # https://github.com/oomol-lab/pdf-craft/issues/149
12
- def encode_friendly(element: Element, indent: int = 2) -> str:
13
- buffer = StringIO()
14
- _encode_element(
15
- buffer=buffer,
16
- element=element,
17
- indent=indent,
18
- depth=0,
19
- escape=_escape_text,
20
- )
21
- return buffer.getvalue()
22
-
23
- def _escape_text(text: str) -> str:
24
- buffer = StringIO()
25
- for cell in parse_tags(text):
26
- if isinstance(cell, Tag):
27
- cell = escape_html(str(cell))
28
- buffer.write(cell)
29
- return buffer.getvalue()
30
-
31
- def encode(element: Element, indent: int = 2) -> str:
32
- buffer = StringIO()
33
- _encode_element(
34
- buffer=buffer,
35
- element=element,
36
- indent=indent,
37
- depth=0,
38
- escape=escape_html,
39
- )
40
- return buffer.getvalue()
41
-
42
- _TINY_TEXT_LEN = 35
43
-
44
- def _encode_element(
45
- buffer: StringIO,
46
- element: Element,
47
- indent: int,
48
- depth: int,
49
- escape: Callable[[str], str],
50
- ) -> None:
51
-
52
- _write_indent(buffer, indent, depth)
53
- if len(element) == 0 and not element.text:
54
- tag = element_to_tag(element, TagKind.SELF_CLOSING)
55
- buffer.write(str(tag))
56
- else:
57
- text = (element.text or "").strip()
58
- opening_tag = element_to_tag(element, TagKind.OPENING)
59
- closing_tag = element_to_tag(element, TagKind.CLOSING)
60
- buffer.write(str(opening_tag))
61
- is_one_line = (
62
- len(text) <= _TINY_TEXT_LEN and
63
- len(element) == 0 and
64
- "\n" not in text
65
- )
66
- if text:
67
- if not is_one_line:
68
- buffer.write("\n")
69
- _write_indent(buffer, indent, depth + 1)
70
- buffer.write(escape(text))
71
-
72
- for child in element:
73
- buffer.write("\n")
74
- _encode_element(
75
- buffer=buffer,
76
- element=child,
77
- indent=indent,
78
- depth=depth + 1,
79
- escape=escape,
80
- )
81
- child_tail = (child.tail or "").strip()
82
- if child_tail:
83
- buffer.write("\n")
84
- _write_indent(buffer, indent, depth + 1)
85
- buffer.write(escape(child_tail))
86
-
87
- if not is_one_line:
88
- buffer.write("\n")
89
- _write_indent(buffer, indent, depth)
90
-
91
- buffer.write(str(closing_tag))
92
-
93
- def _write_indent(buffer: StringIO, indent: int, depth: int) -> None:
94
- for _ in range(indent * depth):
95
- buffer.write(" ")
@@ -1,172 +0,0 @@
1
- from io import StringIO
2
- from typing import Generator, Iterable
3
- from enum import auto, Enum
4
- from .tag import is_valid_name_char, is_valid_value_char, Tag, TagKind
5
-
6
-
7
- _SPACES = (" ", "\n")
8
-
9
- class _Phase(Enum):
10
- OUTSIDE = auto()
11
- LEFT_BRACKET = auto()
12
- LEFT_SLASH = auto()
13
- TAG_NAME = auto()
14
- TAG_GAP = auto()
15
- ATTRIBUTE_NAME = auto()
16
- ATTRIBUTE_NAME_EQUAL = auto()
17
- ATTRIBUTE_VALUE = auto()
18
- MUST_CLOSING_SIGN = auto()
19
-
20
- class _ParsedResult(Enum):
21
- Continue = auto()
22
- Success = auto()
23
- Failed = auto()
24
-
25
- def parse_tags(chars: Iterable[str]) -> Generator[str | Tag, None, None]:
26
- yield from _XMLTagsParser().do(chars)
27
-
28
- class _XMLTagsParser:
29
- def __init__(self):
30
- self._outside_buffer: StringIO = StringIO()
31
- self._tag_buffer: StringIO = StringIO()
32
- self._tag: Tag | None = None
33
- self._phase: _Phase = _Phase.OUTSIDE
34
-
35
- def do(self, chars: Iterable[str]) -> Generator[str | Tag, None, None]:
36
- for char in chars:
37
- parsed_result = self._parse_char(char)
38
- yield from self._generate_by_result(parsed_result)
39
-
40
- self._outside_buffer.write(self._tag_buffer.getvalue())
41
- outside_text = self._outside_buffer.getvalue()
42
- if outside_text != "":
43
- yield outside_text
44
-
45
- def _parse_char(self, char: str) -> _ParsedResult:
46
- parsed_result: _ParsedResult = _ParsedResult.Continue
47
-
48
- if self._phase == _Phase.OUTSIDE:
49
- if char != "<":
50
- self._outside_buffer.write(char)
51
- else:
52
- self._phase = _Phase.LEFT_BRACKET
53
- self._tag_buffer.write(char)
54
- self._tag = Tag(
55
- kind=TagKind.OPENING,
56
- name="",
57
- proto="",
58
- attributes=[],
59
- )
60
- else:
61
- self._tag_buffer.write(char)
62
-
63
- if self._phase == _Phase.LEFT_BRACKET:
64
- if char == "/":
65
- self._tag.kind = TagKind.CLOSING
66
- self._phase = _Phase.LEFT_SLASH
67
- elif is_valid_name_char(char):
68
- self._tag.name += char
69
- self._phase = _Phase.TAG_NAME
70
- else:
71
- parsed_result = _ParsedResult.Failed
72
-
73
- elif self._phase == _Phase.LEFT_SLASH:
74
- if is_valid_name_char(char):
75
- self._tag.name += char
76
- self._phase = _Phase.TAG_NAME
77
- else:
78
- parsed_result = _ParsedResult.Failed
79
-
80
- elif self._phase == _Phase.TAG_NAME:
81
- if char in _SPACES:
82
- self._phase = _Phase.TAG_GAP
83
- elif is_valid_name_char(char):
84
- self._tag.name += char
85
- elif char == ">":
86
- parsed_result = _ParsedResult.Success
87
- elif char == "/" and self._tag.kind == TagKind.OPENING:
88
- self._tag.kind = TagKind.SELF_CLOSING
89
- self._phase = _Phase.MUST_CLOSING_SIGN
90
- else:
91
- parsed_result = _ParsedResult.Failed
92
-
93
- elif self._phase == _Phase.TAG_GAP:
94
- if char in _SPACES:
95
- pass
96
- elif is_valid_name_char(char):
97
- self._tag.attributes.append((char, ""))
98
- self._phase = _Phase.ATTRIBUTE_NAME
99
- elif char == ">":
100
- parsed_result = _ParsedResult.Success
101
- elif char == "/" and self._tag.kind == TagKind.OPENING:
102
- self._tag.kind = TagKind.SELF_CLOSING
103
- self._phase = _Phase.MUST_CLOSING_SIGN
104
- else:
105
- parsed_result = _ParsedResult.Failed
106
-
107
- elif self._phase == _Phase.ATTRIBUTE_NAME:
108
- if is_valid_name_char(char):
109
- attr_name, attr_value = self._tag.attributes[-1]
110
- attr_name = attr_name + char
111
- self._tag.attributes[-1] = (attr_name, attr_value)
112
- elif char == "=":
113
- self._phase = _Phase.ATTRIBUTE_NAME_EQUAL
114
- else:
115
- parsed_result = _ParsedResult.Failed
116
-
117
- elif self._phase == _Phase.ATTRIBUTE_NAME_EQUAL:
118
- if char == "\"":
119
- self._phase = _Phase.ATTRIBUTE_VALUE
120
- else:
121
- parsed_result = _ParsedResult.Failed
122
-
123
- elif self._phase == _Phase.ATTRIBUTE_VALUE:
124
- if is_valid_value_char(char):
125
- attr_name, attr_value = self._tag.attributes[-1]
126
- attr_value = attr_value + char
127
- self._tag.attributes[-1] = (attr_name, attr_value)
128
- elif char == "\"":
129
- self._phase = _Phase.TAG_GAP
130
- else:
131
- parsed_result = _ParsedResult.Failed
132
-
133
- elif self._phase == _Phase.MUST_CLOSING_SIGN:
134
- if char == ">":
135
- parsed_result = _ParsedResult.Success
136
- else:
137
- parsed_result = _ParsedResult.Failed
138
-
139
- return parsed_result
140
-
141
- def _generate_by_result(self, parsed_result: _ParsedResult) -> Generator[str | Tag, None, None]:
142
- if parsed_result == _ParsedResult.Success:
143
- assert self._tag is not None
144
- if self._is_tag_valid(self._tag):
145
- outside_text = self._outside_buffer.getvalue()
146
- self._clear_buffer(self._outside_buffer)
147
- self._clear_buffer(self._tag_buffer)
148
- if outside_text != "":
149
- yield outside_text
150
- yield self._tag
151
- else:
152
- self._tag.proto = self._tag_buffer.getvalue()
153
- self._outside_buffer.write(self._tag.proto)
154
- self._clear_buffer(self._tag_buffer)
155
- self._tag = None
156
- self._phase = _Phase.OUTSIDE
157
-
158
- elif parsed_result == _ParsedResult.Failed:
159
- self._outside_buffer.write(self._tag_buffer.getvalue())
160
- self._clear_buffer(self._tag_buffer)
161
- self._phase = _Phase.OUTSIDE
162
-
163
- def _is_tag_valid(self, tag: Tag) -> bool:
164
- if tag.kind == TagKind.CLOSING and len(tag.attributes) > 0:
165
- return False
166
- if tag.find_invalid_name() is not None:
167
- return False
168
- return True
169
-
170
- def _clear_buffer(self, buffer: StringIO):
171
- buffer.truncate(0)
172
- buffer.seek(0)