epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +4 -2
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +233 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +208 -178
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +231 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +179 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.1.dist-info/METADATA +283 -0
- epub_translator-0.1.1.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -68
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.7.dist-info/METADATA +0 -170
- epub_translator-0.0.7.dist-info/RECORD +0 -36
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
from typing import Iterable
|
|
4
|
-
from xml.etree.ElementTree import fromstring, tostring, Element
|
|
5
|
-
from .dom_operator import read_texts, write_texts
|
|
6
|
-
from .empty_tags import to_xml, to_html
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
_FILE_HEAD_PATTERN = re.compile(r"^<\?xml.*?\?>[\s]*<!DOCTYPE.*?>")
|
|
10
|
-
_XMLNS_IN_TAG = re.compile(r"\{[^}]+\}")
|
|
11
|
-
_BRACES = re.compile(r"(\{|\})")
|
|
12
|
-
|
|
13
|
-
class HTMLFile:
|
|
14
|
-
def __init__(self, file_content: str):
|
|
15
|
-
match = re.match(_FILE_HEAD_PATTERN, file_content)
|
|
16
|
-
xml_content = re.sub(_FILE_HEAD_PATTERN, "", to_xml(file_content))
|
|
17
|
-
self._head: str = match.group() if match else None
|
|
18
|
-
self._root: Element = fromstring(xml_content)
|
|
19
|
-
self._xmlns: str | None = self._extract_xmlns(self._root)
|
|
20
|
-
self._texts_length: int | None = None
|
|
21
|
-
|
|
22
|
-
def _extract_xmlns(self, root: Element) -> str | None:
|
|
23
|
-
root_xmlns: str | None = None
|
|
24
|
-
for i, element in enumerate(_all_elements(root)):
|
|
25
|
-
need_clean_xmlns = True
|
|
26
|
-
match = re.match(_XMLNS_IN_TAG, element.tag)
|
|
27
|
-
|
|
28
|
-
if match:
|
|
29
|
-
xmlns = re.sub(_BRACES, "", match.group())
|
|
30
|
-
if i == 0:
|
|
31
|
-
root_xmlns = xmlns
|
|
32
|
-
elif root_xmlns != xmlns:
|
|
33
|
-
need_clean_xmlns = False
|
|
34
|
-
if need_clean_xmlns:
|
|
35
|
-
element.tag = re.sub(_XMLNS_IN_TAG, "", element.tag)
|
|
36
|
-
|
|
37
|
-
return root_xmlns
|
|
38
|
-
|
|
39
|
-
def read_texts(self) -> list[str]:
|
|
40
|
-
texts = list(read_texts(self._root))
|
|
41
|
-
self._texts_length = len(texts)
|
|
42
|
-
return texts
|
|
43
|
-
|
|
44
|
-
def write_texts(self, texts: Iterable[str], append: bool):
|
|
45
|
-
write_texts(self._root, texts, append)
|
|
46
|
-
|
|
47
|
-
@property
|
|
48
|
-
def texts_length(self) -> int:
|
|
49
|
-
if self._texts_length is None:
|
|
50
|
-
self._texts_length = 0
|
|
51
|
-
for _ in read_texts(self._root):
|
|
52
|
-
self._texts_length += 1
|
|
53
|
-
return self._texts_length
|
|
54
|
-
|
|
55
|
-
@property
|
|
56
|
-
def file_content(self) -> str:
|
|
57
|
-
file_content: str
|
|
58
|
-
if self._xmlns is None:
|
|
59
|
-
file_content = tostring(self._root, encoding="unicode")
|
|
60
|
-
file_content = to_html(file_content)
|
|
61
|
-
else:
|
|
62
|
-
root = Element(
|
|
63
|
-
self._root.tag,
|
|
64
|
-
attrib={**self._root.attrib, "xmlns": self._xmlns},
|
|
65
|
-
)
|
|
66
|
-
root.extend(self._root)
|
|
67
|
-
# XHTML disable <tag/> (we need replace them with <tag></tag>)
|
|
68
|
-
for element in _all_elements(root):
|
|
69
|
-
if element.text is None:
|
|
70
|
-
element.text = ""
|
|
71
|
-
file_content = tostring(root, encoding="unicode")
|
|
72
|
-
|
|
73
|
-
if self._head is not None:
|
|
74
|
-
file_content = self._head + file_content
|
|
75
|
-
return file_content
|
|
76
|
-
|
|
77
|
-
def _all_elements(parent: Element):
|
|
78
|
-
yield parent
|
|
79
|
-
for child in parent:
|
|
80
|
-
yield from _all_elements(child)
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
from typing import Generator, TypeGuard
|
|
2
|
-
from enum import auto, Enum
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class TextPosition(Enum):
|
|
7
|
-
WHOLE_DOM = auto()
|
|
8
|
-
TEXT = auto()
|
|
9
|
-
TAIL = auto()
|
|
10
|
-
|
|
11
|
-
# element, position, parent
|
|
12
|
-
TextDescription = tuple[Element, TextPosition, Element | None]
|
|
13
|
-
|
|
14
|
-
_IGNORE_TAGS = (
|
|
15
|
-
"title", "link", "style", "css", "img", "script", "metadata",
|
|
16
|
-
"{http://www.w3.org/1998/Math/MathML}math", # TODO: 公式是正文,也要读进去,暂时忽略避免扰乱得了。
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_TEXT_LEAF_TAGS = (
|
|
20
|
-
"a", "b", "br", "hr", "span", "em", "strong", "label",
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
def search_texts(element: Element, parent: Element | None = None) -> Generator[TextDescription, None, None]:
|
|
24
|
-
if element.tag in _IGNORE_TAGS:
|
|
25
|
-
return
|
|
26
|
-
|
|
27
|
-
if any(c.tag not in _TEXT_LEAF_TAGS for c in element):
|
|
28
|
-
if _is_not_empty_str(element.text):
|
|
29
|
-
yield element, TextPosition.TEXT, parent
|
|
30
|
-
for child in element:
|
|
31
|
-
if child.tag in _TEXT_LEAF_TAGS:
|
|
32
|
-
yield child, TextPosition.WHOLE_DOM, element
|
|
33
|
-
else:
|
|
34
|
-
yield from search_texts(child, element)
|
|
35
|
-
if _is_not_empty_str(child.tail):
|
|
36
|
-
yield child, TextPosition.TAIL, element
|
|
37
|
-
else:
|
|
38
|
-
yield element, TextPosition.WHOLE_DOM, parent
|
|
39
|
-
|
|
40
|
-
def _is_not_empty_str(text: str | None) -> TypeGuard[str]:
|
|
41
|
-
if text is None:
|
|
42
|
-
return False
|
|
43
|
-
for char in text:
|
|
44
|
-
if char not in (" ", "\n"):
|
|
45
|
-
return True
|
|
46
|
-
return False
|
epub_translator/llm/node.py
DELETED
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
|
|
3
|
-
from os import PathLike
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import cast, Any, TypeVar, Generator, Sequence, Callable
|
|
6
|
-
from importlib.resources import files
|
|
7
|
-
from jinja2 import Environment, Template
|
|
8
|
-
from xml.etree.ElementTree import Element
|
|
9
|
-
from pydantic import SecretStr
|
|
10
|
-
from logging import getLogger, DEBUG, Formatter, Logger, FileHandler
|
|
11
|
-
from tiktoken import get_encoding, Encoding
|
|
12
|
-
from langchain_core.messages import SystemMessage, HumanMessage
|
|
13
|
-
|
|
14
|
-
from ..template import create_env
|
|
15
|
-
from ..xml import decode_friendly, encode_friendly
|
|
16
|
-
from .increasable import Increasable
|
|
17
|
-
from .executor import LLMExecutor
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
R = TypeVar("R")
|
|
21
|
-
|
|
22
|
-
class LLM:
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
key: str,
|
|
26
|
-
url: str,
|
|
27
|
-
model: str,
|
|
28
|
-
token_encoding: str,
|
|
29
|
-
timeout: float | None = None,
|
|
30
|
-
top_p: float | tuple[float, float] | None = None,
|
|
31
|
-
temperature: float | tuple[float, float] | None = None,
|
|
32
|
-
retry_times: int = 5,
|
|
33
|
-
retry_interval_seconds: float = 6.0,
|
|
34
|
-
log_dir_path: PathLike | None = None,
|
|
35
|
-
):
|
|
36
|
-
prompts_path = files("epub_translator") / "data"
|
|
37
|
-
self._templates: dict[str, Template] = {}
|
|
38
|
-
self._encoding: Encoding = get_encoding(token_encoding)
|
|
39
|
-
self._env: Environment = create_env(prompts_path)
|
|
40
|
-
self._logger_save_path: Path | None = None
|
|
41
|
-
|
|
42
|
-
if log_dir_path is not None:
|
|
43
|
-
self._logger_save_path = Path(log_dir_path)
|
|
44
|
-
if not self._logger_save_path.exists():
|
|
45
|
-
self._logger_save_path.mkdir(parents=True, exist_ok=True)
|
|
46
|
-
elif not self._logger_save_path.is_dir():
|
|
47
|
-
self._logger_save_path = None
|
|
48
|
-
|
|
49
|
-
self._executor = LLMExecutor(
|
|
50
|
-
url=url,
|
|
51
|
-
model=model,
|
|
52
|
-
api_key=cast(SecretStr, key),
|
|
53
|
-
timeout=timeout,
|
|
54
|
-
top_p=Increasable(top_p),
|
|
55
|
-
temperature=Increasable(temperature),
|
|
56
|
-
retry_times=retry_times,
|
|
57
|
-
retry_interval_seconds=retry_interval_seconds,
|
|
58
|
-
create_logger=self._create_logger,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
def _create_logger(self) -> Logger | None:
|
|
62
|
-
if self._logger_save_path is None:
|
|
63
|
-
return None
|
|
64
|
-
|
|
65
|
-
now = datetime.datetime.now(datetime.timezone.utc)
|
|
66
|
-
timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
|
|
67
|
-
file_path = self._logger_save_path / f"request {timestamp}.log"
|
|
68
|
-
logger = getLogger(f"LLM Request {timestamp}")
|
|
69
|
-
logger.setLevel(DEBUG)
|
|
70
|
-
handler = FileHandler(file_path, encoding="utf-8")
|
|
71
|
-
handler.setLevel(DEBUG)
|
|
72
|
-
handler.setFormatter(Formatter("%(asctime)s %(message)s", "%H:%M:%S"))
|
|
73
|
-
logger.addHandler(handler)
|
|
74
|
-
|
|
75
|
-
return logger
|
|
76
|
-
|
|
77
|
-
def request_text(
|
|
78
|
-
self,
|
|
79
|
-
template_name: str,
|
|
80
|
-
text_tag: str,
|
|
81
|
-
user_data: Element | str,
|
|
82
|
-
parser: Callable[[str], R],
|
|
83
|
-
max_tokens: int | None = None,
|
|
84
|
-
params: dict[str, Any] | None = None,
|
|
85
|
-
) -> R:
|
|
86
|
-
|
|
87
|
-
if params is None:
|
|
88
|
-
params = {}
|
|
89
|
-
|
|
90
|
-
def parse_response(response: str) -> R:
|
|
91
|
-
text = next(self._search_quotes(text_tag.lower(), response), None)
|
|
92
|
-
if text is None:
|
|
93
|
-
raise ValueError(f"No valid {text_tag} response found")
|
|
94
|
-
return parser(text)
|
|
95
|
-
|
|
96
|
-
return self._executor.request(
|
|
97
|
-
input=self._create_input(template_name, user_data, params),
|
|
98
|
-
parser=parse_response,
|
|
99
|
-
max_tokens=max_tokens,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
def request_xml(
|
|
103
|
-
self,
|
|
104
|
-
template_name: str,
|
|
105
|
-
user_data: Element | str,
|
|
106
|
-
parser: Callable[[Element], R],
|
|
107
|
-
max_tokens: int | None = None,
|
|
108
|
-
params: dict[str, Any] | None = None,
|
|
109
|
-
) -> R:
|
|
110
|
-
|
|
111
|
-
if params is None:
|
|
112
|
-
params = {}
|
|
113
|
-
|
|
114
|
-
def parse_response(response: str) -> R:
|
|
115
|
-
element = next(decode_friendly(response, "response"), None)
|
|
116
|
-
if element is None:
|
|
117
|
-
raise ValueError("No valid XML response found")
|
|
118
|
-
return parser(element)
|
|
119
|
-
|
|
120
|
-
return self._executor.request(
|
|
121
|
-
input=self._create_input(template_name, user_data, params),
|
|
122
|
-
parser=parse_response,
|
|
123
|
-
max_tokens=max_tokens,
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
def _create_input(self, template_name: str, user_data: Element | str, params: dict[str, Any]):
|
|
127
|
-
data: str
|
|
128
|
-
if isinstance(user_data, Element):
|
|
129
|
-
data = encode_friendly(user_data)
|
|
130
|
-
data = f"```XML\n{data}\n```"
|
|
131
|
-
else:
|
|
132
|
-
data = user_data
|
|
133
|
-
|
|
134
|
-
template = self._template(template_name)
|
|
135
|
-
prompt = template.render(**params)
|
|
136
|
-
return [
|
|
137
|
-
SystemMessage(content=prompt),
|
|
138
|
-
HumanMessage(content=data)
|
|
139
|
-
]
|
|
140
|
-
|
|
141
|
-
def prompt_tokens_count(self, template_name: str, params: dict[str, Any]) -> int:
|
|
142
|
-
template = self._template(template_name)
|
|
143
|
-
prompt = template.render(**params)
|
|
144
|
-
return len(self._encoding.encode(prompt))
|
|
145
|
-
|
|
146
|
-
def encode_tokens(self, text: str) -> list[int]:
|
|
147
|
-
return self._encoding.encode(text)
|
|
148
|
-
|
|
149
|
-
def decode_tokens(self, tokens: Sequence[int]) -> str:
|
|
150
|
-
return self._encoding.decode(tokens)
|
|
151
|
-
|
|
152
|
-
def count_tokens_count(self, text: str) -> int:
|
|
153
|
-
return len(self._encoding.encode(text))
|
|
154
|
-
|
|
155
|
-
def _template(self, template_name: str) -> Template:
|
|
156
|
-
template = self._templates.get(template_name, None)
|
|
157
|
-
if template is None:
|
|
158
|
-
template = self._env.get_template(template_name)
|
|
159
|
-
self._templates[template_name] = template
|
|
160
|
-
return template
|
|
161
|
-
|
|
162
|
-
def _search_quotes(self, kind: str, response: str) -> Generator[str, None, None]:
|
|
163
|
-
start_marker = f"```{kind}"
|
|
164
|
-
end_marker = "```"
|
|
165
|
-
start_index = 0
|
|
166
|
-
|
|
167
|
-
while True:
|
|
168
|
-
start_index = self._find_ignore_case(
|
|
169
|
-
raw=response,
|
|
170
|
-
sub=start_marker,
|
|
171
|
-
start=start_index,
|
|
172
|
-
)
|
|
173
|
-
if start_index == -1:
|
|
174
|
-
break
|
|
175
|
-
|
|
176
|
-
end_index = self._find_ignore_case(
|
|
177
|
-
raw=response,
|
|
178
|
-
sub=end_marker,
|
|
179
|
-
start=start_index + len(start_marker),
|
|
180
|
-
)
|
|
181
|
-
if end_index == -1:
|
|
182
|
-
break
|
|
183
|
-
|
|
184
|
-
extracted_text = response[start_index + len(start_marker):end_index].strip()
|
|
185
|
-
yield extracted_text
|
|
186
|
-
start_index = end_index + len(end_marker)
|
|
187
|
-
|
|
188
|
-
def _find_ignore_case(self, raw: str, sub: str, start: int = 0):
|
|
189
|
-
if not sub:
|
|
190
|
-
return 0 if 0 >= start else -1
|
|
191
|
-
|
|
192
|
-
raw_len, sub_len = len(raw), len(sub)
|
|
193
|
-
for i in range(start, raw_len - sub_len + 1):
|
|
194
|
-
match = True
|
|
195
|
-
for j in range(sub_len):
|
|
196
|
-
if raw[i + j].lower() != sub[j].lower():
|
|
197
|
-
match = False
|
|
198
|
-
break
|
|
199
|
-
if match:
|
|
200
|
-
return i
|
|
201
|
-
return -1
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Iterator, Iterable, Generator
|
|
3
|
-
from hashlib import sha512
|
|
4
|
-
from ..llm import LLM
|
|
5
|
-
from .types import Fragment, Language
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@dataclass
|
|
9
|
-
class Chunk:
|
|
10
|
-
index: int
|
|
11
|
-
hash: bytes
|
|
12
|
-
head: list[str]
|
|
13
|
-
body: list[str]
|
|
14
|
-
tail: list[str]
|
|
15
|
-
tokens_count: int
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class ChunkRange:
|
|
19
|
-
index: int
|
|
20
|
-
head_remain_tokens: int
|
|
21
|
-
tail_remain_tokens: int
|
|
22
|
-
head_index: int
|
|
23
|
-
body_index: int
|
|
24
|
-
tail_index: int
|
|
25
|
-
fragments_count: int
|
|
26
|
-
tokens_count: int
|
|
27
|
-
|
|
28
|
-
def match(self, index: int) -> bool:
|
|
29
|
-
return self.head_index <= index < self.head_index + self.fragments_count
|
|
30
|
-
|
|
31
|
-
def match_fragments(
|
|
32
|
-
llm: LLM,
|
|
33
|
-
target_language: Language,
|
|
34
|
-
chunk_ranges_iter: Iterator[ChunkRange],
|
|
35
|
-
fragments_iter: Iterator[Fragment],
|
|
36
|
-
) -> Generator[Chunk, None, None]:
|
|
37
|
-
|
|
38
|
-
for range, texts in _match_range_and_texts(
|
|
39
|
-
chunk_range_iter=chunk_ranges_iter,
|
|
40
|
-
fragments_iter=fragments_iter,
|
|
41
|
-
):
|
|
42
|
-
head_length = range.body_index - range.head_index
|
|
43
|
-
body_length = range.tail_index - range.body_index
|
|
44
|
-
head = texts[:head_length]
|
|
45
|
-
body = texts[head_length:head_length + body_length]
|
|
46
|
-
tail = texts[head_length + body_length:]
|
|
47
|
-
|
|
48
|
-
hash = _hash_texts_list(target_language, (head, body, tail))
|
|
49
|
-
head = _crop_extra_texts(llm, head, True, range.head_remain_tokens)
|
|
50
|
-
tail = _crop_extra_texts(llm, tail, False, range.tail_remain_tokens)
|
|
51
|
-
|
|
52
|
-
yield Chunk(
|
|
53
|
-
hash=hash,
|
|
54
|
-
head=head,
|
|
55
|
-
body=body,
|
|
56
|
-
tail=tail,
|
|
57
|
-
index=range.index,
|
|
58
|
-
tokens_count=range.tokens_count,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
def _match_range_and_texts(
|
|
62
|
-
chunk_range_iter: Iterator[ChunkRange],
|
|
63
|
-
fragments_iter: Iterator[Fragment],
|
|
64
|
-
) -> Generator[tuple[ChunkRange, list[str]], None, None]:
|
|
65
|
-
|
|
66
|
-
next_chunk_range: ChunkRange | None = None
|
|
67
|
-
matched_chunk_ranges: list[tuple[ChunkRange, list[str]]] = []
|
|
68
|
-
|
|
69
|
-
for index, fragment in enumerate(fragments_iter):
|
|
70
|
-
while True:
|
|
71
|
-
if next_chunk_range is None:
|
|
72
|
-
next_chunk_range = next(chunk_range_iter, None)
|
|
73
|
-
if next_chunk_range is None:
|
|
74
|
-
break
|
|
75
|
-
if not next_chunk_range.match(index):
|
|
76
|
-
break
|
|
77
|
-
matched_chunk_ranges.append((next_chunk_range, []))
|
|
78
|
-
next_chunk_range = None
|
|
79
|
-
|
|
80
|
-
if matched_chunk_ranges:
|
|
81
|
-
next_matched_chunks: list[tuple[ChunkRange, list[str]]] = []
|
|
82
|
-
for chunk_range, texts in matched_chunk_ranges:
|
|
83
|
-
if chunk_range.match(index):
|
|
84
|
-
texts.append(fragment.text)
|
|
85
|
-
next_matched_chunks.append((chunk_range, texts))
|
|
86
|
-
else:
|
|
87
|
-
yield chunk_range, texts
|
|
88
|
-
matched_chunk_ranges = next_matched_chunks
|
|
89
|
-
|
|
90
|
-
yield from matched_chunk_ranges
|
|
91
|
-
|
|
92
|
-
def _hash_texts_list(target_language: Language, texts_iterable: Iterable[list[str]]) -> bytes:
|
|
93
|
-
m = sha512()
|
|
94
|
-
m.update(target_language.value.encode("utf-8"))
|
|
95
|
-
for texts in texts_iterable:
|
|
96
|
-
for text in texts:
|
|
97
|
-
m.update(b"\x00")
|
|
98
|
-
m.update(text.encode("utf-8"))
|
|
99
|
-
return m.digest()
|
|
100
|
-
|
|
101
|
-
def _crop_extra_texts(llm: LLM, texts: list[str], crop_left: bool, remain_tokens_count: int):
|
|
102
|
-
tokens_list: list[list[int]] = [llm.encode_tokens(text) for text in texts]
|
|
103
|
-
remain_texts: list[str] = []
|
|
104
|
-
|
|
105
|
-
for tokens in (reversed(tokens_list) if crop_left else tokens_list):
|
|
106
|
-
tokens_count = len(tokens)
|
|
107
|
-
if remain_tokens_count >= tokens_count:
|
|
108
|
-
remain_tokens_count -= tokens_count
|
|
109
|
-
remain_texts.append(llm.decode_tokens(tokens))
|
|
110
|
-
if remain_tokens_count == 0:
|
|
111
|
-
break
|
|
112
|
-
else:
|
|
113
|
-
remain_tokens = tokens[-remain_tokens_count:] if crop_left else tokens[:remain_tokens_count]
|
|
114
|
-
remain_texts.append(llm.decode_tokens(remain_tokens))
|
|
115
|
-
|
|
116
|
-
if crop_left:
|
|
117
|
-
remain_texts.reverse()
|
|
118
|
-
return remain_texts
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
from typing import Iterator, Generator
|
|
2
|
-
from resource_segmentation import split, Resource, Segment
|
|
3
|
-
|
|
4
|
-
from ..llm import LLM
|
|
5
|
-
from .types import Fragment, Incision
|
|
6
|
-
from .chunk import ChunkRange
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def split_into_chunks(llm: LLM, fragments_iter: Iterator[Fragment], max_chunk_tokens_count: int):
|
|
10
|
-
for index, group in enumerate(split(
|
|
11
|
-
resources=_gen_resources(llm, fragments_iter),
|
|
12
|
-
max_segment_count=max_chunk_tokens_count,
|
|
13
|
-
gap_rate=0.15,
|
|
14
|
-
tail_rate=0.5,
|
|
15
|
-
border_incision=Incision.IMPOSSIBLE,
|
|
16
|
-
)):
|
|
17
|
-
head_index: int
|
|
18
|
-
tail_index: int
|
|
19
|
-
fragments_count: int
|
|
20
|
-
body_index, body_end_index, body_tokens_count = _group_part(group.body)
|
|
21
|
-
|
|
22
|
-
if group.head:
|
|
23
|
-
head_index, head_end_index, _ = _group_part(group.head)
|
|
24
|
-
assert head_end_index + 1 == body_index, "Head must be continuous with body"
|
|
25
|
-
else:
|
|
26
|
-
head_index = body_index
|
|
27
|
-
|
|
28
|
-
if group.tail:
|
|
29
|
-
tail_index, tail_end_index, _ = _group_part(group.tail)
|
|
30
|
-
fragments_count = tail_end_index - head_index + 1
|
|
31
|
-
assert body_end_index + 1 == tail_index, "Body must be continuous with tail"
|
|
32
|
-
else:
|
|
33
|
-
tail_index = body_end_index + 1
|
|
34
|
-
fragments_count = tail_index - head_index
|
|
35
|
-
|
|
36
|
-
yield ChunkRange(
|
|
37
|
-
index=index,
|
|
38
|
-
head_remain_tokens=group.head_remain_count,
|
|
39
|
-
tail_remain_tokens=group.tail_remain_count,
|
|
40
|
-
head_index=head_index,
|
|
41
|
-
body_index=body_index,
|
|
42
|
-
tail_index=tail_index,
|
|
43
|
-
fragments_count=fragments_count,
|
|
44
|
-
tokens_count=body_tokens_count,
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
def _gen_resources(llm: LLM, fragments_iter: Iterator[Fragment]) -> Generator[Resource[int], None, None]:
|
|
48
|
-
for index, fragment in enumerate(fragments_iter):
|
|
49
|
-
yield Resource(
|
|
50
|
-
count=llm.count_tokens_count(fragment.text),
|
|
51
|
-
start_incision=fragment.start_incision,
|
|
52
|
-
end_incision=fragment.end_incision,
|
|
53
|
-
payload=index,
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
def _group_part(target: list[Resource[int] | Segment[int]]) -> tuple[int, int, int]:
|
|
57
|
-
start_index: int | None = None
|
|
58
|
-
previous_index: int = 0
|
|
59
|
-
tokens_count: int = 0
|
|
60
|
-
for resource in _iter_group_part(target):
|
|
61
|
-
index = resource.payload
|
|
62
|
-
if start_index is None:
|
|
63
|
-
start_index = index
|
|
64
|
-
else:
|
|
65
|
-
assert index == previous_index + 1, "Resources in group part must be continuous"
|
|
66
|
-
previous_index = index
|
|
67
|
-
tokens_count += resource.count
|
|
68
|
-
|
|
69
|
-
assert start_index is not None, "Group part must contain at least one resource"
|
|
70
|
-
return start_index, previous_index, tokens_count
|
|
71
|
-
|
|
72
|
-
def _iter_group_part(target: list[Resource[int] | Segment[int]]) -> Generator[Resource[int], None, None]:
|
|
73
|
-
for item in target:
|
|
74
|
-
if isinstance(item, Resource):
|
|
75
|
-
yield item
|
|
76
|
-
elif isinstance(item, Segment):
|
|
77
|
-
for resource in item.resources:
|
|
78
|
-
yield resource
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from shutil import rmtree
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Iterable
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Store:
|
|
7
|
-
def __init__(self, directory: Path):
|
|
8
|
-
self._directory = directory
|
|
9
|
-
|
|
10
|
-
def get(self, chunk_hash: bytes) -> list[str] | None:
|
|
11
|
-
file_path = self._file_path(chunk_hash)
|
|
12
|
-
if not file_path.exists() or not file_path.is_file():
|
|
13
|
-
return None
|
|
14
|
-
with file_path.open("r", encoding="utf-8") as file:
|
|
15
|
-
return file.read().split("\n")
|
|
16
|
-
|
|
17
|
-
def put(self, chunk_hash: bytes, lines: Iterable[str]):
|
|
18
|
-
file_path = self._file_path(chunk_hash)
|
|
19
|
-
if file_path.exists():
|
|
20
|
-
if file_path.is_file():
|
|
21
|
-
file_path.unlink()
|
|
22
|
-
else:
|
|
23
|
-
rmtree(file_path)
|
|
24
|
-
|
|
25
|
-
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
-
with file_path.open("w", encoding="utf-8") as file:
|
|
27
|
-
is_first_line = True
|
|
28
|
-
for line in lines:
|
|
29
|
-
if is_first_line:
|
|
30
|
-
is_first_line = False
|
|
31
|
-
else:
|
|
32
|
-
file.write("\n")
|
|
33
|
-
file.write(line)
|
|
34
|
-
|
|
35
|
-
def _file_path(self, chunk_hash: bytes) -> Path:
|
|
36
|
-
return self._directory / f"{chunk_hash.hex()}.chunk"
|