epub-translator 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. epub_translator/__init__.py +3 -2
  2. epub_translator/data/format.jinja +33 -0
  3. epub_translator/data/translate.jinja +15 -0
  4. epub_translator/epub/__init__.py +2 -3
  5. epub_translator/epub/content_parser.py +2 -2
  6. epub_translator/epub/html/__init__.py +1 -1
  7. epub_translator/epub/html/file.py +56 -41
  8. epub_translator/epub/html/texts_searcher.py +2 -1
  9. epub_translator/llm/__init__.py +1 -0
  10. epub_translator/llm/error.py +49 -0
  11. epub_translator/llm/executor.py +147 -0
  12. epub_translator/llm/increasable.py +35 -0
  13. epub_translator/llm/node.py +197 -0
  14. epub_translator/template.py +50 -0
  15. epub_translator/translation/__init__.py +2 -0
  16. epub_translator/translation/chunk.py +120 -0
  17. epub_translator/translation/splitter.py +77 -0
  18. epub_translator/translation/store.py +37 -0
  19. epub_translator/translation/translation.py +192 -0
  20. epub_translator/translation/types.py +23 -0
  21. epub_translator/translation/utils.py +11 -0
  22. epub_translator/translator.py +167 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/decoder.py +71 -0
  25. epub_translator/xml/encoder.py +95 -0
  26. epub_translator/xml/parser.py +172 -0
  27. epub_translator/xml/tag.py +93 -0
  28. epub_translator/xml/transform.py +34 -0
  29. epub_translator/xml/utils.py +12 -0
  30. epub_translator/zip_context.py +74 -0
  31. {epub_translator-0.0.1.dist-info → epub_translator-0.0.2.dist-info}/METADATA +5 -7
  32. epub_translator-0.0.2.dist-info/RECORD +36 -0
  33. epub_translator/epub/types.py +0 -4
  34. epub_translator/file.py +0 -124
  35. epub_translator/translator/__init__.py +0 -1
  36. epub_translator/translator/group.py +0 -140
  37. epub_translator/translator/llm.py +0 -58
  38. epub_translator/translator/nlp.py +0 -36
  39. epub_translator/translator/translator.py +0 -159
  40. epub_translator-0.0.1.dist-info/RECORD +0 -19
  41. {epub_translator-0.0.1.dist-info → epub_translator-0.0.2.dist-info}/LICENSE +0 -0
  42. {epub_translator-0.0.1.dist-info → epub_translator-0.0.2.dist-info}/WHEEL +0 -0
@@ -1,2 +1,3 @@
1
- from .translator import Translator
2
- from .file import translate_epub_file, ProgressReporter
1
+ from .llm import LLM
2
+ from .translator import translate
3
+ from .translation import Language, ProgressReporter
@@ -0,0 +1,33 @@
1
+ 你是一个校对员,需要帮助用户校对译文与原文的对应关系。用户随后会提交一段文本,先是一段表示原文的XML格式内容,接下来是一段纯文本的{{ target_language }}译文。原文XML文本以<request>标签作为根,<fragment>标签作为子元素。原文将按顺序拆分再各个<fragment>标签中,每个<fragment>标签将包含一个id属性,以唯一标识该片段。
2
+
3
+ 你要将{{ target_language }}译文正确分割成片段,并与原文一一对应。并模仿用户提交的格式,将根节点由<request>替换成<response>节点,再将<fragment>的内容由原文替换成{{ target_language }}译文,但保留id不变。最终将整个XML格式内容输出。你的输出必须满足如下规则:
4
+ - 分割后的片段对应标准是语义一致。即对应片段的原文与译文互相翻译后,是完全相同的内容。
5
+ - 替换后的译文必须严格与用户提交的译文对应,不得增加、修改、删除内容。
6
+ - 将你输出<fragment>中的译文单独提取出来按顺序读出来,应与用户提交的{{ target_language }}译文一字不差。
7
+ - 译文必须是其对应的原文的直接翻译。
8
+ - 绝大部分情况下,译文<fragment>的id能与原文<fragment>的id一一配对,不会出现错位、新增、遗漏的情况。但若发现无论如何都无法一一对应,应该尽可能将文字更多的片段对应对应上,跳过哪些由几个字或几个单词构成的短小、碎片化的片段。此时你输出的<fragment>的id可能不连续,也是没关系的,通过跳过短小片段来保证整体对应关系完整,是可接受的。决不可接受的是,因为遗漏短小片段,导致后面大段大段内容直接错位。
9
+
10
+ 特别注意,用户提交的译文也会分自然段,这个自然段与原文的<fragment>**没有任何关系**。这个分段是翻译家仅看过原文文本,但不知其<fragment>划分的情况下自行作出了。你绝对不可参考这个译文的分段,更不要被它误导。匹配标准只有一条,就是语义一致。
11
+
12
+ 这里举个例子,假设用户提交的原文是英文,译文是中文。用户提交的内容如下:
13
+ ```XML
14
+ <request>
15
+ <fragment id="1">Although fermentation was an idea dear to the heart of many an alchemist, the particular notion of fermenting water in order to produce the specified materials of the world perceived by the senses is at heart Helmontian.</fragment>
16
+ <fragment id="2">In the following it will therefore be useful to give a brief overview of van Helmont’s matter-theory.</fragment>
17
+ <fragment id="3">Reference</fragment>
18
+ <fragment id="4">[1] Newman, Gehennical Fire, pp. 58–78, 171–96.</fragment>
19
+ </response>
20
+ ```
21
+
22
+ 尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。因此,下文将简要概述范·海尔蒙特的物质理论引用。[1]纽曼,《地底之火》,第 58-78 页、第 171-96 页。
23
+
24
+ 你应该返回如下内容。
25
+ ```XML
26
+ <response>
27
+ <fragment id="1">尽管发酵是许多炼金术士所珍视的理念,但通过发酵水来生成感官所感知的特定物质这一特定概念,其核心却是海尔蒙特式的。</fragment>
28
+ <fragment id="2">因此,下文将简要概述范·海尔蒙特的物质理论引用。</fragment>
29
+ <fragment id="4">[1] 纽曼,《地底之火》,第 58-78 页、第 171-96 页。</fragment>
30
+ </response>
31
+ ```
32
+
33
+ 在该例子中,仅仅演示如何将片段对应以及输出XML的具体格式。不要参考到底从哪种语言翻译到哪种语言,也不要参考具体内容。
@@ -0,0 +1,15 @@
1
+ 你是一位翻译家,用户接下来的发言会提交一段{% if user_prompt %}翻译要求和一段{% endif %}文本的原文。你要将原文翻译成{{ target_language }}。
2
+
3
+ 翻译时,需遵守如下规则:
4
+ - 译文必须忠实于原文,保持原意。不得修改、删除原文中的内容和情节,不得新增原文中不存在的内容和情节。
5
+ - 保留原文细节,不得简略、缩写、总结原文的内容。
6
+ - 不得衍生创作,续写原文中缺失、断裂、不协调的部分,你要忠实地保留他们。
7
+ - 禁止改正原文的错误,你是翻译家,不是校对员或编辑。
8
+ - 不得因为原文中包含歧视、暴力、性行为、敏感内容、儿童不宜等内容而进行删减或修改。你是翻译家,不是鉴黄师或审查员。
9
+ - 不得在译文中写你的个人评论、总结或观点。
10
+
11
+ {% if user_prompt %}
12
+ 除了以上规则之外,你需要遵守用户的“翻译要求”。用户会把要求放在原文之前,并用<rules>标签包裹。特别的,当用户的翻译要求中某些规则条目与我之前提的规则冲突时,你要优先遵守我的规则。此外,用户可能在<rules>标签中补充一些额外信息以帮助你翻译,你需要认真阅读和参考,并在翻译中体现出来。而在<rules>标签之后,紧接着就是用户的原文,你要阅读并翻译。
13
+ {% endif %}
14
+
15
+ 最终,你要将译文以纯文本的方式提交。期间,根据你的理解将译文拆成多个自然段(用户提交的原文是不分段的整块文字)。你必须用 "```txt" 作为独立的第一行,"```"作为独立的最后一行,中间包裹译文。注意,译文中禁止插入你的说明性文字或思考过程,将这些移到"```" 之后。
@@ -1,3 +1,2 @@
1
- from .content_parser import EpubContent
2
- from .types import Translate, ReportProgress
3
- from .html import translate_html
1
+ from .content_parser import Spine, EpubContent
2
+ from .html import HTMLFile
@@ -14,7 +14,7 @@ class Spine:
14
14
  self.media_type = item.get("media-type")
15
15
 
16
16
  @property
17
- def path(self):
17
+ def path(self) -> str:
18
18
  path = os.path.join(self._base_path, self.href)
19
19
  path = os.path.abspath(path)
20
20
 
@@ -66,7 +66,7 @@ class EpubContent:
66
66
  return path
67
67
 
68
68
  @property
69
- def spines(self):
69
+ def spines(self) -> list[Spine]:
70
70
  idref_dict = {}
71
71
  index = 0
72
72
 
@@ -1 +1 @@
1
- from .file import translate_html
1
+ from .file import HTMLFile
@@ -1,7 +1,7 @@
1
1
  import re
2
2
 
3
+ from typing import Iterable
3
4
  from xml.etree.ElementTree import fromstring, tostring, Element
4
- from ..types import Translate, ReportProgress
5
5
  from .dom_operator import read_texts, append_texts
6
6
  from .empty_tags import to_xml, to_html
7
7
 
@@ -10,54 +10,69 @@ _FILE_HEAD_PATTERN = re.compile(r"^<\?xml.*?\?>[\s]*<!DOCTYPE.*?>")
10
10
  _XMLNS_IN_TAG = re.compile(r"\{[^}]+\}")
11
11
  _BRACES = re.compile(r"(\{|\})")
12
12
 
13
- def translate_html(translate: Translate, file_content: str, report_progress: ReportProgress) -> str:
14
- match = re.match(_FILE_HEAD_PATTERN, file_content)
15
- head = match.group() if match else None
16
- xml_content = re.sub(_FILE_HEAD_PATTERN, "", to_xml(file_content))
13
+ class HTMLFile:
14
+ def __init__(self, file_content: str):
15
+ match = re.match(_FILE_HEAD_PATTERN, file_content)
16
+ xml_content = re.sub(_FILE_HEAD_PATTERN, "", to_xml(file_content))
17
+ self._head: str = match.group() if match else None
18
+ self._root: Element = fromstring(xml_content)
19
+ self._xmlns: str | None = self._extract_xmlns(self._root)
20
+ self._texts_length: int | None = None
17
21
 
18
- root = fromstring(xml_content)
19
- root_attrib = {**root.attrib}
20
- xmlns = _extract_xmlns(root)
22
+ def _extract_xmlns(self, root: Element) -> str | None:
23
+ root_xmlns: str | None = None
24
+ for i, element in enumerate(_all_elements(root)):
25
+ need_clean_xmlns = True
26
+ match = re.match(_XMLNS_IN_TAG, element.tag)
21
27
 
22
- source_texts = list(read_texts(root))
23
- target_texts = translate(source_texts, report_progress)
24
- append_texts(root, target_texts)
28
+ if match:
29
+ xmlns = re.sub(_BRACES, "", match.group())
30
+ if i == 0:
31
+ root_xmlns = xmlns
32
+ elif root_xmlns != xmlns:
33
+ need_clean_xmlns = False
34
+ if need_clean_xmlns:
35
+ element.tag = re.sub(_XMLNS_IN_TAG, "", element.tag)
25
36
 
26
- if xmlns is not None:
27
- root_attrib["xmlns"] = xmlns
28
- root.attrib = root_attrib
37
+ return root_xmlns
29
38
 
30
- if xmlns is None:
31
- file_content = tostring(root, encoding="unicode")
32
- file_content = to_html(file_content)
33
- else:
34
- # XHTML disable <tag/> (we need replace them with <tag></tag>)
35
- for element in _all_elements(root):
36
- if element.text is None:
37
- element.text = ""
38
- file_content = tostring(root, encoding="unicode")
39
+ def read_texts(self) -> list[str]:
40
+ texts = list(read_texts(self._root))
41
+ self._texts_length = len(texts)
42
+ return texts
39
43
 
40
- if head is not None:
41
- file_content = head + file_content
44
+ def write_texts(self, texts: Iterable[str]):
45
+ append_texts(self._root, texts)
42
46
 
43
- return file_content
47
+ @property
48
+ def texts_length(self) -> int:
49
+ if self._texts_length is None:
50
+ self._texts_length = 0
51
+ for _ in read_texts(self._root):
52
+ self._texts_length += 1
53
+ return self._texts_length
44
54
 
45
- def _extract_xmlns(root: Element) -> str | None:
46
- root_xmlns: str | None = None
47
- for i, element in enumerate(_all_elements(root)):
48
- need_clean_xmlns = True
49
- match = re.match(_XMLNS_IN_TAG, element.tag)
55
+ @property
56
+ def file_content(self) -> str:
57
+ file_content: str
58
+ if self._xmlns is None:
59
+ file_content = tostring(self._root, encoding="unicode")
60
+ file_content = to_html(file_content)
61
+ else:
62
+ root = Element(
63
+ self._root.tag,
64
+ attrib={**self._root.attrib, "xmlns": self._xmlns},
65
+ )
66
+ root.extend(self._root)
67
+ # XHTML disable <tag/> (we need replace them with <tag></tag>)
68
+ for element in _all_elements(root):
69
+ if element.text is None:
70
+ element.text = ""
71
+ file_content = tostring(root, encoding="unicode")
50
72
 
51
- if match:
52
- xmlns = re.sub(_BRACES, "", match.group())
53
- if i == 0:
54
- root_xmlns = xmlns
55
- elif root_xmlns != xmlns:
56
- need_clean_xmlns = False
57
- if need_clean_xmlns:
58
- element.tag = re.sub(_XMLNS_IN_TAG, "", element.tag)
59
-
60
- return root_xmlns
73
+ if self._head is not None:
74
+ file_content = self._head + file_content
75
+ return file_content
61
76
 
62
77
  def _all_elements(parent: Element):
63
78
  yield parent
@@ -12,7 +12,8 @@ class TextPosition(Enum):
12
12
  TextDescription = tuple[Element, TextPosition, Element | None]
13
13
 
14
14
  _IGNORE_TAGS = (
15
- "title", "link", "style", "css", "img", "script", "metadata"
15
+ "title", "link", "style", "css", "img", "script", "metadata",
16
+ "{http://www.w3.org/1998/Math/MathML}math", # TODO: 公式是正文,也要读进去,暂时忽略避免扰乱得了。
16
17
  )
17
18
 
18
19
  _TEXT_LEAF_TAGS = (
@@ -0,0 +1 @@
1
+ from .node import LLM
@@ -0,0 +1,49 @@
1
+ import openai
2
+ import httpx
3
+ import requests
4
+
5
+
6
+ def is_retry_error(err: Exception) -> bool:
7
+ if _is_openai_retry_error(err):
8
+ return True
9
+ if _is_httpx_retry_error(err):
10
+ return True
11
+ if _is_request_retry_error(err):
12
+ return True
13
+ return False
14
+
15
+ # https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance
16
+ def _is_openai_retry_error(err: Exception) -> bool:
17
+ if isinstance(err, openai.Timeout):
18
+ return True
19
+ if isinstance(err, openai.APIConnectionError):
20
+ return True
21
+ if isinstance(err, openai.InternalServerError):
22
+ return err.status_code in (502, 503, 504)
23
+ return False
24
+
25
+ # https://www.python-httpx.org/exceptions/
26
+ def _is_httpx_retry_error(err: Exception) -> bool:
27
+ if isinstance(err, httpx.RemoteProtocolError):
28
+ return True
29
+ if isinstance(err, httpx.StreamError):
30
+ return True
31
+ if isinstance(err, httpx.TimeoutException):
32
+ return True
33
+ if isinstance(err, httpx.NetworkError):
34
+ return True
35
+ if isinstance(err, httpx.ProtocolError):
36
+ return True
37
+ return False
38
+
39
+ # https://requests.readthedocs.io/en/latest/api/#exceptions
40
+ def _is_request_retry_error(err: Exception) -> bool:
41
+ if isinstance(err, requests.ConnectionError):
42
+ return True
43
+ if isinstance(err, requests.ConnectTimeout):
44
+ return True
45
+ if isinstance(err, requests.ReadTimeout):
46
+ return True
47
+ if isinstance(err, requests.Timeout):
48
+ return True
49
+ return False
@@ -0,0 +1,147 @@
1
+ from typing import cast, Any, Callable
2
+ from io import StringIO
3
+ from time import sleep
4
+ from pydantic import SecretStr
5
+ from logging import Logger
6
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
7
+ from langchain_core.language_models import LanguageModelInput
8
+ from langchain_openai import ChatOpenAI
9
+
10
+ from .increasable import Increasable, Increaser
11
+ from .error import is_retry_error
12
+
13
+
14
+ class LLMExecutor:
15
+ def __init__(
16
+ self,
17
+ api_key: SecretStr,
18
+ url: str,
19
+ model: str,
20
+ timeout: float | None,
21
+ top_p: Increasable,
22
+ temperature: Increasable,
23
+ retry_times: int,
24
+ retry_interval_seconds: float,
25
+ create_logger: Callable[[], Logger | None],
26
+ ) -> None:
27
+
28
+ self._timeout: float | None = timeout
29
+ self._top_p: Increasable = top_p
30
+ self._temperature: Increasable = temperature
31
+ self._retry_times: int = retry_times
32
+ self._retry_interval_seconds: float = retry_interval_seconds
33
+ self._create_logger: Callable[[], Logger | None] = create_logger
34
+ self._model = ChatOpenAI(
35
+ api_key=cast(SecretStr, api_key),
36
+ base_url=url,
37
+ model=model,
38
+ timeout=timeout,
39
+ )
40
+
41
+ def request(self, input: LanguageModelInput, parser: Callable[[str], Any]) -> Any:
42
+ result: Any | None = None
43
+ last_error: Exception | None = None
44
+ did_success = False
45
+ top_p: Increaser = self._top_p.context()
46
+ temperature: Increaser = self._temperature.context()
47
+ logger = self._create_logger()
48
+
49
+ if logger is not None:
50
+ logger.debug(f"[[Request]]:\n{self._input2str(input)}\n")
51
+
52
+ try:
53
+ for i in range(self._retry_times + 1):
54
+ try:
55
+ response = self._invoke_model(
56
+ input=input,
57
+ top_p=top_p.current,
58
+ temperature=temperature.current,
59
+ )
60
+ if logger is not None:
61
+ logger.debug(f"[[Response]]:\n{response}\n")
62
+
63
+ except Exception as err:
64
+ last_error = err
65
+ if not is_retry_error(err):
66
+ raise err
67
+ if logger is not None:
68
+ logger.warning(f"request failed with connection error, retrying... ({i + 1} times)")
69
+ if self._retry_interval_seconds > 0.0 and \
70
+ i < self._retry_times:
71
+ sleep(self._retry_interval_seconds)
72
+ continue
73
+
74
+ try:
75
+ result = parser(response)
76
+ did_success = True
77
+ break
78
+
79
+ except Exception as err:
80
+ last_error = err
81
+ warn_message = f"request failed with parsing error, retrying... ({i + 1} times)"
82
+ if logger is not None:
83
+ logger.warning(warn_message)
84
+ print(warn_message)
85
+ top_p.increase()
86
+ temperature.increase()
87
+ if self._retry_interval_seconds > 0.0 and \
88
+ i < self._retry_times:
89
+ sleep(self._retry_interval_seconds)
90
+ continue
91
+
92
+ except KeyboardInterrupt as err:
93
+ if last_error is not None and logger is not None:
94
+ logger.debug(f"[[Error]]:\n{last_error}\n")
95
+ raise err
96
+
97
+ if not did_success:
98
+ if last_error is None:
99
+ raise RuntimeError("Request failed with unknown error")
100
+ else:
101
+ raise last_error
102
+
103
+ return result
104
+
105
+ def _input2str(self, input: LanguageModelInput) -> str:
106
+ if isinstance(input, str):
107
+ return input
108
+ if not isinstance(input, list):
109
+ raise ValueError(f"Unsupported input type: {type(input)}")
110
+
111
+ buffer = StringIO()
112
+ is_first = True
113
+ for message in input:
114
+ if not is_first:
115
+ buffer.write("\n\n")
116
+ if isinstance(message, SystemMessage):
117
+ buffer.write("System:\n")
118
+ buffer.write(message.content)
119
+ elif isinstance(message, HumanMessage):
120
+ buffer.write("User:\n")
121
+ buffer.write(message.content)
122
+ elif isinstance(message, AIMessage):
123
+ buffer.write("Assistant:\n")
124
+ buffer.write(message.content)
125
+ else:
126
+ buffer.write(str(message))
127
+ is_first = False
128
+
129
+ return buffer.getvalue()
130
+
131
+ def _invoke_model(
132
+ self,
133
+ input: LanguageModelInput,
134
+ top_p: float | None,
135
+ temperature: float | None,
136
+ ):
137
+ stream = self._model.stream(
138
+ input=input,
139
+ timeout=self._timeout,
140
+ top_p=top_p,
141
+ temperature=temperature,
142
+ )
143
+ buffer = StringIO()
144
+ for chunk in stream:
145
+ data = str(chunk.content)
146
+ buffer.write(data)
147
+ return buffer.getvalue()
@@ -0,0 +1,35 @@
1
+ class Increaser:
2
+ def __init__(self, value_range: tuple[float, float] | None):
3
+ self._value_range: tuple[float, float] | None = value_range
4
+ self._current: float | None = value_range[0] if value_range is not None else None
5
+
6
+ @property
7
+ def current(self) -> float | None:
8
+ return self._current
9
+
10
+ def increase(self):
11
+ if self._value_range is None:
12
+ return
13
+ _, end_value = self._value_range
14
+ self._current = self._current + 0.5 * (end_value - self._current)
15
+
16
+ class Increasable:
17
+ def __init__(self, param: float | tuple[float, float] | None):
18
+ self._value_range: tuple[float, float] | None = None
19
+
20
+ if isinstance(param, int):
21
+ param = float(param)
22
+ if isinstance(param, float):
23
+ param = (param, param)
24
+ if isinstance(param, tuple):
25
+ if len(param) != 2:
26
+ raise ValueError(f"Expected a tuple of length 2, got {len(param)}")
27
+ begin, end = param
28
+ if isinstance(begin, int):
29
+ begin = float(begin)
30
+ if isinstance(end, int):
31
+ end = float(end)
32
+ self._value_range = (begin, end)
33
+
34
+ def context(self) -> Increaser:
35
+ return Increaser(self._value_range)
@@ -0,0 +1,197 @@
1
+ import datetime
2
+
3
+ from os import PathLike
4
+ from pathlib import Path
5
+ from typing import cast, Any, TypeVar, Generator, Sequence, Callable
6
+ from importlib.resources import files
7
+ from jinja2 import Environment, Template
8
+ from xml.etree.ElementTree import Element
9
+ from pydantic import SecretStr
10
+ from logging import getLogger, DEBUG, Formatter, Logger, FileHandler
11
+ from tiktoken import get_encoding, Encoding
12
+ from langchain_core.messages import SystemMessage, HumanMessage
13
+
14
+ from ..template import create_env
15
+ from ..xml import decode_friendly, encode_friendly
16
+ from .increasable import Increasable
17
+ from .executor import LLMExecutor
18
+
19
+
20
+ R = TypeVar("R")
21
+
22
+ class LLM:
23
+ def __init__(
24
+ self,
25
+ key: str,
26
+ url: str,
27
+ model: str,
28
+ token_encoding: str,
29
+ timeout: float | None = None,
30
+ top_p: float | tuple[float, float] | None = None,
31
+ temperature: float | tuple[float, float] | None = None,
32
+ retry_times: int = 5,
33
+ retry_interval_seconds: float = 6.0,
34
+ log_dir_path: PathLike | None = None,
35
+ ):
36
+ prompts_path = files("epub_translator") / "data"
37
+ self._templates: dict[str, Template] = {}
38
+ self._encoding: Encoding = get_encoding(token_encoding)
39
+ self._env: Environment = create_env(prompts_path)
40
+ self._logger_save_path: Path | None = None
41
+
42
+ if log_dir_path is not None:
43
+ self._logger_save_path = Path(log_dir_path)
44
+ if not self._logger_save_path.exists():
45
+ self._logger_save_path.mkdir(parents=True, exist_ok=True)
46
+ elif not self._logger_save_path.is_dir():
47
+ self._logger_save_path = None
48
+
49
+ self._executor = LLMExecutor(
50
+ url=url,
51
+ model=model,
52
+ api_key=cast(SecretStr, key),
53
+ timeout=timeout,
54
+ top_p=Increasable(top_p),
55
+ temperature=Increasable(temperature),
56
+ retry_times=retry_times,
57
+ retry_interval_seconds=retry_interval_seconds,
58
+ create_logger=self._create_logger,
59
+ )
60
+
61
+ def _create_logger(self) -> Logger | None:
62
+ if self._logger_save_path is None:
63
+ return None
64
+
65
+ now = datetime.datetime.now(datetime.timezone.utc)
66
+ timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
67
+ file_path = self._logger_save_path / f"request {timestamp}.log"
68
+ logger = getLogger(f"LLM Request {timestamp}")
69
+ logger.setLevel(DEBUG)
70
+ handler = FileHandler(file_path, encoding="utf-8")
71
+ handler.setLevel(DEBUG)
72
+ handler.setFormatter(Formatter("%(asctime)s %(message)s", "%H:%M:%S"))
73
+ logger.addHandler(handler)
74
+
75
+ return logger
76
+
77
+ def request_text(
78
+ self,
79
+ template_name: str,
80
+ text_tag: str,
81
+ user_data: Element | str,
82
+ parser: Callable[[str], R],
83
+ params: dict[str, Any] | None = None,
84
+ ) -> R:
85
+
86
+ if params is None:
87
+ params = {}
88
+
89
+ def parse_response(response: str) -> R:
90
+ text = next(self._search_quotes(text_tag.lower(), response), None)
91
+ if text is None:
92
+ raise ValueError(f"No valid {text_tag} response found")
93
+ return parser(text)
94
+
95
+ return self._executor.request(
96
+ input=self._create_input(template_name, user_data, params),
97
+ parser=parse_response,
98
+ )
99
+
100
+ def request_xml(
101
+ self,
102
+ template_name: str,
103
+ user_data: Element | str,
104
+ parser: Callable[[Element], R],
105
+ params: dict[str, Any] | None = None,
106
+ ) -> R:
107
+
108
+ if params is None:
109
+ params = {}
110
+
111
+ def parse_response(response: str) -> R:
112
+ element = next(decode_friendly(response, "response"), None)
113
+ if element is None:
114
+ raise ValueError("No valid XML response found")
115
+ return parser(element)
116
+
117
+ return self._executor.request(
118
+ input=self._create_input(template_name, user_data, params),
119
+ parser=parse_response,
120
+ )
121
+
122
+ def _create_input(self, template_name: str, user_data: Element | str, params: dict[str, Any]):
123
+ data: str
124
+ if isinstance(user_data, Element):
125
+ data = encode_friendly(user_data)
126
+ data = f"```XML\n{data}\n```"
127
+ else:
128
+ data = user_data
129
+
130
+ template = self._template(template_name)
131
+ prompt = template.render(**params)
132
+ return [
133
+ SystemMessage(content=prompt),
134
+ HumanMessage(content=data)
135
+ ]
136
+
137
+ def prompt_tokens_count(self, template_name: str, params: dict[str, Any]) -> int:
138
+ template = self._template(template_name)
139
+ prompt = template.render(**params)
140
+ return len(self._encoding.encode(prompt))
141
+
142
+ def encode_tokens(self, text: str) -> list[int]:
143
+ return self._encoding.encode(text)
144
+
145
+ def decode_tokens(self, tokens: Sequence[int]) -> str:
146
+ return self._encoding.decode(tokens)
147
+
148
+ def count_tokens_count(self, text: str) -> int:
149
+ return len(self._encoding.encode(text))
150
+
151
+ def _template(self, template_name: str) -> Template:
152
+ template = self._templates.get(template_name, None)
153
+ if template is None:
154
+ template = self._env.get_template(template_name)
155
+ self._templates[template_name] = template
156
+ return template
157
+
158
+ def _search_quotes(self, kind: str, response: str) -> Generator[str, None, None]:
159
+ start_marker = f"```{kind}"
160
+ end_marker = "```"
161
+ start_index = 0
162
+
163
+ while True:
164
+ start_index = self._find_ignore_case(
165
+ raw=response,
166
+ sub=start_marker,
167
+ start=start_index,
168
+ )
169
+ if start_index == -1:
170
+ break
171
+
172
+ end_index = self._find_ignore_case(
173
+ raw=response,
174
+ sub=end_marker,
175
+ start=start_index + len(start_marker),
176
+ )
177
+ if end_index == -1:
178
+ break
179
+
180
+ extracted_text = response[start_index + len(start_marker):end_index].strip()
181
+ yield extracted_text
182
+ start_index = end_index + len(end_marker)
183
+
184
+ def _find_ignore_case(self, raw: str, sub: str, start: int = 0):
185
+ if not sub:
186
+ return 0 if 0 >= start else -1
187
+
188
+ raw_len, sub_len = len(raw), len(sub)
189
+ for i in range(start, raw_len - sub_len + 1):
190
+ match = True
191
+ for j in range(sub_len):
192
+ if raw[i + j].lower() != sub[j].lower():
193
+ match = False
194
+ break
195
+ if match:
196
+ return i
197
+ return -1