hamtaa-texttools 0.1.54__tar.gz → 0.1.56__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/PKG-INFO +1 -1
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/PKG-INFO +1 -1
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/pyproject.toml +1 -1
- hamtaa_texttools-0.1.56/texttools/tools/translator/gemma_translator.py +199 -0
- hamtaa_texttools-0.1.54/texttools/tools/translator/gemma_translator.py +0 -195
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/README.md +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/SOURCES.txt +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/requires.txt +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/top_level.txt +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/setup.cfg +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/setup.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/tests/test_vllm_output.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_categorizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_keyword_extractor.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_ner_extractor.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_detector.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_generator.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_merger.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_rewriter.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_router.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_summarizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_task_performer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_translator.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/formatter/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/formatter/base.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/formatter/gemma3_formatter.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/categorizer/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/categorizer/categorizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/handlers.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/encoder_model/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/llm/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/llm/gemma_categorizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/llm/openai_categorizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/keyword_extractor/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/keyword_extractor/gemma_extractor.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/merger/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/merger/gemma_question_merger.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/ner/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/ner/gemma_ner_extractor.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_detector/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_detector/gemma_detector.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_detector/llm_detector.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_generator/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_generator/gemma_question_generator.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/reranker/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/reranker/reranker.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/reranker/scorer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/reranker/sorter.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/rewriter/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/rewriter/gemma_question_rewriter.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/router/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/router/gemma_router.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/subject_to_question/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/subject_to_question/gemma_question_generator.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/summarizer/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/summarizer/gemma_summarizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/summarizer/llm_summerizer.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/translator/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/__init__.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/batch_manager.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/batch_runner.py +0 -0
- {hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/flex_processor.py +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, List, Optional
|
|
3
|
+
|
|
4
|
+
from openai import OpenAI
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from texttools.base.base_translator import BaseTranslator
|
|
8
|
+
from texttools.formatter.gemma3_formatter import Gemma3Formatter
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PreprocessorOutput(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
List of proper-name strings extracted from the source text.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
entities: List[str] = Field(
|
|
17
|
+
description="All proper names found in the text; return an empty list if none."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GemmaTranslator(BaseTranslator):
|
|
22
|
+
"""
|
|
23
|
+
Translator for Gemma-style models using structured JSON prompts.
|
|
24
|
+
Outputs only the translated text, without any additional structure.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
client: OpenAI,
|
|
30
|
+
*,
|
|
31
|
+
model: str,
|
|
32
|
+
chat_formatter: Optional[Any] = None,
|
|
33
|
+
use_reason: bool = False,
|
|
34
|
+
temperature: float = 0.0,
|
|
35
|
+
prompt_template: str = None,
|
|
36
|
+
handlers: list[Any] = None,
|
|
37
|
+
**client_kwargs: Any,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(handlers)
|
|
40
|
+
self.client: OpenAI = client
|
|
41
|
+
self.model = model
|
|
42
|
+
self.temperature = temperature
|
|
43
|
+
self.client_kwargs = client_kwargs
|
|
44
|
+
self.chat_formatter = chat_formatter or Gemma3Formatter()
|
|
45
|
+
self.use_reason = use_reason
|
|
46
|
+
self.prompt_template = prompt_template
|
|
47
|
+
|
|
48
|
+
def _build_messages(
|
|
49
|
+
self,
|
|
50
|
+
text: str,
|
|
51
|
+
target_language: str,
|
|
52
|
+
source_language: Optional[str] = None,
|
|
53
|
+
reason: Optional[str] = None,
|
|
54
|
+
proper_names: Optional[list[str]] = None,
|
|
55
|
+
) -> list[dict[str, str]]:
|
|
56
|
+
"""Constructs a single, comprehensive JSON prompt for the translation task."""
|
|
57
|
+
|
|
58
|
+
prompt_data = {
|
|
59
|
+
"role": "Expert Translator",
|
|
60
|
+
"task": f"Translate the following text from {source_language or 'the original language'} to {target_language}.",
|
|
61
|
+
"input_text": text,
|
|
62
|
+
"rules": {
|
|
63
|
+
"proper_names": {
|
|
64
|
+
"instruction": "These names MUST NOT be translated. Only transliterate them to preserve their phonetic form.",
|
|
65
|
+
"list": proper_names if proper_names else "None",
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
"output_instructions": [
|
|
69
|
+
"Provide ONLY the translated text.",
|
|
70
|
+
"Do not include any explanations, comments, or markdown formatting.",
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if reason:
|
|
75
|
+
prompt_data["context"] = {
|
|
76
|
+
"preliminary_analysis": reason,
|
|
77
|
+
"instruction": "Use this analysis to inform the translation.",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# The entire set of instructions is formatted into a single JSON string
|
|
81
|
+
content = json.dumps(prompt_data, indent=2)
|
|
82
|
+
messages = [{"role": "user", "content": content}]
|
|
83
|
+
|
|
84
|
+
# Optional additional JSON template for more complex rules
|
|
85
|
+
if self.prompt_template:
|
|
86
|
+
messages.append({"role": "user", "content": self.prompt_template})
|
|
87
|
+
|
|
88
|
+
return self.chat_formatter.format(messages=messages)
|
|
89
|
+
|
|
90
|
+
def _reason(self, text: str, target_language: str) -> str:
|
|
91
|
+
"""Internal reasoning step using a JSON prompt to analyze text before translation."""
|
|
92
|
+
|
|
93
|
+
prompt_data = {
|
|
94
|
+
"task": "Analyze the provided text to identify potential translation challenges.",
|
|
95
|
+
"analysis_points": [
|
|
96
|
+
"Identify idioms or colloquialisms.",
|
|
97
|
+
"Note any cultural references.",
|
|
98
|
+
"Point out complex grammatical structures.",
|
|
99
|
+
"List all proper nouns that should be transliterated, not translated.",
|
|
100
|
+
],
|
|
101
|
+
"input_text": text,
|
|
102
|
+
"output_instructions": {
|
|
103
|
+
"language": target_language,
|
|
104
|
+
"format": "A concise, bulleted list.",
|
|
105
|
+
"important_rule": "DO NOT TRANSLATE the original text.",
|
|
106
|
+
"length": "must be less than 200 words.",
|
|
107
|
+
},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
|
|
111
|
+
|
|
112
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
113
|
+
completion = self.client.chat.completions.create(
|
|
114
|
+
model=self.model,
|
|
115
|
+
messages=restructured,
|
|
116
|
+
temperature=self.temperature,
|
|
117
|
+
**self.client_kwargs,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return completion.choices[0].message.content.strip()
|
|
121
|
+
|
|
122
|
+
def preprocess(self, text: str) -> PreprocessorOutput:
|
|
123
|
+
"""Preprocessor that finds proper names using a structured JSON prompt."""
|
|
124
|
+
|
|
125
|
+
prompt_data = {
|
|
126
|
+
"task_description": "Extract all proper names of people from the provided text.",
|
|
127
|
+
"input_text": text,
|
|
128
|
+
"output_format": {
|
|
129
|
+
"schema": {"entities": ["string"]},
|
|
130
|
+
"instruction": "Return a JSON object matching this schema. If no names are found, the 'entities' list must be empty.",
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
|
|
135
|
+
|
|
136
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
137
|
+
|
|
138
|
+
completion = self.client.chat.completions.parse(
|
|
139
|
+
model=self.model,
|
|
140
|
+
messages=restructured,
|
|
141
|
+
response_format=PreprocessorOutput,
|
|
142
|
+
temperature=self.temperature,
|
|
143
|
+
extra_body={
|
|
144
|
+
"guided_decoding_backend": "auto",
|
|
145
|
+
},
|
|
146
|
+
**self.client_kwargs,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return completion.choices[0].message.parsed
|
|
150
|
+
|
|
151
|
+
def translate(
|
|
152
|
+
self, text: str, target_language: str, source_language: Optional[str] = None
|
|
153
|
+
) -> str:
|
|
154
|
+
"""Translates text using a structured JSON-based workflow."""
|
|
155
|
+
|
|
156
|
+
# 1. Preprocess: Extract proper names
|
|
157
|
+
extracted_data = self.preprocess(text)
|
|
158
|
+
proper_names = extracted_data.entities
|
|
159
|
+
|
|
160
|
+
# 2. Reason (optional): Analyze the text for challenges
|
|
161
|
+
reason_summary = None
|
|
162
|
+
if self.use_reason:
|
|
163
|
+
reason_summary = self._reason(text, target_language)
|
|
164
|
+
|
|
165
|
+
# 3. Translate: Build the final prompt and get the translation
|
|
166
|
+
messages = self._build_messages(
|
|
167
|
+
text, target_language, source_language, reason_summary, proper_names
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# For debugging purposes, let's see the final prompt
|
|
171
|
+
print("--- Translation Request ---")
|
|
172
|
+
print(f"Original: {text}")
|
|
173
|
+
print(
|
|
174
|
+
f"Translating to {target_language} from {source_language or 'original'}..."
|
|
175
|
+
)
|
|
176
|
+
if reason_summary:
|
|
177
|
+
print(f"Reasoning Analysis:\n{reason_summary}")
|
|
178
|
+
print("--- Final JSON Prompt Sent to Model ---")
|
|
179
|
+
# Pretty-print the JSON content from the message
|
|
180
|
+
print(json.dumps(json.loads(messages[0]["content"]), indent=2))
|
|
181
|
+
print("---------------------------")
|
|
182
|
+
|
|
183
|
+
completion = self.client.chat.completions.create(
|
|
184
|
+
model=self.model,
|
|
185
|
+
messages=messages,
|
|
186
|
+
temperature=self.temperature,
|
|
187
|
+
**self.client_kwargs,
|
|
188
|
+
)
|
|
189
|
+
response = completion.choices[0].message.content
|
|
190
|
+
|
|
191
|
+
self._dispatch(
|
|
192
|
+
{
|
|
193
|
+
"original_text": text,
|
|
194
|
+
"source_language": source_language,
|
|
195
|
+
"target_language": target_language,
|
|
196
|
+
"translated_text": response,
|
|
197
|
+
}
|
|
198
|
+
)
|
|
199
|
+
return response
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
from typing import Any, List, Optional
|
|
2
|
-
|
|
3
|
-
from openai import OpenAI
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
5
|
-
|
|
6
|
-
from texttools.base.base_translator import BaseTranslator
|
|
7
|
-
from texttools.formatter.gemma3_formatter import Gemma3Formatter
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class PreprocessorOutput(BaseModel):
|
|
11
|
-
"""
|
|
12
|
-
List of proper-name strings extracted from the source text.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
entities: List[str] = Field(
|
|
16
|
-
description="All proper names found in the text; return an empty list if none."
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class GemmaTranslator(BaseTranslator):
|
|
21
|
-
"""
|
|
22
|
-
Translator for Gemma-style models with optional reasoning step.
|
|
23
|
-
Outputs only the translated text, without any additional structure.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
client: OpenAI,
|
|
29
|
-
*,
|
|
30
|
-
model: str,
|
|
31
|
-
chat_formatter: Optional[Any] = None,
|
|
32
|
-
use_reason: bool = False,
|
|
33
|
-
temperature: float = 0.0,
|
|
34
|
-
prompt_template: str = None,
|
|
35
|
-
handlers: list[Any] = None,
|
|
36
|
-
**client_kwargs: Any,
|
|
37
|
-
):
|
|
38
|
-
super().__init__(handlers)
|
|
39
|
-
self.client: OpenAI = client
|
|
40
|
-
self.model = model
|
|
41
|
-
self.temperature = temperature
|
|
42
|
-
self.client_kwargs = client_kwargs
|
|
43
|
-
self.chat_formatter = chat_formatter or Gemma3Formatter()
|
|
44
|
-
self.use_reason = use_reason
|
|
45
|
-
self.prompt_template = prompt_template
|
|
46
|
-
|
|
47
|
-
def _build_messages(
|
|
48
|
-
self,
|
|
49
|
-
text: str,
|
|
50
|
-
target_language: str,
|
|
51
|
-
source_language: Optional[str] = None,
|
|
52
|
-
reason: Optional[str] = None,
|
|
53
|
-
proper_names: Optional[list[str]] = None,
|
|
54
|
-
) -> list[dict[str, str]]:
|
|
55
|
-
messages: list[dict[str, str]] = []
|
|
56
|
-
|
|
57
|
-
# This prompt gives initial information about translation like languages and proper names
|
|
58
|
-
enforce_prompt = f"""
|
|
59
|
-
You are a {source_language}-to-{target_language} translator.
|
|
60
|
-
Important Rule: The following are proper names and must NOT be translated.
|
|
61
|
-
They must be only transliterated into {target_language}.
|
|
62
|
-
That means preserving their phonetic form without changing their meaning.
|
|
63
|
-
Apply the rule for **ALL** of following proper names.
|
|
64
|
-
Proper names (do not translate **** of them):
|
|
65
|
-
{proper_names if proper_names else "None"}
|
|
66
|
-
If any proper name is found in the text, you MUST only transliterate it.
|
|
67
|
-
Output only the translated text. No comments, no explanations, no markdown.
|
|
68
|
-
"""
|
|
69
|
-
messages.append({"role": "user", "content": enforce_prompt})
|
|
70
|
-
|
|
71
|
-
clean_text = text
|
|
72
|
-
if reason:
|
|
73
|
-
reason_prompt = f"""
|
|
74
|
-
Based on the analysis conducted, translate the following text {"from" + source_language if source_language else ""} to {target_language}.
|
|
75
|
-
The text to be translated is: "{clean_text}"
|
|
76
|
-
The analysis conducted: {reason}
|
|
77
|
-
"""
|
|
78
|
-
messages.append({"role": "user", "content": reason_prompt})
|
|
79
|
-
else:
|
|
80
|
-
regular_prompt = f"""Translate the following text from {source_language or "original"} to {target_language}:
|
|
81
|
-
{clean_text}"""
|
|
82
|
-
messages.append({"role": "user", "content": regular_prompt})
|
|
83
|
-
|
|
84
|
-
# Optional additional template
|
|
85
|
-
if self.prompt_template:
|
|
86
|
-
messages.append({"role": "user", "content": self.prompt_template})
|
|
87
|
-
|
|
88
|
-
restructured = self.chat_formatter.format(messages=messages)
|
|
89
|
-
|
|
90
|
-
return restructured
|
|
91
|
-
|
|
92
|
-
def _reason(self, text: str, target_language: str) -> str:
|
|
93
|
-
"""
|
|
94
|
-
Internal reasoning step to help the model with translation.
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
reason_step_prompt = f"""
|
|
98
|
-
Analyze the following text and identify important linguistic considerations for translation.
|
|
99
|
-
Do not translate the text. Point out any idioms, cultural references, or complex structures that need special attention.
|
|
100
|
-
Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
|
|
101
|
-
"""
|
|
102
|
-
messages = [
|
|
103
|
-
{"role": "user", "content": reason_step_prompt},
|
|
104
|
-
{"role": "user", "content": text},
|
|
105
|
-
]
|
|
106
|
-
|
|
107
|
-
restructured = self.chat_formatter.format(messages=messages)
|
|
108
|
-
completion = self.client.chat.completions.create(
|
|
109
|
-
model=self.model,
|
|
110
|
-
messages=restructured,
|
|
111
|
-
temperature=self.temperature,
|
|
112
|
-
**self.client_kwargs,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
return completion.choices[0].message.content.strip()
|
|
116
|
-
|
|
117
|
-
def preprocess(self, text: str) -> list:
|
|
118
|
-
"""
|
|
119
|
-
Preprocessor that finds proper names of Islamic figures. The extractions will be given to the
|
|
120
|
-
LLm in order to know that it shouldn't translate them, but transliterate them.
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
messages: list[dict[str, str]] = []
|
|
124
|
-
|
|
125
|
-
main_prompt = """
|
|
126
|
-
You must detect proper names of people.
|
|
127
|
-
Your task is to extract a JSON list of entities from the given input. For each entity, include:
|
|
128
|
-
text: The exact matched string from the original.
|
|
129
|
-
type: Only include "Proper Name" for actual names of real people.
|
|
130
|
-
If there is no proper name in the following text, return empty json.
|
|
131
|
-
"""
|
|
132
|
-
messages.append({"role": "user", "content": main_prompt})
|
|
133
|
-
|
|
134
|
-
text_prompt = f"""The text to be extracted is:{text}"""
|
|
135
|
-
messages.append({"role": "user", "content": text_prompt})
|
|
136
|
-
|
|
137
|
-
restructured = self.chat_formatter.format(messages=messages)
|
|
138
|
-
completion = self.client.chat.completions.parse(
|
|
139
|
-
model=self.model,
|
|
140
|
-
messages=restructured,
|
|
141
|
-
response_format=PreprocessorOutput,
|
|
142
|
-
temperature=self.temperature,
|
|
143
|
-
extra_body={
|
|
144
|
-
"guided_decoding_backend": "auto",
|
|
145
|
-
},
|
|
146
|
-
**self.client_kwargs,
|
|
147
|
-
)
|
|
148
|
-
message = completion.choices[0].message
|
|
149
|
-
|
|
150
|
-
entities = message.parsed
|
|
151
|
-
return entities
|
|
152
|
-
|
|
153
|
-
def translate(
|
|
154
|
-
self, text: str, target_language: str, source_language: Optional[str] = None
|
|
155
|
-
) -> str:
|
|
156
|
-
"""
|
|
157
|
-
Translates text and returns only the translated string.
|
|
158
|
-
"""
|
|
159
|
-
|
|
160
|
-
# Extract proper names to tell the LLM what names not to translate, but to transliterate
|
|
161
|
-
extracted = self.preprocess(text)
|
|
162
|
-
proper_names = extracted.entities
|
|
163
|
-
|
|
164
|
-
reason_summary = None
|
|
165
|
-
if self.use_reason:
|
|
166
|
-
reason_summary = self._reason(text, target_language)
|
|
167
|
-
|
|
168
|
-
messages = self._build_messages(
|
|
169
|
-
text, target_language, source_language, reason_summary, proper_names
|
|
170
|
-
)
|
|
171
|
-
print(f"Original: {text}")
|
|
172
|
-
print(
|
|
173
|
-
f"Translating to {target_language} from {source_language or 'original'}..."
|
|
174
|
-
)
|
|
175
|
-
print(
|
|
176
|
-
f"Reasoning: {reason_summary}" if reason_summary else "Reasoning not used."
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
completion = self.client.chat.completions.create(
|
|
180
|
-
model=self.model,
|
|
181
|
-
messages=messages,
|
|
182
|
-
temperature=self.temperature,
|
|
183
|
-
**self.client_kwargs,
|
|
184
|
-
)
|
|
185
|
-
response = completion.choices[0].message.content
|
|
186
|
-
|
|
187
|
-
self._dispatch(
|
|
188
|
-
{
|
|
189
|
-
"original_text": text,
|
|
190
|
-
"source_language": source_language,
|
|
191
|
-
"target_language": target_language,
|
|
192
|
-
"translated_text": response,
|
|
193
|
-
}
|
|
194
|
-
)
|
|
195
|
-
return response
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/hamtaa_texttools.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_keyword_extractor.py
RENAMED
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_detector.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_generator.py
RENAMED
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/base/base_question_rewriter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/categorizer/__init__.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/handlers/categorizer/categorizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/categorizer/llm/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/keyword_extractor/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/merger/gemma_question_merger.py
RENAMED
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/ner/gemma_ner_extractor.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_detector/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/question_generator/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/subject_to_question/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/summarizer/gemma_summarizer.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/tools/summarizer/llm_summerizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/__init__.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/batch_manager.py
RENAMED
|
File without changes
|
{hamtaa_texttools-0.1.54 → hamtaa_texttools-0.1.56}/texttools/utils/batch_manager/batch_runner.py
RENAMED
|
File without changes
|
|
File without changes
|