hamtaa-texttools 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- {hamtaa_texttools-0.1.53.dist-info → hamtaa_texttools-0.1.55.dist-info}/METADATA +1 -1
- {hamtaa_texttools-0.1.53.dist-info → hamtaa_texttools-0.1.55.dist-info}/RECORD +5 -5
- texttools/tools/translator/gemma_translator.py +81 -77
- {hamtaa_texttools-0.1.53.dist-info → hamtaa_texttools-0.1.55.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-0.1.53.dist-info → hamtaa_texttools-0.1.55.dist-info}/top_level.txt +0 -0
|
@@ -50,12 +50,12 @@ texttools/tools/summarizer/__init__.py,sha256=phrR7qO20CNhO3hjXQBzhTRVumdVdGSufm
|
|
|
50
50
|
texttools/tools/summarizer/gemma_summarizer.py,sha256=ikhsBv7AiZD1dT_d12AyjXxojzSW92e2y5WjchI_3bE,4474
|
|
51
51
|
texttools/tools/summarizer/llm_summerizer.py,sha256=-0rUKbSnl1aDeBfJ5DCSbIlwd2k-9qIaCKgoQJa0hWc,3412
|
|
52
52
|
texttools/tools/translator/__init__.py,sha256=KO1m08J2BZwRqBGO9ICB4l4cnH1jfHLHL5HbgYFUWM8,72
|
|
53
|
-
texttools/tools/translator/gemma_translator.py,sha256=
|
|
53
|
+
texttools/tools/translator/gemma_translator.py,sha256=KsKbD_hzbOmFt-K0pciZ1IXz66JGm480FdBqWL2mYL0,7272
|
|
54
54
|
texttools/utils/flex_processor.py,sha256=C-lMwMjpIM6uAPFxXdgajxcFV1ccngEfJqq6xe5S1J8,3123
|
|
55
55
|
texttools/utils/batch_manager/__init__.py,sha256=3ZkxA395lRD4gNxJ1vp0fNuz_XuBr50GoP51rrwQ0Ks,87
|
|
56
56
|
texttools/utils/batch_manager/batch_manager.py,sha256=jAmKskL3OTYwwsO1mWsWAB3VxMlOF07c2GW1Ev83ZhY,9283
|
|
57
57
|
texttools/utils/batch_manager/batch_runner.py,sha256=DE6TFz3i_jR-ZiUYbgIdLgjqr3aitw-JM_tKnSvzGL0,7424
|
|
58
|
-
hamtaa_texttools-0.1.
|
|
59
|
-
hamtaa_texttools-0.1.
|
|
60
|
-
hamtaa_texttools-0.1.
|
|
61
|
-
hamtaa_texttools-0.1.
|
|
58
|
+
hamtaa_texttools-0.1.55.dist-info/METADATA,sha256=-WVDAY_TTcDZwiM8YkCsrA_qy8dlO669LM2oEPtYiA4,1481
|
|
59
|
+
hamtaa_texttools-0.1.55.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
60
|
+
hamtaa_texttools-0.1.55.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
61
|
+
hamtaa_texttools-0.1.55.dist-info/RECORD,,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Any, List, Optional
|
|
2
3
|
|
|
3
4
|
from openai import OpenAI
|
|
@@ -19,7 +20,7 @@ class PreprocessorOutput(BaseModel):
|
|
|
19
20
|
|
|
20
21
|
class GemmaTranslator(BaseTranslator):
|
|
21
22
|
"""
|
|
22
|
-
Translator for Gemma-style models
|
|
23
|
+
Translator for Gemma-style models using structured JSON prompts.
|
|
23
24
|
Outputs only the translated text, without any additional structure.
|
|
24
25
|
"""
|
|
25
26
|
|
|
@@ -52,57 +53,61 @@ class GemmaTranslator(BaseTranslator):
|
|
|
52
53
|
reason: Optional[str] = None,
|
|
53
54
|
proper_names: Optional[list[str]] = None,
|
|
54
55
|
) -> list[dict[str, str]]:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
56
|
+
"""Constructs a single, comprehensive JSON prompt for the translation task."""
|
|
57
|
+
|
|
58
|
+
prompt_data = {
|
|
59
|
+
"role": "Expert Translator",
|
|
60
|
+
"task": f"Translate the following text from {source_language or 'the original language'} to {target_language}.",
|
|
61
|
+
"input_text": text,
|
|
62
|
+
"rules": {
|
|
63
|
+
"proper_names": {
|
|
64
|
+
"instruction": "These names MUST NOT be translated. Only transliterate them to preserve their phonetic form.",
|
|
65
|
+
"list": proper_names if proper_names else "None",
|
|
66
|
+
}
|
|
67
|
+
},
|
|
68
|
+
"output_instructions": [
|
|
69
|
+
"Provide ONLY the translated text.",
|
|
70
|
+
"Do not include any explanations, comments, or markdown formatting.",
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
72
74
|
if reason:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# Optional additional template
|
|
75
|
+
prompt_data["context"] = {
|
|
76
|
+
"preliminary_analysis": reason,
|
|
77
|
+
"instruction": "Use this analysis to inform the translation.",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# The entire set of instructions is formatted into a single JSON string
|
|
81
|
+
content = json.dumps(prompt_data, indent=2)
|
|
82
|
+
messages = [{"role": "user", "content": content}]
|
|
83
|
+
|
|
84
|
+
# Optional additional JSON template for more complex rules
|
|
85
85
|
if self.prompt_template:
|
|
86
86
|
messages.append({"role": "user", "content": self.prompt_template})
|
|
87
87
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
return restructured
|
|
88
|
+
return self.chat_formatter.format(messages=messages)
|
|
91
89
|
|
|
92
90
|
def _reason(self, text: str, target_language: str) -> str:
|
|
93
|
-
"""
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
91
|
+
"""Internal reasoning step using a JSON prompt to analyze text before translation."""
|
|
92
|
+
|
|
93
|
+
prompt_data = {
|
|
94
|
+
"task": "Analyze the provided text to identify potential translation challenges.",
|
|
95
|
+
"analysis_points": [
|
|
96
|
+
"Identify idioms or colloquialisms.",
|
|
97
|
+
"Note any cultural references.",
|
|
98
|
+
"Point out complex grammatical structures.",
|
|
99
|
+
"List all proper nouns that should be transliterated, not translated.",
|
|
100
|
+
],
|
|
101
|
+
"input_text": text,
|
|
102
|
+
"output_instructions": {
|
|
103
|
+
"language": target_language,
|
|
104
|
+
"format": "A concise, bulleted list.",
|
|
105
|
+
"important_rule": "DO NOT TRANSLATE the original text.",
|
|
106
|
+
"length": "must be less than 200 words.",
|
|
107
|
+
},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
|
|
106
111
|
|
|
107
112
|
restructured = self.chat_formatter.format(messages=messages)
|
|
108
113
|
completion = self.client.chat.completions.create(
|
|
@@ -114,67 +119,66 @@ class GemmaTranslator(BaseTranslator):
|
|
|
114
119
|
|
|
115
120
|
return completion.choices[0].message.content.strip()
|
|
116
121
|
|
|
117
|
-
def preprocess(self, text: str) ->
|
|
118
|
-
"""
|
|
119
|
-
Preprocessor that finds proper names of Islamic figures. The extractions will be given to the
|
|
120
|
-
LLm in order to know that it shouldn't translate them, but transliterate them.
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
messages: list[dict[str, str]] = []
|
|
122
|
+
def preprocess(self, text: str) -> PreprocessorOutput:
|
|
123
|
+
"""Preprocessor that finds proper names using a structured JSON prompt."""
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
125
|
+
prompt_data = {
|
|
126
|
+
"task_description": "Extract all proper names of people from the provided text.",
|
|
127
|
+
"input_text": text,
|
|
128
|
+
"output_format": {
|
|
129
|
+
"schema": {"entities": ["string"]},
|
|
130
|
+
"instruction": "Return a JSON object matching this schema. If no names are found, the 'entities' list must be empty.",
|
|
131
|
+
},
|
|
132
|
+
}
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
messages.append({"role": "user", "content": text_prompt})
|
|
134
|
+
messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
|
|
136
135
|
|
|
137
136
|
restructured = self.chat_formatter.format(messages=messages)
|
|
137
|
+
|
|
138
138
|
completion = self.client.chat.completions.parse(
|
|
139
139
|
model=self.model,
|
|
140
140
|
messages=restructured,
|
|
141
|
-
|
|
141
|
+
response_model=PreprocessorOutput,
|
|
142
142
|
temperature=self.temperature,
|
|
143
143
|
extra_body={
|
|
144
144
|
"guided_decoding_backend": "auto",
|
|
145
145
|
},
|
|
146
146
|
**self.client_kwargs,
|
|
147
147
|
)
|
|
148
|
-
message = completion.choices[0].message
|
|
149
148
|
|
|
150
|
-
|
|
151
|
-
return entities
|
|
149
|
+
return completion.choices[0].message.parsed
|
|
152
150
|
|
|
153
151
|
def translate(
|
|
154
152
|
self, text: str, target_language: str, source_language: Optional[str] = None
|
|
155
153
|
) -> str:
|
|
156
|
-
"""
|
|
157
|
-
Translates text and returns only the translated string.
|
|
158
|
-
"""
|
|
154
|
+
"""Translates text using a structured JSON-based workflow."""
|
|
159
155
|
|
|
160
|
-
# Extract proper names
|
|
161
|
-
|
|
162
|
-
proper_names =
|
|
156
|
+
# 1. Preprocess: Extract proper names
|
|
157
|
+
extracted_data = self.preprocess(text)
|
|
158
|
+
proper_names = extracted_data.entities
|
|
163
159
|
|
|
160
|
+
# 2. Reason (optional): Analyze the text for challenges
|
|
164
161
|
reason_summary = None
|
|
165
162
|
if self.use_reason:
|
|
166
|
-
reason_summary = self._reason(text, target_language
|
|
163
|
+
reason_summary = self._reason(text, target_language)
|
|
167
164
|
|
|
165
|
+
# 3. Translate: Build the final prompt and get the translation
|
|
168
166
|
messages = self._build_messages(
|
|
169
167
|
text, target_language, source_language, reason_summary, proper_names
|
|
170
168
|
)
|
|
169
|
+
|
|
170
|
+
# For debugging purposes, let's see the final prompt
|
|
171
|
+
print("--- Translation Request ---")
|
|
171
172
|
print(f"Original: {text}")
|
|
172
173
|
print(
|
|
173
174
|
f"Translating to {target_language} from {source_language or 'original'}..."
|
|
174
175
|
)
|
|
175
|
-
|
|
176
|
-
f"Reasoning
|
|
177
|
-
)
|
|
176
|
+
if reason_summary:
|
|
177
|
+
print(f"Reasoning Analysis:\n{reason_summary}")
|
|
178
|
+
print("--- Final JSON Prompt Sent to Model ---")
|
|
179
|
+
# Pretty-print the JSON content from the message
|
|
180
|
+
print(json.dumps(json.loads(messages[0]["content"]), indent=2))
|
|
181
|
+
print("---------------------------")
|
|
178
182
|
|
|
179
183
|
completion = self.client.chat.completions.create(
|
|
180
184
|
model=self.model,
|
|
File without changes
|
|
File without changes
|