hamtaa-texttools 0.1.54__py3-none-any.whl → 0.1.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 0.1.54
3
+ Version: 0.1.55
4
4
  Summary: A set of high-level NLP tools
5
5
  Author: Tohidi, Montazer, Givechi, Mousavinezhad
6
6
  Requires-Python: >=3.8
@@ -50,12 +50,12 @@ texttools/tools/summarizer/__init__.py,sha256=phrR7qO20CNhO3hjXQBzhTRVumdVdGSufm
50
50
  texttools/tools/summarizer/gemma_summarizer.py,sha256=ikhsBv7AiZD1dT_d12AyjXxojzSW92e2y5WjchI_3bE,4474
51
51
  texttools/tools/summarizer/llm_summerizer.py,sha256=-0rUKbSnl1aDeBfJ5DCSbIlwd2k-9qIaCKgoQJa0hWc,3412
52
52
  texttools/tools/translator/__init__.py,sha256=KO1m08J2BZwRqBGO9ICB4l4cnH1jfHLHL5HbgYFUWM8,72
53
- texttools/tools/translator/gemma_translator.py,sha256=4bW9wVIkrlYDhWaOWB2sN7oC0xzeWJ-rfKRnp_lGrp4,7259
53
+ texttools/tools/translator/gemma_translator.py,sha256=KsKbD_hzbOmFt-K0pciZ1IXz66JGm480FdBqWL2mYL0,7272
54
54
  texttools/utils/flex_processor.py,sha256=C-lMwMjpIM6uAPFxXdgajxcFV1ccngEfJqq6xe5S1J8,3123
55
55
  texttools/utils/batch_manager/__init__.py,sha256=3ZkxA395lRD4gNxJ1vp0fNuz_XuBr50GoP51rrwQ0Ks,87
56
56
  texttools/utils/batch_manager/batch_manager.py,sha256=jAmKskL3OTYwwsO1mWsWAB3VxMlOF07c2GW1Ev83ZhY,9283
57
57
  texttools/utils/batch_manager/batch_runner.py,sha256=DE6TFz3i_jR-ZiUYbgIdLgjqr3aitw-JM_tKnSvzGL0,7424
58
- hamtaa_texttools-0.1.54.dist-info/METADATA,sha256=ad_jTTDOoADppaC7jik-hrxEuWc5aOwtz5_XFW1dTp0,1481
59
- hamtaa_texttools-0.1.54.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
- hamtaa_texttools-0.1.54.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
61
- hamtaa_texttools-0.1.54.dist-info/RECORD,,
58
+ hamtaa_texttools-0.1.55.dist-info/METADATA,sha256=-WVDAY_TTcDZwiM8YkCsrA_qy8dlO669LM2oEPtYiA4,1481
59
+ hamtaa_texttools-0.1.55.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
+ hamtaa_texttools-0.1.55.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
61
+ hamtaa_texttools-0.1.55.dist-info/RECORD,,
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from typing import Any, List, Optional
2
3
 
3
4
  from openai import OpenAI
@@ -19,7 +20,7 @@ class PreprocessorOutput(BaseModel):
19
20
 
20
21
  class GemmaTranslator(BaseTranslator):
21
22
  """
22
- Translator for Gemma-style models with optional reasoning step.
23
+ Translator for Gemma-style models using structured JSON prompts.
23
24
  Outputs only the translated text, without any additional structure.
24
25
  """
25
26
 
@@ -52,57 +53,61 @@ class GemmaTranslator(BaseTranslator):
52
53
  reason: Optional[str] = None,
53
54
  proper_names: Optional[list[str]] = None,
54
55
  ) -> list[dict[str, str]]:
55
- messages: list[dict[str, str]] = []
56
-
57
- # This prompt gives initial information about translation like languages and proper names
58
- enforce_prompt = f"""
59
- You are a {source_language}-to-{target_language} translator.
60
- Important Rule: The following are proper names and must NOT be translated.
61
- They must be only transliterated into {target_language}.
62
- That means preserving their phonetic form without changing their meaning.
63
- Apply the rule for **ALL** of following proper names.
64
- Proper names (do not translate **** of them):
65
- {proper_names if proper_names else "None"}
66
- If any proper name is found in the text, you MUST only transliterate it.
67
- Output only the translated text. No comments, no explanations, no markdown.
68
- """
69
- messages.append({"role": "user", "content": enforce_prompt})
70
-
71
- clean_text = text
56
+ """Constructs a single, comprehensive JSON prompt for the translation task."""
57
+
58
+ prompt_data = {
59
+ "role": "Expert Translator",
60
+ "task": f"Translate the following text from {source_language or 'the original language'} to {target_language}.",
61
+ "input_text": text,
62
+ "rules": {
63
+ "proper_names": {
64
+ "instruction": "These names MUST NOT be translated. Only transliterate them to preserve their phonetic form.",
65
+ "list": proper_names if proper_names else "None",
66
+ }
67
+ },
68
+ "output_instructions": [
69
+ "Provide ONLY the translated text.",
70
+ "Do not include any explanations, comments, or markdown formatting.",
71
+ ],
72
+ }
73
+
72
74
  if reason:
73
- reason_prompt = f"""
74
- Based on the analysis conducted, translate the following text {"from" + source_language if source_language else ""} to {target_language}.
75
- The text to be translated is: "{clean_text}"
76
- The analysis conducted: {reason}
77
- """
78
- messages.append({"role": "user", "content": reason_prompt})
79
- else:
80
- regular_prompt = f"""Translate the following text from {source_language or "original"} to {target_language}:
81
- {clean_text}"""
82
- messages.append({"role": "user", "content": regular_prompt})
83
-
84
- # Optional additional template
75
+ prompt_data["context"] = {
76
+ "preliminary_analysis": reason,
77
+ "instruction": "Use this analysis to inform the translation.",
78
+ }
79
+
80
+ # The entire set of instructions is formatted into a single JSON string
81
+ content = json.dumps(prompt_data, indent=2)
82
+ messages = [{"role": "user", "content": content}]
83
+
84
+ # Optional additional JSON template for more complex rules
85
85
  if self.prompt_template:
86
86
  messages.append({"role": "user", "content": self.prompt_template})
87
87
 
88
- restructured = self.chat_formatter.format(messages=messages)
89
-
90
- return restructured
88
+ return self.chat_formatter.format(messages=messages)
91
89
 
92
90
  def _reason(self, text: str, target_language: str) -> str:
93
- """
94
- Internal reasoning step to help the model with translation.
95
- """
96
-
97
- reason_step_prompt = f"""
98
- Analyze the following text and identify important linguistic considerations for translation.
99
- Do not translate the text. Point out any idioms, cultural references, or complex structures that need special attention.
100
- Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
101
- """
102
- messages = [
103
- {"role": "user", "content": reason_step_prompt},
104
- {"role": "user", "content": text},
105
- ]
91
+ """Internal reasoning step using a JSON prompt to analyze text before translation."""
92
+
93
+ prompt_data = {
94
+ "task": "Analyze the provided text to identify potential translation challenges.",
95
+ "analysis_points": [
96
+ "Identify idioms or colloquialisms.",
97
+ "Note any cultural references.",
98
+ "Point out complex grammatical structures.",
99
+ "List all proper nouns that should be transliterated, not translated.",
100
+ ],
101
+ "input_text": text,
102
+ "output_instructions": {
103
+ "language": target_language,
104
+ "format": "A concise, bulleted list.",
105
+ "important_rule": "DO NOT TRANSLATE the original text.",
106
+ "length": "must be less than 200 words.",
107
+ },
108
+ }
109
+
110
+ messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
106
111
 
107
112
  restructured = self.chat_formatter.format(messages=messages)
108
113
  completion = self.client.chat.completions.create(
@@ -114,67 +119,66 @@ class GemmaTranslator(BaseTranslator):
114
119
 
115
120
  return completion.choices[0].message.content.strip()
116
121
 
117
- def preprocess(self, text: str) -> list:
118
- """
119
- Preprocessor that finds proper names of Islamic figures. The extractions will be given to the
120
- LLm in order to know that it shouldn't translate them, but transliterate them.
121
- """
122
-
123
- messages: list[dict[str, str]] = []
122
+ def preprocess(self, text: str) -> PreprocessorOutput:
123
+ """Preprocessor that finds proper names using a structured JSON prompt."""
124
124
 
125
- main_prompt = """
126
- You must detect proper names of people.
127
- Your task is to extract a JSON list of entities from the given input. For each entity, include:
128
- text: The exact matched string from the original.
129
- type: Only include "Proper Name" for actual names of real people.
130
- If there is no proper name in the following text, return empty json.
131
- """
132
- messages.append({"role": "user", "content": main_prompt})
125
+ prompt_data = {
126
+ "task_description": "Extract all proper names of people from the provided text.",
127
+ "input_text": text,
128
+ "output_format": {
129
+ "schema": {"entities": ["string"]},
130
+ "instruction": "Return a JSON object matching this schema. If no names are found, the 'entities' list must be empty.",
131
+ },
132
+ }
133
133
 
134
- text_prompt = f"""The text to be extracted is:{text}"""
135
- messages.append({"role": "user", "content": text_prompt})
134
+ messages = [{"role": "user", "content": json.dumps(prompt_data, indent=2)}]
136
135
 
137
136
  restructured = self.chat_formatter.format(messages=messages)
137
+
138
138
  completion = self.client.chat.completions.parse(
139
139
  model=self.model,
140
140
  messages=restructured,
141
- response_format=PreprocessorOutput,
141
+ response_model=PreprocessorOutput,
142
142
  temperature=self.temperature,
143
143
  extra_body={
144
144
  "guided_decoding_backend": "auto",
145
145
  },
146
146
  **self.client_kwargs,
147
147
  )
148
- message = completion.choices[0].message
149
148
 
150
- entities = message.parsed
151
- return entities
149
+ return completion.choices[0].message.parsed
152
150
 
153
151
  def translate(
154
152
  self, text: str, target_language: str, source_language: Optional[str] = None
155
153
  ) -> str:
156
- """
157
- Translates text and returns only the translated string.
158
- """
154
+ """Translates text using a structured JSON-based workflow."""
159
155
 
160
- # Extract proper names to tell the LLM what names not to translate, but to transliterate
161
- extracted = self.preprocess(text)
162
- proper_names = extracted.entities
156
+ # 1. Preprocess: Extract proper names
157
+ extracted_data = self.preprocess(text)
158
+ proper_names = extracted_data.entities
163
159
 
160
+ # 2. Reason (optional): Analyze the text for challenges
164
161
  reason_summary = None
165
162
  if self.use_reason:
166
163
  reason_summary = self._reason(text, target_language)
167
164
 
165
+ # 3. Translate: Build the final prompt and get the translation
168
166
  messages = self._build_messages(
169
167
  text, target_language, source_language, reason_summary, proper_names
170
168
  )
169
+
170
+ # For debugging purposes, let's see the final prompt
171
+ print("--- Translation Request ---")
171
172
  print(f"Original: {text}")
172
173
  print(
173
174
  f"Translating to {target_language} from {source_language or 'original'}..."
174
175
  )
175
- print(
176
- f"Reasoning: {reason_summary}" if reason_summary else "Reasoning not used."
177
- )
176
+ if reason_summary:
177
+ print(f"Reasoning Analysis:\n{reason_summary}")
178
+ print("--- Final JSON Prompt Sent to Model ---")
179
+ # Pretty-print the JSON content from the message
180
+ print(json.dumps(json.loads(messages[0]["content"]), indent=2))
181
+ print("---------------------------")
178
182
 
179
183
  completion = self.client.chat.completions.create(
180
184
  model=self.model,