hamtaa-texttools 0.1.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- hamtaa_texttools-0.1.43.dist-info/METADATA +60 -0
- hamtaa_texttools-0.1.43.dist-info/RECORD +60 -0
- hamtaa_texttools-0.1.43.dist-info/WHEEL +5 -0
- hamtaa_texttools-0.1.43.dist-info/top_level.txt +1 -0
- texttools/__init__.py +26 -0
- texttools/base/__init__.py +3 -0
- texttools/base/base_categorizer.py +40 -0
- texttools/base/base_keyword_extractor.py +35 -0
- texttools/base/base_ner_extractor.py +61 -0
- texttools/base/base_question_detector.py +35 -0
- texttools/base/base_question_generator.py +99 -0
- texttools/base/base_question_merger.py +59 -0
- texttools/base/base_question_rewriter.py +61 -0
- texttools/base/base_router.py +33 -0
- texttools/base/base_summarizer.py +55 -0
- texttools/base/base_task_performer.py +53 -0
- texttools/base/base_translator.py +38 -0
- texttools/batch_manager/__init__.py +2 -0
- texttools/batch_manager/batch_manager.py +241 -0
- texttools/batch_manager/batch_runner.py +207 -0
- texttools/formatter/__init__.py +1 -0
- texttools/formatter/base.py +26 -0
- texttools/formatter/gemma3_formatter.py +51 -0
- texttools/handlers/__init__.py +6 -0
- texttools/handlers/categorizer/__init__.py +6 -0
- texttools/handlers/categorizer/categorizer.py +61 -0
- texttools/handlers/handlers.py +88 -0
- texttools/tools/__init__.py +33 -0
- texttools/tools/categorizer/__init__.py +2 -0
- texttools/tools/categorizer/encoder_model/__init__.py +1 -0
- texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +51 -0
- texttools/tools/categorizer/llm/__init__.py +2 -0
- texttools/tools/categorizer/llm/gemma_categorizer.py +169 -0
- texttools/tools/categorizer/llm/openai_categorizer.py +80 -0
- texttools/tools/keyword_extractor/__init__.py +1 -0
- texttools/tools/keyword_extractor/gemma_extractor.py +138 -0
- texttools/tools/merger/__init__.py +2 -0
- texttools/tools/merger/gemma_question_merger.py +214 -0
- texttools/tools/ner/__init__.py +1 -0
- texttools/tools/ner/gemma_ner_extractor.py +157 -0
- texttools/tools/question_detector/__init__.py +2 -0
- texttools/tools/question_detector/gemma_detector.py +130 -0
- texttools/tools/question_detector/llm_detector.py +112 -0
- texttools/tools/question_generator/__init__.py +1 -0
- texttools/tools/question_generator/gemma_question_generator.py +198 -0
- texttools/tools/reranker/__init__.py +3 -0
- texttools/tools/reranker/reranker.py +137 -0
- texttools/tools/reranker/scorer.py +216 -0
- texttools/tools/reranker/sorter.py +278 -0
- texttools/tools/rewriter/__init__.py +2 -0
- texttools/tools/rewriter/gemma_question_rewriter.py +213 -0
- texttools/tools/router/__init__.py +0 -0
- texttools/tools/router/gemma_router.py +169 -0
- texttools/tools/subject_to_question/__init__.py +1 -0
- texttools/tools/subject_to_question/gemma_question_generator.py +224 -0
- texttools/tools/summarizer/__init__.py +2 -0
- texttools/tools/summarizer/gemma_summarizer.py +140 -0
- texttools/tools/summarizer/llm_summerizer.py +108 -0
- texttools/tools/translator/__init__.py +1 -0
- texttools/tools/translator/gemma_translator.py +202 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from texttools.base.base_question_generator import BaseQuestionGeneratorFromSubject
|
|
7
|
+
from texttools.formatter import Gemma3Formatter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QuestionGeneration(BaseModel):
|
|
11
|
+
"""
|
|
12
|
+
we use this structue, the model will feel this class
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
reasoning_summary: str
|
|
16
|
+
questions: list
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GemmaQuestionGeneratorFromSubject(BaseQuestionGeneratorFromSubject):
|
|
20
|
+
"""
|
|
21
|
+
Question Generator for Gemma-style models with optional reasoning step.
|
|
22
|
+
|
|
23
|
+
Allows optional extra instructions via `prompt_template`.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
client: OpenAI,
|
|
29
|
+
*,
|
|
30
|
+
model: str,
|
|
31
|
+
chat_formatter: Optional[Any] = None,
|
|
32
|
+
use_reason: bool = False,
|
|
33
|
+
temperature: float = 0.0,
|
|
34
|
+
prompt_template: Optional[str] = None,
|
|
35
|
+
handlers: Optional[list[Any]] = None,
|
|
36
|
+
**client_kwargs: Any,
|
|
37
|
+
):
|
|
38
|
+
super().__init__(handlers)
|
|
39
|
+
self.client = client
|
|
40
|
+
self.model = model
|
|
41
|
+
self.temperature = temperature
|
|
42
|
+
self.client_kwargs = client_kwargs
|
|
43
|
+
|
|
44
|
+
self.chat_formatter = chat_formatter or Gemma3Formatter()
|
|
45
|
+
|
|
46
|
+
self.use_reason = use_reason
|
|
47
|
+
self.prompt_template = prompt_template
|
|
48
|
+
|
|
49
|
+
# [DEPRECATED] we wont use unncessery structured outputs anymore
|
|
50
|
+
# Define the JSON schema for the generated question output
|
|
51
|
+
# self.json_schema = {"generated_question": "string"}
|
|
52
|
+
|
|
53
|
+
def _build_messages(
|
|
54
|
+
self,
|
|
55
|
+
subject: str,
|
|
56
|
+
reason: Optional[str] = None,
|
|
57
|
+
number_of_questions: int = 5,
|
|
58
|
+
language: str = "farsi/Persian",
|
|
59
|
+
) -> list[dict[str, str]]:
|
|
60
|
+
"""
|
|
61
|
+
Builds the message list for the LLM API call for question generation.
|
|
62
|
+
"""
|
|
63
|
+
clean_subject = self.preprocess(subject)
|
|
64
|
+
messages: list[dict[str, str]] = []
|
|
65
|
+
|
|
66
|
+
if self.prompt_template:
|
|
67
|
+
messages.append({"role": "user", "content": self.prompt_template})
|
|
68
|
+
|
|
69
|
+
if reason:
|
|
70
|
+
messages.append(
|
|
71
|
+
{
|
|
72
|
+
"role": "user",
|
|
73
|
+
"content": f"Based on this analysis of the subject: {reason}",
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
messages.append(
|
|
78
|
+
{
|
|
79
|
+
"role": "user",
|
|
80
|
+
"content": f"""Given the following subject, generate a single,
|
|
81
|
+
appropriate question that this subject would directly respond to.
|
|
82
|
+
the generated subject should be independently meaningful,
|
|
83
|
+
and not mentioning any verbs like, this, that, he or she ... on the question.
|
|
84
|
+
**the generated question will be in this language {language}**
|
|
85
|
+
|
|
86
|
+
""",
|
|
87
|
+
}
|
|
88
|
+
)
|
|
89
|
+
messages.append(
|
|
90
|
+
{"role": "user", "content": f"here is the text: {clean_subject}"}
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Ensure the schema is dumped as a valid JSON string for the LLM
|
|
94
|
+
# schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
|
|
95
|
+
messages.append(
|
|
96
|
+
{
|
|
97
|
+
"role": "user",
|
|
98
|
+
"content": f"""
|
|
99
|
+
Respond only with the new generated question, without any additional information.
|
|
100
|
+
**the generated question will be in this language {language}**
|
|
101
|
+
generate {number_of_questions} number os question in the questions list.
|
|
102
|
+
|
|
103
|
+
You must return ONLY a single JSON object that matches the schema.
|
|
104
|
+
Do NOT include any explanation before or after the JSON.
|
|
105
|
+
End the JSON with a closing brace }} and nothing else.
|
|
106
|
+
there is a `reasoning_summary` key, fill that up with a really summerized version
|
|
107
|
+
of your thoughts.
|
|
108
|
+
the `reasoning_summary` must be less than 20 words.
|
|
109
|
+
""",
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# messages.append(
|
|
114
|
+
# {"role": "assistant", "content": "{\n"}
|
|
115
|
+
# ) # Hint to start JSON output
|
|
116
|
+
# in this new version we will use
|
|
117
|
+
# parse function of openai library
|
|
118
|
+
|
|
119
|
+
# this line will restructure the messages
|
|
120
|
+
# based on the formatter that we provided
|
|
121
|
+
# some models will require custom settings
|
|
122
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
123
|
+
|
|
124
|
+
return restructured
|
|
125
|
+
|
|
126
|
+
def _reason(self, subject: str, language: str) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Internal reasoning step to help the model understand the core information
|
|
129
|
+
and implications of the subject.
|
|
130
|
+
"""
|
|
131
|
+
messages = [
|
|
132
|
+
{
|
|
133
|
+
"role": "user",
|
|
134
|
+
"content": """
|
|
135
|
+
our goal is to generate questions, from the given subject that the user has provided
|
|
136
|
+
the questions must be meaningfull, some of them should be specific and some should be general
|
|
137
|
+
but first, in this step we want to analyze the inputted subject that the user asked us to generate questions
|
|
138
|
+
for it
|
|
139
|
+
|
|
140
|
+
what is the subject
|
|
141
|
+
we need summerized analysis of the input subject
|
|
142
|
+
what point of views can we see it and generate questoins from it
|
|
143
|
+
|
|
144
|
+
questions that real users might have
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
""",
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"role": "user",
|
|
151
|
+
"content": f"""
|
|
152
|
+
|
|
153
|
+
Here is the subject:
|
|
154
|
+
|
|
155
|
+
{subject}
|
|
156
|
+
|
|
157
|
+
respond only with this language {language}
|
|
158
|
+
|
|
159
|
+
""",
|
|
160
|
+
},
|
|
161
|
+
# {
|
|
162
|
+
# "role": "assistant",
|
|
163
|
+
# "content": """
|
|
164
|
+
# Sure, here is a summerized analysis
|
|
165
|
+
# """,
|
|
166
|
+
# },
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
170
|
+
|
|
171
|
+
resp = self.client.chat.completions.create(
|
|
172
|
+
model=self.model,
|
|
173
|
+
messages=restructured,
|
|
174
|
+
temperature=self.temperature,
|
|
175
|
+
**self.client_kwargs,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
reason_summary = resp.choices[0].message.content.strip()
|
|
179
|
+
return reason_summary
|
|
180
|
+
|
|
181
|
+
def generate_question(
|
|
182
|
+
self, subject: str, number_of_questions: int, language: str
|
|
183
|
+
) -> str:
|
|
184
|
+
"""
|
|
185
|
+
Generates a question for the input `subject`.
|
|
186
|
+
Optionally uses an internal reasoning step for better accuracy.
|
|
187
|
+
|
|
188
|
+
language: the language of the question
|
|
189
|
+
|
|
190
|
+
"""
|
|
191
|
+
reason_summary = None
|
|
192
|
+
if self.use_reason:
|
|
193
|
+
reason_summary = self._reason(subject, language)
|
|
194
|
+
|
|
195
|
+
messages = self._build_messages(
|
|
196
|
+
subject, reason_summary, number_of_questions, language
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
completion = self.client.beta.chat.completions.parse(
|
|
200
|
+
model=self.model,
|
|
201
|
+
messages=messages,
|
|
202
|
+
response_format=QuestionGeneration,
|
|
203
|
+
temperature=self.temperature,
|
|
204
|
+
extra_body=dict(
|
|
205
|
+
guided_decoding_backend="auto",
|
|
206
|
+
),
|
|
207
|
+
**self.client_kwargs,
|
|
208
|
+
)
|
|
209
|
+
message = completion.choices[0].message
|
|
210
|
+
if message.parsed:
|
|
211
|
+
result = message.parsed.questions
|
|
212
|
+
else:
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"Failed to parse the response. Raw content: {message.content}"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# dispatch and return
|
|
218
|
+
self._dispatch(
|
|
219
|
+
{
|
|
220
|
+
"original_subject": subject,
|
|
221
|
+
"generated_question": result,
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
return result
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
from openai import OpenAI
|
|
5
|
+
|
|
6
|
+
from texttools.base.base_summarizer import BaseSummarizer
|
|
7
|
+
from texttools.handlers import ResultHandler
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GemmaSummarizer(BaseSummarizer):
|
|
11
|
+
"""
|
|
12
|
+
Summarizer for Gemma-style models with optional reasoning step.
|
|
13
|
+
Outputs JSON with a single string field: {"summary": "..."}.
|
|
14
|
+
|
|
15
|
+
Allows optional extra instructions via `prompt_template`.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
client: OpenAI,
|
|
21
|
+
*,
|
|
22
|
+
model: str,
|
|
23
|
+
use_reason: bool = False,
|
|
24
|
+
temperature: float = 0.0,
|
|
25
|
+
prompt_template: Optional[str] = None,
|
|
26
|
+
handlers: Optional[list[ResultHandler]] = None,
|
|
27
|
+
**client_kwargs: Any,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(handlers)
|
|
30
|
+
self.client = client
|
|
31
|
+
self.model = model
|
|
32
|
+
self.temperature = temperature
|
|
33
|
+
self.client_kwargs = client_kwargs
|
|
34
|
+
|
|
35
|
+
self.use_reason = use_reason
|
|
36
|
+
self.prompt_template = prompt_template
|
|
37
|
+
|
|
38
|
+
# Define the JSON schema for the summary output
|
|
39
|
+
self.json_schema = {"summary": "string"}
|
|
40
|
+
|
|
41
|
+
def _build_messages(
|
|
42
|
+
self, text: str, reason: Optional[str] = None
|
|
43
|
+
) -> list[dict[str, str]]:
|
|
44
|
+
"""
|
|
45
|
+
Builds the message list for the LLM API call.
|
|
46
|
+
"""
|
|
47
|
+
clean_text = self.preprocess(text)
|
|
48
|
+
# Ensure the schema is dumped as a valid JSON string
|
|
49
|
+
|
|
50
|
+
messages: list[dict[str, str]] = []
|
|
51
|
+
|
|
52
|
+
if self.prompt_template:
|
|
53
|
+
messages.append({"role": "user", "content": self.prompt_template})
|
|
54
|
+
|
|
55
|
+
if reason: # Include the reason if available
|
|
56
|
+
messages.append(
|
|
57
|
+
{"role": "user", "content": f"Based on this analysis: {reason}"}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
messages.append(
|
|
61
|
+
{
|
|
62
|
+
"role": "user",
|
|
63
|
+
"content": "Please provide a concise summary of the following text.",
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
messages.append({"role": "user", "content": clean_text})
|
|
67
|
+
|
|
68
|
+
schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
|
|
69
|
+
messages.append({"role": "user", "content": schema_instr})
|
|
70
|
+
|
|
71
|
+
messages.append(
|
|
72
|
+
{"role": "assistant", "content": "{"}
|
|
73
|
+
) # Start with '{' to hint JSON
|
|
74
|
+
return messages
|
|
75
|
+
|
|
76
|
+
def _reason(self, text: str) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Internal reasoning step to help the model better understand the text for summarization.
|
|
79
|
+
"""
|
|
80
|
+
messages = [
|
|
81
|
+
{
|
|
82
|
+
"role": "user",
|
|
83
|
+
"content": """
|
|
84
|
+
Read the following text and identify its main points, key arguments, and overall purpose.
|
|
85
|
+
Provide a brief, summarized analysis that will help in generating an accurate and concise summary.
|
|
86
|
+
""",
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"role": "user",
|
|
90
|
+
"content": f"""
|
|
91
|
+
{text}
|
|
92
|
+
""",
|
|
93
|
+
},
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
resp = self.client.chat.completions.create(
|
|
97
|
+
model=self.model,
|
|
98
|
+
messages=messages,
|
|
99
|
+
temperature=self.temperature,
|
|
100
|
+
**self.client_kwargs,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
reason_summary = resp.choices[0].message.content.strip()
|
|
104
|
+
return reason_summary
|
|
105
|
+
|
|
106
|
+
def summarize(self, text: str) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Generates a summary for `text`.
|
|
109
|
+
Optionally uses an internal reasoning step for better quality.
|
|
110
|
+
"""
|
|
111
|
+
reason_summary = None
|
|
112
|
+
if self.use_reason:
|
|
113
|
+
reason_summary = self._reason(text)
|
|
114
|
+
|
|
115
|
+
messages = self._build_messages(text, reason_summary)
|
|
116
|
+
resp = self.client.chat.completions.create(
|
|
117
|
+
model=self.model,
|
|
118
|
+
messages=messages,
|
|
119
|
+
temperature=self.temperature,
|
|
120
|
+
**self.client_kwargs,
|
|
121
|
+
)
|
|
122
|
+
raw = resp.choices[0].message.content.strip()
|
|
123
|
+
|
|
124
|
+
if not raw.startswith("{"):
|
|
125
|
+
raw = "{" + raw
|
|
126
|
+
try:
|
|
127
|
+
parsed = json.loads(raw)
|
|
128
|
+
except json.JSONDecodeError as e:
|
|
129
|
+
raise ValueError(f"Failed to parse JSON: {e}\nRaw output: {raw}")
|
|
130
|
+
|
|
131
|
+
result = parsed.get("summary")
|
|
132
|
+
# Validate that the result is a string
|
|
133
|
+
if not isinstance(result, str):
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"Invalid response schema, expected a string for 'summary', got: {parsed}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# dispatch and return, passing original_text
|
|
139
|
+
self._dispatch(summary=result, original_text=text)
|
|
140
|
+
return result
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
from pydantic import BaseModel, create_model
|
|
5
|
+
|
|
6
|
+
from texttools.base.base_summarizer import BaseSummarizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMSummarizer(BaseSummarizer):
|
|
10
|
+
"""
|
|
11
|
+
LLM-based text summarizer that wraps OpenAI's structured output parsing.
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
```python
|
|
15
|
+
from openai import OpenAI
|
|
16
|
+
from texttools import LLMSummarizer
|
|
17
|
+
|
|
18
|
+
client = OpenAI()
|
|
19
|
+
summarizer = LLMSummarizer(
|
|
20
|
+
client=client,
|
|
21
|
+
model="gpt-4o-2024-08-06",
|
|
22
|
+
temperature=0.7,
|
|
23
|
+
prompt_template=(
|
|
24
|
+
"You are a helpful assistant that produces concise summaries of the provided text."
|
|
25
|
+
),
|
|
26
|
+
handlers=[my_handler], # optional callbacks on each summarization
|
|
27
|
+
max_tokens=150, # any other OpenAIClient kwargs
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
summary = summarizer.summarize("Long article text...")
|
|
31
|
+
print(summary)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Parameters:
|
|
35
|
+
client (OpenAI):
|
|
36
|
+
Instantiated OpenAI client. Ensure your API key is configured.
|
|
37
|
+
model (str):
|
|
38
|
+
Model name to use (e.g., "gpt-4").
|
|
39
|
+
temperature (float, default=0.7):
|
|
40
|
+
Sampling temperature.
|
|
41
|
+
prompt_template (str, optional):
|
|
42
|
+
System-level instructions guiding the summarization.
|
|
43
|
+
handlers (list[callable], optional):
|
|
44
|
+
List of callables that receive {"summary": str, "original_text": str}.
|
|
45
|
+
client_kwargs (Any):
|
|
46
|
+
Additional parameters passed directly to OpenAI (e.g., max_tokens, top_p).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
client: OpenAI,
|
|
52
|
+
*,
|
|
53
|
+
model: str,
|
|
54
|
+
temperature: float = 0.7,
|
|
55
|
+
prompt_template: Optional[str] = None,
|
|
56
|
+
handlers: Optional[list[Any]] = None,
|
|
57
|
+
**client_kwargs: Any,
|
|
58
|
+
):
|
|
59
|
+
super().__init__(handlers)
|
|
60
|
+
self.client = client
|
|
61
|
+
self.model = model
|
|
62
|
+
self.temperature = temperature
|
|
63
|
+
self.client_kwargs = client_kwargs
|
|
64
|
+
|
|
65
|
+
self.prompt_template = (
|
|
66
|
+
prompt_template
|
|
67
|
+
if prompt_template is not None
|
|
68
|
+
else (
|
|
69
|
+
"""
|
|
70
|
+
You are a helpful assistant that produces concise and accurate summaries of the provided text.
|
|
71
|
+
do not explain anything, onlu provide the summarized version.
|
|
72
|
+
"""
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self._OutputModel = create_model(
|
|
77
|
+
"SummarizationOutput",
|
|
78
|
+
summary=(str, ...),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def _build_messages(self, text: str) -> list[dict[str, str]]:
|
|
82
|
+
preprocessed = self.preprocess(text)
|
|
83
|
+
return [
|
|
84
|
+
{"role": "system", "content": self.prompt_template},
|
|
85
|
+
{"role": "user", "content": preprocessed},
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
def summarize(self, text: str) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Generate a summary for the input text.
|
|
91
|
+
|
|
92
|
+
:param text: The text to summarize.
|
|
93
|
+
:return: A summary string.
|
|
94
|
+
"""
|
|
95
|
+
messages = self._build_messages(text)
|
|
96
|
+
resp = self.client.responses.parse(
|
|
97
|
+
model=self.model,
|
|
98
|
+
input=messages,
|
|
99
|
+
text_format=self._OutputModel,
|
|
100
|
+
temperature=self.temperature,
|
|
101
|
+
**self.client_kwargs,
|
|
102
|
+
)
|
|
103
|
+
output: BaseModel = resp.output_parsed
|
|
104
|
+
summary_text: str = output.summary
|
|
105
|
+
|
|
106
|
+
self._dispatch(summary=summary_text, original_text=text)
|
|
107
|
+
|
|
108
|
+
return summary_text
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from texttools.tools.translator.gemma_translator import GemmaTranslator
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from texttools.base.base_translator import BaseTranslator
|
|
9
|
+
from texttools.formatter.gemma3_formatter import Gemma3Formatter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Pydantic BaseModel to specify the output format of preprocessor
|
|
13
|
+
# Preprocessor's job is to extract proper names
|
|
14
|
+
class PreprocessorOutput(BaseModel):
|
|
15
|
+
text: str
|
|
16
|
+
text_type: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GemmaTranslator(BaseTranslator):
|
|
20
|
+
"""
|
|
21
|
+
Translator for Gemma-style models with optional reasoning step.
|
|
22
|
+
Outputs only the translated text, without any additional structure.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
client: OpenAI,
|
|
28
|
+
*,
|
|
29
|
+
model: str,
|
|
30
|
+
chat_formatter: Optional[Any] = None,
|
|
31
|
+
use_reason: bool = False,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
prompt_template: str = None,
|
|
34
|
+
handlers: list[Any] = None,
|
|
35
|
+
**client_kwargs: Any,
|
|
36
|
+
):
|
|
37
|
+
super().__init__(handlers)
|
|
38
|
+
self.client = client
|
|
39
|
+
self.model = model
|
|
40
|
+
self.temperature = temperature
|
|
41
|
+
self.client_kwargs = client_kwargs
|
|
42
|
+
self.chat_formatter = chat_formatter or Gemma3Formatter()
|
|
43
|
+
self.use_reason = use_reason
|
|
44
|
+
self.prompt_template = prompt_template
|
|
45
|
+
|
|
46
|
+
def _build_messages(
|
|
47
|
+
self,
|
|
48
|
+
text: str,
|
|
49
|
+
target_language: str,
|
|
50
|
+
source_language: Optional[str] = None,
|
|
51
|
+
reason: Optional[str] = None,
|
|
52
|
+
proper_names: Optional[list[str]] = None,
|
|
53
|
+
) -> list[dict[str, str]]:
|
|
54
|
+
messages: list[dict[str, str]] = []
|
|
55
|
+
|
|
56
|
+
# This prompt gives initial information about translation like languages and proper names
|
|
57
|
+
enforce_prompt = f"""
|
|
58
|
+
You are a {source_language}-to-{target_language} translator.
|
|
59
|
+
Important Rule: The following are proper names and must NOT be translated.
|
|
60
|
+
They must be only transliterated into {target_language}.
|
|
61
|
+
That means preserving their phonetic form without changing their meaning.
|
|
62
|
+
Apply the rule for **ALL** of following proper names.
|
|
63
|
+
Proper names (do not translate **** of them):
|
|
64
|
+
{proper_names if proper_names else "None"}
|
|
65
|
+
If any proper name is found in the text, you MUST only transliterate it.
|
|
66
|
+
Output only the translated text. No comments, no explanations, no markdown.
|
|
67
|
+
"""
|
|
68
|
+
messages.append({"role": "user", "content": enforce_prompt})
|
|
69
|
+
|
|
70
|
+
clean_text = text.strip()
|
|
71
|
+
if reason:
|
|
72
|
+
reason_prompt = f"""
|
|
73
|
+
Based on the analysis conducted, translate the following text {"from" + source_language if source_language else ""} to {target_language}.
|
|
74
|
+
The text to be translated is: "{clean_text}"
|
|
75
|
+
The analysis conducted: {reason}
|
|
76
|
+
"""
|
|
77
|
+
messages.append({"role": "user", "content": reason_prompt})
|
|
78
|
+
else:
|
|
79
|
+
regular_prompt = f"""Translate the following text from {source_language or "original"} to {target_language}:
|
|
80
|
+
{clean_text}"""
|
|
81
|
+
messages.append({"role": "user", "content": regular_prompt})
|
|
82
|
+
|
|
83
|
+
# Optional additional template
|
|
84
|
+
if self.prompt_template:
|
|
85
|
+
messages.append({"role": "user", "content": self.prompt_template})
|
|
86
|
+
|
|
87
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
88
|
+
|
|
89
|
+
return restructured
|
|
90
|
+
|
|
91
|
+
def _reason(self, text: str, target_language: str) -> str:
|
|
92
|
+
"""
|
|
93
|
+
Internal reasoning step to help the model with translation.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
reason_step_prompt = f"""
|
|
97
|
+
Analyze the following text and identify important linguistic considerations for translation.
|
|
98
|
+
Do not translate the text. Point out any idioms, cultural references, or complex structures that need special attention.
|
|
99
|
+
Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
|
|
100
|
+
"""
|
|
101
|
+
messages = [
|
|
102
|
+
{"role": "user", "content": reason_step_prompt},
|
|
103
|
+
{"role": "user", "content": text},
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
107
|
+
completion = self.client.chat.completions.create(
|
|
108
|
+
model=self.model,
|
|
109
|
+
messages=restructured,
|
|
110
|
+
temperature=self.temperature,
|
|
111
|
+
**self.client_kwargs,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return completion.choices[0].message.content.strip()
|
|
115
|
+
|
|
116
|
+
def preprocess(self, text: str) -> list:
|
|
117
|
+
"""
|
|
118
|
+
Preprocessor that finds proper names of Islamic figures. The extractions will be given to the
|
|
119
|
+
LLm in order to know that it shouldn't translate them, but transliterate them.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
messages: list[dict[str, str]] = []
|
|
123
|
+
|
|
124
|
+
main_prompt = """
|
|
125
|
+
You must detect proper names of people.
|
|
126
|
+
Your task is to extract a JSON list of entities from the given input. For each entity, include:
|
|
127
|
+
text: The exact matched string from the original.
|
|
128
|
+
type: Only include "Proper Name" for actual names of real people.
|
|
129
|
+
If there is no proper name in the following text, return empty json.
|
|
130
|
+
"""
|
|
131
|
+
messages.append({"role": "user", "content": main_prompt})
|
|
132
|
+
|
|
133
|
+
text_prompt = f"""The text to be extracted is:{text}"""
|
|
134
|
+
messages.append({"role": "user", "content": text_prompt})
|
|
135
|
+
|
|
136
|
+
restructured = self.chat_formatter.format(messages=messages)
|
|
137
|
+
completion = self.client.chat.completions.create(
|
|
138
|
+
model=self.model,
|
|
139
|
+
messages=restructured,
|
|
140
|
+
response_format={
|
|
141
|
+
"type": "json_schema",
|
|
142
|
+
"json_schema": {
|
|
143
|
+
"name": "NER",
|
|
144
|
+
"schema": PreprocessorOutput.model_json_schema(),
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
temperature=self.temperature,
|
|
148
|
+
**self.client_kwargs,
|
|
149
|
+
)
|
|
150
|
+
response = completion.choices[0].message.content
|
|
151
|
+
|
|
152
|
+
# Remove Markdown-style triple backticks and any optional language tag like "json"
|
|
153
|
+
if response.startswith("```"):
|
|
154
|
+
response = re.sub(r"^```(?:json)?\s*|```$", "", response.strip())
|
|
155
|
+
|
|
156
|
+
entities = json.loads(response)
|
|
157
|
+
|
|
158
|
+
return entities
|
|
159
|
+
|
|
160
|
+
def translate(
|
|
161
|
+
self, text: str, target_language: str, source_language: Optional[str] = None
|
|
162
|
+
) -> str:
|
|
163
|
+
"""
|
|
164
|
+
Translates text and returns only the translated string.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
# Extract proper names to tell the LLM what names not to translate, but to transliterate
|
|
168
|
+
extracted = self.preprocess(text)
|
|
169
|
+
proper_names = [e["text"] for e in extracted]
|
|
170
|
+
|
|
171
|
+
reason_summary = None
|
|
172
|
+
if self.use_reason:
|
|
173
|
+
reason_summary = self._reason(text, target_language, source_language)
|
|
174
|
+
|
|
175
|
+
messages = self._build_messages(
|
|
176
|
+
text, target_language, source_language, reason_summary, proper_names
|
|
177
|
+
)
|
|
178
|
+
print(f"Original: {text}")
|
|
179
|
+
print(
|
|
180
|
+
f"Translating to {target_language} from {source_language or 'original'}..."
|
|
181
|
+
)
|
|
182
|
+
print(
|
|
183
|
+
f"Reasoning: {reason_summary}" if reason_summary else "Reasoning not used."
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
completion = self.client.chat.completions.create(
|
|
187
|
+
model=self.model,
|
|
188
|
+
messages=messages,
|
|
189
|
+
temperature=self.temperature,
|
|
190
|
+
**self.client_kwargs,
|
|
191
|
+
)
|
|
192
|
+
response = completion.choices[0].message.content.strip()
|
|
193
|
+
|
|
194
|
+
self._dispatch(
|
|
195
|
+
{
|
|
196
|
+
"original_text": text,
|
|
197
|
+
"source_language": source_language,
|
|
198
|
+
"target_language": target_language,
|
|
199
|
+
"translated_text": response,
|
|
200
|
+
}
|
|
201
|
+
)
|
|
202
|
+
return response
|