hamtaa-texttools 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (60) hide show
  1. hamtaa_texttools-0.1.43.dist-info/METADATA +60 -0
  2. hamtaa_texttools-0.1.43.dist-info/RECORD +60 -0
  3. hamtaa_texttools-0.1.43.dist-info/WHEEL +5 -0
  4. hamtaa_texttools-0.1.43.dist-info/top_level.txt +1 -0
  5. texttools/__init__.py +26 -0
  6. texttools/base/__init__.py +3 -0
  7. texttools/base/base_categorizer.py +40 -0
  8. texttools/base/base_keyword_extractor.py +35 -0
  9. texttools/base/base_ner_extractor.py +61 -0
  10. texttools/base/base_question_detector.py +35 -0
  11. texttools/base/base_question_generator.py +99 -0
  12. texttools/base/base_question_merger.py +59 -0
  13. texttools/base/base_question_rewriter.py +61 -0
  14. texttools/base/base_router.py +33 -0
  15. texttools/base/base_summarizer.py +55 -0
  16. texttools/base/base_task_performer.py +53 -0
  17. texttools/base/base_translator.py +38 -0
  18. texttools/batch_manager/__init__.py +2 -0
  19. texttools/batch_manager/batch_manager.py +241 -0
  20. texttools/batch_manager/batch_runner.py +207 -0
  21. texttools/formatter/__init__.py +1 -0
  22. texttools/formatter/base.py +26 -0
  23. texttools/formatter/gemma3_formatter.py +51 -0
  24. texttools/handlers/__init__.py +6 -0
  25. texttools/handlers/categorizer/__init__.py +6 -0
  26. texttools/handlers/categorizer/categorizer.py +61 -0
  27. texttools/handlers/handlers.py +88 -0
  28. texttools/tools/__init__.py +33 -0
  29. texttools/tools/categorizer/__init__.py +2 -0
  30. texttools/tools/categorizer/encoder_model/__init__.py +1 -0
  31. texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +51 -0
  32. texttools/tools/categorizer/llm/__init__.py +2 -0
  33. texttools/tools/categorizer/llm/gemma_categorizer.py +169 -0
  34. texttools/tools/categorizer/llm/openai_categorizer.py +80 -0
  35. texttools/tools/keyword_extractor/__init__.py +1 -0
  36. texttools/tools/keyword_extractor/gemma_extractor.py +138 -0
  37. texttools/tools/merger/__init__.py +2 -0
  38. texttools/tools/merger/gemma_question_merger.py +214 -0
  39. texttools/tools/ner/__init__.py +1 -0
  40. texttools/tools/ner/gemma_ner_extractor.py +157 -0
  41. texttools/tools/question_detector/__init__.py +2 -0
  42. texttools/tools/question_detector/gemma_detector.py +130 -0
  43. texttools/tools/question_detector/llm_detector.py +112 -0
  44. texttools/tools/question_generator/__init__.py +1 -0
  45. texttools/tools/question_generator/gemma_question_generator.py +198 -0
  46. texttools/tools/reranker/__init__.py +3 -0
  47. texttools/tools/reranker/reranker.py +137 -0
  48. texttools/tools/reranker/scorer.py +216 -0
  49. texttools/tools/reranker/sorter.py +278 -0
  50. texttools/tools/rewriter/__init__.py +2 -0
  51. texttools/tools/rewriter/gemma_question_rewriter.py +213 -0
  52. texttools/tools/router/__init__.py +0 -0
  53. texttools/tools/router/gemma_router.py +169 -0
  54. texttools/tools/subject_to_question/__init__.py +1 -0
  55. texttools/tools/subject_to_question/gemma_question_generator.py +224 -0
  56. texttools/tools/summarizer/__init__.py +2 -0
  57. texttools/tools/summarizer/gemma_summarizer.py +140 -0
  58. texttools/tools/summarizer/llm_summerizer.py +108 -0
  59. texttools/tools/translator/__init__.py +1 -0
  60. texttools/tools/translator/gemma_translator.py +202 -0
@@ -0,0 +1,224 @@
1
+ from typing import Any, Optional
2
+
3
+ from openai import OpenAI
4
+ from pydantic import BaseModel
5
+
6
+ from texttools.base.base_question_generator import BaseQuestionGeneratorFromSubject
7
+ from texttools.formatter import Gemma3Formatter
8
+
9
+
10
+ class QuestionGeneration(BaseModel):
11
+ """
12
+ we use this structue, the model will feel this class
13
+ """
14
+
15
+ reasoning_summary: str
16
+ questions: list
17
+
18
+
19
+ class GemmaQuestionGeneratorFromSubject(BaseQuestionGeneratorFromSubject):
20
+ """
21
+ Question Generator for Gemma-style models with optional reasoning step.
22
+
23
+ Allows optional extra instructions via `prompt_template`.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ client: OpenAI,
29
+ *,
30
+ model: str,
31
+ chat_formatter: Optional[Any] = None,
32
+ use_reason: bool = False,
33
+ temperature: float = 0.0,
34
+ prompt_template: Optional[str] = None,
35
+ handlers: Optional[list[Any]] = None,
36
+ **client_kwargs: Any,
37
+ ):
38
+ super().__init__(handlers)
39
+ self.client = client
40
+ self.model = model
41
+ self.temperature = temperature
42
+ self.client_kwargs = client_kwargs
43
+
44
+ self.chat_formatter = chat_formatter or Gemma3Formatter()
45
+
46
+ self.use_reason = use_reason
47
+ self.prompt_template = prompt_template
48
+
49
+ # [DEPRECATED] we wont use unncessery structured outputs anymore
50
+ # Define the JSON schema for the generated question output
51
+ # self.json_schema = {"generated_question": "string"}
52
+
53
+ def _build_messages(
54
+ self,
55
+ subject: str,
56
+ reason: Optional[str] = None,
57
+ number_of_questions: int = 5,
58
+ language: str = "farsi/Persian",
59
+ ) -> list[dict[str, str]]:
60
+ """
61
+ Builds the message list for the LLM API call for question generation.
62
+ """
63
+ clean_subject = self.preprocess(subject)
64
+ messages: list[dict[str, str]] = []
65
+
66
+ if self.prompt_template:
67
+ messages.append({"role": "user", "content": self.prompt_template})
68
+
69
+ if reason:
70
+ messages.append(
71
+ {
72
+ "role": "user",
73
+ "content": f"Based on this analysis of the subject: {reason}",
74
+ }
75
+ )
76
+
77
+ messages.append(
78
+ {
79
+ "role": "user",
80
+ "content": f"""Given the following subject, generate a single,
81
+ appropriate question that this subject would directly respond to.
82
+ the generated subject should be independently meaningful,
83
+ and not mentioning any verbs like, this, that, he or she ... on the question.
84
+ **the generated question will be in this language {language}**
85
+
86
+ """,
87
+ }
88
+ )
89
+ messages.append(
90
+ {"role": "user", "content": f"here is the text: {clean_subject}"}
91
+ )
92
+
93
+ # Ensure the schema is dumped as a valid JSON string for the LLM
94
+ # schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
95
+ messages.append(
96
+ {
97
+ "role": "user",
98
+ "content": f"""
99
+ Respond only with the new generated question, without any additional information.
100
+ **the generated question will be in this language {language}**
101
+ generate {number_of_questions} number os question in the questions list.
102
+
103
+ You must return ONLY a single JSON object that matches the schema.
104
+ Do NOT include any explanation before or after the JSON.
105
+ End the JSON with a closing brace }} and nothing else.
106
+ there is a `reasoning_summary` key, fill that up with a really summerized version
107
+ of your thoughts.
108
+ the `reasoning_summary` must be less than 20 words.
109
+ """,
110
+ }
111
+ )
112
+
113
+ # messages.append(
114
+ # {"role": "assistant", "content": "{\n"}
115
+ # ) # Hint to start JSON output
116
+ # in this new version we will use
117
+ # parse function of openai library
118
+
119
+ # this line will restructure the messages
120
+ # based on the formatter that we provided
121
+ # some models will require custom settings
122
+ restructured = self.chat_formatter.format(messages=messages)
123
+
124
+ return restructured
125
+
126
+ def _reason(self, subject: str, language: str) -> str:
127
+ """
128
+ Internal reasoning step to help the model understand the core information
129
+ and implications of the subject.
130
+ """
131
+ messages = [
132
+ {
133
+ "role": "user",
134
+ "content": """
135
+ our goal is to generate questions, from the given subject that the user has provided
136
+ the questions must be meaningfull, some of them should be specific and some should be general
137
+ but first, in this step we want to analyze the inputted subject that the user asked us to generate questions
138
+ for it
139
+
140
+ what is the subject
141
+ we need summerized analysis of the input subject
142
+ what point of views can we see it and generate questoins from it
143
+
144
+ questions that real users might have
145
+
146
+
147
+ """,
148
+ },
149
+ {
150
+ "role": "user",
151
+ "content": f"""
152
+
153
+ Here is the subject:
154
+
155
+ {subject}
156
+
157
+ respond only with this language {language}
158
+
159
+ """,
160
+ },
161
+ # {
162
+ # "role": "assistant",
163
+ # "content": """
164
+ # Sure, here is a summerized analysis
165
+ # """,
166
+ # },
167
+ ]
168
+
169
+ restructured = self.chat_formatter.format(messages=messages)
170
+
171
+ resp = self.client.chat.completions.create(
172
+ model=self.model,
173
+ messages=restructured,
174
+ temperature=self.temperature,
175
+ **self.client_kwargs,
176
+ )
177
+
178
+ reason_summary = resp.choices[0].message.content.strip()
179
+ return reason_summary
180
+
181
+ def generate_question(
182
+ self, subject: str, number_of_questions: int, language: str
183
+ ) -> str:
184
+ """
185
+ Generates a question for the input `subject`.
186
+ Optionally uses an internal reasoning step for better accuracy.
187
+
188
+ language: the language of the question
189
+
190
+ """
191
+ reason_summary = None
192
+ if self.use_reason:
193
+ reason_summary = self._reason(subject, language)
194
+
195
+ messages = self._build_messages(
196
+ subject, reason_summary, number_of_questions, language
197
+ )
198
+
199
+ completion = self.client.beta.chat.completions.parse(
200
+ model=self.model,
201
+ messages=messages,
202
+ response_format=QuestionGeneration,
203
+ temperature=self.temperature,
204
+ extra_body=dict(
205
+ guided_decoding_backend="auto",
206
+ ),
207
+ **self.client_kwargs,
208
+ )
209
+ message = completion.choices[0].message
210
+ if message.parsed:
211
+ result = message.parsed.questions
212
+ else:
213
+ raise ValueError(
214
+ f"Failed to parse the response. Raw content: {message.content}"
215
+ )
216
+
217
+ # dispatch and return
218
+ self._dispatch(
219
+ {
220
+ "original_subject": subject,
221
+ "generated_question": result,
222
+ }
223
+ )
224
+ return result
@@ -0,0 +1,2 @@
1
+ from texttools.tools.summarizer.llm_summerizer import LLMSummarizer
2
+ from texttools.tools.summarizer.gemma_summarizer import GemmaSummarizer
@@ -0,0 +1,140 @@
1
+ import json
2
+ from typing import Any, Optional
3
+
4
+ from openai import OpenAI
5
+
6
+ from texttools.base.base_summarizer import BaseSummarizer
7
+ from texttools.handlers import ResultHandler
8
+
9
+
10
+ class GemmaSummarizer(BaseSummarizer):
11
+ """
12
+ Summarizer for Gemma-style models with optional reasoning step.
13
+ Outputs JSON with a single string field: {"summary": "..."}.
14
+
15
+ Allows optional extra instructions via `prompt_template`.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ client: OpenAI,
21
+ *,
22
+ model: str,
23
+ use_reason: bool = False,
24
+ temperature: float = 0.0,
25
+ prompt_template: Optional[str] = None,
26
+ handlers: Optional[list[ResultHandler]] = None,
27
+ **client_kwargs: Any,
28
+ ):
29
+ super().__init__(handlers)
30
+ self.client = client
31
+ self.model = model
32
+ self.temperature = temperature
33
+ self.client_kwargs = client_kwargs
34
+
35
+ self.use_reason = use_reason
36
+ self.prompt_template = prompt_template
37
+
38
+ # Define the JSON schema for the summary output
39
+ self.json_schema = {"summary": "string"}
40
+
41
+ def _build_messages(
42
+ self, text: str, reason: Optional[str] = None
43
+ ) -> list[dict[str, str]]:
44
+ """
45
+ Builds the message list for the LLM API call.
46
+ """
47
+ clean_text = self.preprocess(text)
48
+ # Ensure the schema is dumped as a valid JSON string
49
+
50
+ messages: list[dict[str, str]] = []
51
+
52
+ if self.prompt_template:
53
+ messages.append({"role": "user", "content": self.prompt_template})
54
+
55
+ if reason: # Include the reason if available
56
+ messages.append(
57
+ {"role": "user", "content": f"Based on this analysis: {reason}"}
58
+ )
59
+
60
+ messages.append(
61
+ {
62
+ "role": "user",
63
+ "content": "Please provide a concise summary of the following text.",
64
+ }
65
+ )
66
+ messages.append({"role": "user", "content": clean_text})
67
+
68
+ schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
69
+ messages.append({"role": "user", "content": schema_instr})
70
+
71
+ messages.append(
72
+ {"role": "assistant", "content": "{"}
73
+ ) # Start with '{' to hint JSON
74
+ return messages
75
+
76
+ def _reason(self, text: str) -> str:
77
+ """
78
+ Internal reasoning step to help the model better understand the text for summarization.
79
+ """
80
+ messages = [
81
+ {
82
+ "role": "user",
83
+ "content": """
84
+ Read the following text and identify its main points, key arguments, and overall purpose.
85
+ Provide a brief, summarized analysis that will help in generating an accurate and concise summary.
86
+ """,
87
+ },
88
+ {
89
+ "role": "user",
90
+ "content": f"""
91
+ {text}
92
+ """,
93
+ },
94
+ ]
95
+
96
+ resp = self.client.chat.completions.create(
97
+ model=self.model,
98
+ messages=messages,
99
+ temperature=self.temperature,
100
+ **self.client_kwargs,
101
+ )
102
+
103
+ reason_summary = resp.choices[0].message.content.strip()
104
+ return reason_summary
105
+
106
+ def summarize(self, text: str) -> str:
107
+ """
108
+ Generates a summary for `text`.
109
+ Optionally uses an internal reasoning step for better quality.
110
+ """
111
+ reason_summary = None
112
+ if self.use_reason:
113
+ reason_summary = self._reason(text)
114
+
115
+ messages = self._build_messages(text, reason_summary)
116
+ resp = self.client.chat.completions.create(
117
+ model=self.model,
118
+ messages=messages,
119
+ temperature=self.temperature,
120
+ **self.client_kwargs,
121
+ )
122
+ raw = resp.choices[0].message.content.strip()
123
+
124
+ if not raw.startswith("{"):
125
+ raw = "{" + raw
126
+ try:
127
+ parsed = json.loads(raw)
128
+ except json.JSONDecodeError as e:
129
+ raise ValueError(f"Failed to parse JSON: {e}\nRaw output: {raw}")
130
+
131
+ result = parsed.get("summary")
132
+ # Validate that the result is a string
133
+ if not isinstance(result, str):
134
+ raise ValueError(
135
+ f"Invalid response schema, expected a string for 'summary', got: {parsed}"
136
+ )
137
+
138
+ # dispatch and return, passing original_text
139
+ self._dispatch(summary=result, original_text=text)
140
+ return result
@@ -0,0 +1,108 @@
1
+ from typing import Any, Optional
2
+
3
+ from openai import OpenAI
4
+ from pydantic import BaseModel, create_model
5
+
6
+ from texttools.base.base_summarizer import BaseSummarizer
7
+
8
+
9
+ class LLMSummarizer(BaseSummarizer):
10
+ """
11
+ LLM-based text summarizer that wraps OpenAI's structured output parsing.
12
+
13
+ Usage:
14
+ ```python
15
+ from openai import OpenAI
16
+ from texttools import LLMSummarizer
17
+
18
+ client = OpenAI()
19
+ summarizer = LLMSummarizer(
20
+ client=client,
21
+ model="gpt-4o-2024-08-06",
22
+ temperature=0.7,
23
+ prompt_template=(
24
+ "You are a helpful assistant that produces concise summaries of the provided text."
25
+ ),
26
+ handlers=[my_handler], # optional callbacks on each summarization
27
+ max_tokens=150, # any other OpenAIClient kwargs
28
+ )
29
+
30
+ summary = summarizer.summarize("Long article text...")
31
+ print(summary)
32
+ ```
33
+
34
+ Parameters:
35
+ client (OpenAI):
36
+ Instantiated OpenAI client. Ensure your API key is configured.
37
+ model (str):
38
+ Model name to use (e.g., "gpt-4").
39
+ temperature (float, default=0.7):
40
+ Sampling temperature.
41
+ prompt_template (str, optional):
42
+ System-level instructions guiding the summarization.
43
+ handlers (list[callable], optional):
44
+ List of callables that receive {"summary": str, "original_text": str}.
45
+ client_kwargs (Any):
46
+ Additional parameters passed directly to OpenAI (e.g., max_tokens, top_p).
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ client: OpenAI,
52
+ *,
53
+ model: str,
54
+ temperature: float = 0.7,
55
+ prompt_template: Optional[str] = None,
56
+ handlers: Optional[list[Any]] = None,
57
+ **client_kwargs: Any,
58
+ ):
59
+ super().__init__(handlers)
60
+ self.client = client
61
+ self.model = model
62
+ self.temperature = temperature
63
+ self.client_kwargs = client_kwargs
64
+
65
+ self.prompt_template = (
66
+ prompt_template
67
+ if prompt_template is not None
68
+ else (
69
+ """
70
+ You are a helpful assistant that produces concise and accurate summaries of the provided text.
71
+ do not explain anything, onlu provide the summarized version.
72
+ """
73
+ )
74
+ )
75
+
76
+ self._OutputModel = create_model(
77
+ "SummarizationOutput",
78
+ summary=(str, ...),
79
+ )
80
+
81
+ def _build_messages(self, text: str) -> list[dict[str, str]]:
82
+ preprocessed = self.preprocess(text)
83
+ return [
84
+ {"role": "system", "content": self.prompt_template},
85
+ {"role": "user", "content": preprocessed},
86
+ ]
87
+
88
+ def summarize(self, text: str) -> str:
89
+ """
90
+ Generate a summary for the input text.
91
+
92
+ :param text: The text to summarize.
93
+ :return: A summary string.
94
+ """
95
+ messages = self._build_messages(text)
96
+ resp = self.client.responses.parse(
97
+ model=self.model,
98
+ input=messages,
99
+ text_format=self._OutputModel,
100
+ temperature=self.temperature,
101
+ **self.client_kwargs,
102
+ )
103
+ output: BaseModel = resp.output_parsed
104
+ summary_text: str = output.summary
105
+
106
+ self._dispatch(summary=summary_text, original_text=text)
107
+
108
+ return summary_text
@@ -0,0 +1 @@
1
+ from texttools.tools.translator.gemma_translator import GemmaTranslator
@@ -0,0 +1,202 @@
1
+ import json
2
+ import re
3
+ from typing import Any, Optional
4
+
5
+ from openai import OpenAI
6
+ from pydantic import BaseModel
7
+
8
+ from texttools.base.base_translator import BaseTranslator
9
+ from texttools.formatter.gemma3_formatter import Gemma3Formatter
10
+
11
+
12
+ # Pydantic BaseModel to specify the output format of preprocessor
13
+ # Preprocessor's job is to extract proper names
14
+ class PreprocessorOutput(BaseModel):
15
+ text: str
16
+ text_type: str
17
+
18
+
19
+ class GemmaTranslator(BaseTranslator):
20
+ """
21
+ Translator for Gemma-style models with optional reasoning step.
22
+ Outputs only the translated text, without any additional structure.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ client: OpenAI,
28
+ *,
29
+ model: str,
30
+ chat_formatter: Optional[Any] = None,
31
+ use_reason: bool = False,
32
+ temperature: float = 0.0,
33
+ prompt_template: str = None,
34
+ handlers: list[Any] = None,
35
+ **client_kwargs: Any,
36
+ ):
37
+ super().__init__(handlers)
38
+ self.client = client
39
+ self.model = model
40
+ self.temperature = temperature
41
+ self.client_kwargs = client_kwargs
42
+ self.chat_formatter = chat_formatter or Gemma3Formatter()
43
+ self.use_reason = use_reason
44
+ self.prompt_template = prompt_template
45
+
46
+ def _build_messages(
47
+ self,
48
+ text: str,
49
+ target_language: str,
50
+ source_language: Optional[str] = None,
51
+ reason: Optional[str] = None,
52
+ proper_names: Optional[list[str]] = None,
53
+ ) -> list[dict[str, str]]:
54
+ messages: list[dict[str, str]] = []
55
+
56
+ # This prompt gives initial information about translation like languages and proper names
57
+ enforce_prompt = f"""
58
+ You are a {source_language}-to-{target_language} translator.
59
+ Important Rule: The following are proper names and must NOT be translated.
60
+ They must be only transliterated into {target_language}.
61
+ That means preserving their phonetic form without changing their meaning.
62
+ Apply the rule for **ALL** of following proper names.
63
+ Proper names (do not translate **** of them):
64
+ {proper_names if proper_names else "None"}
65
+ If any proper name is found in the text, you MUST only transliterate it.
66
+ Output only the translated text. No comments, no explanations, no markdown.
67
+ """
68
+ messages.append({"role": "user", "content": enforce_prompt})
69
+
70
+ clean_text = text.strip()
71
+ if reason:
72
+ reason_prompt = f"""
73
+ Based on the analysis conducted, translate the following text {"from" + source_language if source_language else ""} to {target_language}.
74
+ The text to be translated is: "{clean_text}"
75
+ The analysis conducted: {reason}
76
+ """
77
+ messages.append({"role": "user", "content": reason_prompt})
78
+ else:
79
+ regular_prompt = f"""Translate the following text from {source_language or "original"} to {target_language}:
80
+ {clean_text}"""
81
+ messages.append({"role": "user", "content": regular_prompt})
82
+
83
+ # Optional additional template
84
+ if self.prompt_template:
85
+ messages.append({"role": "user", "content": self.prompt_template})
86
+
87
+ restructured = self.chat_formatter.format(messages=messages)
88
+
89
+ return restructured
90
+
91
+ def _reason(self, text: str, target_language: str) -> str:
92
+ """
93
+ Internal reasoning step to help the model with translation.
94
+ """
95
+
96
+ reason_step_prompt = f"""
97
+ Analyze the following text and identify important linguistic considerations for translation.
98
+ Do not translate the text. Point out any idioms, cultural references, or complex structures that need special attention.
99
+ Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
100
+ """
101
+ messages = [
102
+ {"role": "user", "content": reason_step_prompt},
103
+ {"role": "user", "content": text},
104
+ ]
105
+
106
+ restructured = self.chat_formatter.format(messages=messages)
107
+ completion = self.client.chat.completions.create(
108
+ model=self.model,
109
+ messages=restructured,
110
+ temperature=self.temperature,
111
+ **self.client_kwargs,
112
+ )
113
+
114
+ return completion.choices[0].message.content.strip()
115
+
116
+ def preprocess(self, text: str) -> list:
117
+ """
118
+ Preprocessor that finds proper names of Islamic figures. The extractions will be given to the
119
+ LLm in order to know that it shouldn't translate them, but transliterate them.
120
+ """
121
+
122
+ messages: list[dict[str, str]] = []
123
+
124
+ main_prompt = """
125
+ You must detect proper names of people.
126
+ Your task is to extract a JSON list of entities from the given input. For each entity, include:
127
+ text: The exact matched string from the original.
128
+ type: Only include "Proper Name" for actual names of real people.
129
+ If there is no proper name in the following text, return empty json.
130
+ """
131
+ messages.append({"role": "user", "content": main_prompt})
132
+
133
+ text_prompt = f"""The text to be extracted is:{text}"""
134
+ messages.append({"role": "user", "content": text_prompt})
135
+
136
+ restructured = self.chat_formatter.format(messages=messages)
137
+ completion = self.client.chat.completions.create(
138
+ model=self.model,
139
+ messages=restructured,
140
+ response_format={
141
+ "type": "json_schema",
142
+ "json_schema": {
143
+ "name": "NER",
144
+ "schema": PreprocessorOutput.model_json_schema(),
145
+ },
146
+ },
147
+ temperature=self.temperature,
148
+ **self.client_kwargs,
149
+ )
150
+ response = completion.choices[0].message.content
151
+
152
+ # Remove Markdown-style triple backticks and any optional language tag like "json"
153
+ if response.startswith("```"):
154
+ response = re.sub(r"^```(?:json)?\s*|```$", "", response.strip())
155
+
156
+ entities = json.loads(response)
157
+
158
+ return entities
159
+
160
+ def translate(
161
+ self, text: str, target_language: str, source_language: Optional[str] = None
162
+ ) -> str:
163
+ """
164
+ Translates text and returns only the translated string.
165
+ """
166
+
167
+ # Extract proper names to tell the LLM what names not to translate, but to transliterate
168
+ extracted = self.preprocess(text)
169
+ proper_names = [e["text"] for e in extracted]
170
+
171
+ reason_summary = None
172
+ if self.use_reason:
173
+ reason_summary = self._reason(text, target_language, source_language)
174
+
175
+ messages = self._build_messages(
176
+ text, target_language, source_language, reason_summary, proper_names
177
+ )
178
+ print(f"Original: {text}")
179
+ print(
180
+ f"Translating to {target_language} from {source_language or 'original'}..."
181
+ )
182
+ print(
183
+ f"Reasoning: {reason_summary}" if reason_summary else "Reasoning not used."
184
+ )
185
+
186
+ completion = self.client.chat.completions.create(
187
+ model=self.model,
188
+ messages=messages,
189
+ temperature=self.temperature,
190
+ **self.client_kwargs,
191
+ )
192
+ response = completion.choices[0].message.content.strip()
193
+
194
+ self._dispatch(
195
+ {
196
+ "original_text": text,
197
+ "source_language": source_language,
198
+ "target_language": target_language,
199
+ "translated_text": response,
200
+ }
201
+ )
202
+ return response