hamtaa-texttools 0.1.44__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- hamtaa_texttools-1.0.0.dist-info/METADATA +129 -0
- hamtaa_texttools-1.0.0.dist-info/RECORD +17 -0
- hamtaa_texttools-1.0.0.dist-info/licenses/LICENSE +21 -0
- {hamtaa_texttools-0.1.44.dist-info → hamtaa_texttools-1.0.0.dist-info}/top_level.txt +0 -0
- texttools/__init__.py +4 -21
- texttools/formatters/base_formatter.py +33 -0
- texttools/formatters/user_merge_formatter/user_merge_formatter.py +47 -0
- texttools/tools/__init__.py +2 -32
- texttools/tools/operator.py +236 -0
- texttools/tools/output_models.py +54 -0
- texttools/tools/prompt_loader.py +84 -0
- texttools/tools/the_tool.py +291 -0
- texttools/utils/__init__.py +4 -0
- texttools/{batch_manager → utils/batch_manager}/__init__.py +2 -0
- texttools/{batch_manager → utils/batch_manager}/batch_manager.py +11 -12
- texttools/{batch_manager → utils/batch_manager}/batch_runner.py +20 -15
- hamtaa_texttools-0.1.44.dist-info/METADATA +0 -60
- hamtaa_texttools-0.1.44.dist-info/RECORD +0 -60
- texttools/base/__init__.py +0 -3
- texttools/base/base_categorizer.py +0 -40
- texttools/base/base_keyword_extractor.py +0 -35
- texttools/base/base_ner_extractor.py +0 -61
- texttools/base/base_question_detector.py +0 -35
- texttools/base/base_question_generator.py +0 -99
- texttools/base/base_question_merger.py +0 -59
- texttools/base/base_question_rewriter.py +0 -61
- texttools/base/base_router.py +0 -33
- texttools/base/base_summarizer.py +0 -55
- texttools/base/base_task_performer.py +0 -53
- texttools/base/base_translator.py +0 -38
- texttools/formatter/__init__.py +0 -1
- texttools/formatter/base.py +0 -26
- texttools/formatter/gemma3_formatter.py +0 -54
- texttools/handlers/__init__.py +0 -6
- texttools/handlers/categorizer/__init__.py +0 -6
- texttools/handlers/categorizer/categorizer.py +0 -61
- texttools/handlers/handlers.py +0 -88
- texttools/tools/categorizer/__init__.py +0 -2
- texttools/tools/categorizer/encoder_model/__init__.py +0 -1
- texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
- texttools/tools/categorizer/llm/__init__.py +0 -2
- texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
- texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
- texttools/tools/keyword_extractor/__init__.py +0 -1
- texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
- texttools/tools/merger/__init__.py +0 -2
- texttools/tools/merger/gemma_question_merger.py +0 -214
- texttools/tools/ner/__init__.py +0 -1
- texttools/tools/ner/gemma_ner_extractor.py +0 -157
- texttools/tools/question_detector/__init__.py +0 -2
- texttools/tools/question_detector/gemma_detector.py +0 -114
- texttools/tools/question_detector/llm_detector.py +0 -112
- texttools/tools/question_generator/__init__.py +0 -1
- texttools/tools/question_generator/gemma_question_generator.py +0 -198
- texttools/tools/reranker/__init__.py +0 -3
- texttools/tools/reranker/reranker.py +0 -137
- texttools/tools/reranker/scorer.py +0 -216
- texttools/tools/reranker/sorter.py +0 -278
- texttools/tools/rewriter/__init__.py +0 -2
- texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
- texttools/tools/router/__init__.py +0 -0
- texttools/tools/router/gemma_router.py +0 -169
- texttools/tools/subject_to_question/__init__.py +0 -1
- texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
- texttools/tools/summarizer/__init__.py +0 -2
- texttools/tools/summarizer/gemma_summarizer.py +0 -140
- texttools/tools/summarizer/llm_summerizer.py +0 -108
- texttools/tools/translator/__init__.py +0 -1
- texttools/tools/translator/gemma_translator.py +0 -202
- {hamtaa_texttools-0.1.44.dist-info → hamtaa_texttools-1.0.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PromptLoader:
|
|
7
|
+
"""
|
|
8
|
+
Utility for loading and formatting YAML prompt templates.
|
|
9
|
+
|
|
10
|
+
Each YAML file under `prompts/` must define at least a `main_template`,
|
|
11
|
+
and optionally an `analyze_template`. These can either be a single string
|
|
12
|
+
or a dictionary keyed by mode names (if `use_modes=True`).
|
|
13
|
+
|
|
14
|
+
Responsibilities:
|
|
15
|
+
- Load and parse YAML prompt definitions.
|
|
16
|
+
- Select the right template (by mode, if applicable).
|
|
17
|
+
- Inject variables (`{input}`, plus any extra kwargs) into the templates.
|
|
18
|
+
- Return a dict with:
|
|
19
|
+
{
|
|
20
|
+
"main_template": "...",
|
|
21
|
+
"analyze_template": "..." | None
|
|
22
|
+
}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
MAIN_TEMPLATE: str = "main_template"
|
|
26
|
+
ANALYZE_TEMPLATE: str = "analyze_template"
|
|
27
|
+
|
|
28
|
+
def __init__(self, prompts_dir: Optional[str] = None):
|
|
29
|
+
self.PROMPTS_DIR = prompts_dir or "prompts"
|
|
30
|
+
|
|
31
|
+
def _get_prompt_path(self, prompt_file: str) -> Path:
|
|
32
|
+
return Path(__file__).parent.parent / self.PROMPTS_DIR / prompt_file
|
|
33
|
+
|
|
34
|
+
def _load_templates(
|
|
35
|
+
self, prompt_file: str, use_modes: bool, mode: str
|
|
36
|
+
) -> dict[str, str]:
|
|
37
|
+
prompt_path = self._get_prompt_path(prompt_file)
|
|
38
|
+
|
|
39
|
+
if not prompt_path.exists():
|
|
40
|
+
raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
# Load the data
|
|
44
|
+
data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
|
|
45
|
+
except yaml.YAMLError as e:
|
|
46
|
+
raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
|
|
47
|
+
|
|
48
|
+
if self.MAIN_TEMPLATE not in data:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Missing required '{self.MAIN_TEMPLATE}' in {prompt_file}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
|
|
55
|
+
if use_modes
|
|
56
|
+
else data[self.MAIN_TEMPLATE],
|
|
57
|
+
self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
|
|
58
|
+
if use_modes
|
|
59
|
+
else data.get(self.ANALYZE_TEMPLATE),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def _build_format_args(self, input_text: str, **extra_kwargs) -> dict[str, str]:
|
|
63
|
+
# Base formatting args
|
|
64
|
+
format_args = {"input": input_text}
|
|
65
|
+
# Merge extras
|
|
66
|
+
format_args.update(extra_kwargs)
|
|
67
|
+
return format_args
|
|
68
|
+
|
|
69
|
+
def load_prompts(
|
|
70
|
+
self,
|
|
71
|
+
prompt_file: str,
|
|
72
|
+
use_modes: bool,
|
|
73
|
+
mode: str,
|
|
74
|
+
input_text: str,
|
|
75
|
+
**extra_kwargs,
|
|
76
|
+
) -> dict[str, str]:
|
|
77
|
+
template_configs = self._load_templates(prompt_file, use_modes, mode)
|
|
78
|
+
format_args = self._build_format_args(input_text, **extra_kwargs)
|
|
79
|
+
|
|
80
|
+
# Inject variables inside each template
|
|
81
|
+
for key in template_configs.keys():
|
|
82
|
+
template_configs[key] = template_configs[key].format(**format_args)
|
|
83
|
+
|
|
84
|
+
return template_configs
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
from typing import Literal, Any
|
|
2
|
+
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
|
|
5
|
+
from texttools.tools.operator import Operator
|
|
6
|
+
import texttools.tools.output_models as OutputModels
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TheTool:
|
|
10
|
+
"""
|
|
11
|
+
High-level interface exposing specialized text tools for.
|
|
12
|
+
|
|
13
|
+
Each method configures the operator with a specific YAML prompt,
|
|
14
|
+
output schema, and flags, then delegates execution to `operator.run()`.
|
|
15
|
+
|
|
16
|
+
Supported capabilities:
|
|
17
|
+
- categorize: assign a text to one of several Islamic categories.
|
|
18
|
+
- extract_keywords: produce a keyword list from text.
|
|
19
|
+
- extract_entities: simple NER (name/type pairs).
|
|
20
|
+
- detect_question: binary check whether input is a question.
|
|
21
|
+
- generate_question_from_text: produce a new question from a text.
|
|
22
|
+
- merge_questions: combine multiple questions (default/reason modes).
|
|
23
|
+
- rewrite_question: rephrase questions (same meaning/different wording, or vice versa).
|
|
24
|
+
- generate_questions_from_subject: generate multiple questions given a subject.
|
|
25
|
+
- summarize: produce a concise summary of a subject.
|
|
26
|
+
- translate: translate text between languages.
|
|
27
|
+
|
|
28
|
+
Usage pattern:
|
|
29
|
+
client = OpenAI(...)
|
|
30
|
+
tool = TheTool(client, model="gemma-3")
|
|
31
|
+
result = tool.categorize("متن ورودی ...", with_analysis=True)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
client: OpenAI,
|
|
37
|
+
*,
|
|
38
|
+
model: str,
|
|
39
|
+
temperature: float = 0.0,
|
|
40
|
+
**client_kwargs: Any,
|
|
41
|
+
):
|
|
42
|
+
self.operator = Operator(
|
|
43
|
+
client=client,
|
|
44
|
+
model=model,
|
|
45
|
+
temperature=temperature,
|
|
46
|
+
**client_kwargs,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def categorize(self, text: str, with_analysis: bool = False) -> dict[str, str]:
|
|
50
|
+
"""
|
|
51
|
+
Categorize a text into a single Islamic studies domain category.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Input string to categorize.
|
|
55
|
+
with_analysis: If True, first runs an LLM "analysis" step and
|
|
56
|
+
conditions the main prompt on that analysis.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
{"result": <category string>}
|
|
60
|
+
Example: {"result": "باورهای دینی"}
|
|
61
|
+
"""
|
|
62
|
+
self.operator.PROMPT_FILE = "categorizer.yaml"
|
|
63
|
+
self.operator.OUTPUT_MODEL = OutputModels.CategorizerOutput
|
|
64
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
65
|
+
self.operator.USE_MODES = False
|
|
66
|
+
|
|
67
|
+
results = self.operator.run(text)
|
|
68
|
+
return results
|
|
69
|
+
|
|
70
|
+
def extract_keywords(
|
|
71
|
+
self, text: str, with_analysis: bool = False
|
|
72
|
+
) -> dict[str, list[str]]:
|
|
73
|
+
"""
|
|
74
|
+
Extract salient keywords from text.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
text: Input string to analyze.
|
|
78
|
+
with_analysis: Whether to run an extra LLM reasoning step.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
{"result": [<keyword1>, <keyword2>, ...]}
|
|
82
|
+
"""
|
|
83
|
+
self.operator.PROMPT_FILE = "keyword_extractor.yaml"
|
|
84
|
+
self.operator.OUTPUT_MODEL = OutputModels.ListStrOutput
|
|
85
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
86
|
+
self.operator.USE_MODES = False
|
|
87
|
+
|
|
88
|
+
results = self.operator.run(text)
|
|
89
|
+
return results
|
|
90
|
+
|
|
91
|
+
def extract_entities(
|
|
92
|
+
self, text: str, with_analysis: bool = False
|
|
93
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
94
|
+
"""
|
|
95
|
+
Perform Named Entity Recognition (NER) over the input text.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
text: Input string.
|
|
99
|
+
with_analysis: Whether to run an extra LLM reasoning step.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
{"result": [{"text": <entity>, "type": <entity_type>}, ...]}
|
|
103
|
+
"""
|
|
104
|
+
self.operator.PROMPT_FILE = "ner_extractor.yaml"
|
|
105
|
+
self.operator.OUTPUT_MODEL = OutputModels.ListDictStrStrOutput
|
|
106
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
107
|
+
self.operator.USE_MODES = False
|
|
108
|
+
|
|
109
|
+
results = self.operator.run(text)
|
|
110
|
+
return results
|
|
111
|
+
|
|
112
|
+
def detect_question(
|
|
113
|
+
self, question: str, with_analysis: bool = False
|
|
114
|
+
) -> dict[str, str]:
|
|
115
|
+
"""
|
|
116
|
+
Detect if the input is phrased as a question.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
question: Input string to evaluate.
|
|
120
|
+
with_analysis: Whether to include an analysis step.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
{"result": "true"} or {"result": "false"}
|
|
124
|
+
"""
|
|
125
|
+
self.operator.PROMPT_FILE = "question_detector.yaml"
|
|
126
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
127
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
128
|
+
self.operator.USE_MODES = False
|
|
129
|
+
|
|
130
|
+
results = self.operator.run(question)
|
|
131
|
+
return results
|
|
132
|
+
|
|
133
|
+
def generate_question_from_text(
|
|
134
|
+
self, text: str, with_analysis: bool = False
|
|
135
|
+
) -> dict[str, str]:
|
|
136
|
+
"""
|
|
137
|
+
Generate a single question from the given text.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
text: Source text to derive a question from.
|
|
141
|
+
with_analysis: Whether to use analysis before generation.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
{"result": <generated_question>}
|
|
145
|
+
"""
|
|
146
|
+
self.operator.PROMPT_FILE = "question_generator.yaml"
|
|
147
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
148
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
149
|
+
self.operator.USE_MODES = False
|
|
150
|
+
|
|
151
|
+
results = self.operator.run(text)
|
|
152
|
+
return results
|
|
153
|
+
|
|
154
|
+
def merge_questions(
|
|
155
|
+
self,
|
|
156
|
+
questions: list[str],
|
|
157
|
+
mode: Literal["default_mode", "reason_mode"] = "default_mode",
|
|
158
|
+
with_analysis: bool = False,
|
|
159
|
+
) -> dict[str, str]:
|
|
160
|
+
"""
|
|
161
|
+
Merge multiple questions into a single unified question.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
questions: List of question strings.
|
|
165
|
+
mode: Merge strategy:
|
|
166
|
+
- "default_mode": simple merging.
|
|
167
|
+
- "reason_mode": merging with reasoning explanation.
|
|
168
|
+
with_analysis: Whether to use an analysis step.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
{"result": <merged_question>}
|
|
172
|
+
"""
|
|
173
|
+
question_str = ", ".join(questions)
|
|
174
|
+
|
|
175
|
+
self.operator.PROMPT_FILE = "question_merger.yaml"
|
|
176
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
177
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
178
|
+
self.operator.USE_MODES = True
|
|
179
|
+
self.operator.MODE = mode
|
|
180
|
+
|
|
181
|
+
results = self.operator.run(question_str)
|
|
182
|
+
return results
|
|
183
|
+
|
|
184
|
+
def rewrite_question(
|
|
185
|
+
self,
|
|
186
|
+
question: str,
|
|
187
|
+
mode: Literal[
|
|
188
|
+
"same_meaning_different_wording_mode",
|
|
189
|
+
"different_meaning_similar_wording_mode",
|
|
190
|
+
] = "same_meaning_different_wording_mode",
|
|
191
|
+
with_analysis: bool = False,
|
|
192
|
+
) -> dict[str, str]:
|
|
193
|
+
"""
|
|
194
|
+
Rewrite a question with different wording or meaning.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
question: Input question to rewrite.
|
|
198
|
+
mode: Rewrite strategy:
|
|
199
|
+
- "same_meaning_different_wording_mode": keep meaning, change words.
|
|
200
|
+
- "different_meaning_similar_wording_mode": alter meaning, preserve wording style.
|
|
201
|
+
with_analysis: Whether to include an analysis step.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
{"result": <rewritten_question>}
|
|
205
|
+
"""
|
|
206
|
+
self.operator.PROMPT_FILE = "question_rewriter.yaml"
|
|
207
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
208
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
209
|
+
self.operator.USE_MODES = True
|
|
210
|
+
self.operator.MODE = mode
|
|
211
|
+
|
|
212
|
+
results = self.operator.run(question)
|
|
213
|
+
return results
|
|
214
|
+
|
|
215
|
+
def generate_questions_from_subject(
|
|
216
|
+
self,
|
|
217
|
+
subject: str,
|
|
218
|
+
number_of_questions: int,
|
|
219
|
+
language: str = "English",
|
|
220
|
+
with_analysis: bool = False,
|
|
221
|
+
) -> dict[str, list[str]]:
|
|
222
|
+
"""
|
|
223
|
+
Generate a list of questions about a subject.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
subject: Topic of interest.
|
|
227
|
+
number_of_questions: Number of questions to produce.
|
|
228
|
+
language: Target language for generated questions.
|
|
229
|
+
with_analysis: Whether to include an analysis step.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
{"result": [<question1>, <question2>, ...]}
|
|
233
|
+
"""
|
|
234
|
+
self.operator.PROMPT_FILE = "subject_question_generator.yaml"
|
|
235
|
+
self.operator.OUTPUT_MODEL = OutputModels.ReasonListStrOutput
|
|
236
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
237
|
+
self.operator.USE_MODES = False
|
|
238
|
+
|
|
239
|
+
results = self.operator.run(
|
|
240
|
+
subject,
|
|
241
|
+
number_of_questions=number_of_questions,
|
|
242
|
+
language=language,
|
|
243
|
+
)
|
|
244
|
+
return results
|
|
245
|
+
|
|
246
|
+
def summarize(self, subject: str, with_analysis: bool = False) -> dict[str, str]:
|
|
247
|
+
"""
|
|
248
|
+
Summarize the given subject text.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
subject: Input text to summarize.
|
|
252
|
+
with_analysis: Whether to include an analysis step.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
{"result": <summary>}
|
|
256
|
+
"""
|
|
257
|
+
self.operator.PROMPT_FILE = "summarizer.yaml"
|
|
258
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
259
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
260
|
+
self.operator.USE_MODES = False
|
|
261
|
+
|
|
262
|
+
results = self.operator.run(subject)
|
|
263
|
+
return results
|
|
264
|
+
|
|
265
|
+
def translate(
|
|
266
|
+
self,
|
|
267
|
+
text: str,
|
|
268
|
+
target_language: str,
|
|
269
|
+
with_analysis: bool = False,
|
|
270
|
+
) -> dict[str, str]:
|
|
271
|
+
"""
|
|
272
|
+
Translate text between languages.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
text: Input string to translate.
|
|
276
|
+
target_language: Language code or name to translate into.
|
|
277
|
+
with_analysis: Whether to include an analysis step.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
{"result": <translated_text>}
|
|
281
|
+
"""
|
|
282
|
+
self.operator.PROMPT_FILE = "translator.yaml"
|
|
283
|
+
self.operator.OUTPUT_MODEL = OutputModels.StrOutput
|
|
284
|
+
self.operator.WITH_ANALYSIS = with_analysis
|
|
285
|
+
self.operator.USE_MODES = False
|
|
286
|
+
|
|
287
|
+
results = self.operator.run(
|
|
288
|
+
text,
|
|
289
|
+
target_language=target_language,
|
|
290
|
+
)
|
|
291
|
+
return results
|
|
@@ -6,10 +6,17 @@ from typing import Any, Optional, Type
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
from openai import OpenAI
|
|
8
8
|
from openai.lib._pydantic import to_strict_json_schema
|
|
9
|
-
# from openai.lib._parsing._completions import type_to_response_format_param
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
class SimpleBatchManager:
|
|
12
|
+
"""
|
|
13
|
+
Manages batch processing jobs for OpenAI's chat completions with structured outputs.
|
|
14
|
+
|
|
15
|
+
Handles the full lifecycle of a batch job: creating tasks from input texts,
|
|
16
|
+
starting the job, monitoring status, and fetching results. Results are automatically
|
|
17
|
+
parsed into the specified Pydantic output model. Job state is persisted to disk.
|
|
18
|
+
"""
|
|
19
|
+
|
|
13
20
|
def __init__(
|
|
14
21
|
self,
|
|
15
22
|
client: OpenAI,
|
|
@@ -70,18 +77,12 @@ class SimpleBatchManager:
|
|
|
70
77
|
Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
|
|
71
78
|
"""
|
|
72
79
|
response_format_config: dict[str, Any]
|
|
80
|
+
|
|
73
81
|
if self.custom_json_schema_obj_str:
|
|
74
|
-
# try:
|
|
75
|
-
# parsed_custom_schema = json.loads(self.custom_json_schema_obj_str)
|
|
76
82
|
response_format_config = {
|
|
77
83
|
"type": "json_schema",
|
|
78
84
|
"json_schema": self.custom_json_schema_obj_str,
|
|
79
85
|
}
|
|
80
|
-
# except json.JSONDecodeError as e:
|
|
81
|
-
# raise ValueError(
|
|
82
|
-
# "Failed to parse custom_json_schema_obj_str. "
|
|
83
|
-
# "Please ensure it's a valid JSON string."
|
|
84
|
-
# ) from e
|
|
85
86
|
else:
|
|
86
87
|
raw_schema = to_strict_json_schema(self.output_model)
|
|
87
88
|
response_format_config = {
|
|
@@ -167,7 +168,7 @@ class SimpleBatchManager:
|
|
|
167
168
|
Returns a list of dictionaries with 'id' and 'output' keys.
|
|
168
169
|
"""
|
|
169
170
|
modified_result = []
|
|
170
|
-
|
|
171
|
+
|
|
171
172
|
for key, d in result.items():
|
|
172
173
|
if "desired_output" in d:
|
|
173
174
|
new_dict = {"id": key, "output": d["desired_output"]}
|
|
@@ -176,7 +177,6 @@ class SimpleBatchManager:
|
|
|
176
177
|
new_dict = {"id": key, "output": d["error"]}
|
|
177
178
|
modified_result.append(new_dict)
|
|
178
179
|
return modified_result
|
|
179
|
-
# return modified_result , errors
|
|
180
180
|
|
|
181
181
|
def fetch_results(
|
|
182
182
|
self, job_name: str, remove_cache: bool = True
|
|
@@ -236,6 +236,5 @@ class SimpleBatchManager:
|
|
|
236
236
|
handler.handle(results)
|
|
237
237
|
if remove_cache:
|
|
238
238
|
self._clear_state(job_name)
|
|
239
|
-
|
|
240
|
-
# return results
|
|
239
|
+
|
|
241
240
|
return results, log
|
|
@@ -5,18 +5,17 @@ from dataclasses import dataclass
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Callable
|
|
7
7
|
|
|
8
|
-
# from dotenv import load_dotenv
|
|
9
8
|
from openai import OpenAI
|
|
10
9
|
from pydantic import BaseModel
|
|
11
10
|
|
|
12
|
-
from texttools.batch_manager import SimpleBatchManager
|
|
11
|
+
from texttools.utils.batch_manager import SimpleBatchManager
|
|
13
12
|
|
|
14
13
|
|
|
15
|
-
class
|
|
16
|
-
|
|
14
|
+
class Output(BaseModel):
|
|
15
|
+
output: str
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
def
|
|
18
|
+
def export_data(data):
|
|
20
19
|
"""
|
|
21
20
|
Produces a structure of the following form from an initial data structure:
|
|
22
21
|
[
|
|
@@ -26,7 +25,7 @@ def exporting_data(data):
|
|
|
26
25
|
return data
|
|
27
26
|
|
|
28
27
|
|
|
29
|
-
def
|
|
28
|
+
def import_data(data):
|
|
30
29
|
"""
|
|
31
30
|
Takes the output and adds and aggregates it to the original structure.
|
|
32
31
|
"""
|
|
@@ -49,17 +48,21 @@ class BatchConfig:
|
|
|
49
48
|
CHARS_PER_TOKEN: float = 2.7
|
|
50
49
|
PROMPT_TOKEN_MULTIPLIER: int = 1000
|
|
51
50
|
BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
|
|
52
|
-
import_function: Callable =
|
|
53
|
-
export_function: Callable =
|
|
51
|
+
import_function: Callable = import_data
|
|
52
|
+
export_function: Callable = export_data
|
|
54
53
|
|
|
55
54
|
|
|
56
55
|
class BatchJobRunner:
|
|
57
56
|
"""
|
|
58
|
-
|
|
57
|
+
Orchestrates the execution of batched LLM processing jobs.
|
|
58
|
+
|
|
59
|
+
Handles data loading, partitioning, job execution via SimpleBatchManager,
|
|
60
|
+
and result saving. Manages the complete workflow from input data to processed outputs,
|
|
61
|
+
including retries and progress tracking across multiple batch parts.
|
|
59
62
|
"""
|
|
60
63
|
|
|
61
64
|
def __init__(
|
|
62
|
-
self, config: BatchConfig = BatchConfig(), output_model: type =
|
|
65
|
+
self, config: BatchConfig = BatchConfig(), output_model: type = Output
|
|
63
66
|
):
|
|
64
67
|
self.config = config
|
|
65
68
|
self.system_prompt = config.system_prompt
|
|
@@ -75,7 +78,6 @@ class BatchJobRunner:
|
|
|
75
78
|
Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
|
76
79
|
|
|
77
80
|
def _init_manager(self) -> SimpleBatchManager:
|
|
78
|
-
# load_dotenv()
|
|
79
81
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
80
82
|
client = OpenAI(api_key=api_key)
|
|
81
83
|
return SimpleBatchManager(
|
|
@@ -90,7 +92,7 @@ class BatchJobRunner:
|
|
|
90
92
|
data = json.load(f)
|
|
91
93
|
data = self.config.export_function(data)
|
|
92
94
|
|
|
93
|
-
#
|
|
95
|
+
# Ensure data is a list of dicts with 'id' and 'content' as strings
|
|
94
96
|
if not isinstance(data, list):
|
|
95
97
|
raise ValueError(
|
|
96
98
|
'Exported data must be a list in this form: [ {"id": str, "content": str},...]'
|
|
@@ -159,10 +161,13 @@ class BatchJobRunner:
|
|
|
159
161
|
elif status == "failed":
|
|
160
162
|
print("Job failed. Clearing state, waiting, and retrying...")
|
|
161
163
|
self.manager._clear_state(part_job_name)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
+
# Wait before retrying
|
|
165
|
+
time.sleep(10)
|
|
166
|
+
# Break inner loop to restart the job
|
|
167
|
+
break
|
|
164
168
|
else:
|
|
165
|
-
|
|
169
|
+
# Wait before checking again
|
|
170
|
+
time.sleep(5)
|
|
166
171
|
|
|
167
172
|
def _save_results(
|
|
168
173
|
self, output_data: list[dict[str, Any]], log: list[Any], part_idx: int
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: hamtaa-texttools
|
|
3
|
-
Version: 0.1.44
|
|
4
|
-
Summary: A set of high-level NLP tools
|
|
5
|
-
Author: Tohidi, Montazer, Givechi, Mousavinezhad
|
|
6
|
-
Requires-Python: >=3.8
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: openai>=1.97.0
|
|
9
|
-
Requires-Dist: numpy>=1.26.4
|
|
10
|
-
|
|
11
|
-
# Text Tools
|
|
12
|
-
|
|
13
|
-
<p align="center">
|
|
14
|
-
<img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
|
|
15
|
-
</p>
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
<p align="center">
|
|
19
|
-
<img src="docs/logo.png" alt="Preview" width="300" height="300">
|
|
20
|
-
</p>
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
## How to Install
|
|
24
|
-
|
|
25
|
-
Install the package using:
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
pip install -U hamta-texttools
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
---
|
|
33
|
-
|
|
34
|
-
## What This Library Is *Not*
|
|
35
|
-
|
|
36
|
-
This is **not** a collection of low-level utilities.
|
|
37
|
-
|
|
38
|
-
To clarify: this library **does not** include things like:
|
|
39
|
-
- An standard `regex`
|
|
40
|
-
- Word normalization utilities
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## What This Library *Provides*
|
|
45
|
-
|
|
46
|
-
This is a set of **high-level natural language processing (NLP)** tools.
|
|
47
|
-
|
|
48
|
-
Some of the features include:
|
|
49
|
-
- `question_detector`: Detecting if an incoming text is a question or not
|
|
50
|
-
- `categorizer`: No finetuning need, categorizer
|
|
51
|
-
- ... (Tell me what you want!)
|
|
52
|
-
|
|
53
|
-
---
|
|
54
|
-
|
|
55
|
-
## When to Use This Library
|
|
56
|
-
|
|
57
|
-
Use `texttools` when:
|
|
58
|
-
- You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
|
|
59
|
-
- You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
|
|
60
|
-
- You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
texttools/__init__.py,sha256=cI10Q_zaM9DPUCVOM79gZceuyt6Pjgpj3R-AG7xgUM8,778
|
|
2
|
-
texttools/base/__init__.py,sha256=KUGm-Oe0BxlrRhPS-Jm2q1NCmwX8MdtZtloia7bcLaM,189
|
|
3
|
-
texttools/base/base_categorizer.py,sha256=ojup94iXLxh92TjiJmrFXeRbsWKlon7PPAqez96B1bs,1130
|
|
4
|
-
texttools/base/base_keyword_extractor.py,sha256=uKpxb3xI-sim-vXWe1R4_36QRhSNsWDR4IuVdpkZMME,868
|
|
5
|
-
texttools/base/base_ner_extractor.py,sha256=D0LRNSyq1uIU9Qtepi7zpCWWzYz-AOxpVNjq97S1oUA,1933
|
|
6
|
-
texttools/base/base_question_detector.py,sha256=FR9yDP0Z8aAfGafZy3kcpSDUUYWLJM7saRKdeVN5TiM,829
|
|
7
|
-
texttools/base/base_question_generator.py,sha256=L_2ZwqyV9GxsKiQynWKRJG15OBFgQqiCic5H0i8R5yk,3238
|
|
8
|
-
texttools/base/base_question_merger.py,sha256=TYhsihKaIdyGCVu4AcjxPZ1_HocHt__voV8WWGMRpMs,1945
|
|
9
|
-
texttools/base/base_question_rewriter.py,sha256=K6ZnAjxi2qw4yLxm92zTI1IStCfX6c_6lCfIuBDSx8w,1973
|
|
10
|
-
texttools/base/base_router.py,sha256=pFDjIXFqAhPiS9Onu5py_GxOq8geDGJDQh6k6IhCkvw,933
|
|
11
|
-
texttools/base/base_summarizer.py,sha256=7NAilhUPs6ZUwkBpTtXAj6n2XxQH1w6SOolf3gQX2gc,1627
|
|
12
|
-
texttools/base/base_task_performer.py,sha256=3-6qshkie50S7pRG4WHRNC_RdUbSmHOPKW56CD92-rM,1852
|
|
13
|
-
texttools/base/base_translator.py,sha256=BoOxqaoPoUs8t1O3m2yL9pQa5iwisl097immTVcGZoE,1020
|
|
14
|
-
texttools/batch_manager/__init__.py,sha256=3ZkxA395lRD4gNxJ1vp0fNuz_XuBr50GoP51rrwQ0Ks,87
|
|
15
|
-
texttools/batch_manager/batch_manager.py,sha256=jAmKskL3OTYwwsO1mWsWAB3VxMlOF07c2GW1Ev83ZhY,9283
|
|
16
|
-
texttools/batch_manager/batch_runner.py,sha256=kW0IPauI11xpssApMA7b4XI19FePImywym3V7tBaa-o,7404
|
|
17
|
-
texttools/formatter/__init__.py,sha256=KHz2tFZctbit_HVbQNCTMi46JzmKlg-uB6Ost63IpVU,46
|
|
18
|
-
texttools/formatter/base.py,sha256=0fiM6E7NdJevAVpL6yyPaUZVJGKWxE3fr-Ay1oqgJqQ,879
|
|
19
|
-
texttools/formatter/gemma3_formatter.py,sha256=AmdKBYLj6HMsI2DDX4KHNEEVYJmz_VVNUBOv8ScGjsY,1865
|
|
20
|
-
texttools/handlers/__init__.py,sha256=sv0JloipQ57AI0xo-3w9k6cK5rYjZP3ltR2EbBhkHTA,121
|
|
21
|
-
texttools/handlers/handlers.py,sha256=LtC4FBuzRUDy3Jw-Fp21WR-QS1jOcDhsGaMPFQGjfTw,2381
|
|
22
|
-
texttools/handlers/categorizer/__init__.py,sha256=mE05vt_ma6vcP8pQ37BZ85WVQ8jhcjDS0iZV81_LFCY,127
|
|
23
|
-
texttools/handlers/categorizer/categorizer.py,sha256=HBpdhtCGUPl1TJUOxbgSLmVWD7o9xeIjmSWXvYzGrCA,1592
|
|
24
|
-
texttools/tools/__init__.py,sha256=V3ZjSj_ZI9r02sOmxpxxxKBbBbtuYS1MQqtrdGZHC_A,1121
|
|
25
|
-
texttools/tools/categorizer/__init__.py,sha256=VY0SVdik0et0fwLDj7qn-d5LtVqVBIalvlRVci699i4,48
|
|
26
|
-
texttools/tools/categorizer/encoder_model/__init__.py,sha256=7UwoPlQ09VGN0cqfi5fPQRfsZZ8hoZj6fL6cax1BLSU,53
|
|
27
|
-
texttools/tools/categorizer/encoder_model/encoder_vectorizer.py,sha256=MHPVJQJlvNhZ5xLVXk4FtvrORW2yxPSAnjEhjPbkQts,1476
|
|
28
|
-
texttools/tools/categorizer/llm/__init__.py,sha256=0VbxvInITfNUlOF6bJqcUKKaYWlIe9K3vRmIRuvAGcY,95
|
|
29
|
-
texttools/tools/categorizer/llm/gemma_categorizer.py,sha256=tjwKonTjT5cAhxWQaVyvyooRyOlGACHpnn72PNoLk-8,5636
|
|
30
|
-
texttools/tools/categorizer/llm/openai_categorizer.py,sha256=omRk77Z5ZCIAz17h4wPDP_EcBSsscA-PQJpQjtI6--o,2547
|
|
31
|
-
texttools/tools/keyword_extractor/__init__.py,sha256=eTpujS85MmRRbnNwc2ifKUh60W8OG4RQFmWki3Z7C_0,84
|
|
32
|
-
texttools/tools/keyword_extractor/gemma_extractor.py,sha256=TJ4wMPWRuuzRi_Q0hr7UauKhEg8U_5U5j1D_lTFrn4s,4349
|
|
33
|
-
texttools/tools/merger/__init__.py,sha256=bh2RBpqJvDaqEmDrM9y_GcjRqibagifAxiZVu8nEHc0,115
|
|
34
|
-
texttools/tools/merger/gemma_question_merger.py,sha256=JAC-52kBbabIzEWp0MFi9viiu8nZOAMPaJZALHvNMqo,8035
|
|
35
|
-
texttools/tools/ner/__init__.py,sha256=BW84BcItel6Mc2JlaDL6qvAktVMkti67VXceeCnOB1g,70
|
|
36
|
-
texttools/tools/ner/gemma_ner_extractor.py,sha256=YhyIwX_8bdwkFb4gY8g9mZdYHW_r1jCvbmjjNCK9Wfo,5384
|
|
37
|
-
texttools/tools/question_detector/__init__.py,sha256=ulArGttooSoxEe0vUDQSxUQrnsxr7gH9l-LjSER2dVI,162
|
|
38
|
-
texttools/tools/question_detector/gemma_detector.py,sha256=DhlCAA6Hws_OTuYil6UY4sYlbjdQQU6EqHdoTl3a--w,3772
|
|
39
|
-
texttools/tools/question_detector/llm_detector.py,sha256=zo89eh359hqQGGf83-6M22AaiH7q-m0m91SjTyxZaYs,3862
|
|
40
|
-
texttools/tools/question_generator/__init__.py,sha256=EAElpB_YeyMoBqvFNjbW2a_j18SLtiKQ7sRmdS58Fww,61
|
|
41
|
-
texttools/tools/question_generator/gemma_question_generator.py,sha256=V5QcXmHZ5shTvrThOxUrKJ4FqP0P58NIJbsPdyyy5IM,6744
|
|
42
|
-
texttools/tools/reranker/__init__.py,sha256=70jqJ9cjpPzzvnMYgHYGVZ9PrWrN9N97visqD_PVxwU,100
|
|
43
|
-
texttools/tools/reranker/reranker.py,sha256=2SiTMIxempMuHui2n4GJV_2dLGBeoC7WAn_rVVXlMBA,5518
|
|
44
|
-
texttools/tools/reranker/scorer.py,sha256=fQ3Ya8QmNhrcmb-Rf-72hvhweGvVj6gQ4KOlham2eE8,8176
|
|
45
|
-
texttools/tools/reranker/sorter.py,sha256=_ed5zGz7K60skPFFuEQZ1ObBFA71LAfVT6FyWicA-Pw,11419
|
|
46
|
-
texttools/tools/rewriter/__init__.py,sha256=U_qwGeEOqHAcV4p2CHVb0AIvHKFfdvykRzGyWD54aWA,121
|
|
47
|
-
texttools/tools/rewriter/gemma_question_rewriter.py,sha256=jXtRswfBvHn9QmE90JvxEmLvCTbwZqZhD_A5ONWeCzo,7925
|
|
48
|
-
texttools/tools/router/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
texttools/tools/router/gemma_router.py,sha256=VX-kHphZVZNd0_ajugN08hGkWNUeUriwfonpYy2TIS4,5619
|
|
50
|
-
texttools/tools/subject_to_question/__init__.py,sha256=VJpns16Qe5OL_-4WuGDUNShcJsodB2khGWT3Q1Hc-WU,72
|
|
51
|
-
texttools/tools/subject_to_question/gemma_question_generator.py,sha256=VKXHhYHEvhFLUR87iEh0eFpD_4ueX4np8IjF-NkgWrY,7417
|
|
52
|
-
texttools/tools/summarizer/__init__.py,sha256=phrR7qO20CNhO3hjXQBzhTRVumdVdGSufmH4GEYkhj4,140
|
|
53
|
-
texttools/tools/summarizer/gemma_summarizer.py,sha256=ikhsBv7AiZD1dT_d12AyjXxojzSW92e2y5WjchI_3bE,4474
|
|
54
|
-
texttools/tools/summarizer/llm_summerizer.py,sha256=-0rUKbSnl1aDeBfJ5DCSbIlwd2k-9qIaCKgoQJa0hWc,3412
|
|
55
|
-
texttools/tools/translator/__init__.py,sha256=KO1m08J2BZwRqBGO9ICB4l4cnH1jfHLHL5HbgYFUWM8,72
|
|
56
|
-
texttools/tools/translator/gemma_translator.py,sha256=57NMfJAZHQjZSr_eCBePE_Pnag8pu3O00Jicxhzn6Jc,7572
|
|
57
|
-
hamtaa_texttools-0.1.44.dist-info/METADATA,sha256=OImC1zmuJh7p8SY3s3mhm8poOzYOuuqx6vjOeDy5O3k,1481
|
|
58
|
-
hamtaa_texttools-0.1.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
59
|
-
hamtaa_texttools-0.1.44.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
60
|
-
hamtaa_texttools-0.1.44.dist-info/RECORD,,
|
texttools/base/__init__.py
DELETED