hamtaa-texttools 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- hamtaa_texttools-1.1.7.dist-info/METADATA +228 -0
- hamtaa_texttools-1.1.7.dist-info/RECORD +30 -0
- hamtaa_texttools-1.1.7.dist-info/licenses/LICENSE +21 -0
- texttools/__init__.py +4 -26
- texttools/batch/__init__.py +3 -0
- texttools/{utils/batch_manager → batch}/batch_manager.py +226 -241
- texttools/batch/batch_runner.py +254 -0
- texttools/prompts/README.md +35 -0
- texttools/prompts/categorizer.yaml +28 -0
- texttools/prompts/extract_entities.yaml +20 -0
- texttools/prompts/extract_keywords.yaml +18 -0
- texttools/prompts/is_question.yaml +14 -0
- texttools/prompts/merge_questions.yaml +46 -0
- texttools/prompts/rewrite.yaml +111 -0
- texttools/prompts/run_custom.yaml +7 -0
- texttools/prompts/subject_to_question.yaml +22 -0
- texttools/prompts/summarize.yaml +14 -0
- texttools/prompts/text_to_question.yaml +20 -0
- texttools/prompts/translate.yaml +15 -0
- texttools/tools/__init__.py +4 -33
- texttools/tools/async_the_tool.py +435 -0
- texttools/tools/internals/async_operator.py +242 -0
- texttools/tools/internals/base_operator.py +100 -0
- texttools/tools/internals/formatters.py +24 -0
- texttools/tools/internals/operator.py +242 -0
- texttools/tools/internals/output_models.py +62 -0
- texttools/tools/internals/prompt_loader.py +60 -0
- texttools/tools/the_tool.py +433 -0
- hamtaa_texttools-0.1.48.dist-info/METADATA +0 -60
- hamtaa_texttools-0.1.48.dist-info/RECORD +0 -61
- texttools/base/__init__.py +0 -3
- texttools/base/base_categorizer.py +0 -40
- texttools/base/base_keyword_extractor.py +0 -35
- texttools/base/base_ner_extractor.py +0 -61
- texttools/base/base_question_detector.py +0 -35
- texttools/base/base_question_generator.py +0 -99
- texttools/base/base_question_merger.py +0 -59
- texttools/base/base_question_rewriter.py +0 -61
- texttools/base/base_router.py +0 -33
- texttools/base/base_summarizer.py +0 -55
- texttools/base/base_task_performer.py +0 -53
- texttools/base/base_translator.py +0 -38
- texttools/formatter/__init__.py +0 -1
- texttools/formatter/base.py +0 -26
- texttools/formatter/gemma3_formatter.py +0 -54
- texttools/handlers/__init__.py +0 -6
- texttools/handlers/categorizer/__init__.py +0 -6
- texttools/handlers/categorizer/categorizer.py +0 -61
- texttools/handlers/handlers.py +0 -88
- texttools/tools/categorizer/__init__.py +0 -2
- texttools/tools/categorizer/encoder_model/__init__.py +0 -1
- texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
- texttools/tools/categorizer/llm/__init__.py +0 -2
- texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
- texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
- texttools/tools/keyword_extractor/__init__.py +0 -1
- texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
- texttools/tools/merger/__init__.py +0 -2
- texttools/tools/merger/gemma_question_merger.py +0 -214
- texttools/tools/ner/__init__.py +0 -1
- texttools/tools/ner/gemma_ner_extractor.py +0 -157
- texttools/tools/question_detector/__init__.py +0 -2
- texttools/tools/question_detector/gemma_detector.py +0 -114
- texttools/tools/question_detector/llm_detector.py +0 -112
- texttools/tools/question_generator/__init__.py +0 -1
- texttools/tools/question_generator/gemma_question_generator.py +0 -198
- texttools/tools/reranker/__init__.py +0 -3
- texttools/tools/reranker/reranker.py +0 -137
- texttools/tools/reranker/scorer.py +0 -216
- texttools/tools/reranker/sorter.py +0 -278
- texttools/tools/rewriter/__init__.py +0 -2
- texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
- texttools/tools/router/__init__.py +0 -0
- texttools/tools/router/gemma_router.py +0 -169
- texttools/tools/subject_to_question/__init__.py +0 -1
- texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
- texttools/tools/summarizer/__init__.py +0 -2
- texttools/tools/summarizer/gemma_summarizer.py +0 -140
- texttools/tools/summarizer/llm_summerizer.py +0 -108
- texttools/tools/translator/__init__.py +0 -1
- texttools/tools/translator/gemma_translator.py +0 -189
- texttools/utils/batch_manager/__init__.py +0 -2
- texttools/utils/batch_manager/batch_runner.py +0 -207
- texttools/utils/flex_processor.py +0 -78
- {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
from typing import Literal, Any, Callable
|
|
2
|
+
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
|
|
5
|
+
from texttools.tools.internals.operator import Operator
|
|
6
|
+
import texttools.tools.internals.output_models as OutputModels
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TheTool:
|
|
10
|
+
"""
|
|
11
|
+
Each method configures the operator with a specific YAML prompt,
|
|
12
|
+
output schema, and flags, then delegates execution to `operator.run()`.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
client = OpenAI(...)
|
|
16
|
+
tool = TheTool(client, model="model-name")
|
|
17
|
+
result = tool.categorize("text ...", with_analysis=True)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
client: OpenAI,
|
|
23
|
+
model: str,
|
|
24
|
+
):
|
|
25
|
+
self.operator = Operator(client=client, model=model)
|
|
26
|
+
|
|
27
|
+
def categorize(
|
|
28
|
+
self,
|
|
29
|
+
text: str,
|
|
30
|
+
with_analysis: bool = False,
|
|
31
|
+
user_prompt: str | None = None,
|
|
32
|
+
temperature: float | None = 0.0,
|
|
33
|
+
logprobs: bool = False,
|
|
34
|
+
top_logprobs: int | None = None,
|
|
35
|
+
validator: Callable[[Any], bool] | None = None,
|
|
36
|
+
) -> OutputModels.ToolOutput:
|
|
37
|
+
"""
|
|
38
|
+
Categorize a text into a single Islamic studies domain category.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
ToolOutput: Object containing:
|
|
42
|
+
- result (str): The assigned Islamic studies category
|
|
43
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
44
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
45
|
+
"""
|
|
46
|
+
return self.operator.run(
|
|
47
|
+
# User parameters
|
|
48
|
+
text=text,
|
|
49
|
+
with_analysis=with_analysis,
|
|
50
|
+
user_prompt=user_prompt,
|
|
51
|
+
temperature=temperature,
|
|
52
|
+
logprobs=logprobs,
|
|
53
|
+
top_logprobs=top_logprobs,
|
|
54
|
+
validator=validator,
|
|
55
|
+
# Internal parameters
|
|
56
|
+
prompt_file="categorizer.yaml",
|
|
57
|
+
output_model=OutputModels.CategorizerOutput,
|
|
58
|
+
resp_format="parse",
|
|
59
|
+
mode=None,
|
|
60
|
+
output_lang=None,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def extract_keywords(
|
|
64
|
+
self,
|
|
65
|
+
text: str,
|
|
66
|
+
with_analysis: bool = False,
|
|
67
|
+
output_lang: str | None = None,
|
|
68
|
+
user_prompt: str | None = None,
|
|
69
|
+
temperature: float | None = 0.0,
|
|
70
|
+
logprobs: bool = False,
|
|
71
|
+
top_logprobs: int | None = None,
|
|
72
|
+
validator: Callable[[Any], bool] | None = None,
|
|
73
|
+
) -> OutputModels.ToolOutput:
|
|
74
|
+
"""
|
|
75
|
+
Extract salient keywords from text.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
ToolOutput: Object containing:
|
|
79
|
+
- result (list[str]): List of extracted keywords
|
|
80
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
81
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
82
|
+
"""
|
|
83
|
+
return self.operator.run(
|
|
84
|
+
# User parameters
|
|
85
|
+
text=text,
|
|
86
|
+
with_analysis=with_analysis,
|
|
87
|
+
output_lang=output_lang,
|
|
88
|
+
user_prompt=user_prompt,
|
|
89
|
+
temperature=temperature,
|
|
90
|
+
logprobs=logprobs,
|
|
91
|
+
top_logprobs=top_logprobs,
|
|
92
|
+
validator=validator,
|
|
93
|
+
# Internal parameters
|
|
94
|
+
prompt_file="extract_keywords.yaml",
|
|
95
|
+
output_model=OutputModels.ListStrOutput,
|
|
96
|
+
resp_format="parse",
|
|
97
|
+
mode=None,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def extract_entities(
|
|
101
|
+
self,
|
|
102
|
+
text: str,
|
|
103
|
+
with_analysis: bool = False,
|
|
104
|
+
output_lang: str | None = None,
|
|
105
|
+
user_prompt: str | None = None,
|
|
106
|
+
temperature: float | None = 0.0,
|
|
107
|
+
logprobs: bool = False,
|
|
108
|
+
top_logprobs: int | None = None,
|
|
109
|
+
validator: Callable[[Any], bool] | None = None,
|
|
110
|
+
) -> OutputModels.ToolOutput:
|
|
111
|
+
"""
|
|
112
|
+
Perform Named Entity Recognition (NER) over the input text.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
ToolOutput: Object containing:
|
|
116
|
+
- result (list[dict]): List of entities with 'text' and 'type' keys
|
|
117
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
118
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
119
|
+
"""
|
|
120
|
+
return self.operator.run(
|
|
121
|
+
# User parameters
|
|
122
|
+
text=text,
|
|
123
|
+
with_analysis=with_analysis,
|
|
124
|
+
output_lang=output_lang,
|
|
125
|
+
user_prompt=user_prompt,
|
|
126
|
+
temperature=temperature,
|
|
127
|
+
logprobs=logprobs,
|
|
128
|
+
top_logprobs=top_logprobs,
|
|
129
|
+
validator=validator,
|
|
130
|
+
# Internal parameters
|
|
131
|
+
prompt_file="extract_entities.yaml",
|
|
132
|
+
output_model=OutputModels.ListDictStrStrOutput,
|
|
133
|
+
resp_format="parse",
|
|
134
|
+
mode=None,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def is_question(
|
|
138
|
+
self,
|
|
139
|
+
text: str,
|
|
140
|
+
with_analysis: bool = False,
|
|
141
|
+
user_prompt: str | None = None,
|
|
142
|
+
temperature: float | None = 0.0,
|
|
143
|
+
logprobs: bool = False,
|
|
144
|
+
top_logprobs: int | None = None,
|
|
145
|
+
validator: Callable[[Any], bool] | None = None,
|
|
146
|
+
) -> OutputModels.ToolOutput:
|
|
147
|
+
"""
|
|
148
|
+
Detect if the input is phrased as a question.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
ToolOutput: Object containing:
|
|
152
|
+
- result (bool): True if text is a question, False otherwise
|
|
153
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
154
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
155
|
+
"""
|
|
156
|
+
return self.operator.run(
|
|
157
|
+
# User parameters
|
|
158
|
+
text=text,
|
|
159
|
+
with_analysis=with_analysis,
|
|
160
|
+
user_prompt=user_prompt,
|
|
161
|
+
temperature=temperature,
|
|
162
|
+
logprobs=logprobs,
|
|
163
|
+
top_logprobs=top_logprobs,
|
|
164
|
+
validator=validator,
|
|
165
|
+
# Internal parameters
|
|
166
|
+
prompt_file="is_question.yaml",
|
|
167
|
+
output_model=OutputModels.BoolOutput,
|
|
168
|
+
resp_format="parse",
|
|
169
|
+
mode=None,
|
|
170
|
+
output_lang=None,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def text_to_question(
|
|
174
|
+
self,
|
|
175
|
+
text: str,
|
|
176
|
+
with_analysis: bool = False,
|
|
177
|
+
output_lang: str | None = None,
|
|
178
|
+
user_prompt: str | None = None,
|
|
179
|
+
temperature: float | None = 0.0,
|
|
180
|
+
logprobs: bool = False,
|
|
181
|
+
top_logprobs: int | None = None,
|
|
182
|
+
validator: Callable[[Any], bool] | None = None,
|
|
183
|
+
) -> OutputModels.ToolOutput:
|
|
184
|
+
"""
|
|
185
|
+
Generate a single question from the given text.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
ToolOutput: Object containing:
|
|
189
|
+
- result (str): The generated question
|
|
190
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
191
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
192
|
+
"""
|
|
193
|
+
return self.operator.run(
|
|
194
|
+
# User parameters
|
|
195
|
+
text=text,
|
|
196
|
+
with_analysis=with_analysis,
|
|
197
|
+
output_lang=output_lang,
|
|
198
|
+
user_prompt=user_prompt,
|
|
199
|
+
temperature=temperature,
|
|
200
|
+
logprobs=logprobs,
|
|
201
|
+
top_logprobs=top_logprobs,
|
|
202
|
+
validator=validator,
|
|
203
|
+
# Internal parameters
|
|
204
|
+
prompt_file="text_to_question.yaml",
|
|
205
|
+
output_model=OutputModels.StrOutput,
|
|
206
|
+
resp_format="parse",
|
|
207
|
+
mode=None,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def merge_questions(
|
|
211
|
+
self,
|
|
212
|
+
text: list[str],
|
|
213
|
+
with_analysis: bool = False,
|
|
214
|
+
output_lang: str | None = None,
|
|
215
|
+
user_prompt: str | None = None,
|
|
216
|
+
temperature: float | None = 0.0,
|
|
217
|
+
logprobs: bool = False,
|
|
218
|
+
top_logprobs: int | None = None,
|
|
219
|
+
mode: Literal["default", "reason"] = "default",
|
|
220
|
+
validator: Callable[[Any], bool] | None = None,
|
|
221
|
+
) -> OutputModels.ToolOutput:
|
|
222
|
+
"""
|
|
223
|
+
Merge multiple questions into a single unified question.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
ToolOutput: Object containing:
|
|
227
|
+
- result (str): The merged question
|
|
228
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
229
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
230
|
+
"""
|
|
231
|
+
text = ", ".join(text)
|
|
232
|
+
return self.operator.run(
|
|
233
|
+
# User parameters
|
|
234
|
+
text=text,
|
|
235
|
+
with_analysis=with_analysis,
|
|
236
|
+
output_lang=output_lang,
|
|
237
|
+
user_prompt=user_prompt,
|
|
238
|
+
temperature=temperature,
|
|
239
|
+
logprobs=logprobs,
|
|
240
|
+
top_logprobs=top_logprobs,
|
|
241
|
+
validator=validator,
|
|
242
|
+
# Internal parameters
|
|
243
|
+
prompt_file="merge_questions.yaml",
|
|
244
|
+
output_model=OutputModels.StrOutput,
|
|
245
|
+
resp_format="parse",
|
|
246
|
+
mode=mode,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def rewrite(
|
|
250
|
+
self,
|
|
251
|
+
text: str,
|
|
252
|
+
with_analysis: bool = False,
|
|
253
|
+
output_lang: str | None = None,
|
|
254
|
+
user_prompt: str | None = None,
|
|
255
|
+
temperature: float | None = 0.0,
|
|
256
|
+
logprobs: bool = False,
|
|
257
|
+
top_logprobs: int | None = None,
|
|
258
|
+
mode: Literal["positive", "negative", "hard_negative"] = "positive",
|
|
259
|
+
validator: Callable[[Any], bool] | None = None,
|
|
260
|
+
) -> OutputModels.ToolOutput:
|
|
261
|
+
"""
|
|
262
|
+
Rewrite a text with different modes.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
ToolOutput: Object containing:
|
|
266
|
+
- result (str): The rewritten text
|
|
267
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
268
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
269
|
+
"""
|
|
270
|
+
return self.operator.run(
|
|
271
|
+
# User parameters
|
|
272
|
+
text=text,
|
|
273
|
+
with_analysis=with_analysis,
|
|
274
|
+
output_lang=output_lang,
|
|
275
|
+
user_prompt=user_prompt,
|
|
276
|
+
temperature=temperature,
|
|
277
|
+
logprobs=logprobs,
|
|
278
|
+
top_logprobs=top_logprobs,
|
|
279
|
+
validator=validator,
|
|
280
|
+
# Internal parameters
|
|
281
|
+
prompt_file="rewrite.yaml",
|
|
282
|
+
output_model=OutputModels.StrOutput,
|
|
283
|
+
resp_format="parse",
|
|
284
|
+
mode=mode,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
def subject_to_question(
|
|
288
|
+
self,
|
|
289
|
+
text: str,
|
|
290
|
+
number_of_questions: int,
|
|
291
|
+
with_analysis: bool = False,
|
|
292
|
+
output_lang: str | None = None,
|
|
293
|
+
user_prompt: str | None = None,
|
|
294
|
+
temperature: float | None = 0.0,
|
|
295
|
+
logprobs: bool = False,
|
|
296
|
+
top_logprobs: int | None = None,
|
|
297
|
+
validator: Callable[[Any], bool] | None = None,
|
|
298
|
+
) -> OutputModels.ToolOutput:
|
|
299
|
+
"""
|
|
300
|
+
Generate a list of questions about a subject.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
ToolOutput: Object containing:
|
|
304
|
+
- result (list[str]): List of generated questions
|
|
305
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
306
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
307
|
+
"""
|
|
308
|
+
return self.operator.run(
|
|
309
|
+
# User parameters
|
|
310
|
+
text=text,
|
|
311
|
+
number_of_questions=number_of_questions,
|
|
312
|
+
with_analysis=with_analysis,
|
|
313
|
+
output_lang=output_lang,
|
|
314
|
+
user_prompt=user_prompt,
|
|
315
|
+
temperature=temperature,
|
|
316
|
+
logprobs=logprobs,
|
|
317
|
+
top_logprobs=top_logprobs,
|
|
318
|
+
validator=validator,
|
|
319
|
+
# Internal parameters
|
|
320
|
+
prompt_file="subject_to_question.yaml",
|
|
321
|
+
output_model=OutputModels.ReasonListStrOutput,
|
|
322
|
+
resp_format="parse",
|
|
323
|
+
mode=None,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
def summarize(
|
|
327
|
+
self,
|
|
328
|
+
text: str,
|
|
329
|
+
with_analysis: bool = False,
|
|
330
|
+
output_lang: str | None = None,
|
|
331
|
+
user_prompt: str | None = None,
|
|
332
|
+
temperature: float | None = 0.0,
|
|
333
|
+
logprobs: bool = False,
|
|
334
|
+
top_logprobs: int | None = None,
|
|
335
|
+
validator: Callable[[Any], bool] | None = None,
|
|
336
|
+
) -> OutputModels.ToolOutput:
|
|
337
|
+
"""
|
|
338
|
+
Summarize the given subject text.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
ToolOutput: Object containing:
|
|
342
|
+
- result (str): The summary text
|
|
343
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
344
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
345
|
+
"""
|
|
346
|
+
return self.operator.run(
|
|
347
|
+
# User parameters
|
|
348
|
+
text=text,
|
|
349
|
+
with_analysis=with_analysis,
|
|
350
|
+
output_lang=output_lang,
|
|
351
|
+
user_prompt=user_prompt,
|
|
352
|
+
temperature=temperature,
|
|
353
|
+
logprobs=logprobs,
|
|
354
|
+
top_logprobs=top_logprobs,
|
|
355
|
+
validator=validator,
|
|
356
|
+
# Internal parameters
|
|
357
|
+
prompt_file="summarize.yaml",
|
|
358
|
+
output_model=OutputModels.StrOutput,
|
|
359
|
+
resp_format="parse",
|
|
360
|
+
mode=None,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
def translate(
|
|
364
|
+
self,
|
|
365
|
+
text: str,
|
|
366
|
+
target_language: str,
|
|
367
|
+
with_analysis: bool = False,
|
|
368
|
+
user_prompt: str | None = None,
|
|
369
|
+
temperature: float | None = 0.0,
|
|
370
|
+
logprobs: bool = False,
|
|
371
|
+
top_logprobs: int | None = None,
|
|
372
|
+
validator: Callable[[Any], bool] | None = None,
|
|
373
|
+
) -> OutputModels.ToolOutput:
|
|
374
|
+
"""
|
|
375
|
+
Translate text between languages.
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
ToolOutput: Object containing:
|
|
379
|
+
- result (str): The translated text
|
|
380
|
+
- logprobs (list | None): Probability data if logprobs enabled
|
|
381
|
+
- analysis (str | None): Detailed reasoning if with_analysis enabled
|
|
382
|
+
"""
|
|
383
|
+
return self.operator.run(
|
|
384
|
+
# User parameters
|
|
385
|
+
text=text,
|
|
386
|
+
target_language=target_language,
|
|
387
|
+
with_analysis=with_analysis,
|
|
388
|
+
user_prompt=user_prompt,
|
|
389
|
+
temperature=temperature,
|
|
390
|
+
logprobs=logprobs,
|
|
391
|
+
top_logprobs=top_logprobs,
|
|
392
|
+
validator=validator,
|
|
393
|
+
# Internal parameters
|
|
394
|
+
prompt_file="translate.yaml",
|
|
395
|
+
output_model=OutputModels.StrOutput,
|
|
396
|
+
resp_format="parse",
|
|
397
|
+
mode=None,
|
|
398
|
+
output_lang=None,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
def run_custom(
|
|
402
|
+
self,
|
|
403
|
+
prompt: str,
|
|
404
|
+
output_model: Any,
|
|
405
|
+
output_lang: str | None = None,
|
|
406
|
+
temperature: float | None = None,
|
|
407
|
+
logprobs: bool | None = None,
|
|
408
|
+
top_logprobs: int | None = None,
|
|
409
|
+
) -> OutputModels.ToolOutput:
|
|
410
|
+
"""
|
|
411
|
+
Custom tool that can do almost anything!
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
ToolOutput: Object with fields:
|
|
415
|
+
- result (str): The output result
|
|
416
|
+
"""
|
|
417
|
+
return self.operator.run(
|
|
418
|
+
# User paramaeters
|
|
419
|
+
text=prompt,
|
|
420
|
+
output_model=output_model,
|
|
421
|
+
output_model_str=output_model.model_json_schema(),
|
|
422
|
+
output_lang=output_lang,
|
|
423
|
+
temperature=temperature,
|
|
424
|
+
logprobs=logprobs,
|
|
425
|
+
top_logprobs=top_logprobs,
|
|
426
|
+
# Internal parameters
|
|
427
|
+
prompt_file="run_custom.yaml",
|
|
428
|
+
resp_format="parse",
|
|
429
|
+
user_prompt=None,
|
|
430
|
+
with_analysis=False,
|
|
431
|
+
mode=None,
|
|
432
|
+
validator=None,
|
|
433
|
+
)
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: hamtaa-texttools
|
|
3
|
-
Version: 0.1.48
|
|
4
|
-
Summary: A set of high-level NLP tools
|
|
5
|
-
Author: Tohidi, Montazer, Givechi, Mousavinezhad
|
|
6
|
-
Requires-Python: >=3.8
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
Requires-Dist: openai==1.97.1
|
|
9
|
-
Requires-Dist: numpy==1.26.4
|
|
10
|
-
|
|
11
|
-
# Text Tools
|
|
12
|
-
|
|
13
|
-
<p align="center">
|
|
14
|
-
<img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
|
|
15
|
-
</p>
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
<p align="center">
|
|
19
|
-
<img src="docs/logo.png" alt="Preview" width="300" height="300">
|
|
20
|
-
</p>
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
## How to Install
|
|
24
|
-
|
|
25
|
-
Install the package using:
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
pip install -U hamta-texttools
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
---
|
|
33
|
-
|
|
34
|
-
## What This Library Is *Not*
|
|
35
|
-
|
|
36
|
-
This is **not** a collection of low-level utilities.
|
|
37
|
-
|
|
38
|
-
To clarify: this library **does not** include things like:
|
|
39
|
-
- An standard `regex`
|
|
40
|
-
- Word normalization utilities
|
|
41
|
-
|
|
42
|
-
---
|
|
43
|
-
|
|
44
|
-
## What This Library *Provides*
|
|
45
|
-
|
|
46
|
-
This is a set of **high-level natural language processing (NLP)** tools.
|
|
47
|
-
|
|
48
|
-
Some of the features include:
|
|
49
|
-
- `question_detector`: Detecting if an incoming text is a question or not
|
|
50
|
-
- `categorizer`: No finetuning need, categorizer
|
|
51
|
-
- ... (Tell me what you want!)
|
|
52
|
-
|
|
53
|
-
---
|
|
54
|
-
|
|
55
|
-
## When to Use This Library
|
|
56
|
-
|
|
57
|
-
Use `texttools` when:
|
|
58
|
-
- You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
|
|
59
|
-
- You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
|
|
60
|
-
- You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
texttools/__init__.py,sha256=UEPcm1wKU7Hiqw1K_E2ojZSnyQrGyNLzN0spQ_ZXjJ4,784
|
|
2
|
-
texttools/base/__init__.py,sha256=KUGm-Oe0BxlrRhPS-Jm2q1NCmwX8MdtZtloia7bcLaM,189
|
|
3
|
-
texttools/base/base_categorizer.py,sha256=ojup94iXLxh92TjiJmrFXeRbsWKlon7PPAqez96B1bs,1130
|
|
4
|
-
texttools/base/base_keyword_extractor.py,sha256=uKpxb3xI-sim-vXWe1R4_36QRhSNsWDR4IuVdpkZMME,868
|
|
5
|
-
texttools/base/base_ner_extractor.py,sha256=D0LRNSyq1uIU9Qtepi7zpCWWzYz-AOxpVNjq97S1oUA,1933
|
|
6
|
-
texttools/base/base_question_detector.py,sha256=FR9yDP0Z8aAfGafZy3kcpSDUUYWLJM7saRKdeVN5TiM,829
|
|
7
|
-
texttools/base/base_question_generator.py,sha256=L_2ZwqyV9GxsKiQynWKRJG15OBFgQqiCic5H0i8R5yk,3238
|
|
8
|
-
texttools/base/base_question_merger.py,sha256=TYhsihKaIdyGCVu4AcjxPZ1_HocHt__voV8WWGMRpMs,1945
|
|
9
|
-
texttools/base/base_question_rewriter.py,sha256=K6ZnAjxi2qw4yLxm92zTI1IStCfX6c_6lCfIuBDSx8w,1973
|
|
10
|
-
texttools/base/base_router.py,sha256=pFDjIXFqAhPiS9Onu5py_GxOq8geDGJDQh6k6IhCkvw,933
|
|
11
|
-
texttools/base/base_summarizer.py,sha256=7NAilhUPs6ZUwkBpTtXAj6n2XxQH1w6SOolf3gQX2gc,1627
|
|
12
|
-
texttools/base/base_task_performer.py,sha256=3-6qshkie50S7pRG4WHRNC_RdUbSmHOPKW56CD92-rM,1852
|
|
13
|
-
texttools/base/base_translator.py,sha256=BoOxqaoPoUs8t1O3m2yL9pQa5iwisl097immTVcGZoE,1020
|
|
14
|
-
texttools/formatter/__init__.py,sha256=KHz2tFZctbit_HVbQNCTMi46JzmKlg-uB6Ost63IpVU,46
|
|
15
|
-
texttools/formatter/base.py,sha256=0fiM6E7NdJevAVpL6yyPaUZVJGKWxE3fr-Ay1oqgJqQ,879
|
|
16
|
-
texttools/formatter/gemma3_formatter.py,sha256=AmdKBYLj6HMsI2DDX4KHNEEVYJmz_VVNUBOv8ScGjsY,1865
|
|
17
|
-
texttools/handlers/__init__.py,sha256=sv0JloipQ57AI0xo-3w9k6cK5rYjZP3ltR2EbBhkHTA,121
|
|
18
|
-
texttools/handlers/handlers.py,sha256=LtC4FBuzRUDy3Jw-Fp21WR-QS1jOcDhsGaMPFQGjfTw,2381
|
|
19
|
-
texttools/handlers/categorizer/__init__.py,sha256=mE05vt_ma6vcP8pQ37BZ85WVQ8jhcjDS0iZV81_LFCY,127
|
|
20
|
-
texttools/handlers/categorizer/categorizer.py,sha256=HBpdhtCGUPl1TJUOxbgSLmVWD7o9xeIjmSWXvYzGrCA,1592
|
|
21
|
-
texttools/tools/__init__.py,sha256=V3ZjSj_ZI9r02sOmxpxxxKBbBbtuYS1MQqtrdGZHC_A,1121
|
|
22
|
-
texttools/tools/categorizer/__init__.py,sha256=VY0SVdik0et0fwLDj7qn-d5LtVqVBIalvlRVci699i4,48
|
|
23
|
-
texttools/tools/categorizer/encoder_model/__init__.py,sha256=7UwoPlQ09VGN0cqfi5fPQRfsZZ8hoZj6fL6cax1BLSU,53
|
|
24
|
-
texttools/tools/categorizer/encoder_model/encoder_vectorizer.py,sha256=MHPVJQJlvNhZ5xLVXk4FtvrORW2yxPSAnjEhjPbkQts,1476
|
|
25
|
-
texttools/tools/categorizer/llm/__init__.py,sha256=0VbxvInITfNUlOF6bJqcUKKaYWlIe9K3vRmIRuvAGcY,95
|
|
26
|
-
texttools/tools/categorizer/llm/gemma_categorizer.py,sha256=tjwKonTjT5cAhxWQaVyvyooRyOlGACHpnn72PNoLk-8,5636
|
|
27
|
-
texttools/tools/categorizer/llm/openai_categorizer.py,sha256=omRk77Z5ZCIAz17h4wPDP_EcBSsscA-PQJpQjtI6--o,2547
|
|
28
|
-
texttools/tools/keyword_extractor/__init__.py,sha256=eTpujS85MmRRbnNwc2ifKUh60W8OG4RQFmWki3Z7C_0,84
|
|
29
|
-
texttools/tools/keyword_extractor/gemma_extractor.py,sha256=TJ4wMPWRuuzRi_Q0hr7UauKhEg8U_5U5j1D_lTFrn4s,4349
|
|
30
|
-
texttools/tools/merger/__init__.py,sha256=bh2RBpqJvDaqEmDrM9y_GcjRqibagifAxiZVu8nEHc0,115
|
|
31
|
-
texttools/tools/merger/gemma_question_merger.py,sha256=JAC-52kBbabIzEWp0MFi9viiu8nZOAMPaJZALHvNMqo,8035
|
|
32
|
-
texttools/tools/ner/__init__.py,sha256=BW84BcItel6Mc2JlaDL6qvAktVMkti67VXceeCnOB1g,70
|
|
33
|
-
texttools/tools/ner/gemma_ner_extractor.py,sha256=YhyIwX_8bdwkFb4gY8g9mZdYHW_r1jCvbmjjNCK9Wfo,5384
|
|
34
|
-
texttools/tools/question_detector/__init__.py,sha256=ulArGttooSoxEe0vUDQSxUQrnsxr7gH9l-LjSER2dVI,162
|
|
35
|
-
texttools/tools/question_detector/gemma_detector.py,sha256=DhlCAA6Hws_OTuYil6UY4sYlbjdQQU6EqHdoTl3a--w,3772
|
|
36
|
-
texttools/tools/question_detector/llm_detector.py,sha256=zo89eh359hqQGGf83-6M22AaiH7q-m0m91SjTyxZaYs,3862
|
|
37
|
-
texttools/tools/question_generator/__init__.py,sha256=EAElpB_YeyMoBqvFNjbW2a_j18SLtiKQ7sRmdS58Fww,61
|
|
38
|
-
texttools/tools/question_generator/gemma_question_generator.py,sha256=V5QcXmHZ5shTvrThOxUrKJ4FqP0P58NIJbsPdyyy5IM,6744
|
|
39
|
-
texttools/tools/reranker/__init__.py,sha256=70jqJ9cjpPzzvnMYgHYGVZ9PrWrN9N97visqD_PVxwU,100
|
|
40
|
-
texttools/tools/reranker/reranker.py,sha256=2SiTMIxempMuHui2n4GJV_2dLGBeoC7WAn_rVVXlMBA,5518
|
|
41
|
-
texttools/tools/reranker/scorer.py,sha256=fQ3Ya8QmNhrcmb-Rf-72hvhweGvVj6gQ4KOlham2eE8,8176
|
|
42
|
-
texttools/tools/reranker/sorter.py,sha256=_ed5zGz7K60skPFFuEQZ1ObBFA71LAfVT6FyWicA-Pw,11419
|
|
43
|
-
texttools/tools/rewriter/__init__.py,sha256=U_qwGeEOqHAcV4p2CHVb0AIvHKFfdvykRzGyWD54aWA,121
|
|
44
|
-
texttools/tools/rewriter/gemma_question_rewriter.py,sha256=jXtRswfBvHn9QmE90JvxEmLvCTbwZqZhD_A5ONWeCzo,7925
|
|
45
|
-
texttools/tools/router/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
texttools/tools/router/gemma_router.py,sha256=VX-kHphZVZNd0_ajugN08hGkWNUeUriwfonpYy2TIS4,5619
|
|
47
|
-
texttools/tools/subject_to_question/__init__.py,sha256=VJpns16Qe5OL_-4WuGDUNShcJsodB2khGWT3Q1Hc-WU,72
|
|
48
|
-
texttools/tools/subject_to_question/gemma_question_generator.py,sha256=VKXHhYHEvhFLUR87iEh0eFpD_4ueX4np8IjF-NkgWrY,7417
|
|
49
|
-
texttools/tools/summarizer/__init__.py,sha256=phrR7qO20CNhO3hjXQBzhTRVumdVdGSufmH4GEYkhj4,140
|
|
50
|
-
texttools/tools/summarizer/gemma_summarizer.py,sha256=ikhsBv7AiZD1dT_d12AyjXxojzSW92e2y5WjchI_3bE,4474
|
|
51
|
-
texttools/tools/summarizer/llm_summerizer.py,sha256=-0rUKbSnl1aDeBfJ5DCSbIlwd2k-9qIaCKgoQJa0hWc,3412
|
|
52
|
-
texttools/tools/translator/__init__.py,sha256=KO1m08J2BZwRqBGO9ICB4l4cnH1jfHLHL5HbgYFUWM8,72
|
|
53
|
-
texttools/tools/translator/gemma_translator.py,sha256=rbP0kgkhOiEPdHWgHQc7Lev7lrAIYqNb6t_OfZLp44E,7180
|
|
54
|
-
texttools/utils/flex_processor.py,sha256=Y44uTracvXUJiUm5hh57Uk0933RU9GTc3dN_1Bo_XQA,3214
|
|
55
|
-
texttools/utils/batch_manager/__init__.py,sha256=3ZkxA395lRD4gNxJ1vp0fNuz_XuBr50GoP51rrwQ0Ks,87
|
|
56
|
-
texttools/utils/batch_manager/batch_manager.py,sha256=jAmKskL3OTYwwsO1mWsWAB3VxMlOF07c2GW1Ev83ZhY,9283
|
|
57
|
-
texttools/utils/batch_manager/batch_runner.py,sha256=kW0IPauI11xpssApMA7b4XI19FePImywym3V7tBaa-o,7404
|
|
58
|
-
hamtaa_texttools-0.1.48.dist-info/METADATA,sha256=cUMn4AfOaYObchCIOQ-xrj0vmF3VTsmHYnob3Xi7R2M,1481
|
|
59
|
-
hamtaa_texttools-0.1.48.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
60
|
-
hamtaa_texttools-0.1.48.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
61
|
-
hamtaa_texttools-0.1.48.dist-info/RECORD,,
|
texttools/base/__init__.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from enum import Enum
|
|
4
|
-
from typing import Optional
|
|
5
|
-
|
|
6
|
-
from texttools.handlers import NoOpResultHandler, ResultHandler
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BaseCategorizer(ABC):
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
handlers: Optional[list[ResultHandler]] = None,
|
|
13
|
-
):
|
|
14
|
-
"""
|
|
15
|
-
handlers: List of ResultHandler objects that will process results after categorization.
|
|
16
|
-
"""
|
|
17
|
-
self.handlers = handlers or [NoOpResultHandler()]
|
|
18
|
-
|
|
19
|
-
@abstractmethod
|
|
20
|
-
def categorize(self, text: str) -> Enum:
|
|
21
|
-
"""
|
|
22
|
-
Categorize the input text.
|
|
23
|
-
Must return one of the Enum members defined in self.categories.
|
|
24
|
-
"""
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
def preprocess(self, text: str) -> str:
|
|
28
|
-
"""
|
|
29
|
-
Optional: Preprocess text before categorization.
|
|
30
|
-
"""
|
|
31
|
-
return text
|
|
32
|
-
|
|
33
|
-
def _dispatch(self, results: dict) -> None:
|
|
34
|
-
for handler in self.handlers:
|
|
35
|
-
try:
|
|
36
|
-
handler.handle(results)
|
|
37
|
-
except Exception:
|
|
38
|
-
logging.error(
|
|
39
|
-
f"Handler {handler.__class__.__name__} failed", exc_info=True
|
|
40
|
-
)
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Any, Optional
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class BaseKeywordExtractor(ABC):
|
|
6
|
-
"""
|
|
7
|
-
Base class for all detectors that output a list of keywords.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
handlers: Optional[list[Any]] = None,
|
|
13
|
-
):
|
|
14
|
-
self.handlers = handlers or []
|
|
15
|
-
|
|
16
|
-
@abstractmethod
|
|
17
|
-
def extract_keywords(self, text: str) -> list[str]:
|
|
18
|
-
"""
|
|
19
|
-
Extract keywords from the input text.
|
|
20
|
-
Should return a list of strings, where each string is a keyword.
|
|
21
|
-
"""
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
def preprocess(self, text: str) -> str:
|
|
25
|
-
"""
|
|
26
|
-
Optional text preprocessing step.
|
|
27
|
-
"""
|
|
28
|
-
return text.strip()
|
|
29
|
-
|
|
30
|
-
def _dispatch(self, result: dict) -> None:
|
|
31
|
-
"""
|
|
32
|
-
Dispatch the result to handlers.
|
|
33
|
-
"""
|
|
34
|
-
for handler in self.handlers:
|
|
35
|
-
handler.handle(result)
|