hamtaa-texttools 1.1.21__py3-none-any.whl → 1.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.21.dist-info → hamtaa_texttools-1.1.23.dist-info}/METADATA +46 -87
- hamtaa_texttools-1.1.23.dist-info/RECORD +32 -0
- texttools/__init__.py +3 -3
- texttools/batch/batch_config.py +2 -1
- texttools/batch/batch_manager.py +6 -6
- texttools/batch/batch_runner.py +7 -7
- texttools/internals/async_operator.py +29 -41
- texttools/internals/exceptions.py +0 -6
- texttools/internals/operator_utils.py +24 -5
- texttools/internals/prompt_loader.py +0 -5
- texttools/internals/sync_operator.py +29 -41
- texttools/prompts/categorize.yaml +3 -2
- texttools/prompts/check_fact.yaml +5 -0
- texttools/prompts/extract_entities.yaml +4 -0
- texttools/prompts/extract_keywords.yaml +15 -3
- texttools/prompts/is_question.yaml +4 -0
- texttools/prompts/merge_questions.yaml +8 -1
- texttools/prompts/propositionize.yaml +2 -0
- texttools/prompts/rewrite.yaml +3 -4
- texttools/prompts/subject_to_question.yaml +5 -1
- texttools/prompts/summarize.yaml +4 -0
- texttools/prompts/text_to_question.yaml +4 -0
- texttools/prompts/translate.yaml +5 -0
- texttools/tools/async_tools.py +87 -103
- texttools/tools/sync_tools.py +87 -104
- hamtaa_texttools-1.1.21.dist-info/RECORD +0 -32
- {hamtaa_texttools-1.1.21.dist-info → hamtaa_texttools-1.1.23.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.1.21.dist-info → hamtaa_texttools-1.1.23.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.1.21.dist-info → hamtaa_texttools-1.1.23.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.23
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -37,61 +37,53 @@ Dynamic: license-file
|
|
|
37
37
|
|
|
38
38
|
## 📌 Overview
|
|
39
39
|
|
|
40
|
-
**TextTools** is a high-level **NLP toolkit** built on top of
|
|
40
|
+
**TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
|
|
41
41
|
|
|
42
42
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
43
43
|
|
|
44
44
|
It provides ready-to-use utilities for **translation, question detection, keyword extraction, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
45
45
|
|
|
46
|
+
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
47
|
+
|
|
46
48
|
---
|
|
47
49
|
|
|
48
50
|
## ✨ Features
|
|
49
51
|
|
|
50
52
|
TextTools provides a rich collection of high-level NLP utilities,
|
|
51
|
-
Each tool is designed to work with structured outputs
|
|
53
|
+
Each tool is designed to work with structured outputs.
|
|
52
54
|
|
|
53
55
|
- **`categorize()`** - Classifies text into given categories
|
|
54
|
-
- **`extract_keywords()`** - Extracts keywords from text
|
|
56
|
+
- **`extract_keywords()`** - Extracts keywords from the text
|
|
55
57
|
- **`extract_entities()`** - Named Entity Recognition (NER) system
|
|
56
|
-
- **`is_question()`** - Binary detection
|
|
58
|
+
- **`is_question()`** - Binary question detection
|
|
57
59
|
- **`text_to_question()`** - Generates questions from text
|
|
58
|
-
- **`merge_questions()`** - Merges multiple questions
|
|
59
|
-
- **`rewrite()`** - Rewrites text
|
|
60
|
+
- **`merge_questions()`** - Merges multiple questions into one
|
|
61
|
+
- **`rewrite()`** - Rewrites text in a diffrent way
|
|
60
62
|
- **`subject_to_question()`** - Generates questions about a specific subject
|
|
61
63
|
- **`summarize()`** - Text summarization
|
|
62
|
-
- **`translate()`** - Text translation
|
|
64
|
+
- **`translate()`** - Text translation
|
|
63
65
|
- **`propositionize()`** - Convert text to atomic independence meaningful sentences
|
|
64
66
|
- **`check_fact()`** - Check whether a statement is relevant to the source text
|
|
65
67
|
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
66
68
|
|
|
67
69
|
---
|
|
68
70
|
|
|
71
|
+
## 🚀 Installation
|
|
72
|
+
|
|
73
|
+
Install the latest release via PyPI:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -U hamtaa-texttools
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
69
81
|
## 📊 Tool Quality Tiers
|
|
70
82
|
|
|
71
|
-
| Status | Meaning | Use in Production? |
|
|
72
|
-
|
|
73
|
-
| **✅ Production** | Evaluated, tested, stable. | **Yes** - ready for reliable use. |
|
|
74
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**. Functional, but quality may vary. | **Use with caution** - outputs not yet validated. |
|
|
75
|
-
|
|
76
|
-
### Current Status
|
|
77
|
-
**Production Tools:**
|
|
78
|
-
- `categorize()` (list mode)
|
|
79
|
-
- `extract_keywords()`
|
|
80
|
-
- `extract_entities()`
|
|
81
|
-
- `is_question()`
|
|
82
|
-
- `text_to_question()`
|
|
83
|
-
- `merge_questions()`
|
|
84
|
-
- `rewrite()`
|
|
85
|
-
- `subject_to_question()`
|
|
86
|
-
- `summarize()`
|
|
87
|
-
- `run_custom()` (fine in most cases)
|
|
88
|
-
|
|
89
|
-
**Experimental Tools:**
|
|
90
|
-
- `categorize()` (tree mode)
|
|
91
|
-
- `translate()`
|
|
92
|
-
- `propositionize()`
|
|
93
|
-
- `check_fact()`
|
|
94
|
-
- `run_custom()` (not evaluated in all scenarios)
|
|
83
|
+
| Status | Meaning | Tools | Use in Production? |
|
|
84
|
+
|--------|---------|----------|-------------------|
|
|
85
|
+
| **✅ Production** | Evaluated, tested, stable. | `categorize()` (list mode), `extract_keywords()`, `extract_entities()`, `is_question()`, `text_to_question()`, `merge_questions()`, `rewrite()`, `subject_to_question()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
86
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. Functional, but quality may vary. | `categorize()` (tree mode), `translate()`, `propositionize()`, `check_fact()` | **Use with caution** - outputs not yet validated. |
|
|
95
87
|
|
|
96
88
|
---
|
|
97
89
|
|
|
@@ -100,49 +92,37 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
100
92
|
TextTools provides several optional flags to customize LLM behavior:
|
|
101
93
|
|
|
102
94
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
103
|
-
**Note:** This doubles token usage per call
|
|
95
|
+
**Note:** This doubles token usage per call.
|
|
104
96
|
|
|
105
97
|
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
106
98
|
**Note:** This feature works if it's supported by the model.
|
|
107
99
|
|
|
108
|
-
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
100
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
109
101
|
|
|
110
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction or
|
|
102
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction or into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
|
|
111
103
|
|
|
112
104
|
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
|
|
113
105
|
|
|
114
|
-
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a
|
|
106
|
+
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
115
107
|
|
|
116
|
-
- **`priority: int (Experimental)`** → Task execution priority level.
|
|
108
|
+
- **`priority: int (Experimental)`** → Task execution priority level. Affects processing order in queues.
|
|
117
109
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
118
110
|
|
|
119
|
-
**Note:** There might be some tools that don't support some of the parameters above.
|
|
120
|
-
|
|
121
111
|
---
|
|
122
112
|
|
|
123
113
|
## 🧩 ToolOutput
|
|
124
114
|
|
|
125
115
|
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
126
|
-
- **`result: Any`**
|
|
127
|
-
- **`analysis: str`**
|
|
128
|
-
- **`logprobs: list`**
|
|
129
|
-
- **`errors: list[str]`**
|
|
116
|
+
- **`result: Any`**
|
|
117
|
+
- **`analysis: str`**
|
|
118
|
+
- **`logprobs: list`**
|
|
119
|
+
- **`errors: list[str]`**
|
|
130
120
|
- **`ToolOutputMetadata`** →
|
|
131
|
-
- **`tool_name: str`**
|
|
132
|
-
- **`processed_at: datetime`**
|
|
133
|
-
- **`execution_time: float`**
|
|
121
|
+
- **`tool_name: str`**
|
|
122
|
+
- **`processed_at: datetime`**
|
|
123
|
+
- **`execution_time: float`**
|
|
134
124
|
|
|
135
|
-
**Note:** You can use `repr(ToolOutput)` to
|
|
136
|
-
|
|
137
|
-
---
|
|
138
|
-
|
|
139
|
-
## 🚀 Installation
|
|
140
|
-
|
|
141
|
-
Install the latest release via PyPI:
|
|
142
|
-
|
|
143
|
-
```bash
|
|
144
|
-
pip install -U hamtaa-texttools
|
|
145
|
-
```
|
|
125
|
+
**Note:** You can use `repr(ToolOutput)` to print your output with all the details.
|
|
146
126
|
|
|
147
127
|
---
|
|
148
128
|
|
|
@@ -160,26 +140,13 @@ pip install -U hamtaa-texttools
|
|
|
160
140
|
from openai import OpenAI
|
|
161
141
|
from texttools import TheTool
|
|
162
142
|
|
|
163
|
-
# Create your OpenAI client
|
|
164
143
|
client = OpenAI(base_url = "your_url", API_KEY = "your_api_key")
|
|
144
|
+
model = "model_name"
|
|
165
145
|
|
|
166
|
-
# Specify the model
|
|
167
|
-
model = "gpt-4o-mini"
|
|
168
|
-
|
|
169
|
-
# Create an instance of TheTool
|
|
170
146
|
the_tool = TheTool(client=client, model=model)
|
|
171
147
|
|
|
172
|
-
|
|
173
|
-
detection
|
|
174
|
-
print(detection.result)
|
|
175
|
-
print(detection.logprobs)
|
|
176
|
-
# Output: True + logprobs
|
|
177
|
-
|
|
178
|
-
# Example: Translation
|
|
179
|
-
translation = the_tool.translate("سلام، حالت چطوره؟" target_language="English", with_analysis=True)
|
|
180
|
-
print(translation.result)
|
|
181
|
-
print(translation.analysis)
|
|
182
|
-
# Output: "Hi! How are you?" + analysis
|
|
148
|
+
detection = the_tool.is_question("Is this project open source?")
|
|
149
|
+
print(repr(detection))
|
|
183
150
|
```
|
|
184
151
|
|
|
185
152
|
---
|
|
@@ -192,22 +159,17 @@ from openai import AsyncOpenAI
|
|
|
192
159
|
from texttools import AsyncTheTool
|
|
193
160
|
|
|
194
161
|
async def main():
|
|
195
|
-
# Create your AsyncOpenAI client
|
|
196
162
|
async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
|
|
163
|
+
model = "model_name"
|
|
197
164
|
|
|
198
|
-
# Specify the model
|
|
199
|
-
model = "gpt-4o-mini"
|
|
200
|
-
|
|
201
|
-
# Create an instance of AsyncTheTool
|
|
202
165
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
203
166
|
|
|
204
|
-
# Example: Async Translation and Keyword Extraction
|
|
205
167
|
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_language="English")
|
|
206
168
|
keywords_task = async_the_tool.extract_keywords("Tomorrow, we will be dead by the car crash")
|
|
207
169
|
|
|
208
170
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
209
|
-
print(translation
|
|
210
|
-
print(keywords
|
|
171
|
+
print(repr(translation))
|
|
172
|
+
print(repr(keywords))
|
|
211
173
|
|
|
212
174
|
asyncio.run(main())
|
|
213
175
|
```
|
|
@@ -229,13 +191,12 @@ Use **TextTools** when you need to:
|
|
|
229
191
|
|
|
230
192
|
Process large datasets efficiently using OpenAI's batch API.
|
|
231
193
|
|
|
232
|
-
## ⚡ Quick Start (Batch)
|
|
194
|
+
## ⚡ Quick Start (Batch Runner)
|
|
233
195
|
|
|
234
196
|
```python
|
|
235
197
|
from pydantic import BaseModel
|
|
236
|
-
from texttools import
|
|
198
|
+
from texttools import BatchRunner, BatchConfig
|
|
237
199
|
|
|
238
|
-
# Configure your batch job
|
|
239
200
|
config = BatchConfig(
|
|
240
201
|
system_prompt="Extract entities from the text",
|
|
241
202
|
job_name="entity_extraction",
|
|
@@ -244,12 +205,10 @@ config = BatchConfig(
|
|
|
244
205
|
model="gpt-4o-mini"
|
|
245
206
|
)
|
|
246
207
|
|
|
247
|
-
# Define your output schema
|
|
248
208
|
class Output(BaseModel):
|
|
249
209
|
entities: list[str]
|
|
250
210
|
|
|
251
|
-
|
|
252
|
-
runner = BatchJobRunner(config, output_model=Output)
|
|
211
|
+
runner = BatchRunner(config, output_model=Output)
|
|
253
212
|
runner.run()
|
|
254
213
|
```
|
|
255
214
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
hamtaa_texttools-1.1.23.dist-info/licenses/LICENSE,sha256=Hb2YOBKy2MJQLnyLrX37B4ZVuac8eaIcE71SvVIMOLg,1082
|
|
2
|
+
texttools/__init__.py,sha256=fqGafzxcnGw0_ivi-vUyLfytWOkjLOumiaB0-I612iY,305
|
|
3
|
+
texttools/batch/batch_config.py,sha256=TEUNC4YCEJDVmu-E1V2buglvQQvms8zw6lPRNsLTzDc,1050
|
|
4
|
+
texttools/batch/batch_manager.py,sha256=Z7QiV-uL8QYbPmr7ifUEuOAeFHGNH_ybo8yyHK9Zca8,8730
|
|
5
|
+
texttools/batch/batch_runner.py,sha256=szdvLS2oJtRwdLF29sefrfK7sHsi9WhqQ8xgSySymZA,9982
|
|
6
|
+
texttools/internals/async_operator.py,sha256=kf2A3HBB7ebN5wg-LPVZwnDs8JoudJDeJbU_US408-I,6577
|
|
7
|
+
texttools/internals/exceptions.py,sha256=6SDjUL1rmd3ngzD3ytF4LyTRj3bQMSFR9ECrLoqXXHw,395
|
|
8
|
+
texttools/internals/models.py,sha256=9uoCAe2TLrSzyS9lMJja5orPAYaCvVL1zoCb6FNdkfs,4541
|
|
9
|
+
texttools/internals/operator_utils.py,sha256=RDrNUNhN9QDgCOgw7JAc9IOqZ-gk1Jq-TrjiTlXEE9Q,2414
|
|
10
|
+
texttools/internals/prompt_loader.py,sha256=cQnmeTLSp6MQb6hhv_FwiqzZI__opT7SU7XmPxS33d0,3143
|
|
11
|
+
texttools/internals/sync_operator.py,sha256=Um3ud0DQ-murM8rkZy_Ud7qgfpDqhRqMzHWEkM13wt8,6482
|
|
12
|
+
texttools/internals/text_to_chunks.py,sha256=vY3odhgCZK4E44k_SGlLoSiKkdN0ib6-lQAsPcplAHA,3843
|
|
13
|
+
texttools/prompts/README.md,sha256=ztajRJcmFLhyrUF0_qmOXaCwGsTGCFabfMjch2LAJG0,1375
|
|
14
|
+
texttools/prompts/categorize.yaml,sha256=42Rp3SgVHaDLKrJ27_uK788LiQud0pOXJthz4r0a40Y,1214
|
|
15
|
+
texttools/prompts/check_fact.yaml,sha256=zWFQDRhEE1ij9wSeeenS9YSTM-bY5zzUaG390zUgmcs,714
|
|
16
|
+
texttools/prompts/extract_entities.yaml,sha256=_zYKHNJDIzVDI_-TnwFCKyMs-XLM5igvmWhvSTc3INQ,637
|
|
17
|
+
texttools/prompts/extract_keywords.yaml,sha256=1o4u3uwzapNtB1BUpNIRL5qtrwjW0Yhvyq0TZJiafdg,3272
|
|
18
|
+
texttools/prompts/is_question.yaml,sha256=jnPARd2ZiulLzHW_r4WAsz3sOryfz6Gy5-yYXp-2hd0,496
|
|
19
|
+
texttools/prompts/merge_questions.yaml,sha256=l9Q2OEjPp3SDkxbq3zZCj2ZmXacWSnmYMpUr3l6r5yE,1816
|
|
20
|
+
texttools/prompts/propositionize.yaml,sha256=nbGAfbm1-2Hoc0JLtqZi-S7VHQfnMmuTKI7dZeBxQW0,1403
|
|
21
|
+
texttools/prompts/rewrite.yaml,sha256=klEm8MqXK-Bo8RsS5R9KLMT0zlD-BKo_G6tz9lpAcEY,5420
|
|
22
|
+
texttools/prompts/run_custom.yaml,sha256=IETY9H0wPGWIIzcnupfbwwKQblwZrbYAxB754W9MhgU,125
|
|
23
|
+
texttools/prompts/subject_to_question.yaml,sha256=AK16pZW9HUppIF8JBSEenbUNOU3aqeVV781_WUXnLqk,1160
|
|
24
|
+
texttools/prompts/summarize.yaml,sha256=rPh060Bx_yI1W2JNg-nr83LUk9itatYLKM8ciH2pOvg,486
|
|
25
|
+
texttools/prompts/text_to_question.yaml,sha256=pUwPgK9l5f8S4E5fCht9JY7PFVK2aY1InPfASr7R5o4,1017
|
|
26
|
+
texttools/prompts/translate.yaml,sha256=Dd5bs3O8SI-FlVSwHMYGeEjMmdOWeRlcfBHkhixCx7c,665
|
|
27
|
+
texttools/tools/async_tools.py,sha256=0RjL7zJ565dIYYHJC7-QEZ4FoslB-yawg8rwcv3FBlE,43434
|
|
28
|
+
texttools/tools/sync_tools.py,sha256=-2GGgnAKDfRFxxrCgFIA1h6wgoxehrrqEa-meay60r0,43227
|
|
29
|
+
hamtaa_texttools-1.1.23.dist-info/METADATA,sha256=EsEgRolMNIv5dLunjtJTTSlYIaO6oE8JHRCsqP8uerk,8718
|
|
30
|
+
hamtaa_texttools-1.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
hamtaa_texttools-1.1.23.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
32
|
+
hamtaa_texttools-1.1.23.dist-info/RECORD,,
|
texttools/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from .batch.batch_runner import BatchJobRunner
|
|
2
|
-
from .batch.batch_config import BatchConfig
|
|
3
1
|
from .tools.sync_tools import TheTool
|
|
4
2
|
from .tools.async_tools import AsyncTheTool
|
|
5
3
|
from .internals.models import CategoryTree
|
|
4
|
+
from .batch.batch_runner import BatchRunner
|
|
5
|
+
from .batch.batch_config import BatchConfig
|
|
6
6
|
|
|
7
|
-
__all__ = ["TheTool", "AsyncTheTool", "
|
|
7
|
+
__all__ = ["TheTool", "AsyncTheTool", "CategoryTree", "BatchRunner", "BatchConfig"]
|
texttools/batch/batch_config.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Any
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
|
|
@@ -10,7 +11,7 @@ def export_data(data) -> list[dict[str, str]]:
|
|
|
10
11
|
return data
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
def import_data(data) ->
|
|
14
|
+
def import_data(data) -> Any:
|
|
14
15
|
"""
|
|
15
16
|
Takes the output and adds and aggregates it to the original structure.
|
|
16
17
|
"""
|
texttools/batch/batch_manager.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Type, TypeVar
|
|
4
|
+
from typing import Type, TypeVar, Any
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
@@ -31,7 +31,7 @@ class BatchManager:
|
|
|
31
31
|
prompt_template: str,
|
|
32
32
|
state_dir: Path = Path(".batch_jobs"),
|
|
33
33
|
custom_json_schema_obj_str: dict | None = None,
|
|
34
|
-
**client_kwargs:
|
|
34
|
+
**client_kwargs: Any,
|
|
35
35
|
):
|
|
36
36
|
self._client = client
|
|
37
37
|
self._model = model
|
|
@@ -51,7 +51,7 @@ class BatchManager:
|
|
|
51
51
|
def _state_file(self, job_name: str) -> Path:
|
|
52
52
|
return self._state_dir / f"{job_name}.json"
|
|
53
53
|
|
|
54
|
-
def _load_state(self, job_name: str) -> list[dict[str,
|
|
54
|
+
def _load_state(self, job_name: str) -> list[dict[str, Any]]:
|
|
55
55
|
"""
|
|
56
56
|
Loads the state (job information) from the state file for the given job name.
|
|
57
57
|
Returns an empty list if the state file does not exist.
|
|
@@ -62,7 +62,7 @@ class BatchManager:
|
|
|
62
62
|
return json.load(f)
|
|
63
63
|
return []
|
|
64
64
|
|
|
65
|
-
def _save_state(self, job_name: str, jobs: list[dict[str,
|
|
65
|
+
def _save_state(self, job_name: str, jobs: list[dict[str, Any]]) -> None:
|
|
66
66
|
"""
|
|
67
67
|
Saves the job state to the state file for the given job name.
|
|
68
68
|
"""
|
|
@@ -77,11 +77,11 @@ class BatchManager:
|
|
|
77
77
|
if path.exists():
|
|
78
78
|
path.unlink()
|
|
79
79
|
|
|
80
|
-
def _build_task(self, text: str, idx: str) -> dict[str,
|
|
80
|
+
def _build_task(self, text: str, idx: str) -> dict[str, Any]:
|
|
81
81
|
"""
|
|
82
82
|
Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
|
|
83
83
|
"""
|
|
84
|
-
response_format_config: dict[str,
|
|
84
|
+
response_format_config: dict[str, Any]
|
|
85
85
|
|
|
86
86
|
if self._custom_json_schema_obj_str:
|
|
87
87
|
response_format_config = {
|
texttools/batch/batch_runner.py
CHANGED
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Type, TypeVar
|
|
5
|
+
from typing import Type, TypeVar, Any
|
|
6
6
|
import logging
|
|
7
7
|
|
|
8
8
|
from dotenv import load_dotenv
|
|
@@ -12,7 +12,7 @@ from pydantic import BaseModel
|
|
|
12
12
|
from texttools.batch.batch_manager import BatchManager
|
|
13
13
|
from texttools.batch.batch_config import BatchConfig
|
|
14
14
|
from texttools.internals.models import Str
|
|
15
|
-
from texttools.internals.exceptions import TextToolsError
|
|
15
|
+
from texttools.internals.exceptions import TextToolsError
|
|
16
16
|
|
|
17
17
|
# Base Model type for output models
|
|
18
18
|
T = TypeVar("T", bound=BaseModel)
|
|
@@ -20,7 +20,7 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
20
20
|
logger = logging.getLogger("texttools.batch_runner")
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class
|
|
23
|
+
class BatchRunner:
|
|
24
24
|
"""
|
|
25
25
|
Handles running batch jobs using a batch manager and configuration.
|
|
26
26
|
"""
|
|
@@ -38,7 +38,7 @@ class BatchJobRunner:
|
|
|
38
38
|
self._output_model = output_model
|
|
39
39
|
self._manager = self._init_manager()
|
|
40
40
|
self._data = self._load_data()
|
|
41
|
-
self._parts: list[list[dict[str,
|
|
41
|
+
self._parts: list[list[dict[str, Any]]] = []
|
|
42
42
|
# Map part index to job name
|
|
43
43
|
self._part_idx_to_job_name: dict[int, str] = {}
|
|
44
44
|
# Track retry attempts per part
|
|
@@ -47,7 +47,7 @@ class BatchJobRunner:
|
|
|
47
47
|
Path(self._config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
|
48
48
|
|
|
49
49
|
except Exception as e:
|
|
50
|
-
raise
|
|
50
|
+
raise TextToolsError(f"Batch runner initialization failed: {e}")
|
|
51
51
|
|
|
52
52
|
def _init_manager(self) -> BatchManager:
|
|
53
53
|
load_dotenv()
|
|
@@ -130,8 +130,8 @@ class BatchJobRunner:
|
|
|
130
130
|
|
|
131
131
|
def _save_results(
|
|
132
132
|
self,
|
|
133
|
-
output_data: list[dict[str,
|
|
134
|
-
log: list[
|
|
133
|
+
output_data: list[dict[str, Any]] | dict[str, Any],
|
|
134
|
+
log: list[Any],
|
|
135
135
|
part_idx: int,
|
|
136
136
|
):
|
|
137
137
|
part_suffix = f"_part_{part_idx + 1}" if len(self._parts) > 1 else ""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import TypeVar, Type
|
|
1
|
+
from typing import TypeVar, Type, Any
|
|
2
2
|
from collections.abc import Callable
|
|
3
3
|
|
|
4
4
|
from openai import AsyncOpenAI
|
|
@@ -27,17 +27,11 @@ class AsyncOperator:
|
|
|
27
27
|
self._client = client
|
|
28
28
|
self._model = model
|
|
29
29
|
|
|
30
|
-
async def _analyze_completion(self,
|
|
30
|
+
async def _analyze_completion(self, analyze_message: list[dict[str, str]]) -> str:
|
|
31
31
|
try:
|
|
32
|
-
if not analyze_prompt:
|
|
33
|
-
raise PromptError("Analyze template is empty")
|
|
34
|
-
|
|
35
|
-
analyze_message = OperatorUtils.build_user_message(analyze_prompt)
|
|
36
|
-
|
|
37
32
|
completion = await self._client.chat.completions.create(
|
|
38
33
|
model=self._model,
|
|
39
34
|
messages=analyze_message,
|
|
40
|
-
temperature=temperature,
|
|
41
35
|
)
|
|
42
36
|
|
|
43
37
|
if not completion.choices:
|
|
@@ -57,20 +51,18 @@ class AsyncOperator:
|
|
|
57
51
|
|
|
58
52
|
async def _parse_completion(
|
|
59
53
|
self,
|
|
60
|
-
|
|
54
|
+
main_message: list[dict[str, str]],
|
|
61
55
|
output_model: Type[T],
|
|
62
56
|
temperature: float,
|
|
63
57
|
logprobs: bool,
|
|
64
58
|
top_logprobs: int,
|
|
65
|
-
priority: int,
|
|
66
|
-
) -> tuple[T,
|
|
59
|
+
priority: int | None,
|
|
60
|
+
) -> tuple[T, Any]:
|
|
67
61
|
"""
|
|
68
62
|
Parses a chat completion using OpenAI's structured output format.
|
|
69
|
-
Returns both the parsed
|
|
63
|
+
Returns both the parsed Any and the raw completion for logprobs.
|
|
70
64
|
"""
|
|
71
65
|
try:
|
|
72
|
-
main_message = OperatorUtils.build_user_message(main_prompt)
|
|
73
|
-
|
|
74
66
|
request_kwargs = {
|
|
75
67
|
"model": self._model,
|
|
76
68
|
"messages": main_message,
|
|
@@ -82,7 +74,7 @@ class AsyncOperator:
|
|
|
82
74
|
request_kwargs["logprobs"] = True
|
|
83
75
|
request_kwargs["top_logprobs"] = top_logprobs
|
|
84
76
|
|
|
85
|
-
if priority:
|
|
77
|
+
if priority is not None:
|
|
86
78
|
request_kwargs["extra_body"] = {"priority": priority}
|
|
87
79
|
|
|
88
80
|
completion = await self._client.beta.chat.completions.parse(
|
|
@@ -114,50 +106,48 @@ class AsyncOperator:
|
|
|
114
106
|
temperature: float,
|
|
115
107
|
logprobs: bool,
|
|
116
108
|
top_logprobs: int,
|
|
117
|
-
validator: Callable[[
|
|
109
|
+
validator: Callable[[Any], bool] | None,
|
|
118
110
|
max_validation_retries: int | None,
|
|
119
|
-
priority: int,
|
|
111
|
+
priority: int | None,
|
|
120
112
|
# Internal parameters
|
|
121
|
-
|
|
113
|
+
tool_name: str,
|
|
122
114
|
output_model: Type[T],
|
|
123
115
|
mode: str | None,
|
|
124
116
|
**extra_kwargs,
|
|
125
117
|
) -> OperatorOutput:
|
|
126
118
|
"""
|
|
127
|
-
Execute the LLM pipeline with the given input text.
|
|
119
|
+
Execute the LLM pipeline with the given input text.
|
|
128
120
|
"""
|
|
129
121
|
try:
|
|
130
122
|
prompt_loader = PromptLoader()
|
|
131
|
-
|
|
132
123
|
prompt_configs = prompt_loader.load(
|
|
133
|
-
prompt_file=
|
|
124
|
+
prompt_file=tool_name + ".yaml",
|
|
134
125
|
text=text.strip(),
|
|
135
126
|
mode=mode,
|
|
136
127
|
**extra_kwargs,
|
|
137
128
|
)
|
|
138
129
|
|
|
139
|
-
|
|
140
|
-
analysis = ""
|
|
130
|
+
analysis: str | None = None
|
|
141
131
|
|
|
142
132
|
if with_analysis:
|
|
143
|
-
|
|
144
|
-
prompt_configs["analyze_template"]
|
|
133
|
+
analyze_message = OperatorUtils.build_message(
|
|
134
|
+
prompt_configs["analyze_template"]
|
|
145
135
|
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
if output_lang:
|
|
149
|
-
main_prompt += f"Respond only in the {output_lang} language.\n"
|
|
136
|
+
analysis = await self._analyze_completion(analyze_message)
|
|
150
137
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
if logprobs and (not isinstance(top_logprobs, int) or top_logprobs < 2):
|
|
157
|
-
raise ValueError("top_logprobs should be an integer greater than 1")
|
|
138
|
+
main_message = OperatorUtils.build_message(
|
|
139
|
+
OperatorUtils.build_main_prompt(
|
|
140
|
+
prompt_configs["main_template"], analysis, output_lang, user_prompt
|
|
141
|
+
)
|
|
142
|
+
)
|
|
158
143
|
|
|
159
144
|
parsed, completion = await self._parse_completion(
|
|
160
|
-
|
|
145
|
+
main_message,
|
|
146
|
+
output_model,
|
|
147
|
+
temperature,
|
|
148
|
+
logprobs,
|
|
149
|
+
top_logprobs,
|
|
150
|
+
priority,
|
|
161
151
|
)
|
|
162
152
|
|
|
163
153
|
# Retry logic if validation fails
|
|
@@ -166,9 +156,7 @@ class AsyncOperator:
|
|
|
166
156
|
not isinstance(max_validation_retries, int)
|
|
167
157
|
or max_validation_retries < 1
|
|
168
158
|
):
|
|
169
|
-
raise ValueError(
|
|
170
|
-
"max_validation_retries should be a positive integer"
|
|
171
|
-
)
|
|
159
|
+
raise ValueError("max_validation_retries should be a positive int")
|
|
172
160
|
|
|
173
161
|
succeeded = False
|
|
174
162
|
for _ in range(max_validation_retries):
|
|
@@ -177,7 +165,7 @@ class AsyncOperator:
|
|
|
177
165
|
|
|
178
166
|
try:
|
|
179
167
|
parsed, completion = await self._parse_completion(
|
|
180
|
-
|
|
168
|
+
main_message,
|
|
181
169
|
output_model,
|
|
182
170
|
retry_temperature,
|
|
183
171
|
logprobs,
|
|
@@ -5,7 +5,29 @@ import random
|
|
|
5
5
|
|
|
6
6
|
class OperatorUtils:
|
|
7
7
|
@staticmethod
|
|
8
|
-
def
|
|
8
|
+
def build_main_prompt(
|
|
9
|
+
main_template: str,
|
|
10
|
+
analysis: str | None,
|
|
11
|
+
output_lang: str | None,
|
|
12
|
+
user_prompt: str | None,
|
|
13
|
+
) -> str:
|
|
14
|
+
main_prompt = ""
|
|
15
|
+
|
|
16
|
+
if analysis:
|
|
17
|
+
main_prompt += f"Based on this analysis:\n{analysis}\n"
|
|
18
|
+
|
|
19
|
+
if output_lang:
|
|
20
|
+
main_prompt += f"Respond only in the {output_lang} language.\n"
|
|
21
|
+
|
|
22
|
+
if user_prompt:
|
|
23
|
+
main_prompt += f"Consider this instruction {user_prompt}\n"
|
|
24
|
+
|
|
25
|
+
main_prompt += main_template
|
|
26
|
+
|
|
27
|
+
return main_prompt
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def build_message(prompt: str) -> list[dict[str, str]]:
|
|
9
31
|
return [{"role": "user", "content": prompt}]
|
|
10
32
|
|
|
11
33
|
@staticmethod
|
|
@@ -20,7 +42,7 @@ class OperatorUtils:
|
|
|
20
42
|
|
|
21
43
|
for choice in completion.choices:
|
|
22
44
|
if not getattr(choice, "logprobs", None):
|
|
23
|
-
|
|
45
|
+
raise ValueError("Your model does not support logprobs")
|
|
24
46
|
|
|
25
47
|
for logprob_item in choice.logprobs.content:
|
|
26
48
|
if ignore_pattern.match(logprob_item.token):
|
|
@@ -45,9 +67,6 @@ class OperatorUtils:
|
|
|
45
67
|
|
|
46
68
|
@staticmethod
|
|
47
69
|
def get_retry_temp(base_temp: float) -> float:
|
|
48
|
-
"""
|
|
49
|
-
Calculate temperature for retry attempts.
|
|
50
|
-
"""
|
|
51
70
|
delta_temp = random.choice([-1, 1]) * random.uniform(0.1, 0.9)
|
|
52
71
|
new_temp = base_temp + delta_temp
|
|
53
72
|
|
|
@@ -8,11 +8,6 @@ from texttools.internals.exceptions import PromptError
|
|
|
8
8
|
class PromptLoader:
|
|
9
9
|
"""
|
|
10
10
|
Utility for loading and formatting YAML prompt templates.
|
|
11
|
-
|
|
12
|
-
Responsibilities:
|
|
13
|
-
- Load and parse YAML prompt definitions.
|
|
14
|
-
- Select the right template (by mode, if applicable).
|
|
15
|
-
- Inject variables (`{text}`, plus any extra kwargs) into the templates.
|
|
16
11
|
"""
|
|
17
12
|
|
|
18
13
|
MAIN_TEMPLATE = "main_template"
|