hamtaa-texttools 1.1.20__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.20.dist-info → hamtaa_texttools-1.1.22.dist-info}/METADATA +49 -109
- hamtaa_texttools-1.1.22.dist-info/RECORD +32 -0
- texttools/__init__.py +3 -3
- texttools/batch/batch_config.py +14 -1
- texttools/batch/batch_runner.py +2 -2
- texttools/internals/async_operator.py +49 -92
- texttools/internals/models.py +74 -105
- texttools/internals/operator_utils.py +25 -27
- texttools/internals/prompt_loader.py +3 -20
- texttools/internals/sync_operator.py +49 -92
- texttools/prompts/README.md +2 -2
- texttools/prompts/categorize.yaml +35 -77
- texttools/prompts/check_fact.yaml +2 -2
- texttools/prompts/extract_entities.yaml +2 -2
- texttools/prompts/extract_keywords.yaml +6 -6
- texttools/prompts/is_question.yaml +2 -2
- texttools/prompts/merge_questions.yaml +4 -4
- texttools/prompts/propositionize.yaml +2 -2
- texttools/prompts/rewrite.yaml +6 -6
- texttools/prompts/run_custom.yaml +1 -1
- texttools/prompts/subject_to_question.yaml +2 -2
- texttools/prompts/summarize.yaml +2 -2
- texttools/prompts/text_to_question.yaml +2 -2
- texttools/prompts/translate.yaml +2 -2
- texttools/tools/async_tools.py +393 -487
- texttools/tools/sync_tools.py +394 -488
- hamtaa_texttools-1.1.20.dist-info/RECORD +0 -33
- texttools/batch/internals/utils.py +0 -13
- {hamtaa_texttools-1.1.20.dist-info → hamtaa_texttools-1.1.22.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.1.20.dist-info → hamtaa_texttools-1.1.22.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.1.20.dist-info → hamtaa_texttools-1.1.22.dist-info}/top_level.txt +0 -0
- /texttools/batch/{internals/batch_manager.py → batch_manager.py} +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.22
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -37,61 +37,53 @@ Dynamic: license-file
|
|
|
37
37
|
|
|
38
38
|
## 📌 Overview
|
|
39
39
|
|
|
40
|
-
**TextTools** is a high-level **NLP toolkit** built on top of
|
|
40
|
+
**TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
|
|
41
41
|
|
|
42
42
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
43
43
|
|
|
44
44
|
It provides ready-to-use utilities for **translation, question detection, keyword extraction, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
45
45
|
|
|
46
|
+
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
47
|
+
|
|
46
48
|
---
|
|
47
49
|
|
|
48
50
|
## ✨ Features
|
|
49
51
|
|
|
50
52
|
TextTools provides a rich collection of high-level NLP utilities,
|
|
51
|
-
Each tool is designed to work with structured outputs
|
|
53
|
+
Each tool is designed to work with structured outputs.
|
|
52
54
|
|
|
53
|
-
- **`categorize()`** - Classifies text into given categories
|
|
54
|
-
- **`extract_keywords()`** - Extracts keywords from text
|
|
55
|
+
- **`categorize()`** - Classifies text into given categories
|
|
56
|
+
- **`extract_keywords()`** - Extracts keywords from the text
|
|
55
57
|
- **`extract_entities()`** - Named Entity Recognition (NER) system
|
|
56
|
-
- **`is_question()`** - Binary detection
|
|
58
|
+
- **`is_question()`** - Binary question detection
|
|
57
59
|
- **`text_to_question()`** - Generates questions from text
|
|
58
|
-
- **`merge_questions()`** - Merges multiple questions
|
|
59
|
-
- **`rewrite()`** - Rewrites text
|
|
60
|
+
- **`merge_questions()`** - Merges multiple questions into one
|
|
61
|
+
- **`rewrite()`** - Rewrites text in a diffrent way
|
|
60
62
|
- **`subject_to_question()`** - Generates questions about a specific subject
|
|
61
63
|
- **`summarize()`** - Text summarization
|
|
62
|
-
- **`translate()`** - Text translation
|
|
64
|
+
- **`translate()`** - Text translation
|
|
63
65
|
- **`propositionize()`** - Convert text to atomic independence meaningful sentences
|
|
64
|
-
- **`check_fact()`** - Check a statement is relevant to source text
|
|
66
|
+
- **`check_fact()`** - Check whether a statement is relevant to the source text
|
|
65
67
|
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
66
68
|
|
|
67
69
|
---
|
|
68
70
|
|
|
71
|
+
## 🚀 Installation
|
|
72
|
+
|
|
73
|
+
Install the latest release via PyPI:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -U hamtaa-texttools
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
69
81
|
## 📊 Tool Quality Tiers
|
|
70
82
|
|
|
71
|
-
| Status | Meaning | Use in Production? |
|
|
72
|
-
|
|
73
|
-
| **✅ Production** | Evaluated, tested, stable. | **Yes** - ready for reliable use. |
|
|
74
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**. Functional, but quality may vary. | **Use with caution** - outputs not yet validated. |
|
|
75
|
-
|
|
76
|
-
### Current Status
|
|
77
|
-
**Production Tools:**
|
|
78
|
-
- `categorize()` (list mode)
|
|
79
|
-
- `extract_keywords()`
|
|
80
|
-
- `extract_entities()`
|
|
81
|
-
- `is_question()`
|
|
82
|
-
- `text_to_question()`
|
|
83
|
-
- `merge_questions()`
|
|
84
|
-
- `rewrite()`
|
|
85
|
-
- `subject_to_question()`
|
|
86
|
-
- `summarize()`
|
|
87
|
-
- `run_custom()` (fine in most cases)
|
|
88
|
-
|
|
89
|
-
**Experimental Tools:**
|
|
90
|
-
- `categorize()` (tree mode)
|
|
91
|
-
- `translate()`
|
|
92
|
-
- `propositionize()`
|
|
93
|
-
- `check_fact()`
|
|
94
|
-
- `run_custom()` (not evaluated in all scenarios)
|
|
83
|
+
| Status | Meaning | Tools | Use in Production? |
|
|
84
|
+
|--------|---------|----------|-------------------|
|
|
85
|
+
| **✅ Production** | Evaluated, tested, stable. | `categorize()` (list mode), `extract_keywords()`, `extract_entities()`, `is_question()`, `text_to_question()`, `merge_questions()`, `rewrite()`, `subject_to_question()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
86
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. Functional, but quality may vary. | `categorize()` (tree mode), `translate()`, `propositionize()`, `check_fact()` | **Use with caution** - outputs not yet validated. |
|
|
95
87
|
|
|
96
88
|
---
|
|
97
89
|
|
|
@@ -100,48 +92,37 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
100
92
|
TextTools provides several optional flags to customize LLM behavior:
|
|
101
93
|
|
|
102
94
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
103
|
-
**Note:** This doubles token usage per call
|
|
95
|
+
**Note:** This doubles token usage per call.
|
|
104
96
|
|
|
105
97
|
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
106
98
|
**Note:** This feature works if it's supported by the model.
|
|
107
99
|
|
|
108
|
-
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
100
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
109
101
|
|
|
110
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction or
|
|
102
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction or into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
|
|
111
103
|
|
|
112
104
|
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
|
|
113
105
|
|
|
114
|
-
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a
|
|
106
|
+
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
115
107
|
|
|
116
|
-
- **`priority: int (Experimental)`** → Task execution priority level.
|
|
108
|
+
- **`priority: int (Experimental)`** → Task execution priority level. Affects processing order in queues.
|
|
117
109
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
118
110
|
|
|
119
|
-
**Note:** There might be some tools that don't support some of the parameters above.
|
|
120
|
-
|
|
121
111
|
---
|
|
122
112
|
|
|
123
113
|
## 🧩 ToolOutput
|
|
124
114
|
|
|
125
115
|
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
126
|
-
- **`result: Any`**
|
|
127
|
-
- **`analysis: str`**
|
|
128
|
-
- **`logprobs: list`**
|
|
129
|
-
- **`
|
|
130
|
-
- **`
|
|
131
|
-
- **`
|
|
132
|
-
- **`
|
|
116
|
+
- **`result: Any`**
|
|
117
|
+
- **`analysis: str`**
|
|
118
|
+
- **`logprobs: list`**
|
|
119
|
+
- **`errors: list[str]`**
|
|
120
|
+
- **`ToolOutputMetadata`** →
|
|
121
|
+
- **`tool_name: str`**
|
|
122
|
+
- **`processed_at: datetime`**
|
|
123
|
+
- **`execution_time: float`**
|
|
133
124
|
|
|
134
|
-
**Note:** You can use `repr(ToolOutput)` to
|
|
135
|
-
|
|
136
|
-
---
|
|
137
|
-
|
|
138
|
-
## 🚀 Installation
|
|
139
|
-
|
|
140
|
-
Install the latest release via PyPI:
|
|
141
|
-
|
|
142
|
-
```bash
|
|
143
|
-
pip install -U hamtaa-texttools
|
|
144
|
-
```
|
|
125
|
+
**Note:** You can use `repr(ToolOutput)` to print your output with all the details.
|
|
145
126
|
|
|
146
127
|
---
|
|
147
128
|
|
|
@@ -159,26 +140,13 @@ pip install -U hamtaa-texttools
|
|
|
159
140
|
from openai import OpenAI
|
|
160
141
|
from texttools import TheTool
|
|
161
142
|
|
|
162
|
-
# Create your OpenAI client
|
|
163
143
|
client = OpenAI(base_url = "your_url", API_KEY = "your_api_key")
|
|
144
|
+
model = "model_name"
|
|
164
145
|
|
|
165
|
-
# Specify the model
|
|
166
|
-
model = "gpt-4o-mini"
|
|
167
|
-
|
|
168
|
-
# Create an instance of TheTool
|
|
169
146
|
the_tool = TheTool(client=client, model=model)
|
|
170
147
|
|
|
171
|
-
|
|
172
|
-
detection
|
|
173
|
-
print(detection.result)
|
|
174
|
-
print(detection.logprobs)
|
|
175
|
-
# Output: True + logprobs
|
|
176
|
-
|
|
177
|
-
# Example: Translation
|
|
178
|
-
translation = the_tool.translate("سلام، حالت چطوره؟" target_language="English", with_analysis=True)
|
|
179
|
-
print(translation.result)
|
|
180
|
-
print(translation.analysis)
|
|
181
|
-
# Output: "Hi! How are you?" + analysis
|
|
148
|
+
detection = the_tool.is_question("Is this project open source?")
|
|
149
|
+
print(repr(detection))
|
|
182
150
|
```
|
|
183
151
|
|
|
184
152
|
---
|
|
@@ -191,22 +159,17 @@ from openai import AsyncOpenAI
|
|
|
191
159
|
from texttools import AsyncTheTool
|
|
192
160
|
|
|
193
161
|
async def main():
|
|
194
|
-
# Create your AsyncOpenAI client
|
|
195
162
|
async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
|
|
163
|
+
model = "model_name"
|
|
196
164
|
|
|
197
|
-
# Specify the model
|
|
198
|
-
model = "gpt-4o-mini"
|
|
199
|
-
|
|
200
|
-
# Create an instance of AsyncTheTool
|
|
201
165
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
202
166
|
|
|
203
|
-
# Example: Async Translation and Keyword Extraction
|
|
204
167
|
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_language="English")
|
|
205
168
|
keywords_task = async_the_tool.extract_keywords("Tomorrow, we will be dead by the car crash")
|
|
206
169
|
|
|
207
170
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
208
|
-
print(translation
|
|
209
|
-
print(keywords
|
|
171
|
+
print(repr(translation))
|
|
172
|
+
print(repr(keywords))
|
|
210
173
|
|
|
211
174
|
asyncio.run(main())
|
|
212
175
|
```
|
|
@@ -224,37 +187,16 @@ Use **TextTools** when you need to:
|
|
|
224
187
|
|
|
225
188
|
---
|
|
226
189
|
|
|
227
|
-
## 🔍 Logging
|
|
228
|
-
|
|
229
|
-
TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
```python
|
|
233
|
-
import logging
|
|
234
|
-
|
|
235
|
-
# Default: warnings and errors only
|
|
236
|
-
logging.basicConfig(level=logging.WARNING)
|
|
237
|
-
|
|
238
|
-
# Debug everything (verbose)
|
|
239
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
240
|
-
|
|
241
|
-
# Complete silence
|
|
242
|
-
logging.basicConfig(level=logging.CRITICAL)
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
---
|
|
246
|
-
|
|
247
190
|
## 📚 Batch Processing
|
|
248
191
|
|
|
249
192
|
Process large datasets efficiently using OpenAI's batch API.
|
|
250
193
|
|
|
251
|
-
## ⚡ Quick Start (Batch)
|
|
194
|
+
## ⚡ Quick Start (Batch Runner)
|
|
252
195
|
|
|
253
196
|
```python
|
|
254
197
|
from pydantic import BaseModel
|
|
255
|
-
from texttools import
|
|
198
|
+
from texttools import BatchRunner, BatchConfig
|
|
256
199
|
|
|
257
|
-
# Configure your batch job
|
|
258
200
|
config = BatchConfig(
|
|
259
201
|
system_prompt="Extract entities from the text",
|
|
260
202
|
job_name="entity_extraction",
|
|
@@ -263,12 +205,10 @@ config = BatchConfig(
|
|
|
263
205
|
model="gpt-4o-mini"
|
|
264
206
|
)
|
|
265
207
|
|
|
266
|
-
# Define your output schema
|
|
267
208
|
class Output(BaseModel):
|
|
268
209
|
entities: list[str]
|
|
269
210
|
|
|
270
|
-
|
|
271
|
-
runner = BatchJobRunner(config, output_model=Output)
|
|
211
|
+
runner = BatchRunner(config, output_model=Output)
|
|
272
212
|
runner.run()
|
|
273
213
|
```
|
|
274
214
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
hamtaa_texttools-1.1.22.dist-info/licenses/LICENSE,sha256=Hb2YOBKy2MJQLnyLrX37B4ZVuac8eaIcE71SvVIMOLg,1082
|
|
2
|
+
texttools/__init__.py,sha256=fqGafzxcnGw0_ivi-vUyLfytWOkjLOumiaB0-I612iY,305
|
|
3
|
+
texttools/batch/batch_config.py,sha256=scWYQBDuaTj8-b2x_a33Zu-zxm7eqEf5FFoquD-Sv94,1029
|
|
4
|
+
texttools/batch/batch_manager.py,sha256=6HfsexU0PHGGBH7HKReZ-CQxaQI9DXYKAPsFXxovb_I,8740
|
|
5
|
+
texttools/batch/batch_runner.py,sha256=bpgRnFZiaxqAP6sm3kzb-waeNhIRxXYhttGikGFeXXU,10013
|
|
6
|
+
texttools/internals/async_operator.py,sha256=VHs06Yd_OZqUVyhCOMn7iujEChqhJg8aRS8NXpHBO1w,6719
|
|
7
|
+
texttools/internals/exceptions.py,sha256=h_yp_5i_5IfmqTBQ4S6ZOISrrliJBQ3HTEAjwJXrplk,495
|
|
8
|
+
texttools/internals/models.py,sha256=9uoCAe2TLrSzyS9lMJja5orPAYaCvVL1zoCb6FNdkfs,4541
|
|
9
|
+
texttools/internals/operator_utils.py,sha256=p44-YovUiLefJ-akB3o7Tk1o73ITFxx7E77pod4Aa1Y,2491
|
|
10
|
+
texttools/internals/prompt_loader.py,sha256=yYXDD4YYG2zohGPAmvZwmv5f6xV_RSl5yOrObTh9w7I,3352
|
|
11
|
+
texttools/internals/sync_operator.py,sha256=23mIxk96SOOkYb_7VXjmkNKuWqPTRQVhO4cTKQ_4Mtw,6624
|
|
12
|
+
texttools/internals/text_to_chunks.py,sha256=vY3odhgCZK4E44k_SGlLoSiKkdN0ib6-lQAsPcplAHA,3843
|
|
13
|
+
texttools/prompts/README.md,sha256=ztajRJcmFLhyrUF0_qmOXaCwGsTGCFabfMjch2LAJG0,1375
|
|
14
|
+
texttools/prompts/categorize.yaml,sha256=016b1uGtbKXEwB8_2_bBgVuUelBlu_rgT85XK_c3Yv0,1219
|
|
15
|
+
texttools/prompts/check_fact.yaml,sha256=gQqacCXqUEx3u2FRwhFSZHvhyWGwsYuJd1nIJyhpu7Q,700
|
|
16
|
+
texttools/prompts/extract_entities.yaml,sha256=DN8lZjvzCjotODnHFkWIAxFvmVvoeSs-hDKdN1L6bec,608
|
|
17
|
+
texttools/prompts/extract_keywords.yaml,sha256=GoeApi9SUCLZgs18H2-2BxZiKQ3lHptMPesgq3cluqU,3171
|
|
18
|
+
texttools/prompts/is_question.yaml,sha256=w5qF-z05h62YVs-0x2b2ySlHDKIhukFC9pibnvNM0vc,469
|
|
19
|
+
texttools/prompts/merge_questions.yaml,sha256=f6bHEx54jJ8hnb8iDBUCxXeGdGwRFmuu7vOkVWdaIkM,1788
|
|
20
|
+
texttools/prompts/propositionize.yaml,sha256=agZKQY-NmeJD86DGjmd-paIuazf82bczIGadgzSP5Vs,1378
|
|
21
|
+
texttools/prompts/rewrite.yaml,sha256=h6x8aXcW8oRxEbp466eak0y-LCkUOKf-mJ-vNVp5j5M,5386
|
|
22
|
+
texttools/prompts/run_custom.yaml,sha256=IETY9H0wPGWIIzcnupfbwwKQblwZrbYAxB754W9MhgU,125
|
|
23
|
+
texttools/prompts/subject_to_question.yaml,sha256=TfVmZ6gDgaHRqJWCVkFlKpuJczpMvJTo4XLWPaq5zic,1145
|
|
24
|
+
texttools/prompts/summarize.yaml,sha256=CKx4vjhHbGus1TdjDz_oc0bNEQtq7zfHsZkV2WeYHDU,457
|
|
25
|
+
texttools/prompts/text_to_question.yaml,sha256=mnArBoYu7gpGHriaU2-Aw5SixB2ZIgoHMt99PnTPKD0,1003
|
|
26
|
+
texttools/prompts/translate.yaml,sha256=ew9RERAVSzg0cvxAinNwTSFIaOIjdwIsekbUsgAuNgo,632
|
|
27
|
+
texttools/tools/async_tools.py,sha256=s3g6_8Jmg2KvdItWa3sXGfWI8YaOUPnfIRtWhWRMd1c,44543
|
|
28
|
+
texttools/tools/sync_tools.py,sha256=AcApMy_XvT47rBtqGdAFrKE1QDZq30f0uJsqiWYUWQg,44349
|
|
29
|
+
hamtaa_texttools-1.1.22.dist-info/METADATA,sha256=RF431cau25sLMmynuSHXKssKt3ipFt5M9ZKJJA3C9UI,8718
|
|
30
|
+
hamtaa_texttools-1.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
hamtaa_texttools-1.1.22.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
|
|
32
|
+
hamtaa_texttools-1.1.22.dist-info/RECORD,,
|
texttools/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from .batch.batch_runner import BatchJobRunner
|
|
2
|
-
from .batch.batch_config import BatchConfig
|
|
3
1
|
from .tools.sync_tools import TheTool
|
|
4
2
|
from .tools.async_tools import AsyncTheTool
|
|
5
3
|
from .internals.models import CategoryTree
|
|
4
|
+
from .batch.batch_runner import BatchRunner
|
|
5
|
+
from .batch.batch_config import BatchConfig
|
|
6
6
|
|
|
7
|
-
__all__ = ["TheTool", "AsyncTheTool", "
|
|
7
|
+
__all__ = ["TheTool", "AsyncTheTool", "CategoryTree", "BatchRunner", "BatchConfig"]
|
texttools/batch/batch_config.py
CHANGED
|
@@ -1,7 +1,20 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from collections.abc import Callable
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
def export_data(data) -> list[dict[str, str]]:
|
|
6
|
+
"""
|
|
7
|
+
Produces a structure of the following form from an initial data structure:
|
|
8
|
+
[{"id": str, "text": str},...]
|
|
9
|
+
"""
|
|
10
|
+
return data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def import_data(data) -> object:
|
|
14
|
+
"""
|
|
15
|
+
Takes the output and adds and aggregates it to the original structure.
|
|
16
|
+
"""
|
|
17
|
+
return data
|
|
5
18
|
|
|
6
19
|
|
|
7
20
|
@dataclass
|
texttools/batch/batch_runner.py
CHANGED
|
@@ -9,7 +9,7 @@ from dotenv import load_dotenv
|
|
|
9
9
|
from openai import OpenAI
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
-
from texttools.batch.
|
|
12
|
+
from texttools.batch.batch_manager import BatchManager
|
|
13
13
|
from texttools.batch.batch_config import BatchConfig
|
|
14
14
|
from texttools.internals.models import Str
|
|
15
15
|
from texttools.internals.exceptions import TextToolsError, ConfigurationError
|
|
@@ -20,7 +20,7 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
20
20
|
logger = logging.getLogger("texttools.batch_runner")
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class
|
|
23
|
+
class BatchRunner:
|
|
24
24
|
"""
|
|
25
25
|
Handles running batch jobs using a batch manager and configuration.
|
|
26
26
|
"""
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
from typing import TypeVar, Type
|
|
2
2
|
from collections.abc import Callable
|
|
3
|
-
import logging
|
|
4
3
|
|
|
5
4
|
from openai import AsyncOpenAI
|
|
6
5
|
from pydantic import BaseModel
|
|
7
6
|
|
|
8
|
-
from texttools.internals.models import
|
|
7
|
+
from texttools.internals.models import OperatorOutput
|
|
9
8
|
from texttools.internals.operator_utils import OperatorUtils
|
|
10
9
|
from texttools.internals.prompt_loader import PromptLoader
|
|
11
10
|
from texttools.internals.exceptions import (
|
|
@@ -18,39 +17,21 @@ from texttools.internals.exceptions import (
|
|
|
18
17
|
# Base Model type for output models
|
|
19
18
|
T = TypeVar("T", bound=BaseModel)
|
|
20
19
|
|
|
21
|
-
logger = logging.getLogger("texttools.async_operator")
|
|
22
|
-
|
|
23
20
|
|
|
24
21
|
class AsyncOperator:
|
|
25
22
|
"""
|
|
26
|
-
Core engine for running text-processing operations with an LLM
|
|
27
|
-
|
|
28
|
-
It wires together:
|
|
29
|
-
- `PromptLoader` → loads YAML prompt templates.
|
|
30
|
-
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
31
|
-
- AsyncOpenAI client → executes completions/parsed completions.
|
|
23
|
+
Core engine for running text-processing operations with an LLM.
|
|
32
24
|
"""
|
|
33
25
|
|
|
34
26
|
def __init__(self, client: AsyncOpenAI, model: str):
|
|
35
27
|
self._client = client
|
|
36
28
|
self._model = model
|
|
37
29
|
|
|
38
|
-
async def
|
|
39
|
-
"""
|
|
40
|
-
Calls OpenAI API for analysis using the configured prompt template.
|
|
41
|
-
Returns the analyzed content as a string.
|
|
42
|
-
"""
|
|
30
|
+
async def _analyze_completion(self, analyze_message: list[dict[str, str]]) -> str:
|
|
43
31
|
try:
|
|
44
|
-
analyze_prompt = prompt_configs["analyze_template"]
|
|
45
|
-
|
|
46
|
-
if not analyze_prompt:
|
|
47
|
-
raise PromptError("Analyze template is empty")
|
|
48
|
-
|
|
49
|
-
analyze_message = [OperatorUtils.build_user_message(analyze_prompt)]
|
|
50
32
|
completion = await self._client.chat.completions.create(
|
|
51
33
|
model=self._model,
|
|
52
34
|
messages=analyze_message,
|
|
53
|
-
temperature=temperature,
|
|
54
35
|
)
|
|
55
36
|
|
|
56
37
|
if not completion.choices:
|
|
@@ -61,7 +42,7 @@ class AsyncOperator:
|
|
|
61
42
|
if not analysis:
|
|
62
43
|
raise LLMError("Empty analysis response")
|
|
63
44
|
|
|
64
|
-
return analysis
|
|
45
|
+
return analysis
|
|
65
46
|
|
|
66
47
|
except Exception as e:
|
|
67
48
|
if isinstance(e, (PromptError, LLMError)):
|
|
@@ -70,12 +51,12 @@ class AsyncOperator:
|
|
|
70
51
|
|
|
71
52
|
async def _parse_completion(
|
|
72
53
|
self,
|
|
73
|
-
|
|
54
|
+
main_message: list[dict[str, str]],
|
|
74
55
|
output_model: Type[T],
|
|
75
56
|
temperature: float,
|
|
76
|
-
logprobs: bool
|
|
77
|
-
top_logprobs: int
|
|
78
|
-
priority: int
|
|
57
|
+
logprobs: bool,
|
|
58
|
+
top_logprobs: int,
|
|
59
|
+
priority: int,
|
|
79
60
|
) -> tuple[T, object]:
|
|
80
61
|
"""
|
|
81
62
|
Parses a chat completion using OpenAI's structured output format.
|
|
@@ -84,7 +65,7 @@ class AsyncOperator:
|
|
|
84
65
|
try:
|
|
85
66
|
request_kwargs = {
|
|
86
67
|
"model": self._model,
|
|
87
|
-
"messages":
|
|
68
|
+
"messages": main_message,
|
|
88
69
|
"response_format": output_model,
|
|
89
70
|
"temperature": temperature,
|
|
90
71
|
}
|
|
@@ -92,8 +73,10 @@ class AsyncOperator:
|
|
|
92
73
|
if logprobs:
|
|
93
74
|
request_kwargs["logprobs"] = True
|
|
94
75
|
request_kwargs["top_logprobs"] = top_logprobs
|
|
76
|
+
|
|
95
77
|
if priority:
|
|
96
78
|
request_kwargs["extra_body"] = {"priority": priority}
|
|
79
|
+
|
|
97
80
|
completion = await self._client.beta.chat.completions.parse(
|
|
98
81
|
**request_kwargs
|
|
99
82
|
)
|
|
@@ -122,24 +105,24 @@ class AsyncOperator:
|
|
|
122
105
|
user_prompt: str | None,
|
|
123
106
|
temperature: float,
|
|
124
107
|
logprobs: bool,
|
|
125
|
-
top_logprobs: int
|
|
108
|
+
top_logprobs: int,
|
|
126
109
|
validator: Callable[[object], bool] | None,
|
|
127
110
|
max_validation_retries: int | None,
|
|
111
|
+
priority: int,
|
|
128
112
|
# Internal parameters
|
|
129
113
|
prompt_file: str,
|
|
130
114
|
output_model: Type[T],
|
|
131
115
|
mode: str | None,
|
|
132
|
-
priority: int | None = 0,
|
|
133
116
|
**extra_kwargs,
|
|
134
|
-
) ->
|
|
117
|
+
) -> OperatorOutput:
|
|
135
118
|
"""
|
|
136
|
-
Execute the LLM pipeline with the given input text.
|
|
119
|
+
Execute the LLM pipeline with the given input text.
|
|
137
120
|
"""
|
|
138
121
|
try:
|
|
139
|
-
|
|
140
|
-
|
|
122
|
+
if logprobs and (not isinstance(top_logprobs, int) or top_logprobs < 2):
|
|
123
|
+
raise ValueError("top_logprobs should be an int greater than 1")
|
|
141
124
|
|
|
142
|
-
|
|
125
|
+
prompt_loader = PromptLoader()
|
|
143
126
|
prompt_configs = prompt_loader.load(
|
|
144
127
|
prompt_file=prompt_file,
|
|
145
128
|
text=text.strip(),
|
|
@@ -147,67 +130,45 @@ class AsyncOperator:
|
|
|
147
130
|
**extra_kwargs,
|
|
148
131
|
)
|
|
149
132
|
|
|
150
|
-
|
|
133
|
+
analysis: str | None = None
|
|
151
134
|
|
|
152
135
|
if with_analysis:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
OperatorUtils.build_user_message(
|
|
156
|
-
f"Based on this analysis: {analysis}"
|
|
157
|
-
)
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
if output_lang:
|
|
161
|
-
messages.append(
|
|
162
|
-
OperatorUtils.build_user_message(
|
|
163
|
-
f"Respond only in the {output_lang} language."
|
|
164
|
-
)
|
|
136
|
+
analyze_message = OperatorUtils.build_message(
|
|
137
|
+
prompt_configs["analyze_template"]
|
|
165
138
|
)
|
|
139
|
+
analysis = await self._analyze_completion(analyze_message)
|
|
166
140
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
f"Consider this instruction {user_prompt}"
|
|
171
|
-
)
|
|
141
|
+
main_message = OperatorUtils.build_message(
|
|
142
|
+
OperatorUtils.build_main_prompt(
|
|
143
|
+
prompt_configs["main_template"], analysis, output_lang, user_prompt
|
|
172
144
|
)
|
|
173
|
-
|
|
174
|
-
messages.append(
|
|
175
|
-
OperatorUtils.build_user_message(prompt_configs["main_template"])
|
|
176
145
|
)
|
|
177
146
|
|
|
178
|
-
messages = OperatorUtils.user_merge_format(messages)
|
|
179
|
-
|
|
180
|
-
if logprobs and (not isinstance(top_logprobs, int) or top_logprobs < 2):
|
|
181
|
-
raise ValueError("top_logprobs should be an integer greater than 1")
|
|
182
|
-
|
|
183
147
|
parsed, completion = await self._parse_completion(
|
|
184
|
-
|
|
148
|
+
main_message,
|
|
149
|
+
output_model,
|
|
150
|
+
temperature,
|
|
151
|
+
logprobs,
|
|
152
|
+
top_logprobs,
|
|
153
|
+
priority,
|
|
185
154
|
)
|
|
186
155
|
|
|
187
|
-
output.result = parsed.result
|
|
188
|
-
|
|
189
156
|
# Retry logic if validation fails
|
|
190
|
-
if validator and not validator(
|
|
157
|
+
if validator and not validator(parsed.result):
|
|
191
158
|
if (
|
|
192
159
|
not isinstance(max_validation_retries, int)
|
|
193
160
|
or max_validation_retries < 1
|
|
194
161
|
):
|
|
195
|
-
raise ValueError(
|
|
196
|
-
"max_validation_retries should be a positive integer"
|
|
197
|
-
)
|
|
162
|
+
raise ValueError("max_validation_retries should be a positive int")
|
|
198
163
|
|
|
199
164
|
succeeded = False
|
|
200
|
-
for
|
|
201
|
-
|
|
202
|
-
f"Validation failed, retrying for the {attempt + 1} time."
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Generate new temperature for retry
|
|
165
|
+
for _ in range(max_validation_retries):
|
|
166
|
+
# Generate a new temperature to retry
|
|
206
167
|
retry_temperature = OperatorUtils.get_retry_temp(temperature)
|
|
207
168
|
|
|
208
169
|
try:
|
|
209
170
|
parsed, completion = await self._parse_completion(
|
|
210
|
-
|
|
171
|
+
main_message,
|
|
211
172
|
output_model,
|
|
212
173
|
retry_temperature,
|
|
213
174
|
logprobs,
|
|
@@ -215,30 +176,26 @@ class AsyncOperator:
|
|
|
215
176
|
priority=priority,
|
|
216
177
|
)
|
|
217
178
|
|
|
218
|
-
output.result = parsed.result
|
|
219
|
-
|
|
220
179
|
# Check if retry was successful
|
|
221
|
-
if validator(
|
|
180
|
+
if validator(parsed.result):
|
|
222
181
|
succeeded = True
|
|
223
182
|
break
|
|
224
183
|
|
|
225
|
-
except LLMError
|
|
226
|
-
|
|
184
|
+
except LLMError:
|
|
185
|
+
pass
|
|
227
186
|
|
|
228
187
|
if not succeeded:
|
|
229
|
-
raise ValidationError(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
output.process = prompt_file[:-5]
|
|
188
|
+
raise ValidationError("Validation failed after all retries")
|
|
189
|
+
|
|
190
|
+
operator_output = OperatorOutput(
|
|
191
|
+
result=parsed.result,
|
|
192
|
+
analysis=analysis if with_analysis else None,
|
|
193
|
+
logprobs=OperatorUtils.extract_logprobs(completion)
|
|
194
|
+
if logprobs
|
|
195
|
+
else None,
|
|
196
|
+
)
|
|
240
197
|
|
|
241
|
-
return
|
|
198
|
+
return operator_output
|
|
242
199
|
|
|
243
200
|
except (PromptError, LLMError, ValidationError):
|
|
244
201
|
raise
|