hamtaa-texttools 2.1.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/PKG-INFO +75 -11
  2. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/README.md +73 -10
  3. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/hamtaa_texttools.egg-info/PKG-INFO +75 -11
  4. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/hamtaa_texttools.egg-info/SOURCES.txt +1 -0
  5. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/hamtaa_texttools.egg-info/requires.txt +1 -0
  6. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/pyproject.toml +2 -1
  7. hamtaa_texttools-2.3.0/texttools/__init__.py +4 -0
  8. hamtaa_texttools-2.3.0/texttools/core/__init__.py +34 -0
  9. hamtaa_texttools-2.3.0/texttools/core/internal_models.py +123 -0
  10. hamtaa_texttools-2.3.0/texttools/core/operators/__init__.py +4 -0
  11. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/core/operators/async_operator.py +11 -3
  12. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/core/operators/sync_operator.py +9 -3
  13. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/core/utils.py +33 -0
  14. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/models.py +4 -0
  15. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/augment.yaml +15 -15
  16. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/to_question.yaml +0 -2
  17. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/translate.yaml +2 -2
  18. hamtaa_texttools-2.3.0/texttools/tools/__init__.py +5 -0
  19. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/tools/async_tools.py +69 -19
  20. hamtaa_texttools-2.3.0/texttools/tools/batch_tools.py +688 -0
  21. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/tools/sync_tools.py +69 -19
  22. hamtaa_texttools-2.1.0/texttools/__init__.py +0 -5
  23. hamtaa_texttools-2.1.0/texttools/core/__init__.py +0 -0
  24. hamtaa_texttools-2.1.0/texttools/core/internal_models.py +0 -71
  25. hamtaa_texttools-2.1.0/texttools/core/operators/__init__.py +0 -0
  26. hamtaa_texttools-2.1.0/texttools/tools/__init__.py +0 -0
  27. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/LICENSE +0 -0
  28. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
  29. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/hamtaa_texttools.egg-info/top_level.txt +0 -0
  30. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/setup.cfg +0 -0
  31. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/tests/test_category_tree.py +0 -0
  32. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/tests/test_to_chunks.py +0 -0
  33. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/core/exceptions.py +0 -0
  34. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/categorize.yaml +0 -0
  35. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/extract_entities.yaml +0 -0
  36. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/extract_keywords.yaml +0 -0
  37. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/is_fact.yaml +0 -0
  38. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/is_question.yaml +0 -0
  39. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/merge_questions.yaml +0 -0
  40. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/propositionize.yaml +0 -0
  41. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/run_custom.yaml +0 -0
  42. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/prompts/summarize.yaml +0 -0
  43. {hamtaa_texttools-2.1.0 → hamtaa_texttools-2.3.0}/texttools/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 2.1.0
3
+ Version: 2.3.0
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
6
6
  Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
@@ -17,6 +17,7 @@ License-File: LICENSE
17
17
  Requires-Dist: dotenv>=0.9.9
18
18
  Requires-Dist: openai>=1.97.1
19
19
  Requires-Dist: pydantic>=2.0.0
20
+ Requires-Dist: pytest>=9.0.2
20
21
  Requires-Dist: pyyaml>=6.0
21
22
  Dynamic: license-file
22
23
 
@@ -29,7 +30,10 @@ Dynamic: license-file
29
30
 
30
31
  **TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
31
32
 
32
- It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
33
+ It provides three API styles for maximum flexibility:
34
+ - Sync API (`TheTool`) - Simple, sequential operations
35
+ - Async API (`AsyncTheTool`) - High-performance async operations
36
+ - Batch API (`BatchTheTool`) - Process multiple texts in parallel with built-in concurrency control
33
37
 
34
38
  It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
35
39
 
@@ -76,8 +80,6 @@ pip install -U hamtaa-texttools
76
80
 
77
81
  ## ⚙️ Additional Parameters
78
82
 
79
- - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
80
-
81
83
  - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
82
84
  **Note:** This doubles token usage per call.
83
85
 
@@ -98,32 +100,49 @@ pip install -U hamtaa-texttools
98
100
  - **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
99
101
  **Note:** This feature is only available in `AsyncTheTool`.
100
102
 
103
+ - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool`) Raise errors (True) or return them in output (False). Default is True.
104
+
105
+ - **`max_concurrency: int`** → (`BatchTheTool` only) Maximum number of concurrent API calls. Default is 5.
101
106
 
102
107
  ---
103
108
 
104
109
  ## 🧩 ToolOutput
105
110
 
106
111
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
112
+
107
113
  - **`result: Any`**
108
114
  - **`analysis: str`**
109
115
  - **`logprobs: list`**
110
116
  - **`errors: list[str]`**
111
- - **`ToolOutputMetadata`**
117
+ - **`ToolOutputMetadata`**
112
118
  - **`tool_name: str`**
119
+ - **`processed_by: str`**
113
120
  - **`processed_at: datetime`**
114
121
  - **`execution_time: float`**
122
+ - **`token_usage: TokenUsage`**
123
+ - **`completion_usage: CompletionUsage`**
124
+ - **`prompt_tokens: int`**
125
+ - **`completion_tokens: int`**
126
+ - **`total_tokens: int`**
127
+ - **`analyze_usage: AnalyzeUsage`**
128
+ - **`prompt_tokens: int`**
129
+ - **`completion_tokens: int`**
130
+ - **`total_tokens: int`**
115
131
 
116
132
  - Serialize output to JSON using the `to_json()` method.
117
133
  - Verify operation success with the `is_successful()` method.
118
134
  - Convert output to a dictionary with the `to_dict()` method.
119
135
 
136
+ **Note:** For BatchTheTool: Each method returns a `list[ToolOutput]` containing results for all input texts.
137
+
120
138
  ---
121
139
 
122
- ## 🧨 Sync vs Async
123
- | Tool | Style | Use case |
124
- |--------------|---------|---------------------------------------------|
125
- | `TheTool` | Sync | Simple scripts, sequential workflows |
126
- | `AsyncTheTool` | Async | High-throughput apps, APIs, concurrent tasks |
140
+ ## 🧨 Sync vs Async vs Batch
141
+ | Tool | Style | Use Case | Best For |
142
+ |------|-------|----------|----------|
143
+ | `TheTool` | **Sync** | Simple scripts, sequential workflows | • Quick prototyping<br>• Simple scripts<br>• Sequential processing<br>• Debugging |
144
+ | `AsyncTheTool` | **Async** | High-throughput applications, APIs, concurrent tasks | • Web APIs<br>• Concurrent operations<br>• High-performance apps<br>• Real-time processing |
145
+ | `BatchTheTool` | **Batch** | Process multiple texts efficiently with controlled concurrency | • Bulk processing<br>• Large datasets<br>• Parallel execution<br>• Resource optimization |
127
146
 
128
147
  ---
129
148
 
@@ -168,6 +187,35 @@ async def main():
168
187
  asyncio.run(main())
169
188
  ```
170
189
 
190
+ ## ⚡ Quick Start (Batch)
191
+
192
+ ```python
193
+ import asyncio
194
+ from openai import AsyncOpenAI
195
+ from texttools import BatchTheTool
196
+
197
+ async def main():
198
+ async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
199
+ model = "model_name"
200
+
201
+ batch_the_tool = BatchTheTool(client=async_client, model=model, max_concurrency=3)
202
+
203
+ categories = await batch_tool.categorize(
204
+ texts=[
205
+ "Climate change impacts on agriculture",
206
+ "Artificial intelligence in healthcare",
207
+ "Economic effects of remote work",
208
+ "Advancements in quantum computing",
209
+ ],
210
+ categories=["Science", "Technology", "Economics", "Environment"],
211
+ )
212
+
213
+ for i, result in enumerate(categories):
214
+ print(f"Text {i+1}: {result.result}")
215
+
216
+ asyncio.run(main())
217
+ ```
218
+
171
219
  ---
172
220
 
173
221
  ## ✅ Use Cases
@@ -176,4 +224,20 @@ Use **TextTools** when you need to:
176
224
 
177
225
  - 🔍 **Classify** large datasets quickly without model training
178
226
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
179
- - 📊 **Analyze** large text collections using embeddings and categorization
227
+ - 📊 **Analyze** large text collections using embeddings and categorization
228
+
229
+ ---
230
+
231
+ ## 📄 License
232
+
233
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
234
+
235
+ ---
236
+
237
+ ## 🤝 Contributing
238
+
239
+ We welcome contributions from the community! - see the [CONTRIBUTING](CONTRIBUTING.md) file for details.
240
+
241
+ ## 📚 Documentation
242
+
243
+ For detailed documentation, architecture overview, and implementation details, please visit the [docs](docs) directory.
@@ -7,7 +7,10 @@
7
7
 
8
8
  **TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
9
9
 
10
- It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
10
+ It provides three API styles for maximum flexibility:
11
+ - Sync API (`TheTool`) - Simple, sequential operations
12
+ - Async API (`AsyncTheTool`) - High-performance async operations
13
+ - Batch API (`BatchTheTool`) - Process multiple texts in parallel with built-in concurrency control
11
14
 
12
15
  It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
13
16
 
@@ -54,8 +57,6 @@ pip install -U hamtaa-texttools
54
57
 
55
58
  ## ⚙️ Additional Parameters
56
59
 
57
- - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
58
-
59
60
  - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
60
61
  **Note:** This doubles token usage per call.
61
62
 
@@ -76,32 +77,49 @@ pip install -U hamtaa-texttools
76
77
  - **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
77
78
  **Note:** This feature is only available in `AsyncTheTool`.
78
79
 
80
+ - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool`) Raise errors (True) or return them in output (False). Default is True.
81
+
82
+ - **`max_concurrency: int`** → (`BatchTheTool` only) Maximum number of concurrent API calls. Default is 5.
79
83
 
80
84
  ---
81
85
 
82
86
  ## 🧩 ToolOutput
83
87
 
84
88
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
89
+
85
90
  - **`result: Any`**
86
91
  - **`analysis: str`**
87
92
  - **`logprobs: list`**
88
93
  - **`errors: list[str]`**
89
- - **`ToolOutputMetadata`**
94
+ - **`ToolOutputMetadata`**
90
95
  - **`tool_name: str`**
96
+ - **`processed_by: str`**
91
97
  - **`processed_at: datetime`**
92
98
  - **`execution_time: float`**
99
+ - **`token_usage: TokenUsage`**
100
+ - **`completion_usage: CompletionUsage`**
101
+ - **`prompt_tokens: int`**
102
+ - **`completion_tokens: int`**
103
+ - **`total_tokens: int`**
104
+ - **`analyze_usage: AnalyzeUsage`**
105
+ - **`prompt_tokens: int`**
106
+ - **`completion_tokens: int`**
107
+ - **`total_tokens: int`**
93
108
 
94
109
  - Serialize output to JSON using the `to_json()` method.
95
110
  - Verify operation success with the `is_successful()` method.
96
111
  - Convert output to a dictionary with the `to_dict()` method.
97
112
 
113
+ **Note:** For BatchTheTool: Each method returns a `list[ToolOutput]` containing results for all input texts.
114
+
98
115
  ---
99
116
 
100
- ## 🧨 Sync vs Async
101
- | Tool | Style | Use case |
102
- |--------------|---------|---------------------------------------------|
103
- | `TheTool` | Sync | Simple scripts, sequential workflows |
104
- | `AsyncTheTool` | Async | High-throughput apps, APIs, concurrent tasks |
117
+ ## 🧨 Sync vs Async vs Batch
118
+ | Tool | Style | Use Case | Best For |
119
+ |------|-------|----------|----------|
120
+ | `TheTool` | **Sync** | Simple scripts, sequential workflows | • Quick prototyping<br>• Simple scripts<br>• Sequential processing<br>• Debugging |
121
+ | `AsyncTheTool` | **Async** | High-throughput applications, APIs, concurrent tasks | • Web APIs<br>• Concurrent operations<br>• High-performance apps<br>• Real-time processing |
122
+ | `BatchTheTool` | **Batch** | Process multiple texts efficiently with controlled concurrency | • Bulk processing<br>• Large datasets<br>• Parallel execution<br>• Resource optimization |
105
123
 
106
124
  ---
107
125
 
@@ -146,6 +164,35 @@ async def main():
146
164
  asyncio.run(main())
147
165
  ```
148
166
 
167
+ ## ⚡ Quick Start (Batch)
168
+
169
+ ```python
170
+ import asyncio
171
+ from openai import AsyncOpenAI
172
+ from texttools import BatchTheTool
173
+
174
+ async def main():
175
+ async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
176
+ model = "model_name"
177
+
178
+ batch_the_tool = BatchTheTool(client=async_client, model=model, max_concurrency=3)
179
+
180
+ categories = await batch_tool.categorize(
181
+ texts=[
182
+ "Climate change impacts on agriculture",
183
+ "Artificial intelligence in healthcare",
184
+ "Economic effects of remote work",
185
+ "Advancements in quantum computing",
186
+ ],
187
+ categories=["Science", "Technology", "Economics", "Environment"],
188
+ )
189
+
190
+ for i, result in enumerate(categories):
191
+ print(f"Text {i+1}: {result.result}")
192
+
193
+ asyncio.run(main())
194
+ ```
195
+
149
196
  ---
150
197
 
151
198
  ## ✅ Use Cases
@@ -154,4 +201,20 @@ Use **TextTools** when you need to:
154
201
 
155
202
  - 🔍 **Classify** large datasets quickly without model training
156
203
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
157
- - 📊 **Analyze** large text collections using embeddings and categorization
204
+ - 📊 **Analyze** large text collections using embeddings and categorization
205
+
206
+ ---
207
+
208
+ ## 📄 License
209
+
210
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
211
+
212
+ ---
213
+
214
+ ## 🤝 Contributing
215
+
216
+ We welcome contributions from the community! - see the [CONTRIBUTING](CONTRIBUTING.md) file for details.
217
+
218
+ ## 📚 Documentation
219
+
220
+ For detailed documentation, architecture overview, and implementation details, please visit the [docs](docs) directory.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 2.1.0
3
+ Version: 2.3.0
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
6
6
  Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
@@ -17,6 +17,7 @@ License-File: LICENSE
17
17
  Requires-Dist: dotenv>=0.9.9
18
18
  Requires-Dist: openai>=1.97.1
19
19
  Requires-Dist: pydantic>=2.0.0
20
+ Requires-Dist: pytest>=9.0.2
20
21
  Requires-Dist: pyyaml>=6.0
21
22
  Dynamic: license-file
22
23
 
@@ -29,7 +30,10 @@ Dynamic: license-file
29
30
 
30
31
  **TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
31
32
 
32
- It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
33
+ It provides three API styles for maximum flexibility:
34
+ - Sync API (`TheTool`) - Simple, sequential operations
35
+ - Async API (`AsyncTheTool`) - High-performance async operations
36
+ - Batch API (`BatchTheTool`) - Process multiple texts in parallel with built-in concurrency control
33
37
 
34
38
  It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
35
39
 
@@ -76,8 +80,6 @@ pip install -U hamtaa-texttools
76
80
 
77
81
  ## ⚙️ Additional Parameters
78
82
 
79
- - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
80
-
81
83
  - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
82
84
  **Note:** This doubles token usage per call.
83
85
 
@@ -98,32 +100,49 @@ pip install -U hamtaa-texttools
98
100
  - **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
99
101
  **Note:** This feature is only available in `AsyncTheTool`.
100
102
 
103
+ - **`raise_on_error: bool`** → (`TheTool/AsyncTheTool`) Raise errors (True) or return them in output (False). Default is True.
104
+
105
+ - **`max_concurrency: int`** → (`BatchTheTool` only) Maximum number of concurrent API calls. Default is 5.
101
106
 
102
107
  ---
103
108
 
104
109
  ## 🧩 ToolOutput
105
110
 
106
111
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
112
+
107
113
  - **`result: Any`**
108
114
  - **`analysis: str`**
109
115
  - **`logprobs: list`**
110
116
  - **`errors: list[str]`**
111
- - **`ToolOutputMetadata`**
117
+ - **`ToolOutputMetadata`**
112
118
  - **`tool_name: str`**
119
+ - **`processed_by: str`**
113
120
  - **`processed_at: datetime`**
114
121
  - **`execution_time: float`**
122
+ - **`token_usage: TokenUsage`**
123
+ - **`completion_usage: CompletionUsage`**
124
+ - **`prompt_tokens: int`**
125
+ - **`completion_tokens: int`**
126
+ - **`total_tokens: int`**
127
+ - **`analyze_usage: AnalyzeUsage`**
128
+ - **`prompt_tokens: int`**
129
+ - **`completion_tokens: int`**
130
+ - **`total_tokens: int`**
115
131
 
116
132
  - Serialize output to JSON using the `to_json()` method.
117
133
  - Verify operation success with the `is_successful()` method.
118
134
  - Convert output to a dictionary with the `to_dict()` method.
119
135
 
136
+ **Note:** For BatchTheTool: Each method returns a `list[ToolOutput]` containing results for all input texts.
137
+
120
138
  ---
121
139
 
122
- ## 🧨 Sync vs Async
123
- | Tool | Style | Use case |
124
- |--------------|---------|---------------------------------------------|
125
- | `TheTool` | Sync | Simple scripts, sequential workflows |
126
- | `AsyncTheTool` | Async | High-throughput apps, APIs, concurrent tasks |
140
+ ## 🧨 Sync vs Async vs Batch
141
+ | Tool | Style | Use Case | Best For |
142
+ |------|-------|----------|----------|
143
+ | `TheTool` | **Sync** | Simple scripts, sequential workflows | • Quick prototyping<br>• Simple scripts<br>• Sequential processing<br>• Debugging |
144
+ | `AsyncTheTool` | **Async** | High-throughput applications, APIs, concurrent tasks | • Web APIs<br>• Concurrent operations<br>• High-performance apps<br>• Real-time processing |
145
+ | `BatchTheTool` | **Batch** | Process multiple texts efficiently with controlled concurrency | • Bulk processing<br>• Large datasets<br>• Parallel execution<br>• Resource optimization |
127
146
 
128
147
  ---
129
148
 
@@ -168,6 +187,35 @@ async def main():
168
187
  asyncio.run(main())
169
188
  ```
170
189
 
190
+ ## ⚡ Quick Start (Batch)
191
+
192
+ ```python
193
+ import asyncio
194
+ from openai import AsyncOpenAI
195
+ from texttools import BatchTheTool
196
+
197
+ async def main():
198
+ async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
199
+ model = "model_name"
200
+
201
+ batch_the_tool = BatchTheTool(client=async_client, model=model, max_concurrency=3)
202
+
203
+ categories = await batch_tool.categorize(
204
+ texts=[
205
+ "Climate change impacts on agriculture",
206
+ "Artificial intelligence in healthcare",
207
+ "Economic effects of remote work",
208
+ "Advancements in quantum computing",
209
+ ],
210
+ categories=["Science", "Technology", "Economics", "Environment"],
211
+ )
212
+
213
+ for i, result in enumerate(categories):
214
+ print(f"Text {i+1}: {result.result}")
215
+
216
+ asyncio.run(main())
217
+ ```
218
+
171
219
  ---
172
220
 
173
221
  ## ✅ Use Cases
@@ -176,4 +224,20 @@ Use **TextTools** when you need to:
176
224
 
177
225
  - 🔍 **Classify** large datasets quickly without model training
178
226
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
179
- - 📊 **Analyze** large text collections using embeddings and categorization
227
+ - 📊 **Analyze** large text collections using embeddings and categorization
228
+
229
+ ---
230
+
231
+ ## 📄 License
232
+
233
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
234
+
235
+ ---
236
+
237
+ ## 🤝 Contributing
238
+
239
+ We welcome contributions from the community! - see the [CONTRIBUTING](CONTRIBUTING.md) file for details.
240
+
241
+ ## 📚 Documentation
242
+
243
+ For detailed documentation, architecture overview, and implementation details, please visit the [docs](docs) directory.
@@ -32,4 +32,5 @@ texttools/prompts/to_question.yaml
32
32
  texttools/prompts/translate.yaml
33
33
  texttools/tools/__init__.py
34
34
  texttools/tools/async_tools.py
35
+ texttools/tools/batch_tools.py
35
36
  texttools/tools/sync_tools.py
@@ -1,4 +1,5 @@
1
1
  dotenv>=0.9.9
2
2
  openai>=1.97.1
3
3
  pydantic>=2.0.0
4
+ pytest>=9.0.2
4
5
  pyyaml>=6.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hamtaa-texttools"
7
- version = "2.1.0"
7
+ version = "2.3.0"
8
8
  authors = [
9
9
  {name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
10
10
  {name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
@@ -24,6 +24,7 @@ dependencies = [
24
24
  "dotenv>=0.9.9",
25
25
  "openai>=1.97.1",
26
26
  "pydantic>=2.0.0",
27
+ "pytest>=9.0.2",
27
28
  "pyyaml>=6.0",
28
29
  ]
29
30
  keywords = ["nlp", "llm", "text-processing", "openai"]
@@ -0,0 +1,4 @@
1
+ from .models import CategoryTree
2
+ from .tools import AsyncTheTool, BatchTheTool, TheTool
3
+
4
+ __all__ = ["CategoryTree", "AsyncTheTool", "BatchTheTool", "TheTool"]
@@ -0,0 +1,34 @@
1
+ from .exceptions import LLMError, PromptError, TextToolsError, ValidationError
2
+ from .internal_models import (
3
+ Bool,
4
+ ListDictStrStr,
5
+ ListStr,
6
+ ReasonListStr,
7
+ Str,
8
+ TokenUsage,
9
+ create_dynamic_model,
10
+ )
11
+ from .operators import AsyncOperator, Operator
12
+ from .utils import OperatorUtils, TheToolUtils
13
+
14
+ __all__ = [
15
+ # Exceptions
16
+ "LLMError",
17
+ "PromptError",
18
+ "TextToolsError",
19
+ "ValidationError",
20
+ # Internal models
21
+ "Bool",
22
+ "ListDictStrStr",
23
+ "ListStr",
24
+ "ReasonListStr",
25
+ "Str",
26
+ "TokenUsage",
27
+ "create_dynamic_model",
28
+ # Operators
29
+ "AsyncOperator",
30
+ "Operator",
31
+ # Utils
32
+ "OperatorUtils",
33
+ "TheToolUtils",
34
+ ]
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Literal
4
+
5
+ from pydantic import BaseModel, Field, create_model
6
+
7
+
8
+ class CompletionUsage(BaseModel):
9
+ prompt_tokens: int = 0
10
+ completion_tokens: int = 0
11
+ total_tokens: int = 0
12
+
13
+
14
+ class AnalyzeUsage(BaseModel):
15
+ prompt_tokens: int = 0
16
+ completion_tokens: int = 0
17
+ total_tokens: int = 0
18
+
19
+
20
+ class TokenUsage(BaseModel):
21
+ completion_usage: CompletionUsage = CompletionUsage()
22
+ analyze_usage: AnalyzeUsage = AnalyzeUsage()
23
+ total_tokens: int = 0
24
+
25
+ def __add__(self, other: TokenUsage) -> TokenUsage:
26
+ new_completion_usage = CompletionUsage(
27
+ prompt_tokens=self.completion_usage.prompt_tokens
28
+ + other.completion_usage.prompt_tokens,
29
+ completion_tokens=self.completion_usage.completion_tokens
30
+ + other.completion_usage.completion_tokens,
31
+ total_tokens=self.completion_usage.total_tokens
32
+ + other.completion_usage.total_tokens,
33
+ )
34
+ new_analyze_usage = AnalyzeUsage(
35
+ prompt_tokens=self.analyze_usage.prompt_tokens
36
+ + other.analyze_usage.prompt_tokens,
37
+ completion_tokens=self.analyze_usage.completion_tokens
38
+ + other.analyze_usage.completion_tokens,
39
+ total_tokens=self.analyze_usage.total_tokens
40
+ + other.analyze_usage.total_tokens,
41
+ )
42
+ total_tokens = (
43
+ new_completion_usage.total_tokens + new_analyze_usage.total_tokens
44
+ )
45
+
46
+ return TokenUsage(
47
+ completion_usage=new_completion_usage,
48
+ analyze_usage=new_analyze_usage,
49
+ total_tokens=total_tokens,
50
+ )
51
+
52
+
53
+ class OperatorOutput(BaseModel):
54
+ result: Any
55
+ analysis: str | None
56
+ logprobs: list[dict[str, Any]] | None
57
+ token_usage: TokenUsage | None = None
58
+ prompt_tokens: int | None = None
59
+ completion_tokens: int | None = None
60
+ analysis_tokens: int | None = None
61
+ total_tokens: int | None = None
62
+
63
+
64
+ class Str(BaseModel):
65
+ result: str = Field(
66
+ ..., description="The output string", json_schema_extra={"example": "text"}
67
+ )
68
+
69
+
70
+ class Bool(BaseModel):
71
+ result: bool = Field(
72
+ ...,
73
+ description="Boolean indicating the output state",
74
+ json_schema_extra={"example": True},
75
+ )
76
+
77
+
78
+ class ListStr(BaseModel):
79
+ result: list[str] = Field(
80
+ ...,
81
+ description="The output list of strings",
82
+ json_schema_extra={"example": ["text_1", "text_2", "text_3"]},
83
+ )
84
+
85
+
86
+ class ListDictStrStr(BaseModel):
87
+ result: list[dict[str, str]] = Field(
88
+ ...,
89
+ description="List of dictionaries containing string key-value pairs",
90
+ json_schema_extra={
91
+ "example": [
92
+ {"text": "Mohammad", "type": "PER"},
93
+ {"text": "Iran", "type": "LOC"},
94
+ ]
95
+ },
96
+ )
97
+
98
+
99
+ class ReasonListStr(BaseModel):
100
+ reason: str = Field(..., description="Thinking process that led to the output")
101
+ result: list[str] = Field(
102
+ ...,
103
+ description="The output list of strings",
104
+ json_schema_extra={"example": ["text_1", "text_2", "text_3"]},
105
+ )
106
+
107
+
108
+ # Create CategorizerOutput with dynamic categories
109
+ def create_dynamic_model(allowed_values: list[str]) -> type[BaseModel]:
110
+ literal_type = Literal[*allowed_values]
111
+
112
+ CategorizerOutput = create_model(
113
+ "CategorizerOutput",
114
+ reason=(
115
+ str,
116
+ Field(
117
+ ..., description="Explanation of why the input belongs to the category"
118
+ ),
119
+ ),
120
+ result=(literal_type, Field(..., description="Predicted category label")),
121
+ )
122
+
123
+ return CategorizerOutput
@@ -0,0 +1,4 @@
1
+ from .async_operator import AsyncOperator
2
+ from .sync_operator import Operator
3
+
4
+ __all__ = ["AsyncOperator", "Operator"]
@@ -18,7 +18,9 @@ class AsyncOperator:
18
18
  self._client = client
19
19
  self._model = model
20
20
 
21
- async def _analyze_completion(self, analyze_message: list[dict[str, str]]) -> str:
21
+ async def _analyze_completion(
22
+ self, analyze_message: list[dict[str, str]]
23
+ ) -> tuple[str, Any]:
22
24
  try:
23
25
  completion = await self._client.chat.completions.create(
24
26
  model=self._model,
@@ -33,7 +35,7 @@ class AsyncOperator:
33
35
  if not analysis:
34
36
  raise LLMError("Empty analysis response")
35
37
 
36
- return analysis
38
+ return analysis, completion
37
39
 
38
40
  except Exception as e:
39
41
  if isinstance(e, (PromptError, LLMError)):
@@ -116,12 +118,15 @@ class AsyncOperator:
116
118
  )
117
119
 
118
120
  analysis: str | None = None
121
+ analyze_completion: Any = None
119
122
 
120
123
  if with_analysis:
121
124
  analyze_message = OperatorUtils.build_message(
122
125
  prompt_configs["analyze_template"]
123
126
  )
124
- analysis = await self._analyze_completion(analyze_message)
127
+ analysis, analyze_completion = await self._analyze_completion(
128
+ analyze_message
129
+ )
125
130
 
126
131
  main_prompt = OperatorUtils.build_main_prompt(
127
132
  prompt_configs["main_template"], analysis, output_lang, user_prompt
@@ -176,6 +181,9 @@ class AsyncOperator:
176
181
  logprobs=OperatorUtils.extract_logprobs(completion)
177
182
  if logprobs
178
183
  else None,
184
+ token_usage=OperatorUtils.extract_token_usage(
185
+ completion, analyze_completion
186
+ ),
179
187
  )
180
188
 
181
189
  return operator_output