hamtaa-texttools 1.3.2__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/LICENSE +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/PKG-INFO +40 -47
- hamtaa_texttools-2.1.0/README.md +157 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/hamtaa_texttools.egg-info/PKG-INFO +40 -47
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/hamtaa_texttools.egg-info/SOURCES.txt +6 -8
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/hamtaa_texttools.egg-info/requires.txt +1 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/pyproject.toml +46 -45
- hamtaa_texttools-2.1.0/tests/test_category_tree.py +48 -0
- hamtaa_texttools-2.1.0/tests/test_to_chunks.py +13 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/__init__.py +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/internal_models.py +16 -7
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/operators/async_operator.py +10 -16
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/operators/sync_operator.py +10 -16
- hamtaa_texttools-2.1.0/texttools/core/utils.py +260 -0
- hamtaa_texttools-2.1.0/texttools/models.py +143 -0
- hamtaa_texttools-1.3.2/texttools/prompts/rewrite.yaml → hamtaa_texttools-2.1.0/texttools/prompts/augment.yaml +3 -3
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/categorize.yaml +7 -8
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/extract_entities.yaml +2 -2
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/extract_keywords.yaml +4 -2
- hamtaa_texttools-1.3.2/texttools/prompts/check_fact.yaml → hamtaa_texttools-2.1.0/texttools/prompts/is_fact.yaml +5 -4
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/is_question.yaml +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/merge_questions.yaml +8 -6
- hamtaa_texttools-2.1.0/texttools/prompts/propositionize.yaml +28 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/run_custom.yaml +3 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/summarize.yaml +3 -3
- hamtaa_texttools-2.1.0/texttools/prompts/to_question.yaml +60 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/prompts/translate.yaml +4 -4
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/tools/async_tools.py +152 -169
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/tools/sync_tools.py +138 -150
- hamtaa_texttools-1.3.2/README.md +0 -165
- hamtaa_texttools-1.3.2/tests/test_all_async_tools.py +0 -99
- hamtaa_texttools-1.3.2/tests/test_all_tools.py +0 -118
- hamtaa_texttools-1.3.2/tests/test_output_validation.py +0 -31
- hamtaa_texttools-1.3.2/texttools/core/engine.py +0 -262
- hamtaa_texttools-1.3.2/texttools/models.py +0 -88
- hamtaa_texttools-1.3.2/texttools/prompts/propositionize.yaml +0 -24
- hamtaa_texttools-1.3.2/texttools/prompts/subject_to_question.yaml +0 -26
- hamtaa_texttools-1.3.2/texttools/prompts/text_to_question.yaml +0 -26
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/hamtaa_texttools.egg-info/top_level.txt +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/setup.cfg +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/__init__.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/exceptions.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/core/operators/__init__.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/py.typed +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.1.0}/texttools/tools/__init__.py +0 -0
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
@@ -11,9 +11,10 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
12
|
Classifier: Topic :: Text Processing
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
|
-
Requires-Python: >=3.
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
+
Requires-Dist: dotenv>=0.9.9
|
|
17
18
|
Requires-Dist: openai>=1.97.1
|
|
18
19
|
Requires-Dist: pydantic>=2.0.0
|
|
19
20
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -30,30 +31,27 @@ Dynamic: license-file
|
|
|
30
31
|
|
|
31
32
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
32
33
|
|
|
33
|
-
It provides ready-to-use utilities for **translation, question detection,
|
|
34
|
-
|
|
35
|
-
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
34
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
36
35
|
|
|
37
36
|
---
|
|
38
37
|
|
|
39
38
|
## ✨ Features
|
|
40
39
|
|
|
41
|
-
TextTools provides a
|
|
40
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
42
41
|
Each tool is designed to work with structured outputs.
|
|
43
42
|
|
|
44
|
-
- **`categorize()`** -
|
|
45
|
-
- **`extract_keywords()`** -
|
|
46
|
-
- **`extract_entities()`** - Named Entity Recognition (NER)
|
|
47
|
-
- **`is_question()`** -
|
|
48
|
-
- **`
|
|
49
|
-
- **`merge_questions()`** -
|
|
50
|
-
- **`
|
|
51
|
-
- **`
|
|
52
|
-
- **`
|
|
53
|
-
- **`
|
|
54
|
-
- **`
|
|
55
|
-
- **`
|
|
56
|
-
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
43
|
+
- **`categorize()`** - Classify text into given categories
|
|
44
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
45
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
46
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
47
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
48
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
49
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
50
|
+
- **`summarize()`** - Summarize the given text
|
|
51
|
+
- **`translate()`** - Translate text between languages
|
|
52
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
53
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
54
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
57
55
|
|
|
58
56
|
---
|
|
59
57
|
|
|
@@ -71,14 +69,14 @@ pip install -U hamtaa-texttools
|
|
|
71
69
|
|
|
72
70
|
| Status | Meaning | Tools | Safe for Production? |
|
|
73
71
|
|--------|---------|----------|-------------------|
|
|
74
|
-
| **✅ Production** | Evaluated
|
|
75
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**.
|
|
72
|
+
| **✅ Production** | Evaluated and tested. | `categorize()`, `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
73
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
76
74
|
|
|
77
75
|
---
|
|
78
76
|
|
|
79
|
-
## ⚙️
|
|
77
|
+
## ⚙️ Additional Parameters
|
|
80
78
|
|
|
81
|
-
|
|
79
|
+
- **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
|
|
82
80
|
|
|
83
81
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
84
82
|
**Note:** This doubles token usage per call.
|
|
@@ -88,17 +86,17 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
88
86
|
|
|
89
87
|
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
90
88
|
|
|
91
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction
|
|
89
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
92
90
|
|
|
93
|
-
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number
|
|
91
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
94
92
|
|
|
95
|
-
- **`validator: Callable (Experimental)`** → Forces
|
|
93
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
96
94
|
|
|
97
|
-
- **`priority: int (Experimental)`** →
|
|
95
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
98
96
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
99
97
|
|
|
100
|
-
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error
|
|
101
|
-
**Note:** This feature only
|
|
98
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
99
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
102
100
|
|
|
103
101
|
|
|
104
102
|
---
|
|
@@ -110,12 +108,14 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
110
108
|
- **`analysis: str`**
|
|
111
109
|
- **`logprobs: list`**
|
|
112
110
|
- **`errors: list[str]`**
|
|
113
|
-
- **`ToolOutputMetadata`**
|
|
111
|
+
- **`ToolOutputMetadata`**
|
|
114
112
|
- **`tool_name: str`**
|
|
115
113
|
- **`processed_at: datetime`**
|
|
116
114
|
- **`execution_time: float`**
|
|
117
115
|
|
|
118
|
-
|
|
116
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
117
|
+
- Verify operation success with the `is_successful()` method.
|
|
118
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
119
119
|
|
|
120
120
|
---
|
|
121
121
|
|
|
@@ -133,13 +133,13 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
133
133
|
from openai import OpenAI
|
|
134
134
|
from texttools import TheTool
|
|
135
135
|
|
|
136
|
-
client = OpenAI(base_url
|
|
136
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
137
137
|
model = "model_name"
|
|
138
138
|
|
|
139
139
|
the_tool = TheTool(client=client, model=model)
|
|
140
140
|
|
|
141
141
|
detection = the_tool.is_question("Is this project open source?")
|
|
142
|
-
print(
|
|
142
|
+
print(detection.to_json())
|
|
143
143
|
```
|
|
144
144
|
|
|
145
145
|
---
|
|
@@ -157,30 +157,23 @@ async def main():
|
|
|
157
157
|
|
|
158
158
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
159
159
|
|
|
160
|
-
translation_task = async_the_tool.translate("سلام، حالت چطوره؟",
|
|
161
|
-
keywords_task = async_the_tool.extract_keywords("
|
|
160
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
161
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
162
162
|
|
|
163
163
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
164
|
-
|
|
165
|
-
print(
|
|
164
|
+
|
|
165
|
+
print(translation.to_json())
|
|
166
|
+
print(keywords.to_json())
|
|
166
167
|
|
|
167
168
|
asyncio.run(main())
|
|
168
169
|
```
|
|
169
170
|
|
|
170
171
|
---
|
|
171
172
|
|
|
172
|
-
##
|
|
173
|
+
## ✅ Use Cases
|
|
173
174
|
|
|
174
175
|
Use **TextTools** when you need to:
|
|
175
176
|
|
|
176
|
-
- 🔍 **Classify** large datasets quickly without model training
|
|
177
|
-
- 🌍 **Translate** and process multilingual corpora with ease
|
|
177
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
178
178
|
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
179
179
|
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
180
|
-
|
|
181
|
-
---
|
|
182
|
-
|
|
183
|
-
## 🤝 Contributing
|
|
184
|
-
|
|
185
|
-
Contributions are welcome!
|
|
186
|
-
Feel free to **open issues, suggest new features, or submit pull requests**.
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# TextTools
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
## 📌 Overview
|
|
7
|
+
|
|
8
|
+
**TextTools** is a high-level **NLP toolkit** built on top of **LLMs**.
|
|
9
|
+
|
|
10
|
+
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
11
|
+
|
|
12
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## ✨ Features
|
|
17
|
+
|
|
18
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
19
|
+
Each tool is designed to work with structured outputs.
|
|
20
|
+
|
|
21
|
+
- **`categorize()`** - Classify text into given categories
|
|
22
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
23
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
24
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
25
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
26
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
27
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
28
|
+
- **`summarize()`** - Summarize the given text
|
|
29
|
+
- **`translate()`** - Translate text between languages
|
|
30
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
31
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
32
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 🚀 Installation
|
|
37
|
+
|
|
38
|
+
Install the latest release via PyPI:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install -U hamtaa-texttools
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## 📊 Tool Quality Tiers
|
|
47
|
+
|
|
48
|
+
| Status | Meaning | Tools | Safe for Production? |
|
|
49
|
+
|--------|---------|----------|-------------------|
|
|
50
|
+
| **✅ Production** | Evaluated and tested. | `categorize()`, `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
51
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## ⚙️ Additional Parameters
|
|
56
|
+
|
|
57
|
+
- **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
|
|
58
|
+
|
|
59
|
+
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
60
|
+
**Note:** This doubles token usage per call.
|
|
61
|
+
|
|
62
|
+
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
63
|
+
**Note:** This feature works if it's supported by the model.
|
|
64
|
+
|
|
65
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
66
|
+
|
|
67
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
68
|
+
|
|
69
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
70
|
+
|
|
71
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
72
|
+
|
|
73
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
74
|
+
**Note:** This feature works if it's supported by the model and vLLM.
|
|
75
|
+
|
|
76
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
77
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## 🧩 ToolOutput
|
|
83
|
+
|
|
84
|
+
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
85
|
+
- **`result: Any`**
|
|
86
|
+
- **`analysis: str`**
|
|
87
|
+
- **`logprobs: list`**
|
|
88
|
+
- **`errors: list[str]`**
|
|
89
|
+
- **`ToolOutputMetadata`**
|
|
90
|
+
- **`tool_name: str`**
|
|
91
|
+
- **`processed_at: datetime`**
|
|
92
|
+
- **`execution_time: float`**
|
|
93
|
+
|
|
94
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
95
|
+
- Verify operation success with the `is_successful()` method.
|
|
96
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## 🧨 Sync vs Async
|
|
101
|
+
| Tool | Style | Use case |
|
|
102
|
+
|--------------|---------|---------------------------------------------|
|
|
103
|
+
| `TheTool` | Sync | Simple scripts, sequential workflows |
|
|
104
|
+
| `AsyncTheTool` | Async | High-throughput apps, APIs, concurrent tasks |
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## ⚡ Quick Start (Sync)
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from openai import OpenAI
|
|
112
|
+
from texttools import TheTool
|
|
113
|
+
|
|
114
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
115
|
+
model = "model_name"
|
|
116
|
+
|
|
117
|
+
the_tool = TheTool(client=client, model=model)
|
|
118
|
+
|
|
119
|
+
detection = the_tool.is_question("Is this project open source?")
|
|
120
|
+
print(detection.to_json())
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## ⚡ Quick Start (Async)
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
import asyncio
|
|
129
|
+
from openai import AsyncOpenAI
|
|
130
|
+
from texttools import AsyncTheTool
|
|
131
|
+
|
|
132
|
+
async def main():
|
|
133
|
+
async_client = AsyncOpenAI(base_url="your_url", api_key="your_api_key")
|
|
134
|
+
model = "model_name"
|
|
135
|
+
|
|
136
|
+
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
137
|
+
|
|
138
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
139
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
140
|
+
|
|
141
|
+
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
142
|
+
|
|
143
|
+
print(translation.to_json())
|
|
144
|
+
print(keywords.to_json())
|
|
145
|
+
|
|
146
|
+
asyncio.run(main())
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## ✅ Use Cases
|
|
152
|
+
|
|
153
|
+
Use **TextTools** when you need to:
|
|
154
|
+
|
|
155
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
156
|
+
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
157
|
+
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
@@ -11,9 +11,10 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
12
|
Classifier: Topic :: Text Processing
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
|
-
Requires-Python: >=3.
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
|
+
Requires-Dist: dotenv>=0.9.9
|
|
17
18
|
Requires-Dist: openai>=1.97.1
|
|
18
19
|
Requires-Dist: pydantic>=2.0.0
|
|
19
20
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -30,30 +31,27 @@ Dynamic: license-file
|
|
|
30
31
|
|
|
31
32
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
32
33
|
|
|
33
|
-
It provides ready-to-use utilities for **translation, question detection,
|
|
34
|
-
|
|
35
|
-
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
34
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
36
35
|
|
|
37
36
|
---
|
|
38
37
|
|
|
39
38
|
## ✨ Features
|
|
40
39
|
|
|
41
|
-
TextTools provides a
|
|
40
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
42
41
|
Each tool is designed to work with structured outputs.
|
|
43
42
|
|
|
44
|
-
- **`categorize()`** -
|
|
45
|
-
- **`extract_keywords()`** -
|
|
46
|
-
- **`extract_entities()`** - Named Entity Recognition (NER)
|
|
47
|
-
- **`is_question()`** -
|
|
48
|
-
- **`
|
|
49
|
-
- **`merge_questions()`** -
|
|
50
|
-
- **`
|
|
51
|
-
- **`
|
|
52
|
-
- **`
|
|
53
|
-
- **`
|
|
54
|
-
- **`
|
|
55
|
-
- **`
|
|
56
|
-
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
43
|
+
- **`categorize()`** - Classify text into given categories
|
|
44
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
45
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
46
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
47
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
48
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
49
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
50
|
+
- **`summarize()`** - Summarize the given text
|
|
51
|
+
- **`translate()`** - Translate text between languages
|
|
52
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
53
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
54
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
57
55
|
|
|
58
56
|
---
|
|
59
57
|
|
|
@@ -71,14 +69,14 @@ pip install -U hamtaa-texttools
|
|
|
71
69
|
|
|
72
70
|
| Status | Meaning | Tools | Safe for Production? |
|
|
73
71
|
|--------|---------|----------|-------------------|
|
|
74
|
-
| **✅ Production** | Evaluated
|
|
75
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**.
|
|
72
|
+
| **✅ Production** | Evaluated and tested. | `categorize()`, `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
73
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
76
74
|
|
|
77
75
|
---
|
|
78
76
|
|
|
79
|
-
## ⚙️
|
|
77
|
+
## ⚙️ Additional Parameters
|
|
80
78
|
|
|
81
|
-
|
|
79
|
+
- **`raise_on_error: bool`** → (`TheTool/AsyncTheTool` parameter) Raise errors (True) or return them in output (False). Default is True.
|
|
82
80
|
|
|
83
81
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
84
82
|
**Note:** This doubles token usage per call.
|
|
@@ -88,17 +86,17 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
88
86
|
|
|
89
87
|
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
90
88
|
|
|
91
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction
|
|
89
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
92
90
|
|
|
93
|
-
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number
|
|
91
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
94
92
|
|
|
95
|
-
- **`validator: Callable (Experimental)`** → Forces
|
|
93
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
96
94
|
|
|
97
|
-
- **`priority: int (Experimental)`** →
|
|
95
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
98
96
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
99
97
|
|
|
100
|
-
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error
|
|
101
|
-
**Note:** This feature only
|
|
98
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
99
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
102
100
|
|
|
103
101
|
|
|
104
102
|
---
|
|
@@ -110,12 +108,14 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
110
108
|
- **`analysis: str`**
|
|
111
109
|
- **`logprobs: list`**
|
|
112
110
|
- **`errors: list[str]`**
|
|
113
|
-
- **`ToolOutputMetadata`**
|
|
111
|
+
- **`ToolOutputMetadata`**
|
|
114
112
|
- **`tool_name: str`**
|
|
115
113
|
- **`processed_at: datetime`**
|
|
116
114
|
- **`execution_time: float`**
|
|
117
115
|
|
|
118
|
-
|
|
116
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
117
|
+
- Verify operation success with the `is_successful()` method.
|
|
118
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
119
119
|
|
|
120
120
|
---
|
|
121
121
|
|
|
@@ -133,13 +133,13 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
133
133
|
from openai import OpenAI
|
|
134
134
|
from texttools import TheTool
|
|
135
135
|
|
|
136
|
-
client = OpenAI(base_url
|
|
136
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
137
137
|
model = "model_name"
|
|
138
138
|
|
|
139
139
|
the_tool = TheTool(client=client, model=model)
|
|
140
140
|
|
|
141
141
|
detection = the_tool.is_question("Is this project open source?")
|
|
142
|
-
print(
|
|
142
|
+
print(detection.to_json())
|
|
143
143
|
```
|
|
144
144
|
|
|
145
145
|
---
|
|
@@ -157,30 +157,23 @@ async def main():
|
|
|
157
157
|
|
|
158
158
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
159
159
|
|
|
160
|
-
translation_task = async_the_tool.translate("سلام، حالت چطوره؟",
|
|
161
|
-
keywords_task = async_the_tool.extract_keywords("
|
|
160
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
161
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
162
162
|
|
|
163
163
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
164
|
-
|
|
165
|
-
print(
|
|
164
|
+
|
|
165
|
+
print(translation.to_json())
|
|
166
|
+
print(keywords.to_json())
|
|
166
167
|
|
|
167
168
|
asyncio.run(main())
|
|
168
169
|
```
|
|
169
170
|
|
|
170
171
|
---
|
|
171
172
|
|
|
172
|
-
##
|
|
173
|
+
## ✅ Use Cases
|
|
173
174
|
|
|
174
175
|
Use **TextTools** when you need to:
|
|
175
176
|
|
|
176
|
-
- 🔍 **Classify** large datasets quickly without model training
|
|
177
|
-
- 🌍 **Translate** and process multilingual corpora with ease
|
|
177
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
178
178
|
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
179
179
|
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
180
|
-
|
|
181
|
-
---
|
|
182
|
-
|
|
183
|
-
## 🤝 Contributing
|
|
184
|
-
|
|
185
|
-
Contributions are welcome!
|
|
186
|
-
Feel free to **open issues, suggest new features, or submit pull requests**.
|
|
@@ -6,31 +6,29 @@ hamtaa_texttools.egg-info/SOURCES.txt
|
|
|
6
6
|
hamtaa_texttools.egg-info/dependency_links.txt
|
|
7
7
|
hamtaa_texttools.egg-info/requires.txt
|
|
8
8
|
hamtaa_texttools.egg-info/top_level.txt
|
|
9
|
-
tests/
|
|
10
|
-
tests/
|
|
11
|
-
tests/test_output_validation.py
|
|
9
|
+
tests/test_category_tree.py
|
|
10
|
+
tests/test_to_chunks.py
|
|
12
11
|
texttools/__init__.py
|
|
13
12
|
texttools/models.py
|
|
14
13
|
texttools/py.typed
|
|
15
14
|
texttools/core/__init__.py
|
|
16
|
-
texttools/core/engine.py
|
|
17
15
|
texttools/core/exceptions.py
|
|
18
16
|
texttools/core/internal_models.py
|
|
17
|
+
texttools/core/utils.py
|
|
19
18
|
texttools/core/operators/__init__.py
|
|
20
19
|
texttools/core/operators/async_operator.py
|
|
21
20
|
texttools/core/operators/sync_operator.py
|
|
21
|
+
texttools/prompts/augment.yaml
|
|
22
22
|
texttools/prompts/categorize.yaml
|
|
23
|
-
texttools/prompts/check_fact.yaml
|
|
24
23
|
texttools/prompts/extract_entities.yaml
|
|
25
24
|
texttools/prompts/extract_keywords.yaml
|
|
25
|
+
texttools/prompts/is_fact.yaml
|
|
26
26
|
texttools/prompts/is_question.yaml
|
|
27
27
|
texttools/prompts/merge_questions.yaml
|
|
28
28
|
texttools/prompts/propositionize.yaml
|
|
29
|
-
texttools/prompts/rewrite.yaml
|
|
30
29
|
texttools/prompts/run_custom.yaml
|
|
31
|
-
texttools/prompts/subject_to_question.yaml
|
|
32
30
|
texttools/prompts/summarize.yaml
|
|
33
|
-
texttools/prompts/
|
|
31
|
+
texttools/prompts/to_question.yaml
|
|
34
32
|
texttools/prompts/translate.yaml
|
|
35
33
|
texttools/tools/__init__.py
|
|
36
34
|
texttools/tools/async_tools.py
|
|
@@ -1,45 +1,46 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
-
build-backend = "setuptools.build_meta"
|
|
4
|
-
|
|
5
|
-
[project]
|
|
6
|
-
name = "hamtaa-texttools"
|
|
7
|
-
version = "1.
|
|
8
|
-
authors = [
|
|
9
|
-
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
10
|
-
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
11
|
-
{name = "Montazer", email = "montazerh82@gmail.com"},
|
|
12
|
-
{name = "Givechi", email = "mohamad.m.givechi@gmail.com"},
|
|
13
|
-
{name = "Zareshahi", email = "a.zareshahi1377@gmail.com"},
|
|
14
|
-
]
|
|
15
|
-
maintainers = [
|
|
16
|
-
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
17
|
-
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
18
|
-
]
|
|
19
|
-
description = "A high-level NLP toolkit built on top of modern LLMs."
|
|
20
|
-
readme = "README.md"
|
|
21
|
-
license = {text = "MIT"}
|
|
22
|
-
requires-python = ">=3.
|
|
23
|
-
dependencies = [
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"Topic ::
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hamtaa-texttools"
|
|
7
|
+
version = "2.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
10
|
+
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
11
|
+
{name = "Montazer", email = "montazerh82@gmail.com"},
|
|
12
|
+
{name = "Givechi", email = "mohamad.m.givechi@gmail.com"},
|
|
13
|
+
{name = "Zareshahi", email = "a.zareshahi1377@gmail.com"},
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
17
|
+
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
18
|
+
]
|
|
19
|
+
description = "A high-level NLP toolkit built on top of modern LLMs."
|
|
20
|
+
readme = "README.md"
|
|
21
|
+
license = {text = "MIT"}
|
|
22
|
+
requires-python = ">=3.11"
|
|
23
|
+
dependencies = [
|
|
24
|
+
"dotenv>=0.9.9",
|
|
25
|
+
"openai>=1.97.1",
|
|
26
|
+
"pydantic>=2.0.0",
|
|
27
|
+
"pyyaml>=6.0",
|
|
28
|
+
]
|
|
29
|
+
keywords = ["nlp", "llm", "text-processing", "openai"]
|
|
30
|
+
classifiers = [
|
|
31
|
+
"Development Status :: 5 - Production/Stable",
|
|
32
|
+
"License :: OSI Approved :: MIT License",
|
|
33
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
34
|
+
"Topic :: Text Processing",
|
|
35
|
+
"Operating System :: OS Independent",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["."]
|
|
40
|
+
include = ["texttools*"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
include-package-data = true
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
"texttools" = ["prompts/*.yaml", "py.typed"]
|