hamtaa-texttools 1.3.2__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/LICENSE +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/PKG-INFO +38 -41
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/README.md +36 -39
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/hamtaa_texttools.egg-info/PKG-INFO +38 -41
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/hamtaa_texttools.egg-info/SOURCES.txt +6 -8
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/pyproject.toml +2 -2
- hamtaa_texttools-2.0.0/tests/test_category_tree.py +48 -0
- hamtaa_texttools-2.0.0/tests/test_to_chunks.py +13 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/__init__.py +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/internal_models.py +16 -7
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/operators/async_operator.py +10 -16
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/operators/sync_operator.py +10 -16
- hamtaa_texttools-2.0.0/texttools/core/utils.py +260 -0
- hamtaa_texttools-2.0.0/texttools/models.py +143 -0
- hamtaa_texttools-1.3.2/texttools/prompts/rewrite.yaml → hamtaa_texttools-2.0.0/texttools/prompts/augment.yaml +3 -3
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/categorize.yaml +7 -8
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/extract_entities.yaml +2 -2
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/extract_keywords.yaml +4 -2
- hamtaa_texttools-1.3.2/texttools/prompts/check_fact.yaml → hamtaa_texttools-2.0.0/texttools/prompts/is_fact.yaml +5 -4
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/is_question.yaml +1 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/merge_questions.yaml +8 -6
- hamtaa_texttools-2.0.0/texttools/prompts/propositionize.yaml +28 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/run_custom.yaml +3 -1
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/summarize.yaml +3 -3
- hamtaa_texttools-2.0.0/texttools/prompts/to_question.yaml +60 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/prompts/translate.yaml +4 -4
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/tools/async_tools.py +90 -169
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/tools/sync_tools.py +76 -150
- hamtaa_texttools-1.3.2/tests/test_all_async_tools.py +0 -99
- hamtaa_texttools-1.3.2/tests/test_all_tools.py +0 -118
- hamtaa_texttools-1.3.2/tests/test_output_validation.py +0 -31
- hamtaa_texttools-1.3.2/texttools/core/engine.py +0 -262
- hamtaa_texttools-1.3.2/texttools/models.py +0 -88
- hamtaa_texttools-1.3.2/texttools/prompts/propositionize.yaml +0 -24
- hamtaa_texttools-1.3.2/texttools/prompts/subject_to_question.yaml +0 -26
- hamtaa_texttools-1.3.2/texttools/prompts/text_to_question.yaml +0 -26
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/hamtaa_texttools.egg-info/requires.txt +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/hamtaa_texttools.egg-info/top_level.txt +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/setup.cfg +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/__init__.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/exceptions.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/core/operators/__init__.py +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/py.typed +0 -0
- {hamtaa_texttools-1.3.2 → hamtaa_texttools-2.0.0}/texttools/tools/__init__.py +0 -0
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
@@ -11,7 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
12
|
Classifier: Topic :: Text Processing
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
|
-
Requires-Python: >=3.
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
17
|
Requires-Dist: openai>=1.97.1
|
|
@@ -30,30 +30,27 @@ Dynamic: license-file
|
|
|
30
30
|
|
|
31
31
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
32
32
|
|
|
33
|
-
It provides ready-to-use utilities for **translation, question detection,
|
|
34
|
-
|
|
35
|
-
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
33
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
36
34
|
|
|
37
35
|
---
|
|
38
36
|
|
|
39
37
|
## ✨ Features
|
|
40
38
|
|
|
41
|
-
TextTools provides a
|
|
39
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
42
40
|
Each tool is designed to work with structured outputs.
|
|
43
41
|
|
|
44
|
-
- **`categorize()`** -
|
|
45
|
-
- **`extract_keywords()`** -
|
|
46
|
-
- **`extract_entities()`** - Named Entity Recognition (NER)
|
|
47
|
-
- **`is_question()`** -
|
|
48
|
-
- **`
|
|
49
|
-
- **`merge_questions()`** -
|
|
50
|
-
- **`
|
|
51
|
-
- **`
|
|
52
|
-
- **`
|
|
53
|
-
- **`
|
|
54
|
-
- **`
|
|
55
|
-
- **`
|
|
56
|
-
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
42
|
+
- **`categorize()`** - Classify text into given categories
|
|
43
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
44
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
45
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
46
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
47
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
48
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
49
|
+
- **`summarize()`** - Summarize the given text
|
|
50
|
+
- **`translate()`** - Translate text between languages
|
|
51
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
52
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
53
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
57
54
|
|
|
58
55
|
---
|
|
59
56
|
|
|
@@ -71,14 +68,12 @@ pip install -U hamtaa-texttools
|
|
|
71
68
|
|
|
72
69
|
| Status | Meaning | Tools | Safe for Production? |
|
|
73
70
|
|--------|---------|----------|-------------------|
|
|
74
|
-
| **✅ Production** | Evaluated
|
|
75
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**.
|
|
71
|
+
| **✅ Production** | Evaluated and tested. | `categorize()` (list mode), `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
72
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `categorize()` (tree mode), `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
76
73
|
|
|
77
74
|
---
|
|
78
75
|
|
|
79
|
-
## ⚙️
|
|
80
|
-
|
|
81
|
-
TextTools provides several optional flags to customize LLM behavior:
|
|
76
|
+
## ⚙️ Additional Parameters
|
|
82
77
|
|
|
83
78
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
84
79
|
**Note:** This doubles token usage per call.
|
|
@@ -88,17 +83,17 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
88
83
|
|
|
89
84
|
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
90
85
|
|
|
91
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction
|
|
86
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
92
87
|
|
|
93
|
-
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number
|
|
88
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
94
89
|
|
|
95
|
-
- **`validator: Callable (Experimental)`** → Forces
|
|
90
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
96
91
|
|
|
97
|
-
- **`priority: int (Experimental)`** →
|
|
92
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
98
93
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
99
94
|
|
|
100
|
-
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error
|
|
101
|
-
**Note:** This feature only
|
|
95
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
96
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
102
97
|
|
|
103
98
|
|
|
104
99
|
---
|
|
@@ -110,12 +105,14 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
110
105
|
- **`analysis: str`**
|
|
111
106
|
- **`logprobs: list`**
|
|
112
107
|
- **`errors: list[str]`**
|
|
113
|
-
- **`ToolOutputMetadata`**
|
|
108
|
+
- **`ToolOutputMetadata`**
|
|
114
109
|
- **`tool_name: str`**
|
|
115
110
|
- **`processed_at: datetime`**
|
|
116
111
|
- **`execution_time: float`**
|
|
117
112
|
|
|
118
|
-
|
|
113
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
114
|
+
- Verify operation success with the `is_successful()` method.
|
|
115
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
119
116
|
|
|
120
117
|
---
|
|
121
118
|
|
|
@@ -133,13 +130,13 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
133
130
|
from openai import OpenAI
|
|
134
131
|
from texttools import TheTool
|
|
135
132
|
|
|
136
|
-
client = OpenAI(base_url
|
|
133
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
137
134
|
model = "model_name"
|
|
138
135
|
|
|
139
136
|
the_tool = TheTool(client=client, model=model)
|
|
140
137
|
|
|
141
138
|
detection = the_tool.is_question("Is this project open source?")
|
|
142
|
-
print(
|
|
139
|
+
print(detection.to_json())
|
|
143
140
|
```
|
|
144
141
|
|
|
145
142
|
---
|
|
@@ -157,24 +154,24 @@ async def main():
|
|
|
157
154
|
|
|
158
155
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
159
156
|
|
|
160
|
-
translation_task = async_the_tool.translate("سلام، حالت چطوره؟",
|
|
161
|
-
keywords_task = async_the_tool.extract_keywords("
|
|
157
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
158
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
162
159
|
|
|
163
160
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
164
|
-
|
|
165
|
-
print(
|
|
161
|
+
|
|
162
|
+
print(translation.to_json())
|
|
163
|
+
print(keywords.to_json())
|
|
166
164
|
|
|
167
165
|
asyncio.run(main())
|
|
168
166
|
```
|
|
169
167
|
|
|
170
168
|
---
|
|
171
169
|
|
|
172
|
-
##
|
|
170
|
+
## ✅ Use Cases
|
|
173
171
|
|
|
174
172
|
Use **TextTools** when you need to:
|
|
175
173
|
|
|
176
|
-
- 🔍 **Classify** large datasets quickly without model training
|
|
177
|
-
- 🌍 **Translate** and process multilingual corpora with ease
|
|
174
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
178
175
|
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
179
176
|
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
180
177
|
|
|
@@ -9,30 +9,27 @@
|
|
|
9
9
|
|
|
10
10
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
11
11
|
|
|
12
|
-
It provides ready-to-use utilities for **translation, question detection,
|
|
13
|
-
|
|
14
|
-
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
12
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
15
13
|
|
|
16
14
|
---
|
|
17
15
|
|
|
18
16
|
## ✨ Features
|
|
19
17
|
|
|
20
|
-
TextTools provides a
|
|
18
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
21
19
|
Each tool is designed to work with structured outputs.
|
|
22
20
|
|
|
23
|
-
- **`categorize()`** -
|
|
24
|
-
- **`extract_keywords()`** -
|
|
25
|
-
- **`extract_entities()`** - Named Entity Recognition (NER)
|
|
26
|
-
- **`is_question()`** -
|
|
27
|
-
- **`
|
|
28
|
-
- **`merge_questions()`** -
|
|
29
|
-
- **`
|
|
30
|
-
- **`
|
|
31
|
-
- **`
|
|
32
|
-
- **`
|
|
33
|
-
- **`
|
|
34
|
-
- **`
|
|
35
|
-
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
21
|
+
- **`categorize()`** - Classify text into given categories
|
|
22
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
23
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
24
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
25
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
26
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
27
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
28
|
+
- **`summarize()`** - Summarize the given text
|
|
29
|
+
- **`translate()`** - Translate text between languages
|
|
30
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
31
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
32
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
36
33
|
|
|
37
34
|
---
|
|
38
35
|
|
|
@@ -50,14 +47,12 @@ pip install -U hamtaa-texttools
|
|
|
50
47
|
|
|
51
48
|
| Status | Meaning | Tools | Safe for Production? |
|
|
52
49
|
|--------|---------|----------|-------------------|
|
|
53
|
-
| **✅ Production** | Evaluated
|
|
54
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**.
|
|
50
|
+
| **✅ Production** | Evaluated and tested. | `categorize()` (list mode), `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
51
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `categorize()` (tree mode), `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
55
52
|
|
|
56
53
|
---
|
|
57
54
|
|
|
58
|
-
## ⚙️
|
|
59
|
-
|
|
60
|
-
TextTools provides several optional flags to customize LLM behavior:
|
|
55
|
+
## ⚙️ Additional Parameters
|
|
61
56
|
|
|
62
57
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
63
58
|
**Note:** This doubles token usage per call.
|
|
@@ -67,17 +62,17 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
67
62
|
|
|
68
63
|
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
69
64
|
|
|
70
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction
|
|
65
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
71
66
|
|
|
72
|
-
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number
|
|
67
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
73
68
|
|
|
74
|
-
- **`validator: Callable (Experimental)`** → Forces
|
|
69
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
75
70
|
|
|
76
|
-
- **`priority: int (Experimental)`** →
|
|
71
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
77
72
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
78
73
|
|
|
79
|
-
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error
|
|
80
|
-
**Note:** This feature only
|
|
74
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
75
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
81
76
|
|
|
82
77
|
|
|
83
78
|
---
|
|
@@ -89,12 +84,14 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
89
84
|
- **`analysis: str`**
|
|
90
85
|
- **`logprobs: list`**
|
|
91
86
|
- **`errors: list[str]`**
|
|
92
|
-
- **`ToolOutputMetadata`**
|
|
87
|
+
- **`ToolOutputMetadata`**
|
|
93
88
|
- **`tool_name: str`**
|
|
94
89
|
- **`processed_at: datetime`**
|
|
95
90
|
- **`execution_time: float`**
|
|
96
91
|
|
|
97
|
-
|
|
92
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
93
|
+
- Verify operation success with the `is_successful()` method.
|
|
94
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
98
95
|
|
|
99
96
|
---
|
|
100
97
|
|
|
@@ -112,13 +109,13 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
112
109
|
from openai import OpenAI
|
|
113
110
|
from texttools import TheTool
|
|
114
111
|
|
|
115
|
-
client = OpenAI(base_url
|
|
112
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
116
113
|
model = "model_name"
|
|
117
114
|
|
|
118
115
|
the_tool = TheTool(client=client, model=model)
|
|
119
116
|
|
|
120
117
|
detection = the_tool.is_question("Is this project open source?")
|
|
121
|
-
print(
|
|
118
|
+
print(detection.to_json())
|
|
122
119
|
```
|
|
123
120
|
|
|
124
121
|
---
|
|
@@ -136,24 +133,24 @@ async def main():
|
|
|
136
133
|
|
|
137
134
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
138
135
|
|
|
139
|
-
translation_task = async_the_tool.translate("سلام، حالت چطوره؟",
|
|
140
|
-
keywords_task = async_the_tool.extract_keywords("
|
|
136
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
137
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
141
138
|
|
|
142
139
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
143
|
-
|
|
144
|
-
print(
|
|
140
|
+
|
|
141
|
+
print(translation.to_json())
|
|
142
|
+
print(keywords.to_json())
|
|
145
143
|
|
|
146
144
|
asyncio.run(main())
|
|
147
145
|
```
|
|
148
146
|
|
|
149
147
|
---
|
|
150
148
|
|
|
151
|
-
##
|
|
149
|
+
## ✅ Use Cases
|
|
152
150
|
|
|
153
151
|
Use **TextTools** when you need to:
|
|
154
152
|
|
|
155
|
-
- 🔍 **Classify** large datasets quickly without model training
|
|
156
|
-
- 🌍 **Translate** and process multilingual corpora with ease
|
|
153
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
157
154
|
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
158
155
|
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
159
156
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Erfan Moosavi <erfanmoosavi84@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
Maintainer-email: Erfan Moosavi <erfanmoosavi84@gmail.com>, Tohidi <the.mohammad.tohidi@gmail.com>
|
|
@@ -11,7 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
12
|
Classifier: Topic :: Text Processing
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
|
-
Requires-Python: >=3.
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
15
|
Description-Content-Type: text/markdown
|
|
16
16
|
License-File: LICENSE
|
|
17
17
|
Requires-Dist: openai>=1.97.1
|
|
@@ -30,30 +30,27 @@ Dynamic: license-file
|
|
|
30
30
|
|
|
31
31
|
It provides both **sync (`TheTool`)** and **async (`AsyncTheTool`)** APIs for maximum flexibility.
|
|
32
32
|
|
|
33
|
-
It provides ready-to-use utilities for **translation, question detection,
|
|
34
|
-
|
|
35
|
-
**Note:** Most features of `texttools` are reliable when you use `google/gemma-3n-e4b-it` model.
|
|
33
|
+
It provides ready-to-use utilities for **translation, question detection, categorization, NER extraction, and more** - designed to help you integrate AI-powered text processing into your applications with minimal effort.
|
|
36
34
|
|
|
37
35
|
---
|
|
38
36
|
|
|
39
37
|
## ✨ Features
|
|
40
38
|
|
|
41
|
-
TextTools provides a
|
|
39
|
+
TextTools provides a collection of high-level NLP utilities.
|
|
42
40
|
Each tool is designed to work with structured outputs.
|
|
43
41
|
|
|
44
|
-
- **`categorize()`** -
|
|
45
|
-
- **`extract_keywords()`** -
|
|
46
|
-
- **`extract_entities()`** - Named Entity Recognition (NER)
|
|
47
|
-
- **`is_question()`** -
|
|
48
|
-
- **`
|
|
49
|
-
- **`merge_questions()`** -
|
|
50
|
-
- **`
|
|
51
|
-
- **`
|
|
52
|
-
- **`
|
|
53
|
-
- **`
|
|
54
|
-
- **`
|
|
55
|
-
- **`
|
|
56
|
-
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
42
|
+
- **`categorize()`** - Classify text into given categories
|
|
43
|
+
- **`extract_keywords()`** - Extract keywords from the text
|
|
44
|
+
- **`extract_entities()`** - Perform Named Entity Recognition (NER)
|
|
45
|
+
- **`is_question()`** - Detect if the input is phrased as a question
|
|
46
|
+
- **`to_question()`** - Generate questions from the given text / subject
|
|
47
|
+
- **`merge_questions()`** - Merge multiple questions into one
|
|
48
|
+
- **`augment()`** - Rewrite text in different augmentations
|
|
49
|
+
- **`summarize()`** - Summarize the given text
|
|
50
|
+
- **`translate()`** - Translate text between languages
|
|
51
|
+
- **`propositionize()`** - Convert a text into atomic, independent, meaningful sentences
|
|
52
|
+
- **`is_fact()`** - Check whether a statement is a fact based on the source text
|
|
53
|
+
- **`run_custom()`** - Custom tool that can do almost anything
|
|
57
54
|
|
|
58
55
|
---
|
|
59
56
|
|
|
@@ -71,14 +68,12 @@ pip install -U hamtaa-texttools
|
|
|
71
68
|
|
|
72
69
|
| Status | Meaning | Tools | Safe for Production? |
|
|
73
70
|
|--------|---------|----------|-------------------|
|
|
74
|
-
| **✅ Production** | Evaluated
|
|
75
|
-
| **🧪 Experimental** | Added to the package but **not fully evaluated**.
|
|
71
|
+
| **✅ Production** | Evaluated and tested. | `categorize()` (list mode), `extract_keywords()`, `extract_entities()`, `is_question()`, `to_question()`, `merge_questions()`, `augment()`, `summarize()`, `run_custom()` | **Yes** - ready for reliable use. |
|
|
72
|
+
| **🧪 Experimental** | Added to the package but **not fully evaluated**. | `categorize()` (tree mode), `translate()`, `propositionize()`, `is_fact()` | **Use with caution** |
|
|
76
73
|
|
|
77
74
|
---
|
|
78
75
|
|
|
79
|
-
## ⚙️
|
|
80
|
-
|
|
81
|
-
TextTools provides several optional flags to customize LLM behavior:
|
|
76
|
+
## ⚙️ Additional Parameters
|
|
82
77
|
|
|
83
78
|
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
84
79
|
**Note:** This doubles token usage per call.
|
|
@@ -88,17 +83,17 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
88
83
|
|
|
89
84
|
- **`output_lang: str`** → Forces the model to respond in a specific language.
|
|
90
85
|
|
|
91
|
-
- **`user_prompt: str`** → Allows you to inject a custom instruction
|
|
86
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction into the model alongside the main template.
|
|
92
87
|
|
|
93
|
-
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number
|
|
88
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number between `0.0` and `2.0`.
|
|
94
89
|
|
|
95
|
-
- **`validator: Callable (Experimental)`** → Forces
|
|
90
|
+
- **`validator: Callable (Experimental)`** → Forces the tool to validate the output result based on your validator function. Validator should return a boolean. If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can also specify `max_validation_retries=<N>`.
|
|
96
91
|
|
|
97
|
-
- **`priority: int (Experimental)`** →
|
|
92
|
+
- **`priority: int (Experimental)`** → Affects processing order in queues.
|
|
98
93
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
99
94
|
|
|
100
|
-
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error
|
|
101
|
-
**Note:** This feature only
|
|
95
|
+
- **`timeout: float`** → Maximum time in seconds to wait for the response before raising a timeout error.
|
|
96
|
+
**Note:** This feature is only available in `AsyncTheTool`.
|
|
102
97
|
|
|
103
98
|
|
|
104
99
|
---
|
|
@@ -110,12 +105,14 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
110
105
|
- **`analysis: str`**
|
|
111
106
|
- **`logprobs: list`**
|
|
112
107
|
- **`errors: list[str]`**
|
|
113
|
-
- **`ToolOutputMetadata`**
|
|
108
|
+
- **`ToolOutputMetadata`**
|
|
114
109
|
- **`tool_name: str`**
|
|
115
110
|
- **`processed_at: datetime`**
|
|
116
111
|
- **`execution_time: float`**
|
|
117
112
|
|
|
118
|
-
|
|
113
|
+
- Serialize output to JSON using the `to_json()` method.
|
|
114
|
+
- Verify operation success with the `is_successful()` method.
|
|
115
|
+
- Convert output to a dictionary with the `to_dict()` method.
|
|
119
116
|
|
|
120
117
|
---
|
|
121
118
|
|
|
@@ -133,13 +130,13 @@ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel wit
|
|
|
133
130
|
from openai import OpenAI
|
|
134
131
|
from texttools import TheTool
|
|
135
132
|
|
|
136
|
-
client = OpenAI(base_url
|
|
133
|
+
client = OpenAI(base_url="your_url", API_KEY="your_api_key")
|
|
137
134
|
model = "model_name"
|
|
138
135
|
|
|
139
136
|
the_tool = TheTool(client=client, model=model)
|
|
140
137
|
|
|
141
138
|
detection = the_tool.is_question("Is this project open source?")
|
|
142
|
-
print(
|
|
139
|
+
print(detection.to_json())
|
|
143
140
|
```
|
|
144
141
|
|
|
145
142
|
---
|
|
@@ -157,24 +154,24 @@ async def main():
|
|
|
157
154
|
|
|
158
155
|
async_the_tool = AsyncTheTool(client=async_client, model=model)
|
|
159
156
|
|
|
160
|
-
translation_task = async_the_tool.translate("سلام، حالت چطوره؟",
|
|
161
|
-
keywords_task = async_the_tool.extract_keywords("
|
|
157
|
+
translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_lang="English")
|
|
158
|
+
keywords_task = async_the_tool.extract_keywords("This open source project is great for processing large datasets!")
|
|
162
159
|
|
|
163
160
|
(translation, keywords) = await asyncio.gather(translation_task, keywords_task)
|
|
164
|
-
|
|
165
|
-
print(
|
|
161
|
+
|
|
162
|
+
print(translation.to_json())
|
|
163
|
+
print(keywords.to_json())
|
|
166
164
|
|
|
167
165
|
asyncio.run(main())
|
|
168
166
|
```
|
|
169
167
|
|
|
170
168
|
---
|
|
171
169
|
|
|
172
|
-
##
|
|
170
|
+
## ✅ Use Cases
|
|
173
171
|
|
|
174
172
|
Use **TextTools** when you need to:
|
|
175
173
|
|
|
176
|
-
- 🔍 **Classify** large datasets quickly without model training
|
|
177
|
-
- 🌍 **Translate** and process multilingual corpora with ease
|
|
174
|
+
- 🔍 **Classify** large datasets quickly without model training
|
|
178
175
|
- 🧩 **Integrate** LLMs into production pipelines (structured outputs)
|
|
179
176
|
- 📊 **Analyze** large text collections using embeddings and categorization
|
|
180
177
|
|
|
@@ -6,31 +6,29 @@ hamtaa_texttools.egg-info/SOURCES.txt
|
|
|
6
6
|
hamtaa_texttools.egg-info/dependency_links.txt
|
|
7
7
|
hamtaa_texttools.egg-info/requires.txt
|
|
8
8
|
hamtaa_texttools.egg-info/top_level.txt
|
|
9
|
-
tests/
|
|
10
|
-
tests/
|
|
11
|
-
tests/test_output_validation.py
|
|
9
|
+
tests/test_category_tree.py
|
|
10
|
+
tests/test_to_chunks.py
|
|
12
11
|
texttools/__init__.py
|
|
13
12
|
texttools/models.py
|
|
14
13
|
texttools/py.typed
|
|
15
14
|
texttools/core/__init__.py
|
|
16
|
-
texttools/core/engine.py
|
|
17
15
|
texttools/core/exceptions.py
|
|
18
16
|
texttools/core/internal_models.py
|
|
17
|
+
texttools/core/utils.py
|
|
19
18
|
texttools/core/operators/__init__.py
|
|
20
19
|
texttools/core/operators/async_operator.py
|
|
21
20
|
texttools/core/operators/sync_operator.py
|
|
21
|
+
texttools/prompts/augment.yaml
|
|
22
22
|
texttools/prompts/categorize.yaml
|
|
23
|
-
texttools/prompts/check_fact.yaml
|
|
24
23
|
texttools/prompts/extract_entities.yaml
|
|
25
24
|
texttools/prompts/extract_keywords.yaml
|
|
25
|
+
texttools/prompts/is_fact.yaml
|
|
26
26
|
texttools/prompts/is_question.yaml
|
|
27
27
|
texttools/prompts/merge_questions.yaml
|
|
28
28
|
texttools/prompts/propositionize.yaml
|
|
29
|
-
texttools/prompts/rewrite.yaml
|
|
30
29
|
texttools/prompts/run_custom.yaml
|
|
31
|
-
texttools/prompts/subject_to_question.yaml
|
|
32
30
|
texttools/prompts/summarize.yaml
|
|
33
|
-
texttools/prompts/
|
|
31
|
+
texttools/prompts/to_question.yaml
|
|
34
32
|
texttools/prompts/translate.yaml
|
|
35
33
|
texttools/tools/__init__.py
|
|
36
34
|
texttools/tools/async_tools.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hamtaa-texttools"
|
|
7
|
-
version = "
|
|
7
|
+
version = "2.0.0"
|
|
8
8
|
authors = [
|
|
9
9
|
{name = "Tohidi", email = "the.mohammad.tohidi@gmail.com"},
|
|
10
10
|
{name = "Erfan Moosavi", email = "erfanmoosavi84@gmail.com"},
|
|
@@ -19,7 +19,7 @@ maintainers = [
|
|
|
19
19
|
description = "A high-level NLP toolkit built on top of modern LLMs."
|
|
20
20
|
readme = "README.md"
|
|
21
21
|
license = {text = "MIT"}
|
|
22
|
-
requires-python = ">=3.
|
|
22
|
+
requires-python = ">=3.11"
|
|
23
23
|
dependencies = [
|
|
24
24
|
"openai>=1.97.1",
|
|
25
25
|
"pydantic>=2.0.0",
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from texttools.models import CategoryTree, Node
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@pytest.fixture
|
|
6
|
+
def tree():
|
|
7
|
+
tree = CategoryTree()
|
|
8
|
+
tree.add_node("اخلاق", "root")
|
|
9
|
+
tree.add_node("معرفت شناسی", "root")
|
|
10
|
+
tree.add_node("متافیزیک", "root")
|
|
11
|
+
tree.add_node("فلسفه ذهن", "root")
|
|
12
|
+
tree.add_node("آگاهی", "فلسفه ذهن")
|
|
13
|
+
tree.add_node("ذهن و بدن", "فلسفه ذهن")
|
|
14
|
+
tree.add_node("امکان و ضرورت", "متافیزیک")
|
|
15
|
+
tree.add_node("مغز و ترشحات", "ذهن و بدن")
|
|
16
|
+
return tree
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_level_count(tree):
|
|
20
|
+
assert tree.get_level_count() == 3
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_none_node(tree):
|
|
24
|
+
assert tree.get_node("سلامت") is None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_get_node(tree):
|
|
28
|
+
assert isinstance(tree.get_node("آگاهی"), Node)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_add_duplicate_node(tree):
|
|
32
|
+
with pytest.raises(ValueError, match="Cannot add آگاهی category twice"):
|
|
33
|
+
tree.add_node("آگاهی", "root")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_wrong_parent(tree):
|
|
37
|
+
with pytest.raises(ValueError, match="Parent category امکان not found"):
|
|
38
|
+
tree.add_node("ضرورت", "امکان")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_remove_root(tree):
|
|
42
|
+
with pytest.raises(ValueError, match="Cannot remove the root node"):
|
|
43
|
+
tree.remove_node("root")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_remove_none(tree):
|
|
47
|
+
with pytest.raises(ValueError, match="Category: ایجاب not found"):
|
|
48
|
+
tree.remove_node("ایجاب")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from texttools.core.utils import TheToolUtils
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_single_chunk():
|
|
5
|
+
text = "Short text"
|
|
6
|
+
chunks = TheToolUtils.to_chunks(text, size=100, overlap=0)
|
|
7
|
+
assert len(chunks) == 1
|
|
8
|
+
assert chunks[0] == "Short text"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_empty_text():
|
|
12
|
+
chunks = TheToolUtils.to_chunks("", size=10, overlap=0)
|
|
13
|
+
assert len(chunks) == 0
|