hamtaa-texttools 1.1.19__tar.gz → 1.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.19/hamtaa_texttools.egg-info → hamtaa_texttools-1.1.21}/PKG-INFO +15 -34
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/README.md +14 -33
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21/hamtaa_texttools.egg-info}/PKG-INFO +15 -34
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/SOURCES.txt +2 -3
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/pyproject.toml +1 -1
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_all_async_tools.py +5 -3
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_all_tools.py +19 -13
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/batch/batch_config.py +14 -1
- {hamtaa_texttools-1.1.19/texttools/batch/internals → hamtaa_texttools-1.1.21/texttools/batch}/batch_manager.py +6 -6
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/batch/batch_runner.py +7 -7
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/async_operator.py +48 -84
- hamtaa_texttools-1.1.21/texttools/internals/models.py +150 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/operator_utils.py +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/prompt_loader.py +3 -20
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/sync_operator.py +47 -83
- hamtaa_texttools-1.1.21/texttools/internals/text_to_chunks.py +97 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/README.md +2 -2
- hamtaa_texttools-1.1.21/texttools/prompts/categorize.yaml +35 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/check_fact.yaml +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/extract_entities.yaml +3 -3
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/extract_keywords.yaml +6 -6
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/is_question.yaml +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/merge_questions.yaml +4 -4
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/propositionize.yaml +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/rewrite.yaml +6 -6
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/run_custom.yaml +1 -1
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/subject_to_question.yaml +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/summarize.yaml +2 -2
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/text_to_question.yaml +8 -6
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/translate.yaml +2 -2
- hamtaa_texttools-1.1.21/texttools/tools/async_tools.py +1106 -0
- hamtaa_texttools-1.1.21/texttools/tools/sync_tools.py +1106 -0
- hamtaa_texttools-1.1.19/texttools/batch/internals/utils.py +0 -16
- hamtaa_texttools-1.1.19/texttools/internals/formatters.py +0 -24
- hamtaa_texttools-1.1.19/texttools/internals/models.py +0 -190
- hamtaa_texttools-1.1.19/texttools/prompts/categorize.yaml +0 -77
- hamtaa_texttools-1.1.19/texttools/tools/async_tools.py +0 -1128
- hamtaa_texttools-1.1.19/texttools/tools/sync_tools.py +0 -1128
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/LICENSE +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/MANIFEST.in +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/requires.txt +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/top_level.txt +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/setup.cfg +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_output_validation.py +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/__init__.py +0 -0
- {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/exceptions.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.21
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,7 +50,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
|
|
|
50
50
|
TextTools provides a rich collection of high-level NLP utilities,
|
|
51
51
|
Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
52
52
|
|
|
53
|
-
- **`categorize()`** - Classifies text into given categories
|
|
53
|
+
- **`categorize()`** - Classifies text into given categories
|
|
54
54
|
- **`extract_keywords()`** - Extracts keywords from text
|
|
55
55
|
- **`extract_entities()`** - Named Entity Recognition (NER) system
|
|
56
56
|
- **`is_question()`** - Binary detection of whether input is a question
|
|
@@ -61,7 +61,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
61
61
|
- **`summarize()`** - Text summarization
|
|
62
62
|
- **`translate()`** - Text translation between languages
|
|
63
63
|
- **`propositionize()`** - Convert text to atomic independence meaningful sentences
|
|
64
|
-
- **`check_fact()`** - Check a statement is relevant to source text
|
|
64
|
+
- **`check_fact()`** - Check whether a statement is relevant to the source text
|
|
65
65
|
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
66
66
|
|
|
67
67
|
---
|
|
@@ -99,21 +99,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
99
99
|
|
|
100
100
|
TextTools provides several optional flags to customize LLM behavior:
|
|
101
101
|
|
|
102
|
-
- **`with_analysis
|
|
102
|
+
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
103
103
|
**Note:** This doubles token usage per call because it triggers an additional LLM request.
|
|
104
104
|
|
|
105
|
-
- **`logprobs
|
|
105
|
+
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
106
106
|
**Note:** This feature works if it's supported by the model.
|
|
107
107
|
|
|
108
|
-
- **`output_lang
|
|
108
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
|
|
109
109
|
|
|
110
|
-
- **`user_prompt
|
|
110
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
|
|
111
111
|
|
|
112
|
-
- **`temperature
|
|
112
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
|
|
113
113
|
|
|
114
|
-
- **`validator (
|
|
114
|
+
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
|
|
115
115
|
|
|
116
|
-
- **`priority (
|
|
116
|
+
- **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
|
|
117
117
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
118
118
|
|
|
119
119
|
**Note:** There might be some tools that don't support some of the parameters above.
|
|
@@ -125,11 +125,12 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
125
125
|
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
126
126
|
- **`result: Any`** → The output of LLM
|
|
127
127
|
- **`analysis: str`** → The reasoning step before generating the final output
|
|
128
|
-
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
129
|
-
- **`process: str`** → The tool name which processed the input
|
|
130
|
-
- **`processed_at: datetime`** → The process time
|
|
131
|
-
- **`execution_time: float`** → The execution time (seconds)
|
|
128
|
+
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
132
129
|
- **`errors: list[str]`** → Any error that have occured during calling LLM
|
|
130
|
+
- **`ToolOutputMetadata`** →
|
|
131
|
+
- **`tool_name: str`** → The tool name which processed the input
|
|
132
|
+
- **`processed_at: datetime`** → The process time
|
|
133
|
+
- **`execution_time: float`** → The execution time (seconds)
|
|
133
134
|
|
|
134
135
|
**Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
|
|
135
136
|
|
|
@@ -224,26 +225,6 @@ Use **TextTools** when you need to:
|
|
|
224
225
|
|
|
225
226
|
---
|
|
226
227
|
|
|
227
|
-
## 🔍 Logging
|
|
228
|
-
|
|
229
|
-
TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
```python
|
|
233
|
-
import logging
|
|
234
|
-
|
|
235
|
-
# Default: warnings and errors only
|
|
236
|
-
logging.basicConfig(level=logging.WARNING)
|
|
237
|
-
|
|
238
|
-
# Debug everything (verbose)
|
|
239
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
240
|
-
|
|
241
|
-
# Complete silence
|
|
242
|
-
logging.basicConfig(level=logging.CRITICAL)
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
---
|
|
246
|
-
|
|
247
228
|
## 📚 Batch Processing
|
|
248
229
|
|
|
249
230
|
Process large datasets efficiently using OpenAI's batch API.
|
|
@@ -15,7 +15,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
|
|
|
15
15
|
TextTools provides a rich collection of high-level NLP utilities,
|
|
16
16
|
Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
17
17
|
|
|
18
|
-
- **`categorize()`** - Classifies text into given categories
|
|
18
|
+
- **`categorize()`** - Classifies text into given categories
|
|
19
19
|
- **`extract_keywords()`** - Extracts keywords from text
|
|
20
20
|
- **`extract_entities()`** - Named Entity Recognition (NER) system
|
|
21
21
|
- **`is_question()`** - Binary detection of whether input is a question
|
|
@@ -26,7 +26,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
26
26
|
- **`summarize()`** - Text summarization
|
|
27
27
|
- **`translate()`** - Text translation between languages
|
|
28
28
|
- **`propositionize()`** - Convert text to atomic independence meaningful sentences
|
|
29
|
-
- **`check_fact()`** - Check a statement is relevant to source text
|
|
29
|
+
- **`check_fact()`** - Check whether a statement is relevant to the source text
|
|
30
30
|
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
31
31
|
|
|
32
32
|
---
|
|
@@ -64,21 +64,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
64
64
|
|
|
65
65
|
TextTools provides several optional flags to customize LLM behavior:
|
|
66
66
|
|
|
67
|
-
- **`with_analysis
|
|
67
|
+
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
68
68
|
**Note:** This doubles token usage per call because it triggers an additional LLM request.
|
|
69
69
|
|
|
70
|
-
- **`logprobs
|
|
70
|
+
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
71
71
|
**Note:** This feature works if it's supported by the model.
|
|
72
72
|
|
|
73
|
-
- **`output_lang
|
|
73
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
|
|
74
74
|
|
|
75
|
-
- **`user_prompt
|
|
75
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
|
|
76
76
|
|
|
77
|
-
- **`temperature
|
|
77
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
|
|
78
78
|
|
|
79
|
-
- **`validator (
|
|
79
|
+
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
|
|
80
80
|
|
|
81
|
-
- **`priority (
|
|
81
|
+
- **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
|
|
82
82
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
83
83
|
|
|
84
84
|
**Note:** There might be some tools that don't support some of the parameters above.
|
|
@@ -90,11 +90,12 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
90
90
|
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
91
91
|
- **`result: Any`** → The output of LLM
|
|
92
92
|
- **`analysis: str`** → The reasoning step before generating the final output
|
|
93
|
-
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
94
|
-
- **`process: str`** → The tool name which processed the input
|
|
95
|
-
- **`processed_at: datetime`** → The process time
|
|
96
|
-
- **`execution_time: float`** → The execution time (seconds)
|
|
93
|
+
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
97
94
|
- **`errors: list[str]`** → Any error that have occured during calling LLM
|
|
95
|
+
- **`ToolOutputMetadata`** →
|
|
96
|
+
- **`tool_name: str`** → The tool name which processed the input
|
|
97
|
+
- **`processed_at: datetime`** → The process time
|
|
98
|
+
- **`execution_time: float`** → The execution time (seconds)
|
|
98
99
|
|
|
99
100
|
**Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
|
|
100
101
|
|
|
@@ -189,26 +190,6 @@ Use **TextTools** when you need to:
|
|
|
189
190
|
|
|
190
191
|
---
|
|
191
192
|
|
|
192
|
-
## 🔍 Logging
|
|
193
|
-
|
|
194
|
-
TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
```python
|
|
198
|
-
import logging
|
|
199
|
-
|
|
200
|
-
# Default: warnings and errors only
|
|
201
|
-
logging.basicConfig(level=logging.WARNING)
|
|
202
|
-
|
|
203
|
-
# Debug everything (verbose)
|
|
204
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
205
|
-
|
|
206
|
-
# Complete silence
|
|
207
|
-
logging.basicConfig(level=logging.CRITICAL)
|
|
208
|
-
```
|
|
209
|
-
|
|
210
|
-
---
|
|
211
|
-
|
|
212
193
|
## 📚 Batch Processing
|
|
213
194
|
|
|
214
195
|
Process large datasets efficiently using OpenAI's batch API.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hamtaa-texttools
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.21
|
|
4
4
|
Summary: A high-level NLP toolkit built on top of modern LLMs.
|
|
5
5
|
Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,7 +50,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
|
|
|
50
50
|
TextTools provides a rich collection of high-level NLP utilities,
|
|
51
51
|
Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
52
52
|
|
|
53
|
-
- **`categorize()`** - Classifies text into given categories
|
|
53
|
+
- **`categorize()`** - Classifies text into given categories
|
|
54
54
|
- **`extract_keywords()`** - Extracts keywords from text
|
|
55
55
|
- **`extract_entities()`** - Named Entity Recognition (NER) system
|
|
56
56
|
- **`is_question()`** - Binary detection of whether input is a question
|
|
@@ -61,7 +61,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
61
61
|
- **`summarize()`** - Text summarization
|
|
62
62
|
- **`translate()`** - Text translation between languages
|
|
63
63
|
- **`propositionize()`** - Convert text to atomic independence meaningful sentences
|
|
64
|
-
- **`check_fact()`** - Check a statement is relevant to source text
|
|
64
|
+
- **`check_fact()`** - Check whether a statement is relevant to the source text
|
|
65
65
|
- **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
|
|
66
66
|
|
|
67
67
|
---
|
|
@@ -99,21 +99,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
|
|
|
99
99
|
|
|
100
100
|
TextTools provides several optional flags to customize LLM behavior:
|
|
101
101
|
|
|
102
|
-
- **`with_analysis
|
|
102
|
+
- **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
|
|
103
103
|
**Note:** This doubles token usage per call because it triggers an additional LLM request.
|
|
104
104
|
|
|
105
|
-
- **`logprobs
|
|
105
|
+
- **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
|
|
106
106
|
**Note:** This feature works if it's supported by the model.
|
|
107
107
|
|
|
108
|
-
- **`output_lang
|
|
108
|
+
- **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
|
|
109
109
|
|
|
110
|
-
- **`user_prompt
|
|
110
|
+
- **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
|
|
111
111
|
|
|
112
|
-
- **`temperature
|
|
112
|
+
- **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
|
|
113
113
|
|
|
114
|
-
- **`validator (
|
|
114
|
+
- **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
|
|
115
115
|
|
|
116
|
-
- **`priority (
|
|
116
|
+
- **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
|
|
117
117
|
**Note:** This feature works if it's supported by the model and vLLM.
|
|
118
118
|
|
|
119
119
|
**Note:** There might be some tools that don't support some of the parameters above.
|
|
@@ -125,11 +125,12 @@ TextTools provides several optional flags to customize LLM behavior:
|
|
|
125
125
|
Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
|
|
126
126
|
- **`result: Any`** → The output of LLM
|
|
127
127
|
- **`analysis: str`** → The reasoning step before generating the final output
|
|
128
|
-
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
129
|
-
- **`process: str`** → The tool name which processed the input
|
|
130
|
-
- **`processed_at: datetime`** → The process time
|
|
131
|
-
- **`execution_time: float`** → The execution time (seconds)
|
|
128
|
+
- **`logprobs: list`** → Token-level probabilities for the generated output
|
|
132
129
|
- **`errors: list[str]`** → Any error that have occured during calling LLM
|
|
130
|
+
- **`ToolOutputMetadata`** →
|
|
131
|
+
- **`tool_name: str`** → The tool name which processed the input
|
|
132
|
+
- **`processed_at: datetime`** → The process time
|
|
133
|
+
- **`execution_time: float`** → The execution time (seconds)
|
|
133
134
|
|
|
134
135
|
**Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
|
|
135
136
|
|
|
@@ -224,26 +225,6 @@ Use **TextTools** when you need to:
|
|
|
224
225
|
|
|
225
226
|
---
|
|
226
227
|
|
|
227
|
-
## 🔍 Logging
|
|
228
|
-
|
|
229
|
-
TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
```python
|
|
233
|
-
import logging
|
|
234
|
-
|
|
235
|
-
# Default: warnings and errors only
|
|
236
|
-
logging.basicConfig(level=logging.WARNING)
|
|
237
|
-
|
|
238
|
-
# Debug everything (verbose)
|
|
239
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
240
|
-
|
|
241
|
-
# Complete silence
|
|
242
|
-
logging.basicConfig(level=logging.CRITICAL)
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
---
|
|
246
|
-
|
|
247
228
|
## 📚 Batch Processing
|
|
248
229
|
|
|
249
230
|
Process large datasets efficiently using OpenAI's batch API.
|
|
@@ -12,16 +12,15 @@ tests/test_all_tools.py
|
|
|
12
12
|
tests/test_output_validation.py
|
|
13
13
|
texttools/__init__.py
|
|
14
14
|
texttools/batch/batch_config.py
|
|
15
|
+
texttools/batch/batch_manager.py
|
|
15
16
|
texttools/batch/batch_runner.py
|
|
16
|
-
texttools/batch/internals/batch_manager.py
|
|
17
|
-
texttools/batch/internals/utils.py
|
|
18
17
|
texttools/internals/async_operator.py
|
|
19
18
|
texttools/internals/exceptions.py
|
|
20
|
-
texttools/internals/formatters.py
|
|
21
19
|
texttools/internals/models.py
|
|
22
20
|
texttools/internals/operator_utils.py
|
|
23
21
|
texttools/internals/prompt_loader.py
|
|
24
22
|
texttools/internals/sync_operator.py
|
|
23
|
+
texttools/internals/text_to_chunks.py
|
|
25
24
|
texttools/prompts/README.md
|
|
26
25
|
texttools/prompts/categorize.yaml
|
|
27
26
|
texttools/prompts/check_fact.yaml
|
|
@@ -24,12 +24,11 @@ async def main():
|
|
|
24
24
|
"سلام حالت چطوره؟",
|
|
25
25
|
categories=["هیچکدام", "دینی", "فلسفه"],
|
|
26
26
|
logprobs=True,
|
|
27
|
-
top_logprobs=-1,
|
|
28
27
|
)
|
|
29
28
|
keywords_task = t.extract_keywords("Tomorrow, we will be dead by the car crash")
|
|
30
29
|
entities_task = t.extract_entities("We will be dead by the car crash")
|
|
31
30
|
detection_task = t.is_question("We will be dead by the car crash")
|
|
32
|
-
question_task = t.text_to_question("We will be dead by the car crash")
|
|
31
|
+
question_task = t.text_to_question("We will be dead by the car crash", 2)
|
|
33
32
|
merged_task = t.merge_questions(
|
|
34
33
|
["چرا ما موجوداتی اجتماعی هستیم؟", "چرا باید در کنار هم زندگی کنیم؟"],
|
|
35
34
|
mode="default",
|
|
@@ -47,7 +46,10 @@ async def main():
|
|
|
47
46
|
"جنگ جهانی دوم در سال ۱۹۳۹ آغاز شد و آلمان به لهستان حمله کرد.",
|
|
48
47
|
output_lang="Persian",
|
|
49
48
|
)
|
|
50
|
-
check_fact_task = t.check_fact(
|
|
49
|
+
check_fact_task = t.check_fact(
|
|
50
|
+
text="امام نهم در ایران به خاک سپرده شد",
|
|
51
|
+
source_text="حرم مطهر امام رضا علیه السلام در مشهد مقدس هست",
|
|
52
|
+
)
|
|
51
53
|
(
|
|
52
54
|
category,
|
|
53
55
|
keywords,
|
|
@@ -28,19 +28,20 @@ category = t.categorize(
|
|
|
28
28
|
print(repr(category))
|
|
29
29
|
|
|
30
30
|
# Categorizer: tree mode
|
|
31
|
-
tree = CategoryTree(
|
|
32
|
-
tree.add_node("اخلاق")
|
|
33
|
-
tree.add_node("معرفت شناسی")
|
|
34
|
-
tree.add_node("متافیزیک", description="اراده قدرت در حیطه متافیزیک است")
|
|
35
|
-
tree.add_node(
|
|
36
|
-
|
|
31
|
+
tree = CategoryTree()
|
|
32
|
+
tree.add_node("اخلاق", "root")
|
|
33
|
+
tree.add_node("معرفت شناسی", "root")
|
|
34
|
+
tree.add_node("متافیزیک", "root", description="اراده قدرت در حیطه متافیزیک است")
|
|
35
|
+
tree.add_node(
|
|
36
|
+
"فلسفه ذهن", "root", description="فلسفه ذهن به چگونگی درک ما از جهان می پردازد"
|
|
37
|
+
)
|
|
38
|
+
tree.add_node("آگاهی", "فلسفه ذهن")
|
|
37
39
|
tree.add_node("ذهن و بدن", "فلسفه ذهن")
|
|
38
40
|
tree.add_node("امکان و ضرورت", "متافیزیک")
|
|
39
41
|
|
|
40
42
|
categories = t.categorize(
|
|
41
43
|
"اراده قدرت مفهومی مهم در مابعد الطبیعه است که توسط نیچه مطرح شده",
|
|
42
44
|
tree,
|
|
43
|
-
mode="category_tree",
|
|
44
45
|
)
|
|
45
46
|
print(repr(categories))
|
|
46
47
|
|
|
@@ -51,7 +52,12 @@ keywords = t.extract_keywords(
|
|
|
51
52
|
print(repr(keywords))
|
|
52
53
|
|
|
53
54
|
# NER Extractor
|
|
54
|
-
entities = t.extract_entities(
|
|
55
|
+
entities = t.extract_entities(
|
|
56
|
+
"Ali will be dead by the car crash",
|
|
57
|
+
entities=["EVENT"],
|
|
58
|
+
with_analysis=True,
|
|
59
|
+
logprobs=True,
|
|
60
|
+
)
|
|
55
61
|
print(repr(entities))
|
|
56
62
|
|
|
57
63
|
|
|
@@ -60,7 +66,7 @@ detection = t.is_question("We will be dead by the car crash")
|
|
|
60
66
|
print(repr(detection))
|
|
61
67
|
|
|
62
68
|
# Question from Text Generator
|
|
63
|
-
question = t.text_to_question("We will be dead by the car crash")
|
|
69
|
+
question = t.text_to_question("We will be dead by the car crash", 2)
|
|
64
70
|
print(repr(question))
|
|
65
71
|
|
|
66
72
|
# Question Merger
|
|
@@ -77,7 +83,7 @@ rewritten = t.rewrite(
|
|
|
77
83
|
)
|
|
78
84
|
print(repr(rewritten))
|
|
79
85
|
|
|
80
|
-
# Question
|
|
86
|
+
# Question from Subject Generator
|
|
81
87
|
questions = t.subject_to_question("Friendship", 3)
|
|
82
88
|
print(repr(questions))
|
|
83
89
|
|
|
@@ -89,14 +95,14 @@ print(repr(summary))
|
|
|
89
95
|
translation = t.translate("سلام حالت چطوره؟", target_language="English")
|
|
90
96
|
print(repr(translation))
|
|
91
97
|
|
|
92
|
-
#
|
|
98
|
+
# Propositionizer
|
|
93
99
|
propositionize = t.propositionize(
|
|
94
100
|
"جنگ جهانی دوم در سال ۱۹۳۹ آغاز شد و آلمان به لهستان حمله کرد.",
|
|
95
101
|
output_lang="Persian",
|
|
96
102
|
)
|
|
97
103
|
print(repr(propositionize))
|
|
98
104
|
|
|
99
|
-
#
|
|
105
|
+
# Check Fact
|
|
100
106
|
check_fact = t.check_fact(
|
|
101
107
|
text="امام نهم در ایران به خاک سپرده شد",
|
|
102
108
|
source_text="حرم مطهر امام رضا علیه السلام در مشهد مقدس هست",
|
|
@@ -104,7 +110,7 @@ check_fact = t.check_fact(
|
|
|
104
110
|
print(repr(check_fact))
|
|
105
111
|
|
|
106
112
|
|
|
107
|
-
# Custom
|
|
113
|
+
# Run Custom
|
|
108
114
|
class Student(BaseModel):
|
|
109
115
|
result: list[dict[str, str]]
|
|
110
116
|
|
|
@@ -1,7 +1,20 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from collections.abc import Callable
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
def export_data(data) -> list[dict[str, str]]:
|
|
6
|
+
"""
|
|
7
|
+
Produces a structure of the following form from an initial data structure:
|
|
8
|
+
[{"id": str, "text": str},...]
|
|
9
|
+
"""
|
|
10
|
+
return data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def import_data(data) -> object:
|
|
14
|
+
"""
|
|
15
|
+
Takes the output and adds and aggregates it to the original structure.
|
|
16
|
+
"""
|
|
17
|
+
return data
|
|
5
18
|
|
|
6
19
|
|
|
7
20
|
@dataclass
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Type, TypeVar
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
@@ -31,7 +31,7 @@ class BatchManager:
|
|
|
31
31
|
prompt_template: str,
|
|
32
32
|
state_dir: Path = Path(".batch_jobs"),
|
|
33
33
|
custom_json_schema_obj_str: dict | None = None,
|
|
34
|
-
**client_kwargs:
|
|
34
|
+
**client_kwargs: object,
|
|
35
35
|
):
|
|
36
36
|
self._client = client
|
|
37
37
|
self._model = model
|
|
@@ -51,7 +51,7 @@ class BatchManager:
|
|
|
51
51
|
def _state_file(self, job_name: str) -> Path:
|
|
52
52
|
return self._state_dir / f"{job_name}.json"
|
|
53
53
|
|
|
54
|
-
def _load_state(self, job_name: str) -> list[dict[str,
|
|
54
|
+
def _load_state(self, job_name: str) -> list[dict[str, object]]:
|
|
55
55
|
"""
|
|
56
56
|
Loads the state (job information) from the state file for the given job name.
|
|
57
57
|
Returns an empty list if the state file does not exist.
|
|
@@ -62,7 +62,7 @@ class BatchManager:
|
|
|
62
62
|
return json.load(f)
|
|
63
63
|
return []
|
|
64
64
|
|
|
65
|
-
def _save_state(self, job_name: str, jobs: list[dict[str,
|
|
65
|
+
def _save_state(self, job_name: str, jobs: list[dict[str, object]]) -> None:
|
|
66
66
|
"""
|
|
67
67
|
Saves the job state to the state file for the given job name.
|
|
68
68
|
"""
|
|
@@ -77,11 +77,11 @@ class BatchManager:
|
|
|
77
77
|
if path.exists():
|
|
78
78
|
path.unlink()
|
|
79
79
|
|
|
80
|
-
def _build_task(self, text: str, idx: str) -> dict[str,
|
|
80
|
+
def _build_task(self, text: str, idx: str) -> dict[str, object]:
|
|
81
81
|
"""
|
|
82
82
|
Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
|
|
83
83
|
"""
|
|
84
|
-
response_format_config: dict[str,
|
|
84
|
+
response_format_config: dict[str, object]
|
|
85
85
|
|
|
86
86
|
if self._custom_json_schema_obj_str:
|
|
87
87
|
response_format_config = {
|
|
@@ -2,16 +2,16 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Type, TypeVar
|
|
6
6
|
import logging
|
|
7
7
|
|
|
8
8
|
from dotenv import load_dotenv
|
|
9
9
|
from openai import OpenAI
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
-
from texttools.batch.
|
|
12
|
+
from texttools.batch.batch_manager import BatchManager
|
|
13
13
|
from texttools.batch.batch_config import BatchConfig
|
|
14
|
-
from texttools.internals.models import
|
|
14
|
+
from texttools.internals.models import Str
|
|
15
15
|
from texttools.internals.exceptions import TextToolsError, ConfigurationError
|
|
16
16
|
|
|
17
17
|
# Base Model type for output models
|
|
@@ -26,7 +26,7 @@ class BatchJobRunner:
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
def __init__(
|
|
29
|
-
self, config: BatchConfig = BatchConfig(), output_model: Type[T] =
|
|
29
|
+
self, config: BatchConfig = BatchConfig(), output_model: Type[T] = Str
|
|
30
30
|
):
|
|
31
31
|
try:
|
|
32
32
|
self._config = config
|
|
@@ -38,7 +38,7 @@ class BatchJobRunner:
|
|
|
38
38
|
self._output_model = output_model
|
|
39
39
|
self._manager = self._init_manager()
|
|
40
40
|
self._data = self._load_data()
|
|
41
|
-
self._parts: list[list[dict[str,
|
|
41
|
+
self._parts: list[list[dict[str, object]]] = []
|
|
42
42
|
# Map part index to job name
|
|
43
43
|
self._part_idx_to_job_name: dict[int, str] = {}
|
|
44
44
|
# Track retry attempts per part
|
|
@@ -130,8 +130,8 @@ class BatchJobRunner:
|
|
|
130
130
|
|
|
131
131
|
def _save_results(
|
|
132
132
|
self,
|
|
133
|
-
output_data: list[dict[str,
|
|
134
|
-
log: list[
|
|
133
|
+
output_data: list[dict[str, object]] | dict[str, object],
|
|
134
|
+
log: list[object],
|
|
135
135
|
part_idx: int,
|
|
136
136
|
):
|
|
137
137
|
part_suffix = f"_part_{part_idx + 1}" if len(self._parts) > 1 else ""
|