hamtaa-texttools 1.1.19__tar.gz → 1.1.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {hamtaa_texttools-1.1.19/hamtaa_texttools.egg-info → hamtaa_texttools-1.1.21}/PKG-INFO +15 -34
  2. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/README.md +14 -33
  3. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21/hamtaa_texttools.egg-info}/PKG-INFO +15 -34
  4. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/SOURCES.txt +2 -3
  5. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/pyproject.toml +1 -1
  6. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_all_async_tools.py +5 -3
  7. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_all_tools.py +19 -13
  8. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/batch/batch_config.py +14 -1
  9. {hamtaa_texttools-1.1.19/texttools/batch/internals → hamtaa_texttools-1.1.21/texttools/batch}/batch_manager.py +6 -6
  10. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/batch/batch_runner.py +7 -7
  11. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/async_operator.py +48 -84
  12. hamtaa_texttools-1.1.21/texttools/internals/models.py +150 -0
  13. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/operator_utils.py +2 -2
  14. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/prompt_loader.py +3 -20
  15. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/sync_operator.py +47 -83
  16. hamtaa_texttools-1.1.21/texttools/internals/text_to_chunks.py +97 -0
  17. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/README.md +2 -2
  18. hamtaa_texttools-1.1.21/texttools/prompts/categorize.yaml +35 -0
  19. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/check_fact.yaml +2 -2
  20. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/extract_entities.yaml +3 -3
  21. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/extract_keywords.yaml +6 -6
  22. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/is_question.yaml +2 -2
  23. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/merge_questions.yaml +4 -4
  24. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/propositionize.yaml +2 -2
  25. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/rewrite.yaml +6 -6
  26. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/run_custom.yaml +1 -1
  27. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/subject_to_question.yaml +2 -2
  28. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/summarize.yaml +2 -2
  29. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/text_to_question.yaml +8 -6
  30. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/prompts/translate.yaml +2 -2
  31. hamtaa_texttools-1.1.21/texttools/tools/async_tools.py +1106 -0
  32. hamtaa_texttools-1.1.21/texttools/tools/sync_tools.py +1106 -0
  33. hamtaa_texttools-1.1.19/texttools/batch/internals/utils.py +0 -16
  34. hamtaa_texttools-1.1.19/texttools/internals/formatters.py +0 -24
  35. hamtaa_texttools-1.1.19/texttools/internals/models.py +0 -190
  36. hamtaa_texttools-1.1.19/texttools/prompts/categorize.yaml +0 -77
  37. hamtaa_texttools-1.1.19/texttools/tools/async_tools.py +0 -1128
  38. hamtaa_texttools-1.1.19/texttools/tools/sync_tools.py +0 -1128
  39. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/LICENSE +0 -0
  40. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/MANIFEST.in +0 -0
  41. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
  42. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/requires.txt +0 -0
  43. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/hamtaa_texttools.egg-info/top_level.txt +0 -0
  44. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/setup.cfg +0 -0
  45. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/tests/test_output_validation.py +0 -0
  46. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/__init__.py +0 -0
  47. {hamtaa_texttools-1.1.19 → hamtaa_texttools-1.1.21}/texttools/internals/exceptions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.1.19
3
+ Version: 1.1.21
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
6
6
  License: MIT License
@@ -50,7 +50,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
50
50
  TextTools provides a rich collection of high-level NLP utilities,
51
51
  Each tool is designed to work with structured outputs (JSON / Pydantic).
52
52
 
53
- - **`categorize()`** - Classifies text into given categories (You have to create a category tree)
53
+ - **`categorize()`** - Classifies text into given categories
54
54
  - **`extract_keywords()`** - Extracts keywords from text
55
55
  - **`extract_entities()`** - Named Entity Recognition (NER) system
56
56
  - **`is_question()`** - Binary detection of whether input is a question
@@ -61,7 +61,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
61
61
  - **`summarize()`** - Text summarization
62
62
  - **`translate()`** - Text translation between languages
63
63
  - **`propositionize()`** - Convert text to atomic independence meaningful sentences
64
- - **`check_fact()`** - Check a statement is relevant to source text or not
64
+ - **`check_fact()`** - Check whether a statement is relevant to the source text
65
65
  - **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
66
66
 
67
67
  ---
@@ -99,21 +99,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
99
99
 
100
100
  TextTools provides several optional flags to customize LLM behavior:
101
101
 
102
- - **`with_analysis (bool)`** → Adds a reasoning step before generating the final output.
102
+ - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
103
103
  **Note:** This doubles token usage per call because it triggers an additional LLM request.
104
104
 
105
- - **`logprobs (bool)`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
105
+ - **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
106
106
  **Note:** This feature works if it's supported by the model.
107
107
 
108
- - **`output_lang (str)`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
108
+ - **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
109
109
 
110
- - **`user_prompt (str)`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
110
+ - **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
111
111
 
112
- - **`temperature (float)`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
112
+ - **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
113
113
 
114
- - **`validator (Callable)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
114
+ - **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
115
115
 
116
- - **`priority (int)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
116
+ - **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
117
117
  **Note:** This feature works if it's supported by the model and vLLM.
118
118
 
119
119
  **Note:** There might be some tools that don't support some of the parameters above.
@@ -125,11 +125,12 @@ TextTools provides several optional flags to customize LLM behavior:
125
125
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
126
126
  - **`result: Any`** → The output of LLM
127
127
  - **`analysis: str`** → The reasoning step before generating the final output
128
- - **`logprobs: list`** → Token-level probabilities for the generated output
129
- - **`process: str`** → The tool name which processed the input
130
- - **`processed_at: datetime`** → The process time
131
- - **`execution_time: float`** → The execution time (seconds)
128
+ - **`logprobs: list`** → Token-level probabilities for the generated output
132
129
  - **`errors: list[str]`** → Any error that have occured during calling LLM
130
+ - **`ToolOutputMetadata`** →
131
+ - **`tool_name: str`** → The tool name which processed the input
132
+ - **`processed_at: datetime`** → The process time
133
+ - **`execution_time: float`** → The execution time (seconds)
133
134
 
134
135
  **Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
135
136
 
@@ -224,26 +225,6 @@ Use **TextTools** when you need to:
224
225
 
225
226
  ---
226
227
 
227
- ## 🔍 Logging
228
-
229
- TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
230
-
231
-
232
- ```python
233
- import logging
234
-
235
- # Default: warnings and errors only
236
- logging.basicConfig(level=logging.WARNING)
237
-
238
- # Debug everything (verbose)
239
- logging.basicConfig(level=logging.DEBUG)
240
-
241
- # Complete silence
242
- logging.basicConfig(level=logging.CRITICAL)
243
- ```
244
-
245
- ---
246
-
247
228
  ## 📚 Batch Processing
248
229
 
249
230
  Process large datasets efficiently using OpenAI's batch API.
@@ -15,7 +15,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
15
15
  TextTools provides a rich collection of high-level NLP utilities,
16
16
  Each tool is designed to work with structured outputs (JSON / Pydantic).
17
17
 
18
- - **`categorize()`** - Classifies text into given categories (You have to create a category tree)
18
+ - **`categorize()`** - Classifies text into given categories
19
19
  - **`extract_keywords()`** - Extracts keywords from text
20
20
  - **`extract_entities()`** - Named Entity Recognition (NER) system
21
21
  - **`is_question()`** - Binary detection of whether input is a question
@@ -26,7 +26,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
26
26
  - **`summarize()`** - Text summarization
27
27
  - **`translate()`** - Text translation between languages
28
28
  - **`propositionize()`** - Convert text to atomic independence meaningful sentences
29
- - **`check_fact()`** - Check a statement is relevant to source text or not
29
+ - **`check_fact()`** - Check whether a statement is relevant to the source text
30
30
  - **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
31
31
 
32
32
  ---
@@ -64,21 +64,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
64
64
 
65
65
  TextTools provides several optional flags to customize LLM behavior:
66
66
 
67
- - **`with_analysis (bool)`** → Adds a reasoning step before generating the final output.
67
+ - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
68
68
  **Note:** This doubles token usage per call because it triggers an additional LLM request.
69
69
 
70
- - **`logprobs (bool)`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
70
+ - **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
71
71
  **Note:** This feature works if it's supported by the model.
72
72
 
73
- - **`output_lang (str)`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
73
+ - **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
74
74
 
75
- - **`user_prompt (str)`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
75
+ - **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
76
76
 
77
- - **`temperature (float)`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
77
+ - **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
78
78
 
79
- - **`validator (Callable)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
79
+ - **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
80
80
 
81
- - **`priority (int)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
81
+ - **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
82
82
  **Note:** This feature works if it's supported by the model and vLLM.
83
83
 
84
84
  **Note:** There might be some tools that don't support some of the parameters above.
@@ -90,11 +90,12 @@ TextTools provides several optional flags to customize LLM behavior:
90
90
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
91
91
  - **`result: Any`** → The output of LLM
92
92
  - **`analysis: str`** → The reasoning step before generating the final output
93
- - **`logprobs: list`** → Token-level probabilities for the generated output
94
- - **`process: str`** → The tool name which processed the input
95
- - **`processed_at: datetime`** → The process time
96
- - **`execution_time: float`** → The execution time (seconds)
93
+ - **`logprobs: list`** → Token-level probabilities for the generated output
97
94
  - **`errors: list[str]`** → Any error that have occured during calling LLM
95
+ - **`ToolOutputMetadata`** →
96
+ - **`tool_name: str`** → The tool name which processed the input
97
+ - **`processed_at: datetime`** → The process time
98
+ - **`execution_time: float`** → The execution time (seconds)
98
99
 
99
100
  **Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
100
101
 
@@ -189,26 +190,6 @@ Use **TextTools** when you need to:
189
190
 
190
191
  ---
191
192
 
192
- ## 🔍 Logging
193
-
194
- TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
195
-
196
-
197
- ```python
198
- import logging
199
-
200
- # Default: warnings and errors only
201
- logging.basicConfig(level=logging.WARNING)
202
-
203
- # Debug everything (verbose)
204
- logging.basicConfig(level=logging.DEBUG)
205
-
206
- # Complete silence
207
- logging.basicConfig(level=logging.CRITICAL)
208
- ```
209
-
210
- ---
211
-
212
193
  ## 📚 Batch Processing
213
194
 
214
195
  Process large datasets efficiently using OpenAI's batch API.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.1.19
3
+ Version: 1.1.21
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>, Zareshahi <a.zareshahi1377@gmail.com>
6
6
  License: MIT License
@@ -50,7 +50,7 @@ It provides ready-to-use utilities for **translation, question detection, keywor
50
50
  TextTools provides a rich collection of high-level NLP utilities,
51
51
  Each tool is designed to work with structured outputs (JSON / Pydantic).
52
52
 
53
- - **`categorize()`** - Classifies text into given categories (You have to create a category tree)
53
+ - **`categorize()`** - Classifies text into given categories
54
54
  - **`extract_keywords()`** - Extracts keywords from text
55
55
  - **`extract_entities()`** - Named Entity Recognition (NER) system
56
56
  - **`is_question()`** - Binary detection of whether input is a question
@@ -61,7 +61,7 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
61
61
  - **`summarize()`** - Text summarization
62
62
  - **`translate()`** - Text translation between languages
63
63
  - **`propositionize()`** - Convert text to atomic independence meaningful sentences
64
- - **`check_fact()`** - Check a statement is relevant to source text or not
64
+ - **`check_fact()`** - Check whether a statement is relevant to the source text
65
65
  - **`run_custom()`** - Allows users to define a custom tool with an arbitrary BaseModel
66
66
 
67
67
  ---
@@ -99,21 +99,21 @@ Each tool is designed to work with structured outputs (JSON / Pydantic).
99
99
 
100
100
  TextTools provides several optional flags to customize LLM behavior:
101
101
 
102
- - **`with_analysis (bool)`** → Adds a reasoning step before generating the final output.
102
+ - **`with_analysis: bool`** → Adds a reasoning step before generating the final output.
103
103
  **Note:** This doubles token usage per call because it triggers an additional LLM request.
104
104
 
105
- - **`logprobs (bool)`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
105
+ - **`logprobs: bool`** → Returns token-level probabilities for the generated output. You can also specify `top_logprobs=<N>` to get the top N alternative tokens and their probabilities.
106
106
  **Note:** This feature works if it's supported by the model.
107
107
 
108
- - **`output_lang (str)`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
108
+ - **`output_lang: str`** → Forces the model to respond in a specific language. The model will ignore other instructions about language and respond strictly in the requested language.
109
109
 
110
- - **`user_prompt (str)`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
110
+ - **`user_prompt: str`** → Allows you to inject a custom instruction or prompt into the model alongside the main template. This gives you fine-grained control over how the model interprets or modifies the input text.
111
111
 
112
- - **`temperature (float)`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
112
+ - **`temperature: float`** → Determines how creative the model should respond. Takes a float number from `0.0` to `2.0`.
113
113
 
114
- - **`validator (Callable)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
114
+ - **`validator: Callable (Experimental)`** → Forces TheTool to validate the output result based on your custom validator. Validator should return a bool (True if there were no problem, False if the validation fails.) If the validator fails, TheTool will retry to get another output by modifying `temperature`. You can specify `max_validation_retries=<N>` to change the number of retries.
115
115
 
116
- - **`priority (int)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
116
+ - **`priority: int (Experimental)`** → Task execution priority level. Higher values = higher priority. Affects processing order in queues.
117
117
  **Note:** This feature works if it's supported by the model and vLLM.
118
118
 
119
119
  **Note:** There might be some tools that don't support some of the parameters above.
@@ -125,11 +125,12 @@ TextTools provides several optional flags to customize LLM behavior:
125
125
  Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
126
126
  - **`result: Any`** → The output of LLM
127
127
  - **`analysis: str`** → The reasoning step before generating the final output
128
- - **`logprobs: list`** → Token-level probabilities for the generated output
129
- - **`process: str`** → The tool name which processed the input
130
- - **`processed_at: datetime`** → The process time
131
- - **`execution_time: float`** → The execution time (seconds)
128
+ - **`logprobs: list`** → Token-level probabilities for the generated output
132
129
  - **`errors: list[str]`** → Any error that have occured during calling LLM
130
+ - **`ToolOutputMetadata`** →
131
+ - **`tool_name: str`** → The tool name which processed the input
132
+ - **`processed_at: datetime`** → The process time
133
+ - **`execution_time: float`** → The execution time (seconds)
133
134
 
134
135
  **Note:** You can use `repr(ToolOutput)` to see details of your ToolOutput.
135
136
 
@@ -224,26 +225,6 @@ Use **TextTools** when you need to:
224
225
 
225
226
  ---
226
227
 
227
- ## 🔍 Logging
228
-
229
- TextTools uses Python's standard `logging` module. The library's default logger level is `WARNING`, so if you want to modify it, follow instructions:
230
-
231
-
232
- ```python
233
- import logging
234
-
235
- # Default: warnings and errors only
236
- logging.basicConfig(level=logging.WARNING)
237
-
238
- # Debug everything (verbose)
239
- logging.basicConfig(level=logging.DEBUG)
240
-
241
- # Complete silence
242
- logging.basicConfig(level=logging.CRITICAL)
243
- ```
244
-
245
- ---
246
-
247
228
  ## 📚 Batch Processing
248
229
 
249
230
  Process large datasets efficiently using OpenAI's batch API.
@@ -12,16 +12,15 @@ tests/test_all_tools.py
12
12
  tests/test_output_validation.py
13
13
  texttools/__init__.py
14
14
  texttools/batch/batch_config.py
15
+ texttools/batch/batch_manager.py
15
16
  texttools/batch/batch_runner.py
16
- texttools/batch/internals/batch_manager.py
17
- texttools/batch/internals/utils.py
18
17
  texttools/internals/async_operator.py
19
18
  texttools/internals/exceptions.py
20
- texttools/internals/formatters.py
21
19
  texttools/internals/models.py
22
20
  texttools/internals/operator_utils.py
23
21
  texttools/internals/prompt_loader.py
24
22
  texttools/internals/sync_operator.py
23
+ texttools/internals/text_to_chunks.py
25
24
  texttools/prompts/README.md
26
25
  texttools/prompts/categorize.yaml
27
26
  texttools/prompts/check_fact.yaml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hamtaa-texttools"
7
- version = "1.1.19"
7
+ version = "1.1.21"
8
8
  authors = [
9
9
  { name = "Tohidi", email = "the.mohammad.tohidi@gmail.com" },
10
10
  { name = "Montazer", email = "montazerh82@gmail.com" },
@@ -24,12 +24,11 @@ async def main():
24
24
  "سلام حالت چطوره؟",
25
25
  categories=["هیچکدام", "دینی", "فلسفه"],
26
26
  logprobs=True,
27
- top_logprobs=-1,
28
27
  )
29
28
  keywords_task = t.extract_keywords("Tomorrow, we will be dead by the car crash")
30
29
  entities_task = t.extract_entities("We will be dead by the car crash")
31
30
  detection_task = t.is_question("We will be dead by the car crash")
32
- question_task = t.text_to_question("We will be dead by the car crash")
31
+ question_task = t.text_to_question("We will be dead by the car crash", 2)
33
32
  merged_task = t.merge_questions(
34
33
  ["چرا ما موجوداتی اجتماعی هستیم؟", "چرا باید در کنار هم زندگی کنیم؟"],
35
34
  mode="default",
@@ -47,7 +46,10 @@ async def main():
47
46
  "جنگ جهانی دوم در سال ۱۹۳۹ آغاز شد و آلمان به لهستان حمله کرد.",
48
47
  output_lang="Persian",
49
48
  )
50
- check_fact_task = t.check_fact(text="امام نهم در ایران به خاک سپرده شد", source_text="حرم مطهر امام رضا علیه السلام در مشهد مقدس هست")
49
+ check_fact_task = t.check_fact(
50
+ text="امام نهم در ایران به خاک سپرده شد",
51
+ source_text="حرم مطهر امام رضا علیه السلام در مشهد مقدس هست",
52
+ )
51
53
  (
52
54
  category,
53
55
  keywords,
@@ -28,19 +28,20 @@ category = t.categorize(
28
28
  print(repr(category))
29
29
 
30
30
  # Categorizer: tree mode
31
- tree = CategoryTree("category_test_tree")
32
- tree.add_node("اخلاق")
33
- tree.add_node("معرفت شناسی")
34
- tree.add_node("متافیزیک", description="اراده قدرت در حیطه متافیزیک است")
35
- tree.add_node("فلسفه ذهن", description="فلسفه ذهن به چگونگی درک ما از جهان می پردازد")
36
- tree.add_node("آگاهی", "فلسفه ذهن", description="آگاهی خیلی مهم است")
31
+ tree = CategoryTree()
32
+ tree.add_node("اخلاق", "root")
33
+ tree.add_node("معرفت شناسی", "root")
34
+ tree.add_node("متافیزیک", "root", description="اراده قدرت در حیطه متافیزیک است")
35
+ tree.add_node(
36
+ "فلسفه ذهن", "root", description="فلسفه ذهن به چگونگی درک ما از جهان می پردازد"
37
+ )
38
+ tree.add_node("آگاهی", "فلسفه ذهن")
37
39
  tree.add_node("ذهن و بدن", "فلسفه ذهن")
38
40
  tree.add_node("امکان و ضرورت", "متافیزیک")
39
41
 
40
42
  categories = t.categorize(
41
43
  "اراده قدرت مفهومی مهم در مابعد الطبیعه است که توسط نیچه مطرح شده",
42
44
  tree,
43
- mode="category_tree",
44
45
  )
45
46
  print(repr(categories))
46
47
 
@@ -51,7 +52,12 @@ keywords = t.extract_keywords(
51
52
  print(repr(keywords))
52
53
 
53
54
  # NER Extractor
54
- entities = t.extract_entities("We will be dead by the car crash", with_analysis=True)
55
+ entities = t.extract_entities(
56
+ "Ali will be dead by the car crash",
57
+ entities=["EVENT"],
58
+ with_analysis=True,
59
+ logprobs=True,
60
+ )
55
61
  print(repr(entities))
56
62
 
57
63
 
@@ -60,7 +66,7 @@ detection = t.is_question("We will be dead by the car crash")
60
66
  print(repr(detection))
61
67
 
62
68
  # Question from Text Generator
63
- question = t.text_to_question("We will be dead by the car crash")
69
+ question = t.text_to_question("We will be dead by the car crash", 2)
64
70
  print(repr(question))
65
71
 
66
72
  # Question Merger
@@ -77,7 +83,7 @@ rewritten = t.rewrite(
77
83
  )
78
84
  print(repr(rewritten))
79
85
 
80
- # Question Generator from Subject
86
+ # Question from Subject Generator
81
87
  questions = t.subject_to_question("Friendship", 3)
82
88
  print(repr(questions))
83
89
 
@@ -89,14 +95,14 @@ print(repr(summary))
89
95
  translation = t.translate("سلام حالت چطوره؟", target_language="English")
90
96
  print(repr(translation))
91
97
 
92
- # propositionize
98
+ # Propositionizer
93
99
  propositionize = t.propositionize(
94
100
  "جنگ جهانی دوم در سال ۱۹۳۹ آغاز شد و آلمان به لهستان حمله کرد.",
95
101
  output_lang="Persian",
96
102
  )
97
103
  print(repr(propositionize))
98
104
 
99
- # check_fact
105
+ # Check Fact
100
106
  check_fact = t.check_fact(
101
107
  text="امام نهم در ایران به خاک سپرده شد",
102
108
  source_text="حرم مطهر امام رضا علیه السلام در مشهد مقدس هست",
@@ -104,7 +110,7 @@ check_fact = t.check_fact(
104
110
  print(repr(check_fact))
105
111
 
106
112
 
107
- # Custom tool
113
+ # Run Custom
108
114
  class Student(BaseModel):
109
115
  result: list[dict[str, str]]
110
116
 
@@ -1,7 +1,20 @@
1
1
  from dataclasses import dataclass
2
2
  from collections.abc import Callable
3
3
 
4
- from texttools.batch.internals.utils import import_data, export_data
4
+
5
+ def export_data(data) -> list[dict[str, str]]:
6
+ """
7
+ Produces a structure of the following form from an initial data structure:
8
+ [{"id": str, "text": str},...]
9
+ """
10
+ return data
11
+
12
+
13
+ def import_data(data) -> object:
14
+ """
15
+ Takes the output and adds and aggregates it to the original structure.
16
+ """
17
+ return data
5
18
 
6
19
 
7
20
  @dataclass
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import uuid
3
3
  from pathlib import Path
4
- from typing import Any, Type, TypeVar
4
+ from typing import Type, TypeVar
5
5
  import logging
6
6
 
7
7
  from pydantic import BaseModel
@@ -31,7 +31,7 @@ class BatchManager:
31
31
  prompt_template: str,
32
32
  state_dir: Path = Path(".batch_jobs"),
33
33
  custom_json_schema_obj_str: dict | None = None,
34
- **client_kwargs: Any,
34
+ **client_kwargs: object,
35
35
  ):
36
36
  self._client = client
37
37
  self._model = model
@@ -51,7 +51,7 @@ class BatchManager:
51
51
  def _state_file(self, job_name: str) -> Path:
52
52
  return self._state_dir / f"{job_name}.json"
53
53
 
54
- def _load_state(self, job_name: str) -> list[dict[str, Any]]:
54
+ def _load_state(self, job_name: str) -> list[dict[str, object]]:
55
55
  """
56
56
  Loads the state (job information) from the state file for the given job name.
57
57
  Returns an empty list if the state file does not exist.
@@ -62,7 +62,7 @@ class BatchManager:
62
62
  return json.load(f)
63
63
  return []
64
64
 
65
- def _save_state(self, job_name: str, jobs: list[dict[str, Any]]) -> None:
65
+ def _save_state(self, job_name: str, jobs: list[dict[str, object]]) -> None:
66
66
  """
67
67
  Saves the job state to the state file for the given job name.
68
68
  """
@@ -77,11 +77,11 @@ class BatchManager:
77
77
  if path.exists():
78
78
  path.unlink()
79
79
 
80
- def _build_task(self, text: str, idx: str) -> dict[str, Any]:
80
+ def _build_task(self, text: str, idx: str) -> dict[str, object]:
81
81
  """
82
82
  Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
83
83
  """
84
- response_format_config: dict[str, Any]
84
+ response_format_config: dict[str, object]
85
85
 
86
86
  if self._custom_json_schema_obj_str:
87
87
  response_format_config = {
@@ -2,16 +2,16 @@ import json
2
2
  import os
3
3
  import time
4
4
  from pathlib import Path
5
- from typing import Any, Type, TypeVar
5
+ from typing import Type, TypeVar
6
6
  import logging
7
7
 
8
8
  from dotenv import load_dotenv
9
9
  from openai import OpenAI
10
10
  from pydantic import BaseModel
11
11
 
12
- from texttools.batch.internals.batch_manager import BatchManager
12
+ from texttools.batch.batch_manager import BatchManager
13
13
  from texttools.batch.batch_config import BatchConfig
14
- from texttools.internals.models import StrOutput
14
+ from texttools.internals.models import Str
15
15
  from texttools.internals.exceptions import TextToolsError, ConfigurationError
16
16
 
17
17
  # Base Model type for output models
@@ -26,7 +26,7 @@ class BatchJobRunner:
26
26
  """
27
27
 
28
28
  def __init__(
29
- self, config: BatchConfig = BatchConfig(), output_model: Type[T] = StrOutput
29
+ self, config: BatchConfig = BatchConfig(), output_model: Type[T] = Str
30
30
  ):
31
31
  try:
32
32
  self._config = config
@@ -38,7 +38,7 @@ class BatchJobRunner:
38
38
  self._output_model = output_model
39
39
  self._manager = self._init_manager()
40
40
  self._data = self._load_data()
41
- self._parts: list[list[dict[str, Any]]] = []
41
+ self._parts: list[list[dict[str, object]]] = []
42
42
  # Map part index to job name
43
43
  self._part_idx_to_job_name: dict[int, str] = {}
44
44
  # Track retry attempts per part
@@ -130,8 +130,8 @@ class BatchJobRunner:
130
130
 
131
131
  def _save_results(
132
132
  self,
133
- output_data: list[dict[str, Any]] | dict[str, Any],
134
- log: list[Any],
133
+ output_data: list[dict[str, object]] | dict[str, object],
134
+ log: list[object],
135
135
  part_idx: int,
136
136
  ):
137
137
  part_suffix = f"_part_{part_idx + 1}" if len(self._parts) > 1 else ""