hamtaa-texttools 1.1.1__py3-none-any.whl → 1.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/METADATA +98 -26
- hamtaa_texttools-1.1.16.dist-info/RECORD +31 -0
- texttools/__init__.py +6 -8
- texttools/batch/batch_config.py +26 -0
- texttools/batch/batch_runner.py +105 -151
- texttools/batch/{batch_manager.py → internals/batch_manager.py} +39 -40
- texttools/batch/internals/utils.py +16 -0
- texttools/prompts/README.md +4 -4
- texttools/prompts/categorize.yaml +77 -0
- texttools/prompts/detect_entity.yaml +22 -0
- texttools/prompts/extract_keywords.yaml +68 -18
- texttools/tools/async_tools.py +804 -0
- texttools/tools/internals/async_operator.py +90 -69
- texttools/tools/internals/models.py +183 -0
- texttools/tools/internals/operator_utils.py +54 -0
- texttools/tools/internals/prompt_loader.py +13 -14
- texttools/tools/internals/sync_operator.py +201 -0
- texttools/tools/sync_tools.py +804 -0
- hamtaa_texttools-1.1.1.dist-info/RECORD +0 -30
- texttools/batch/__init__.py +0 -4
- texttools/prompts/categorizer.yaml +0 -28
- texttools/tools/__init__.py +0 -4
- texttools/tools/async_the_tool.py +0 -414
- texttools/tools/internals/base_operator.py +0 -91
- texttools/tools/internals/operator.py +0 -179
- texttools/tools/internals/output_models.py +0 -59
- texttools/tools/the_tool.py +0 -412
- {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Type
|
|
4
|
+
from typing import Any, Type, TypeVar
|
|
5
5
|
import logging
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
from openai import OpenAI
|
|
9
9
|
from openai.lib._pydantic import to_strict_json_schema
|
|
10
10
|
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
logger.setLevel(logging.INFO)
|
|
11
|
+
# Base Model type for output models
|
|
12
|
+
T = TypeVar("T", bound=BaseModel)
|
|
14
13
|
|
|
14
|
+
logger = logging.getLogger("texttools.batch_manager")
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
|
|
17
|
+
class BatchManager:
|
|
17
18
|
"""
|
|
18
19
|
Manages batch processing jobs for OpenAI's chat completions with structured outputs.
|
|
19
20
|
|
|
@@ -26,30 +27,29 @@ class SimpleBatchManager:
|
|
|
26
27
|
self,
|
|
27
28
|
client: OpenAI,
|
|
28
29
|
model: str,
|
|
29
|
-
output_model: Type[
|
|
30
|
+
output_model: Type[T],
|
|
30
31
|
prompt_template: str,
|
|
31
|
-
handlers: list[Any] | None = None,
|
|
32
32
|
state_dir: Path = Path(".batch_jobs"),
|
|
33
33
|
custom_json_schema_obj_str: dict | None = None,
|
|
34
34
|
**client_kwargs: Any,
|
|
35
35
|
):
|
|
36
|
-
self.
|
|
37
|
-
self.
|
|
38
|
-
self.
|
|
39
|
-
self.
|
|
40
|
-
self.
|
|
41
|
-
self.
|
|
42
|
-
self.
|
|
43
|
-
self.
|
|
44
|
-
self.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
36
|
+
self._client = client
|
|
37
|
+
self._model = model
|
|
38
|
+
self._output_model = output_model
|
|
39
|
+
self._prompt_template = prompt_template
|
|
40
|
+
self._state_dir = state_dir
|
|
41
|
+
self._custom_json_schema_obj_str = custom_json_schema_obj_str
|
|
42
|
+
self._client_kwargs = client_kwargs
|
|
43
|
+
self._dict_input = False
|
|
44
|
+
self._state_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
|
|
46
|
+
if custom_json_schema_obj_str and not isinstance(
|
|
47
|
+
custom_json_schema_obj_str, dict
|
|
48
|
+
):
|
|
49
|
+
raise ValueError("Schema should be a dict")
|
|
50
50
|
|
|
51
51
|
def _state_file(self, job_name: str) -> Path:
|
|
52
|
-
return self.
|
|
52
|
+
return self._state_dir / f"{job_name}.json"
|
|
53
53
|
|
|
54
54
|
def _load_state(self, job_name: str) -> list[dict[str, Any]]:
|
|
55
55
|
"""
|
|
@@ -83,17 +83,17 @@ class SimpleBatchManager:
|
|
|
83
83
|
"""
|
|
84
84
|
response_format_config: dict[str, Any]
|
|
85
85
|
|
|
86
|
-
if self.
|
|
86
|
+
if self._custom_json_schema_obj_str:
|
|
87
87
|
response_format_config = {
|
|
88
88
|
"type": "json_schema",
|
|
89
|
-
"json_schema": self.
|
|
89
|
+
"json_schema": self._custom_json_schema_obj_str,
|
|
90
90
|
}
|
|
91
91
|
else:
|
|
92
|
-
raw_schema = to_strict_json_schema(self.
|
|
92
|
+
raw_schema = to_strict_json_schema(self._output_model)
|
|
93
93
|
response_format_config = {
|
|
94
94
|
"type": "json_schema",
|
|
95
95
|
"json_schema": {
|
|
96
|
-
"name": self.
|
|
96
|
+
"name": self._output_model.__name__,
|
|
97
97
|
"schema": raw_schema,
|
|
98
98
|
},
|
|
99
99
|
}
|
|
@@ -105,11 +105,11 @@ class SimpleBatchManager:
|
|
|
105
105
|
"body": {
|
|
106
106
|
"model": self.model,
|
|
107
107
|
"messages": [
|
|
108
|
-
{"role": "system", "content": self.
|
|
108
|
+
{"role": "system", "content": self._prompt_template},
|
|
109
109
|
{"role": "user", "content": text},
|
|
110
110
|
],
|
|
111
111
|
"response_format": response_format_config,
|
|
112
|
-
**self.
|
|
112
|
+
**self._client_kwargs,
|
|
113
113
|
},
|
|
114
114
|
}
|
|
115
115
|
|
|
@@ -127,10 +127,10 @@ class SimpleBatchManager:
|
|
|
127
127
|
|
|
128
128
|
else:
|
|
129
129
|
raise TypeError(
|
|
130
|
-
"The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}
|
|
130
|
+
"The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
|
-
file_path = self.
|
|
133
|
+
file_path = self._state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
|
|
134
134
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
135
135
|
for task in tasks:
|
|
136
136
|
f.write(json.dumps(task) + "\n")
|
|
@@ -143,9 +143,10 @@ class SimpleBatchManager:
|
|
|
143
143
|
"""
|
|
144
144
|
if self._load_state(job_name):
|
|
145
145
|
return
|
|
146
|
+
|
|
146
147
|
path = self._prepare_file(payload)
|
|
147
|
-
upload = self.
|
|
148
|
-
job = self.
|
|
148
|
+
upload = self._client.files.create(file=open(path, "rb"), purpose="batch")
|
|
149
|
+
job = self._client.batches.create(
|
|
149
150
|
input_file_id=upload.id,
|
|
150
151
|
endpoint="/v1/chat/completions",
|
|
151
152
|
completion_window="24h",
|
|
@@ -161,7 +162,7 @@ class SimpleBatchManager:
|
|
|
161
162
|
if not job:
|
|
162
163
|
return "completed"
|
|
163
164
|
|
|
164
|
-
info = self.
|
|
165
|
+
info = self._client.batches.retrieve(job["id"])
|
|
165
166
|
job = info.to_dict()
|
|
166
167
|
self._save_state(job_name, [job])
|
|
167
168
|
logger.info("Batch job status: %s", job)
|
|
@@ -179,18 +180,18 @@ class SimpleBatchManager:
|
|
|
179
180
|
return {}
|
|
180
181
|
batch_id = job["id"]
|
|
181
182
|
|
|
182
|
-
info = self.
|
|
183
|
+
info = self._client.batches.retrieve(batch_id)
|
|
183
184
|
out_file_id = info.output_file_id
|
|
184
185
|
if not out_file_id:
|
|
185
186
|
error_file_id = info.error_file_id
|
|
186
187
|
if error_file_id:
|
|
187
188
|
err_content = (
|
|
188
|
-
self.
|
|
189
|
+
self._client.files.content(error_file_id).read().decode("utf-8")
|
|
189
190
|
)
|
|
190
|
-
logger.
|
|
191
|
+
logger.error("Error file content:", err_content)
|
|
191
192
|
return {}
|
|
192
193
|
|
|
193
|
-
content = self.
|
|
194
|
+
content = self._client.files.content(out_file_id).read().decode("utf-8")
|
|
194
195
|
lines = content.splitlines()
|
|
195
196
|
results = {}
|
|
196
197
|
log = []
|
|
@@ -201,7 +202,7 @@ class SimpleBatchManager:
|
|
|
201
202
|
content = result["response"]["body"]["choices"][0]["message"]["content"]
|
|
202
203
|
try:
|
|
203
204
|
parsed_content = json.loads(content)
|
|
204
|
-
model_instance = self.
|
|
205
|
+
model_instance = self._output_model(**parsed_content)
|
|
205
206
|
results[custom_id] = model_instance.model_dump(mode="json")
|
|
206
207
|
except json.JSONDecodeError:
|
|
207
208
|
results[custom_id] = {"error": "Failed to parse content as JSON"}
|
|
@@ -221,8 +222,6 @@ class SimpleBatchManager:
|
|
|
221
222
|
error_d = {custom_id: results[custom_id]}
|
|
222
223
|
log.append(error_d)
|
|
223
224
|
|
|
224
|
-
for handler in self.handlers:
|
|
225
|
-
handler.handle(results)
|
|
226
225
|
if remove_cache:
|
|
227
226
|
self._clear_state(job_name)
|
|
228
227
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def export_data(data) -> list[dict[str, str]]:
|
|
5
|
+
"""
|
|
6
|
+
Produces a structure of the following form from an initial data structure:
|
|
7
|
+
[{"id": str, "text": str},...]
|
|
8
|
+
"""
|
|
9
|
+
return data
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def import_data(data) -> Any:
|
|
13
|
+
"""
|
|
14
|
+
Takes the output and adds and aggregates it to the original structure.
|
|
15
|
+
"""
|
|
16
|
+
return data
|
texttools/prompts/README.md
CHANGED
|
@@ -14,15 +14,15 @@ This folder contains YAML files for all prompts used in the project. Each file r
|
|
|
14
14
|
### Example YAML Structure
|
|
15
15
|
```yaml
|
|
16
16
|
main_template:
|
|
17
|
-
|
|
17
|
+
mode_1: |
|
|
18
18
|
Your main instructions here with placeholders like {input}.
|
|
19
|
-
|
|
19
|
+
mode_2: |
|
|
20
20
|
Optional reasoning instructions here.
|
|
21
21
|
|
|
22
22
|
analyze_template:
|
|
23
|
-
|
|
23
|
+
mode_1: |
|
|
24
24
|
Analyze and summarize the input.
|
|
25
|
-
|
|
25
|
+
mode_2: |
|
|
26
26
|
Optional detailed analysis template.
|
|
27
27
|
```
|
|
28
28
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
main_template:
|
|
2
|
+
|
|
3
|
+
category_list: |
|
|
4
|
+
You are an expert classification agent.
|
|
5
|
+
You receive a list of categories.
|
|
6
|
+
|
|
7
|
+
Your task:
|
|
8
|
+
- Read all provided categories carefully.
|
|
9
|
+
- Consider the user query, intent, and task explanation.
|
|
10
|
+
- Select exactly one category name from the list that best matches the user’s intent.
|
|
11
|
+
- Return only the category name, nothing else.
|
|
12
|
+
|
|
13
|
+
Rules:
|
|
14
|
+
- Never invent categories that are not in the list.
|
|
15
|
+
- If multiple categories seem possible, choose the closest match based on the description and user intent.
|
|
16
|
+
- If descriptions are missing or empty, rely on the category name.
|
|
17
|
+
- If the correct answer cannot be determined with certainty, choose the most likely one.
|
|
18
|
+
|
|
19
|
+
Output format:
|
|
20
|
+
{{
|
|
21
|
+
"reason": "Explanation of why the input belongs to the category"
|
|
22
|
+
"result": "<category_name_only>"
|
|
23
|
+
}}
|
|
24
|
+
|
|
25
|
+
Available categories with their descriptions:
|
|
26
|
+
{category_list}
|
|
27
|
+
|
|
28
|
+
The text that has to be categorized:
|
|
29
|
+
{input}
|
|
30
|
+
|
|
31
|
+
category_tree: |
|
|
32
|
+
You are an expert classification agent.
|
|
33
|
+
You receive a list of categories at the current level of a hierarchical category tree.
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
- Read all provided categories carefully.
|
|
37
|
+
- Consider the user query, intent, and task explanation.
|
|
38
|
+
- Select exactly one category name from the list that best matches the user’s intent.
|
|
39
|
+
- Return only the category name, nothing else.
|
|
40
|
+
|
|
41
|
+
Rules:
|
|
42
|
+
- Never invent categories that are not in the list.
|
|
43
|
+
- If multiple categories seem possible, choose the closest match based on the description and user intent.
|
|
44
|
+
- If descriptions are missing or empty, rely on the category name.
|
|
45
|
+
- If the correct answer cannot be determined with certainty, choose the most likely one.
|
|
46
|
+
|
|
47
|
+
Output format:
|
|
48
|
+
{{
|
|
49
|
+
"reason": "Explanation of why the input belongs to the category"
|
|
50
|
+
"result": "<category_name_only>"
|
|
51
|
+
}}
|
|
52
|
+
|
|
53
|
+
Available categories with their descriptions at this level:
|
|
54
|
+
{category_list}
|
|
55
|
+
|
|
56
|
+
Do not include category descriptions at all. Only write the raw category.
|
|
57
|
+
|
|
58
|
+
The text that has to be categorized:
|
|
59
|
+
{input}
|
|
60
|
+
|
|
61
|
+
analyze_template:
|
|
62
|
+
|
|
63
|
+
category_list: |
|
|
64
|
+
We want to categorize the given text.
|
|
65
|
+
To improve categorization, we need an analysis of the text.
|
|
66
|
+
Analyze the given text and write its main idea and a short analysis of that.
|
|
67
|
+
Analysis should be very short.
|
|
68
|
+
Text:
|
|
69
|
+
{input}
|
|
70
|
+
|
|
71
|
+
category_tree: |
|
|
72
|
+
We want to categorize the given text.
|
|
73
|
+
To improve categorization, we need an analysis of the text.
|
|
74
|
+
Analyze the given text and write its main idea and a short analysis of that.
|
|
75
|
+
Analysis should be very short.
|
|
76
|
+
Text:
|
|
77
|
+
{input}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
main_template: |
|
|
2
|
+
You are an expert Named Entity Recognition (NER) system. Extract entities from the text.
|
|
3
|
+
The output must strictly follow the provided Pydantic schema.
|
|
4
|
+
|
|
5
|
+
Mapping Rule:
|
|
6
|
+
- Person: شخص
|
|
7
|
+
- Location: مکان
|
|
8
|
+
- Time: زمان
|
|
9
|
+
- Living Beings: موجود زنده
|
|
10
|
+
- Organization: سازمان
|
|
11
|
+
- Concept: مفهوم
|
|
12
|
+
|
|
13
|
+
CRITICAL:
|
|
14
|
+
1. The final output structure must be a complete JSON object matching the Pydantic schema (List[Entity]).
|
|
15
|
+
2. Both the extracted text and the type must be in Persian, using the exact mapping provided above.
|
|
16
|
+
|
|
17
|
+
Here is the text: {input}
|
|
18
|
+
|
|
19
|
+
analyze_template: |
|
|
20
|
+
Analyze the following text to identify all potential named entities and their categories (Person, Location, Time, Living Beings, Organization, Concept).
|
|
21
|
+
Provide a brief summary of the entities identified that will help the main process to extract them accurately and apply the correct Persian type label.
|
|
22
|
+
Here is the text: {input}
|
|
@@ -1,18 +1,68 @@
|
|
|
1
|
-
main_template:
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
1
|
+
main_template:
|
|
2
|
+
|
|
3
|
+
auto: |
|
|
4
|
+
You are an expert keyword extractor.
|
|
5
|
+
Extract the most relevant keywords from the given text.
|
|
6
|
+
Guidelines:
|
|
7
|
+
- Keywords must represent the main concepts of the text.
|
|
8
|
+
- If two words have overlapping meanings, choose only one.
|
|
9
|
+
- Do not include generic or unrelated words.
|
|
10
|
+
- Keywords must be single, self-contained words (no phrases).
|
|
11
|
+
- Output between 3 and 7 keywords based on the input length.
|
|
12
|
+
- Respond only in JSON format:
|
|
13
|
+
{{"result": ["keyword1", "keyword2", etc.]}}
|
|
14
|
+
Here is the text:
|
|
15
|
+
{input}
|
|
16
|
+
|
|
17
|
+
threshold: |
|
|
18
|
+
You are an expert keyword extractor specialized in fine-grained concept identification.
|
|
19
|
+
Extract the most specific, content-bearing keywords from the text.
|
|
20
|
+
|
|
21
|
+
Requirements:
|
|
22
|
+
- Choose fine-grained conceptual terms, not general domain labels.
|
|
23
|
+
- Avoid words that only describe the broad topic (e.g., Islam, religion, philosophy, history).
|
|
24
|
+
- Prefer specific names, concepts, doctrines, events, arguments, or terminology.
|
|
25
|
+
- Do not select words only because they appear frequently. A keyword must represent a central conceptual idea, not a repeated surface term.
|
|
26
|
+
- If multiple words express overlapping meaning, select the more specific one.
|
|
27
|
+
- Keywords must be single words (no multi-word expressions).
|
|
28
|
+
- Extract N keywords depending on input length:
|
|
29
|
+
- Short texts (a few sentences): 3 keywords
|
|
30
|
+
- Medium texts (1–4 paragraphs): 4–5 keywords
|
|
31
|
+
- Long texts (more than 4 paragraphs): 6–7 keywords
|
|
32
|
+
- Respond only in JSON format:
|
|
33
|
+
{{"result": ["keyword1", "keyword2", etc.]}}
|
|
34
|
+
Here is the text:
|
|
35
|
+
{input}
|
|
36
|
+
|
|
37
|
+
count: |
|
|
38
|
+
You are an expert keyword extractor with precise output requirements.
|
|
39
|
+
Extract exactly {number_of_keywords} keywords from the given text.
|
|
40
|
+
|
|
41
|
+
Requirements:
|
|
42
|
+
- Extract exactly {number_of_keywords} keywords, no more, no less.
|
|
43
|
+
- Select the {number_of_keywords} most relevant and specific keywords that represent core concepts.
|
|
44
|
+
- Prefer specific terms, names, and concepts over general topic labels.
|
|
45
|
+
- If the text doesn't contain enough distinct keywords, include the most relevant ones even if some are less specific.
|
|
46
|
+
- Keywords must be single words (no multi-word expressions).
|
|
47
|
+
- Order keywords by relevance (most relevant first).
|
|
48
|
+
- Respond only in JSON format:
|
|
49
|
+
{{"result": ["keyword1", "keyword2", "keyword3", ...]}}
|
|
50
|
+
|
|
51
|
+
Here is the text:
|
|
52
|
+
{input}
|
|
53
|
+
|
|
54
|
+
analyze_template:
|
|
55
|
+
auto: |
|
|
56
|
+
Analyze the following text to identify its main topics, concepts, and important terms.
|
|
57
|
+
Provide a concise summary of your findings that will help in extracting relevant keywords.
|
|
58
|
+
{input}
|
|
59
|
+
|
|
60
|
+
threshold: |
|
|
61
|
+
Analyze the following text to identify its main topics, concepts, and important terms.
|
|
62
|
+
Provide a concise summary of your findings that will help in extracting relevant keywords.
|
|
63
|
+
{input}
|
|
64
|
+
|
|
65
|
+
count: |
|
|
66
|
+
Analyze the following text to identify its main topics, concepts, and important terms.
|
|
67
|
+
Provide a concise summary of your findings that will help in extracting relevant keywords.
|
|
68
|
+
{input}
|