hamtaa-texttools 1.1.1__py3-none-any.whl → 1.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/METADATA +98 -26
  2. hamtaa_texttools-1.1.16.dist-info/RECORD +31 -0
  3. texttools/__init__.py +6 -8
  4. texttools/batch/batch_config.py +26 -0
  5. texttools/batch/batch_runner.py +105 -151
  6. texttools/batch/{batch_manager.py → internals/batch_manager.py} +39 -40
  7. texttools/batch/internals/utils.py +16 -0
  8. texttools/prompts/README.md +4 -4
  9. texttools/prompts/categorize.yaml +77 -0
  10. texttools/prompts/detect_entity.yaml +22 -0
  11. texttools/prompts/extract_keywords.yaml +68 -18
  12. texttools/tools/async_tools.py +804 -0
  13. texttools/tools/internals/async_operator.py +90 -69
  14. texttools/tools/internals/models.py +183 -0
  15. texttools/tools/internals/operator_utils.py +54 -0
  16. texttools/tools/internals/prompt_loader.py +13 -14
  17. texttools/tools/internals/sync_operator.py +201 -0
  18. texttools/tools/sync_tools.py +804 -0
  19. hamtaa_texttools-1.1.1.dist-info/RECORD +0 -30
  20. texttools/batch/__init__.py +0 -4
  21. texttools/prompts/categorizer.yaml +0 -28
  22. texttools/tools/__init__.py +0 -4
  23. texttools/tools/async_the_tool.py +0 -414
  24. texttools/tools/internals/base_operator.py +0 -91
  25. texttools/tools/internals/operator.py +0 -179
  26. texttools/tools/internals/output_models.py +0 -59
  27. texttools/tools/the_tool.py +0 -412
  28. {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/WHEEL +0 -0
  29. {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/licenses/LICENSE +0 -0
  30. {hamtaa_texttools-1.1.1.dist-info → hamtaa_texttools-1.1.16.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,20 @@
1
1
  import json
2
2
  import uuid
3
3
  from pathlib import Path
4
- from typing import Any, Type
4
+ from typing import Any, Type, TypeVar
5
5
  import logging
6
6
 
7
7
  from pydantic import BaseModel
8
8
  from openai import OpenAI
9
9
  from openai.lib._pydantic import to_strict_json_schema
10
10
 
11
- # Configure logger
12
- logger = logging.getLogger("batch_runner")
13
- logger.setLevel(logging.INFO)
11
+ # Base Model type for output models
12
+ T = TypeVar("T", bound=BaseModel)
14
13
 
14
+ logger = logging.getLogger("texttools.batch_manager")
15
15
 
16
- class SimpleBatchManager:
16
+
17
+ class BatchManager:
17
18
  """
18
19
  Manages batch processing jobs for OpenAI's chat completions with structured outputs.
19
20
 
@@ -26,30 +27,29 @@ class SimpleBatchManager:
26
27
  self,
27
28
  client: OpenAI,
28
29
  model: str,
29
- output_model: Type[BaseModel],
30
+ output_model: Type[T],
30
31
  prompt_template: str,
31
- handlers: list[Any] | None = None,
32
32
  state_dir: Path = Path(".batch_jobs"),
33
33
  custom_json_schema_obj_str: dict | None = None,
34
34
  **client_kwargs: Any,
35
35
  ):
36
- self.client = client
37
- self.model = model
38
- self.output_model = output_model
39
- self.prompt_template = prompt_template
40
- self.handlers = handlers or []
41
- self.state_dir = state_dir
42
- self.state_dir.mkdir(parents=True, exist_ok=True)
43
- self.custom_json_schema_obj_str = custom_json_schema_obj_str
44
- self.client_kwargs = client_kwargs
45
- self.dict_input = False
46
-
47
- if self.custom_json_schema_obj_str:
48
- if self.custom_json_schema_obj_str is not dict:
49
- raise ValueError("schema should be a dict")
36
+ self._client = client
37
+ self._model = model
38
+ self._output_model = output_model
39
+ self._prompt_template = prompt_template
40
+ self._state_dir = state_dir
41
+ self._custom_json_schema_obj_str = custom_json_schema_obj_str
42
+ self._client_kwargs = client_kwargs
43
+ self._dict_input = False
44
+ self._state_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ if custom_json_schema_obj_str and not isinstance(
47
+ custom_json_schema_obj_str, dict
48
+ ):
49
+ raise ValueError("Schema should be a dict")
50
50
 
51
51
  def _state_file(self, job_name: str) -> Path:
52
- return self.state_dir / f"{job_name}.json"
52
+ return self._state_dir / f"{job_name}.json"
53
53
 
54
54
  def _load_state(self, job_name: str) -> list[dict[str, Any]]:
55
55
  """
@@ -83,17 +83,17 @@ class SimpleBatchManager:
83
83
  """
84
84
  response_format_config: dict[str, Any]
85
85
 
86
- if self.custom_json_schema_obj_str:
86
+ if self._custom_json_schema_obj_str:
87
87
  response_format_config = {
88
88
  "type": "json_schema",
89
- "json_schema": self.custom_json_schema_obj_str,
89
+ "json_schema": self._custom_json_schema_obj_str,
90
90
  }
91
91
  else:
92
- raw_schema = to_strict_json_schema(self.output_model)
92
+ raw_schema = to_strict_json_schema(self._output_model)
93
93
  response_format_config = {
94
94
  "type": "json_schema",
95
95
  "json_schema": {
96
- "name": self.output_model.__name__,
96
+ "name": self._output_model.__name__,
97
97
  "schema": raw_schema,
98
98
  },
99
99
  }
@@ -105,11 +105,11 @@ class SimpleBatchManager:
105
105
  "body": {
106
106
  "model": self.model,
107
107
  "messages": [
108
- {"role": "system", "content": self.prompt_template},
108
+ {"role": "system", "content": self._prompt_template},
109
109
  {"role": "user", "content": text},
110
110
  ],
111
111
  "response_format": response_format_config,
112
- **self.client_kwargs,
112
+ **self._client_kwargs,
113
113
  },
114
114
  }
115
115
 
@@ -127,10 +127,10 @@ class SimpleBatchManager:
127
127
 
128
128
  else:
129
129
  raise TypeError(
130
- "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}."
130
+ "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}"
131
131
  )
132
132
 
133
- file_path = self.state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
133
+ file_path = self._state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
134
134
  with open(file_path, "w", encoding="utf-8") as f:
135
135
  for task in tasks:
136
136
  f.write(json.dumps(task) + "\n")
@@ -143,9 +143,10 @@ class SimpleBatchManager:
143
143
  """
144
144
  if self._load_state(job_name):
145
145
  return
146
+
146
147
  path = self._prepare_file(payload)
147
- upload = self.client.files.create(file=open(path, "rb"), purpose="batch")
148
- job = self.client.batches.create(
148
+ upload = self._client.files.create(file=open(path, "rb"), purpose="batch")
149
+ job = self._client.batches.create(
149
150
  input_file_id=upload.id,
150
151
  endpoint="/v1/chat/completions",
151
152
  completion_window="24h",
@@ -161,7 +162,7 @@ class SimpleBatchManager:
161
162
  if not job:
162
163
  return "completed"
163
164
 
164
- info = self.client.batches.retrieve(job["id"])
165
+ info = self._client.batches.retrieve(job["id"])
165
166
  job = info.to_dict()
166
167
  self._save_state(job_name, [job])
167
168
  logger.info("Batch job status: %s", job)
@@ -179,18 +180,18 @@ class SimpleBatchManager:
179
180
  return {}
180
181
  batch_id = job["id"]
181
182
 
182
- info = self.client.batches.retrieve(batch_id)
183
+ info = self._client.batches.retrieve(batch_id)
183
184
  out_file_id = info.output_file_id
184
185
  if not out_file_id:
185
186
  error_file_id = info.error_file_id
186
187
  if error_file_id:
187
188
  err_content = (
188
- self.client.files.content(error_file_id).read().decode("utf-8")
189
+ self._client.files.content(error_file_id).read().decode("utf-8")
189
190
  )
190
- logger.info("Error file content:", err_content)
191
+ logger.error("Error file content:", err_content)
191
192
  return {}
192
193
 
193
- content = self.client.files.content(out_file_id).read().decode("utf-8")
194
+ content = self._client.files.content(out_file_id).read().decode("utf-8")
194
195
  lines = content.splitlines()
195
196
  results = {}
196
197
  log = []
@@ -201,7 +202,7 @@ class SimpleBatchManager:
201
202
  content = result["response"]["body"]["choices"][0]["message"]["content"]
202
203
  try:
203
204
  parsed_content = json.loads(content)
204
- model_instance = self.output_model(**parsed_content)
205
+ model_instance = self._output_model(**parsed_content)
205
206
  results[custom_id] = model_instance.model_dump(mode="json")
206
207
  except json.JSONDecodeError:
207
208
  results[custom_id] = {"error": "Failed to parse content as JSON"}
@@ -221,8 +222,6 @@ class SimpleBatchManager:
221
222
  error_d = {custom_id: results[custom_id]}
222
223
  log.append(error_d)
223
224
 
224
- for handler in self.handlers:
225
- handler.handle(results)
226
225
  if remove_cache:
227
226
  self._clear_state(job_name)
228
227
 
@@ -0,0 +1,16 @@
1
+ from typing import Any
2
+
3
+
4
+ def export_data(data) -> list[dict[str, str]]:
5
+ """
6
+ Produces a structure of the following form from an initial data structure:
7
+ [{"id": str, "text": str},...]
8
+ """
9
+ return data
10
+
11
+
12
+ def import_data(data) -> Any:
13
+ """
14
+ Takes the output and adds and aggregates it to the original structure.
15
+ """
16
+ return data
@@ -14,15 +14,15 @@ This folder contains YAML files for all prompts used in the project. Each file r
14
14
  ### Example YAML Structure
15
15
  ```yaml
16
16
  main_template:
17
- default: |
17
+ mode_1: |
18
18
  Your main instructions here with placeholders like {input}.
19
- reason: |
19
+ mode_2: |
20
20
  Optional reasoning instructions here.
21
21
 
22
22
  analyze_template:
23
- default: |
23
+ mode_1: |
24
24
  Analyze and summarize the input.
25
- reason: |
25
+ mode_2: |
26
26
  Optional detailed analysis template.
27
27
  ```
28
28
 
@@ -0,0 +1,77 @@
1
+ main_template:
2
+
3
+ category_list: |
4
+ You are an expert classification agent.
5
+ You receive a list of categories.
6
+
7
+ Your task:
8
+ - Read all provided categories carefully.
9
+ - Consider the user query, intent, and task explanation.
10
+ - Select exactly one category name from the list that best matches the user’s intent.
11
+ - Return only the category name, nothing else.
12
+
13
+ Rules:
14
+ - Never invent categories that are not in the list.
15
+ - If multiple categories seem possible, choose the closest match based on the description and user intent.
16
+ - If descriptions are missing or empty, rely on the category name.
17
+ - If the correct answer cannot be determined with certainty, choose the most likely one.
18
+
19
+ Output format:
20
+ {{
21
+ "reason": "Explanation of why the input belongs to the category"
22
+ "result": "<category_name_only>"
23
+ }}
24
+
25
+ Available categories with their descriptions:
26
+ {category_list}
27
+
28
+ The text that has to be categorized:
29
+ {input}
30
+
31
+ category_tree: |
32
+ You are an expert classification agent.
33
+ You receive a list of categories at the current level of a hierarchical category tree.
34
+
35
+ Your task:
36
+ - Read all provided categories carefully.
37
+ - Consider the user query, intent, and task explanation.
38
+ - Select exactly one category name from the list that best matches the user’s intent.
39
+ - Return only the category name, nothing else.
40
+
41
+ Rules:
42
+ - Never invent categories that are not in the list.
43
+ - If multiple categories seem possible, choose the closest match based on the description and user intent.
44
+ - If descriptions are missing or empty, rely on the category name.
45
+ - If the correct answer cannot be determined with certainty, choose the most likely one.
46
+
47
+ Output format:
48
+ {{
49
+ "reason": "Explanation of why the input belongs to the category"
50
+ "result": "<category_name_only>"
51
+ }}
52
+
53
+ Available categories with their descriptions at this level:
54
+ {category_list}
55
+
56
+ Do not include category descriptions at all. Only write the raw category.
57
+
58
+ The text that has to be categorized:
59
+ {input}
60
+
61
+ analyze_template:
62
+
63
+ category_list: |
64
+ We want to categorize the given text.
65
+ To improve categorization, we need an analysis of the text.
66
+ Analyze the given text and write its main idea and a short analysis of that.
67
+ Analysis should be very short.
68
+ Text:
69
+ {input}
70
+
71
+ category_tree: |
72
+ We want to categorize the given text.
73
+ To improve categorization, we need an analysis of the text.
74
+ Analyze the given text and write its main idea and a short analysis of that.
75
+ Analysis should be very short.
76
+ Text:
77
+ {input}
@@ -0,0 +1,22 @@
1
+ main_template: |
2
+ You are an expert Named Entity Recognition (NER) system. Extract entities from the text.
3
+ The output must strictly follow the provided Pydantic schema.
4
+
5
+ Mapping Rule:
6
+ - Person: شخص
7
+ - Location: مکان
8
+ - Time: زمان
9
+ - Living Beings: موجود زنده
10
+ - Organization: سازمان
11
+ - Concept: مفهوم
12
+
13
+ CRITICAL:
14
+ 1. The final output structure must be a complete JSON object matching the Pydantic schema (List[Entity]).
15
+ 2. Both the extracted text and the type must be in Persian, using the exact mapping provided above.
16
+
17
+ Here is the text: {input}
18
+
19
+ analyze_template: |
20
+ Analyze the following text to identify all potential named entities and their categories (Person, Location, Time, Living Beings, Organization, Concept).
21
+ Provide a brief summary of the entities identified that will help the main process to extract them accurately and apply the correct Persian type label.
22
+ Here is the text: {input}
@@ -1,18 +1,68 @@
1
- main_template: |
2
- You are an expert keyword extractor.
3
- Extract the most relevant keywords from the given text.
4
- Guidelines:
5
- - Keywords must represent the main concepts of the text.
6
- - If two words have overlapping meanings, choose only one.
7
- - Do not include generic or unrelated words.
8
- - Keywords must be single, self-contained words (no phrases).
9
- - Output between 3 and 7 keywords based on the input length.
10
- - Respond only in JSON format:
11
- {{"result": ["keyword1", "keyword2", etc.]}}
12
- Here is the text:
13
- {input}
14
-
15
- analyze_template: |
16
- Analyze the following text to identify its main topics, concepts, and important terms.
17
- Provide a concise summary of your findings that will help in extracting relevant keywords.
18
- {input}
1
+ main_template:
2
+
3
+ auto: |
4
+ You are an expert keyword extractor.
5
+ Extract the most relevant keywords from the given text.
6
+ Guidelines:
7
+ - Keywords must represent the main concepts of the text.
8
+ - If two words have overlapping meanings, choose only one.
9
+ - Do not include generic or unrelated words.
10
+ - Keywords must be single, self-contained words (no phrases).
11
+ - Output between 3 and 7 keywords based on the input length.
12
+ - Respond only in JSON format:
13
+ {{"result": ["keyword1", "keyword2", etc.]}}
14
+ Here is the text:
15
+ {input}
16
+
17
+ threshold: |
18
+ You are an expert keyword extractor specialized in fine-grained concept identification.
19
+ Extract the most specific, content-bearing keywords from the text.
20
+
21
+ Requirements:
22
+ - Choose fine-grained conceptual terms, not general domain labels.
23
+ - Avoid words that only describe the broad topic (e.g., Islam, religion, philosophy, history).
24
+ - Prefer specific names, concepts, doctrines, events, arguments, or terminology.
25
+ - Do not select words only because they appear frequently. A keyword must represent a central conceptual idea, not a repeated surface term.
26
+ - If multiple words express overlapping meaning, select the more specific one.
27
+ - Keywords must be single words (no multi-word expressions).
28
+ - Extract N keywords depending on input length:
29
+ - Short texts (a few sentences): 3 keywords
30
+ - Medium texts (1–4 paragraphs): 4–5 keywords
31
+ - Long texts (more than 4 paragraphs): 6–7 keywords
32
+ - Respond only in JSON format:
33
+ {{"result": ["keyword1", "keyword2", etc.]}}
34
+ Here is the text:
35
+ {input}
36
+
37
+ count: |
38
+ You are an expert keyword extractor with precise output requirements.
39
+ Extract exactly {number_of_keywords} keywords from the given text.
40
+
41
+ Requirements:
42
+ - Extract exactly {number_of_keywords} keywords, no more, no less.
43
+ - Select the {number_of_keywords} most relevant and specific keywords that represent core concepts.
44
+ - Prefer specific terms, names, and concepts over general topic labels.
45
+ - If the text doesn't contain enough distinct keywords, include the most relevant ones even if some are less specific.
46
+ - Keywords must be single words (no multi-word expressions).
47
+ - Order keywords by relevance (most relevant first).
48
+ - Respond only in JSON format:
49
+ {{"result": ["keyword1", "keyword2", "keyword3", ...]}}
50
+
51
+ Here is the text:
52
+ {input}
53
+
54
+ analyze_template:
55
+ auto: |
56
+ Analyze the following text to identify its main topics, concepts, and important terms.
57
+ Provide a concise summary of your findings that will help in extracting relevant keywords.
58
+ {input}
59
+
60
+ threshold: |
61
+ Analyze the following text to identify its main topics, concepts, and important terms.
62
+ Provide a concise summary of your findings that will help in extracting relevant keywords.
63
+ {input}
64
+
65
+ count: |
66
+ Analyze the following text to identify its main topics, concepts, and important terms.
67
+ Provide a concise summary of your findings that will help in extracting relevant keywords.
68
+ {input}