hamtaa-texttools 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (32) hide show
  1. {hamtaa_texttools-1.0.3.dist-info → hamtaa_texttools-1.0.5.dist-info}/METADATA +192 -141
  2. hamtaa_texttools-1.0.5.dist-info/RECORD +30 -0
  3. {hamtaa_texttools-1.0.3.dist-info → hamtaa_texttools-1.0.5.dist-info}/licenses/LICENSE +20 -20
  4. {hamtaa_texttools-1.0.3.dist-info → hamtaa_texttools-1.0.5.dist-info}/top_level.txt +0 -0
  5. texttools/__init__.py +9 -9
  6. texttools/batch/__init__.py +4 -4
  7. texttools/batch/batch_manager.py +240 -240
  8. texttools/batch/batch_runner.py +212 -212
  9. texttools/formatters/base_formatter.py +33 -33
  10. texttools/formatters/{user_merge_formatter/user_merge_formatter.py → user_merge_formatter.py} +30 -30
  11. texttools/prompts/README.md +31 -31
  12. texttools/prompts/categorizer.yaml +28 -31
  13. texttools/prompts/custom_tool.yaml +7 -0
  14. texttools/prompts/keyword_extractor.yaml +18 -14
  15. texttools/prompts/ner_extractor.yaml +20 -21
  16. texttools/prompts/question_detector.yaml +13 -14
  17. texttools/prompts/question_generator.yaml +19 -22
  18. texttools/prompts/question_merger.yaml +45 -48
  19. texttools/prompts/rewriter.yaml +111 -0
  20. texttools/prompts/subject_question_generator.yaml +22 -26
  21. texttools/prompts/summarizer.yaml +13 -11
  22. texttools/prompts/translator.yaml +14 -14
  23. texttools/tools/__init__.py +4 -4
  24. texttools/tools/async_the_tool.py +277 -263
  25. texttools/tools/internals/async_operator.py +297 -288
  26. texttools/tools/internals/operator.py +295 -306
  27. texttools/tools/internals/output_models.py +52 -62
  28. texttools/tools/internals/prompt_loader.py +76 -82
  29. texttools/tools/the_tool.py +501 -400
  30. hamtaa_texttools-1.0.3.dist-info/RECORD +0 -29
  31. texttools/prompts/question_rewriter.yaml +0 -46
  32. {hamtaa_texttools-1.0.3.dist-info → hamtaa_texttools-1.0.5.dist-info}/WHEEL +0 -0
@@ -1,240 +1,240 @@
1
- import json
2
- import uuid
3
- from pathlib import Path
4
- from typing import Any, Optional, Type
5
-
6
- from pydantic import BaseModel
7
- from openai import OpenAI
8
- from openai.lib._pydantic import to_strict_json_schema
9
-
10
-
11
- class SimpleBatchManager:
12
- """
13
- Manages batch processing jobs for OpenAI's chat completions with structured outputs.
14
-
15
- Handles the full lifecycle of a batch job: creating tasks from input texts,
16
- starting the job, monitoring status, and fetching results. Results are automatically
17
- parsed into the specified Pydantic output model. Job state is persisted to disk.
18
- """
19
-
20
- def __init__(
21
- self,
22
- client: OpenAI,
23
- model: str,
24
- output_model: Type[BaseModel],
25
- prompt_template: str,
26
- handlers: Optional[list[Any]] = None,
27
- state_dir: Path = Path(".batch_jobs"),
28
- custom_json_schema_obj_str: Optional[dict] = None,
29
- **client_kwargs: Any,
30
- ):
31
- self.client = client
32
- self.model = model
33
- self.output_model = output_model
34
- self.prompt_template = prompt_template
35
- self.handlers = handlers or []
36
- self.state_dir = state_dir
37
- self.state_dir.mkdir(parents=True, exist_ok=True)
38
- self.custom_json_schema_obj_str = custom_json_schema_obj_str
39
- self.client_kwargs = client_kwargs
40
- self.dict_input = False
41
-
42
- if self.custom_json_schema_obj_str:
43
- if self.custom_json_schema_obj_str is not dict:
44
- raise ValueError("schema should be a dict")
45
-
46
- def _state_file(self, job_name: str) -> Path:
47
- return self.state_dir / f"{job_name}.json"
48
-
49
- def _load_state(self, job_name: str) -> list[dict[str, Any]]:
50
- """
51
- Loads the state (job information) from the state file for the given job name.
52
- Returns an empty list if the state file does not exist.
53
- """
54
- path = self._state_file(job_name)
55
- if path.exists():
56
- with open(path, "r", encoding="utf-8") as f:
57
- return json.load(f)
58
- return []
59
-
60
- def _save_state(self, job_name: str, jobs: list[dict[str, Any]]) -> None:
61
- """
62
- Saves the job state to the state file for the given job name.
63
- """
64
- with open(self._state_file(job_name), "w", encoding="utf-8") as f:
65
- json.dump(jobs, f)
66
-
67
- def _clear_state(self, job_name: str) -> None:
68
- """
69
- Deletes the state file for the given job name if it exists.
70
- """
71
- path = self._state_file(job_name)
72
- if path.exists():
73
- path.unlink()
74
-
75
- def _build_task(self, text: str, idx: str) -> dict[str, Any]:
76
- """
77
- Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
78
- """
79
- response_format_config: dict[str, Any]
80
-
81
- if self.custom_json_schema_obj_str:
82
- response_format_config = {
83
- "type": "json_schema",
84
- "json_schema": self.custom_json_schema_obj_str,
85
- }
86
- else:
87
- raw_schema = to_strict_json_schema(self.output_model)
88
- response_format_config = {
89
- "type": "json_schema",
90
- "json_schema": {
91
- "name": self.output_model.__name__,
92
- "schema": raw_schema,
93
- },
94
- }
95
-
96
- return {
97
- "custom_id": str(idx),
98
- "method": "POST",
99
- "url": "/v1/chat/completions",
100
- "body": {
101
- "model": self.model,
102
- "messages": [
103
- {"role": "system", "content": self.prompt_template},
104
- {"role": "user", "content": text},
105
- ],
106
- "response_format": response_format_config,
107
- **self.client_kwargs,
108
- },
109
- }
110
-
111
- def _prepare_file(self, payload: list[str] | list[dict[str, str]]) -> Path:
112
- """
113
- Prepares a JSONL file containing all tasks for the batch job, based on the input payload.
114
- Returns the path to the created file.
115
- """
116
- if not payload:
117
- raise ValueError("Payload must not be empty")
118
- if isinstance(payload[0], str):
119
- tasks = [self._build_task(text, uuid.uuid4().hex) for text in payload]
120
- elif isinstance(payload[0], dict):
121
- tasks = [self._build_task(dic["text"], dic["id"]) for dic in payload]
122
-
123
- else:
124
- raise TypeError(
125
- "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}."
126
- )
127
-
128
- file_path = self.state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
129
- with open(file_path, "w", encoding="utf-8") as f:
130
- for task in tasks:
131
- f.write(json.dumps(task) + "\n")
132
- return file_path
133
-
134
- def start(self, payload: list[str | dict[str, str]], job_name: str):
135
- """
136
- Starts a new batch job by uploading the prepared file and creating a batch job on the server.
137
- If a job with the same name already exists, it does nothing.
138
- """
139
- if self._load_state(job_name):
140
- return
141
- path = self._prepare_file(payload)
142
- upload = self.client.files.create(file=open(path, "rb"), purpose="batch")
143
- job = self.client.batches.create(
144
- input_file_id=upload.id,
145
- endpoint="/v1/chat/completions",
146
- completion_window="24h",
147
- ).to_dict()
148
- self._save_state(job_name, [job])
149
-
150
- def check_status(self, job_name: str) -> str:
151
- """
152
- Checks and returns the current status of the batch job with the given job name.
153
- Updates the job state with the latest information from the server.
154
- """
155
- job = self._load_state(job_name)[0]
156
- if not job:
157
- return "completed"
158
-
159
- info = self.client.batches.retrieve(job["id"])
160
- job = info.to_dict()
161
- self._save_state(job_name, [job])
162
- print("HERE is the job", job)
163
- return job["status"]
164
-
165
- def _parsed(self, result: dict) -> list:
166
- """
167
- Parses the result dictionary, extracting the desired output or error for each item.
168
- Returns a list of dictionaries with 'id' and 'output' keys.
169
- """
170
- modified_result = []
171
-
172
- for key, d in result.items():
173
- if "desired_output" in d:
174
- new_dict = {"id": key, "output": d["desired_output"]}
175
- modified_result.append(new_dict)
176
- else:
177
- new_dict = {"id": key, "output": d["error"]}
178
- modified_result.append(new_dict)
179
- return modified_result
180
-
181
- def fetch_results(
182
- self, job_name: str, remove_cache: bool = True
183
- ) -> tuple[dict[str, str], list]:
184
- """
185
- Fetches the results of a completed batch job. Optionally saves the results to a file and/or removes the job cache.
186
- Returns a tuple containing the parsed results and a log of errors (if any).
187
- """
188
- job = self._load_state(job_name)[0]
189
- if not job:
190
- return {}
191
- batch_id = job["id"]
192
-
193
- info = self.client.batches.retrieve(batch_id)
194
- out_file_id = info.output_file_id
195
- if not out_file_id:
196
- error_file_id = info.error_file_id
197
- if error_file_id:
198
- err_content = (
199
- self.client.files.content(error_file_id).read().decode("utf-8")
200
- )
201
- print("Error file content:", err_content)
202
- return {}
203
-
204
- content = self.client.files.content(out_file_id).read().decode("utf-8")
205
- lines = content.splitlines()
206
- results = {}
207
- log = []
208
- for line in lines:
209
- result = json.loads(line)
210
- custom_id = result["custom_id"]
211
- if result["response"]["status_code"] == 200:
212
- content = result["response"]["body"]["choices"][0]["message"]["content"]
213
- try:
214
- parsed_content = json.loads(content)
215
- model_instance = self.output_model(**parsed_content)
216
- results[custom_id] = model_instance.model_dump(mode="json")
217
- except json.JSONDecodeError:
218
- results[custom_id] = {"error": "Failed to parse content as JSON"}
219
- error_d = {custom_id: results[custom_id]}
220
- log.append(error_d)
221
- except Exception as e:
222
- results[custom_id] = {"error": str(e)}
223
- error_d = {custom_id: results[custom_id]}
224
- log.append(error_d)
225
- else:
226
- error_message = (
227
- result["response"]["body"]
228
- .get("error", {})
229
- .get("message", "Unknown error")
230
- )
231
- results[custom_id] = {"error": error_message}
232
- error_d = {custom_id: results[custom_id]}
233
- log.append(error_d)
234
-
235
- for handler in self.handlers:
236
- handler.handle(results)
237
- if remove_cache:
238
- self._clear_state(job_name)
239
-
240
- return results, log
1
+ import json
2
+ import uuid
3
+ from pathlib import Path
4
+ from typing import Any, Type
5
+
6
+ from pydantic import BaseModel
7
+ from openai import OpenAI
8
+ from openai.lib._pydantic import to_strict_json_schema
9
+
10
+
11
+ class SimpleBatchManager:
12
+ """
13
+ Manages batch processing jobs for OpenAI's chat completions with structured outputs.
14
+
15
+ Handles the full lifecycle of a batch job: creating tasks from input texts,
16
+ starting the job, monitoring status, and fetching results. Results are automatically
17
+ parsed into the specified Pydantic output model. Job state is persisted to disk.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ client: OpenAI,
23
+ model: str,
24
+ output_model: Type[BaseModel],
25
+ prompt_template: str,
26
+ handlers: list[Any] | None = None,
27
+ state_dir: Path = Path(".batch_jobs"),
28
+ custom_json_schema_obj_str: dict | None = None,
29
+ **client_kwargs: Any,
30
+ ):
31
+ self.client = client
32
+ self.model = model
33
+ self.output_model = output_model
34
+ self.prompt_template = prompt_template
35
+ self.handlers = handlers or []
36
+ self.state_dir = state_dir
37
+ self.state_dir.mkdir(parents=True, exist_ok=True)
38
+ self.custom_json_schema_obj_str = custom_json_schema_obj_str
39
+ self.client_kwargs = client_kwargs
40
+ self.dict_input = False
41
+
42
+ if self.custom_json_schema_obj_str:
43
+ if self.custom_json_schema_obj_str is not dict:
44
+ raise ValueError("schema should be a dict")
45
+
46
+ def _state_file(self, job_name: str) -> Path:
47
+ return self.state_dir / f"{job_name}.json"
48
+
49
+ def _load_state(self, job_name: str) -> list[dict[str, Any]]:
50
+ """
51
+ Loads the state (job information) from the state file for the given job name.
52
+ Returns an empty list if the state file does not exist.
53
+ """
54
+ path = self._state_file(job_name)
55
+ if path.exists():
56
+ with open(path, "r", encoding="utf-8") as f:
57
+ return json.load(f)
58
+ return []
59
+
60
+ def _save_state(self, job_name: str, jobs: list[dict[str, Any]]) -> None:
61
+ """
62
+ Saves the job state to the state file for the given job name.
63
+ """
64
+ with open(self._state_file(job_name), "w", encoding="utf-8") as f:
65
+ json.dump(jobs, f)
66
+
67
+ def _clear_state(self, job_name: str) -> None:
68
+ """
69
+ Deletes the state file for the given job name if it exists.
70
+ """
71
+ path = self._state_file(job_name)
72
+ if path.exists():
73
+ path.unlink()
74
+
75
+ def _build_task(self, text: str, idx: str) -> dict[str, Any]:
76
+ """
77
+ Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
78
+ """
79
+ response_format_config: dict[str, Any]
80
+
81
+ if self.custom_json_schema_obj_str:
82
+ response_format_config = {
83
+ "type": "json_schema",
84
+ "json_schema": self.custom_json_schema_obj_str,
85
+ }
86
+ else:
87
+ raw_schema = to_strict_json_schema(self.output_model)
88
+ response_format_config = {
89
+ "type": "json_schema",
90
+ "json_schema": {
91
+ "name": self.output_model.__name__,
92
+ "schema": raw_schema,
93
+ },
94
+ }
95
+
96
+ return {
97
+ "custom_id": str(idx),
98
+ "method": "POST",
99
+ "url": "/v1/chat/completions",
100
+ "body": {
101
+ "model": self.model,
102
+ "messages": [
103
+ {"role": "system", "content": self.prompt_template},
104
+ {"role": "user", "content": text},
105
+ ],
106
+ "response_format": response_format_config,
107
+ **self.client_kwargs,
108
+ },
109
+ }
110
+
111
+ def _prepare_file(self, payload: list[str] | list[dict[str, str]]) -> Path:
112
+ """
113
+ Prepares a JSONL file containing all tasks for the batch job, based on the input payload.
114
+ Returns the path to the created file.
115
+ """
116
+ if not payload:
117
+ raise ValueError("Payload must not be empty")
118
+ if isinstance(payload[0], str):
119
+ tasks = [self._build_task(text, uuid.uuid4().hex) for text in payload]
120
+ elif isinstance(payload[0], dict):
121
+ tasks = [self._build_task(dic["text"], dic["id"]) for dic in payload]
122
+
123
+ else:
124
+ raise TypeError(
125
+ "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}."
126
+ )
127
+
128
+ file_path = self.state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
129
+ with open(file_path, "w", encoding="utf-8") as f:
130
+ for task in tasks:
131
+ f.write(json.dumps(task) + "\n")
132
+ return file_path
133
+
134
+ def start(self, payload: list[str | dict[str, str]], job_name: str):
135
+ """
136
+ Starts a new batch job by uploading the prepared file and creating a batch job on the server.
137
+ If a job with the same name already exists, it does nothing.
138
+ """
139
+ if self._load_state(job_name):
140
+ return
141
+ path = self._prepare_file(payload)
142
+ upload = self.client.files.create(file=open(path, "rb"), purpose="batch")
143
+ job = self.client.batches.create(
144
+ input_file_id=upload.id,
145
+ endpoint="/v1/chat/completions",
146
+ completion_window="24h",
147
+ ).to_dict()
148
+ self._save_state(job_name, [job])
149
+
150
+ def check_status(self, job_name: str) -> str:
151
+ """
152
+ Checks and returns the current status of the batch job with the given job name.
153
+ Updates the job state with the latest information from the server.
154
+ """
155
+ job = self._load_state(job_name)[0]
156
+ if not job:
157
+ return "completed"
158
+
159
+ info = self.client.batches.retrieve(job["id"])
160
+ job = info.to_dict()
161
+ self._save_state(job_name, [job])
162
+ print("HERE is the job", job)
163
+ return job["status"]
164
+
165
+ def _parsed(self, result: dict) -> list:
166
+ """
167
+ Parses the result dictionary, extracting the desired output or error for each item.
168
+ Returns a list of dictionaries with 'id' and 'output' keys.
169
+ """
170
+ modified_result = []
171
+
172
+ for key, d in result.items():
173
+ if "desired_output" in d:
174
+ new_dict = {"id": key, "output": d["desired_output"]}
175
+ modified_result.append(new_dict)
176
+ else:
177
+ new_dict = {"id": key, "output": d["error"]}
178
+ modified_result.append(new_dict)
179
+ return modified_result
180
+
181
+ def fetch_results(
182
+ self, job_name: str, remove_cache: bool = True
183
+ ) -> tuple[dict[str, str], list]:
184
+ """
185
+ Fetches the results of a completed batch job. Optionally saves the results to a file and/or removes the job cache.
186
+ Returns a tuple containing the parsed results and a log of errors (if any).
187
+ """
188
+ job = self._load_state(job_name)[0]
189
+ if not job:
190
+ return {}
191
+ batch_id = job["id"]
192
+
193
+ info = self.client.batches.retrieve(batch_id)
194
+ out_file_id = info.output_file_id
195
+ if not out_file_id:
196
+ error_file_id = info.error_file_id
197
+ if error_file_id:
198
+ err_content = (
199
+ self.client.files.content(error_file_id).read().decode("utf-8")
200
+ )
201
+ print("Error file content:", err_content)
202
+ return {}
203
+
204
+ content = self.client.files.content(out_file_id).read().decode("utf-8")
205
+ lines = content.splitlines()
206
+ results = {}
207
+ log = []
208
+ for line in lines:
209
+ result = json.loads(line)
210
+ custom_id = result["custom_id"]
211
+ if result["response"]["status_code"] == 200:
212
+ content = result["response"]["body"]["choices"][0]["message"]["content"]
213
+ try:
214
+ parsed_content = json.loads(content)
215
+ model_instance = self.output_model(**parsed_content)
216
+ results[custom_id] = model_instance.model_dump(mode="json")
217
+ except json.JSONDecodeError:
218
+ results[custom_id] = {"error": "Failed to parse content as JSON"}
219
+ error_d = {custom_id: results[custom_id]}
220
+ log.append(error_d)
221
+ except Exception as e:
222
+ results[custom_id] = {"error": str(e)}
223
+ error_d = {custom_id: results[custom_id]}
224
+ log.append(error_d)
225
+ else:
226
+ error_message = (
227
+ result["response"]["body"]
228
+ .get("error", {})
229
+ .get("message", "Unknown error")
230
+ )
231
+ results[custom_id] = {"error": error_message}
232
+ error_d = {custom_id: results[custom_id]}
233
+ log.append(error_d)
234
+
235
+ for handler in self.handlers:
236
+ handler.handle(results)
237
+ if remove_cache:
238
+ self._clear_state(job_name)
239
+
240
+ return results, log