hamtaa-texttools 1.1.3__tar.gz → 1.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {hamtaa_texttools-1.1.3/hamtaa_texttools.egg-info → hamtaa_texttools-1.1.7}/PKG-INFO +52 -9
  2. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/README.md +51 -8
  3. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7/hamtaa_texttools.egg-info}/PKG-INFO +52 -9
  4. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/pyproject.toml +1 -1
  5. hamtaa_texttools-1.1.7/texttools/__init__.py +4 -0
  6. hamtaa_texttools-1.1.7/texttools/batch/__init__.py +3 -0
  7. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/batch/batch_manager.py +9 -11
  8. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/batch/batch_runner.py +53 -61
  9. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/async_the_tool.py +11 -11
  10. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/async_operator.py +6 -6
  11. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/base_operator.py +1 -2
  12. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/operator.py +6 -6
  13. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/output_models.py +7 -4
  14. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/the_tool.py +11 -11
  15. hamtaa_texttools-1.1.3/texttools/__init__.py +0 -9
  16. hamtaa_texttools-1.1.3/texttools/batch/__init__.py +0 -4
  17. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/LICENSE +0 -0
  18. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/MANIFEST.in +0 -0
  19. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/hamtaa_texttools.egg-info/SOURCES.txt +0 -0
  20. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
  21. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/hamtaa_texttools.egg-info/requires.txt +0 -0
  22. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/hamtaa_texttools.egg-info/top_level.txt +0 -0
  23. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/setup.cfg +0 -0
  24. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/README.md +0 -0
  25. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/categorizer.yaml +0 -0
  26. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/extract_entities.yaml +0 -0
  27. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/extract_keywords.yaml +0 -0
  28. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/is_question.yaml +0 -0
  29. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/merge_questions.yaml +0 -0
  30. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/rewrite.yaml +0 -0
  31. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/run_custom.yaml +0 -0
  32. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/subject_to_question.yaml +0 -0
  33. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/summarize.yaml +0 -0
  34. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/text_to_question.yaml +0 -0
  35. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/prompts/translate.yaml +0 -0
  36. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/__init__.py +0 -0
  37. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/formatters.py +0 -0
  38. {hamtaa_texttools-1.1.3 → hamtaa_texttools-1.1.7}/texttools/tools/internals/prompt_loader.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.1.3
3
+ Version: 1.1.7
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>
6
6
  License: MIT License
@@ -86,6 +86,18 @@ All these parameters can be used individually or together to tailor the behavior
86
86
 
87
87
  ---
88
88
 
89
+ ## 🧩 ToolOutput
90
+
91
+ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
92
+ - **`result`** → The output of LLM (`type=Any`)
93
+ - **`analysis`** → The reasoning step before generating the final output (`type=str`)
94
+ - **`logprobs`** → Token-level probabilities for the generated output (`type=list`)
95
+ - **`errors`** → Any error that have occured during calling LLM (`type=str`)
96
+
97
+ **None:** You can use `repr(ToolOutput)` to see details of an output.
98
+
99
+ ---
100
+
89
101
  ## 🚀 Installation
90
102
 
91
103
  Install the latest release via PyPI:
@@ -123,13 +135,13 @@ the_tool = TheTool(client=client, model=model)
123
135
  detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
124
136
  print(detection.result)
125
137
  print(detection.logprobs)
126
- # Output: True \n --logprobs
138
+ # Output: True + logprobs
127
139
 
128
140
  # Example: Translation
129
141
  translation = the_tool.translate("سلام، حالت چطوره؟" target_language="English", with_analysis=True)
130
142
  print(translation.result)
131
143
  print(translation.analysis)
132
- # Output: "Hi! How are you?" \n --analysis
144
+ # Output: "Hi! How are you?" + analysis
133
145
  ```
134
146
 
135
147
  ---
@@ -149,19 +161,22 @@ async def main():
149
161
  model = "gpt-4o-mini"
150
162
 
151
163
  # Create an instance of AsyncTheTool
152
- the_tool = AsyncTheTool(client=async_client, model=model)
164
+ async_the_tool = AsyncTheTool(client=async_client, model=model)
165
+
166
+ # Example: Async Translation and Keyword Extraction
167
+ translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_language="English")
168
+ keywords_task = async_the_tool.extract_keywords("Tomorrow, we will be dead by the car crash")
153
169
 
154
- # Example: Async Translation
155
- translation = await the_tool.translate("سلام، حالت چطوره؟", target_language="English")
170
+ (translation, keywords) = await asyncio.gather(translation_task, keywords_task)
156
171
  print(translation.result)
157
- # Output: "Hi! How are you?"
172
+ print(keywords.result)
158
173
 
159
174
  asyncio.run(main())
160
175
  ```
161
176
 
162
177
  ---
163
178
 
164
- ## 📚 Use Cases
179
+ ## 👍 Use Cases
165
180
 
166
181
  Use **TextTools** when you need to:
167
182
 
@@ -169,7 +184,35 @@ Use **TextTools** when you need to:
169
184
  - 🌍 **Translate** and process multilingual corpora with ease
170
185
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
171
186
  - 📊 **Analyze** large text collections using embeddings and categorization
172
- - 👍 **Automate** common text-processing tasks without reinventing the wheel
187
+
188
+ ---
189
+
190
+ ## 📚 Batch Processing
191
+
192
+ Process large datasets efficiently using OpenAI's batch API.
193
+
194
+ ## Quick Start
195
+
196
+ ```python
197
+ from texttools import BatchJobRunner, BatchConfig
198
+
199
+ # Configure your batch job
200
+ config = BatchConfig(
201
+ system_prompt="Extract entities from the text",
202
+ job_name="entity_extraction",
203
+ input_data_path="data.json",
204
+ output_data_filename="results.json",
205
+ model="gpt-4o-mini"
206
+ )
207
+
208
+ # Define your output schema
209
+ class Output(BaseModel):
210
+ entities: list[str]
211
+
212
+ # Run the batch job
213
+ runner = BatchJobRunner(config, output_model=Output)
214
+ runner.run()
215
+ ```
173
216
 
174
217
  ---
175
218
 
@@ -52,6 +52,18 @@ All these parameters can be used individually or together to tailor the behavior
52
52
 
53
53
  ---
54
54
 
55
+ ## 🧩 ToolOutput
56
+
57
+ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
58
+ - **`result`** → The output of LLM (`type=Any`)
59
+ - **`analysis`** → The reasoning step before generating the final output (`type=str`)
60
+ - **`logprobs`** → Token-level probabilities for the generated output (`type=list`)
61
+ - **`errors`** → Any error that have occured during calling LLM (`type=str`)
62
+
63
+ **None:** You can use `repr(ToolOutput)` to see details of an output.
64
+
65
+ ---
66
+
55
67
  ## 🚀 Installation
56
68
 
57
69
  Install the latest release via PyPI:
@@ -89,13 +101,13 @@ the_tool = TheTool(client=client, model=model)
89
101
  detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
90
102
  print(detection.result)
91
103
  print(detection.logprobs)
92
- # Output: True \n --logprobs
104
+ # Output: True + logprobs
93
105
 
94
106
  # Example: Translation
95
107
  translation = the_tool.translate("سلام، حالت چطوره؟" target_language="English", with_analysis=True)
96
108
  print(translation.result)
97
109
  print(translation.analysis)
98
- # Output: "Hi! How are you?" \n --analysis
110
+ # Output: "Hi! How are you?" + analysis
99
111
  ```
100
112
 
101
113
  ---
@@ -115,19 +127,22 @@ async def main():
115
127
  model = "gpt-4o-mini"
116
128
 
117
129
  # Create an instance of AsyncTheTool
118
- the_tool = AsyncTheTool(client=async_client, model=model)
130
+ async_the_tool = AsyncTheTool(client=async_client, model=model)
131
+
132
+ # Example: Async Translation and Keyword Extraction
133
+ translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_language="English")
134
+ keywords_task = async_the_tool.extract_keywords("Tomorrow, we will be dead by the car crash")
119
135
 
120
- # Example: Async Translation
121
- translation = await the_tool.translate("سلام، حالت چطوره؟", target_language="English")
136
+ (translation, keywords) = await asyncio.gather(translation_task, keywords_task)
122
137
  print(translation.result)
123
- # Output: "Hi! How are you?"
138
+ print(keywords.result)
124
139
 
125
140
  asyncio.run(main())
126
141
  ```
127
142
 
128
143
  ---
129
144
 
130
- ## 📚 Use Cases
145
+ ## 👍 Use Cases
131
146
 
132
147
  Use **TextTools** when you need to:
133
148
 
@@ -135,7 +150,35 @@ Use **TextTools** when you need to:
135
150
  - 🌍 **Translate** and process multilingual corpora with ease
136
151
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
137
152
  - 📊 **Analyze** large text collections using embeddings and categorization
138
- - 👍 **Automate** common text-processing tasks without reinventing the wheel
153
+
154
+ ---
155
+
156
+ ## 📚 Batch Processing
157
+
158
+ Process large datasets efficiently using OpenAI's batch API.
159
+
160
+ ## Quick Start
161
+
162
+ ```python
163
+ from texttools import BatchJobRunner, BatchConfig
164
+
165
+ # Configure your batch job
166
+ config = BatchConfig(
167
+ system_prompt="Extract entities from the text",
168
+ job_name="entity_extraction",
169
+ input_data_path="data.json",
170
+ output_data_filename="results.json",
171
+ model="gpt-4o-mini"
172
+ )
173
+
174
+ # Define your output schema
175
+ class Output(BaseModel):
176
+ entities: list[str]
177
+
178
+ # Run the batch job
179
+ runner = BatchJobRunner(config, output_model=Output)
180
+ runner.run()
181
+ ```
139
182
 
140
183
  ---
141
184
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.1.3
3
+ Version: 1.1.7
4
4
  Summary: A high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>
6
6
  License: MIT License
@@ -86,6 +86,18 @@ All these parameters can be used individually or together to tailor the behavior
86
86
 
87
87
  ---
88
88
 
89
+ ## 🧩 ToolOutput
90
+
91
+ Every tool of `TextTools` returns a `ToolOutput` object which is a BaseModel with attributes:
92
+ - **`result`** → The output of LLM (`type=Any`)
93
+ - **`analysis`** → The reasoning step before generating the final output (`type=str`)
94
+ - **`logprobs`** → Token-level probabilities for the generated output (`type=list`)
95
+ - **`errors`** → Any error that have occured during calling LLM (`type=str`)
96
+
97
+ **None:** You can use `repr(ToolOutput)` to see details of an output.
98
+
99
+ ---
100
+
89
101
  ## 🚀 Installation
90
102
 
91
103
  Install the latest release via PyPI:
@@ -123,13 +135,13 @@ the_tool = TheTool(client=client, model=model)
123
135
  detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
124
136
  print(detection.result)
125
137
  print(detection.logprobs)
126
- # Output: True \n --logprobs
138
+ # Output: True + logprobs
127
139
 
128
140
  # Example: Translation
129
141
  translation = the_tool.translate("سلام، حالت چطوره؟" target_language="English", with_analysis=True)
130
142
  print(translation.result)
131
143
  print(translation.analysis)
132
- # Output: "Hi! How are you?" \n --analysis
144
+ # Output: "Hi! How are you?" + analysis
133
145
  ```
134
146
 
135
147
  ---
@@ -149,19 +161,22 @@ async def main():
149
161
  model = "gpt-4o-mini"
150
162
 
151
163
  # Create an instance of AsyncTheTool
152
- the_tool = AsyncTheTool(client=async_client, model=model)
164
+ async_the_tool = AsyncTheTool(client=async_client, model=model)
165
+
166
+ # Example: Async Translation and Keyword Extraction
167
+ translation_task = async_the_tool.translate("سلام، حالت چطوره؟", target_language="English")
168
+ keywords_task = async_the_tool.extract_keywords("Tomorrow, we will be dead by the car crash")
153
169
 
154
- # Example: Async Translation
155
- translation = await the_tool.translate("سلام، حالت چطوره؟", target_language="English")
170
+ (translation, keywords) = await asyncio.gather(translation_task, keywords_task)
156
171
  print(translation.result)
157
- # Output: "Hi! How are you?"
172
+ print(keywords.result)
158
173
 
159
174
  asyncio.run(main())
160
175
  ```
161
176
 
162
177
  ---
163
178
 
164
- ## 📚 Use Cases
179
+ ## 👍 Use Cases
165
180
 
166
181
  Use **TextTools** when you need to:
167
182
 
@@ -169,7 +184,35 @@ Use **TextTools** when you need to:
169
184
  - 🌍 **Translate** and process multilingual corpora with ease
170
185
  - 🧩 **Integrate** LLMs into production pipelines (structured outputs)
171
186
  - 📊 **Analyze** large text collections using embeddings and categorization
172
- - 👍 **Automate** common text-processing tasks without reinventing the wheel
187
+
188
+ ---
189
+
190
+ ## 📚 Batch Processing
191
+
192
+ Process large datasets efficiently using OpenAI's batch API.
193
+
194
+ ## Quick Start
195
+
196
+ ```python
197
+ from texttools import BatchJobRunner, BatchConfig
198
+
199
+ # Configure your batch job
200
+ config = BatchConfig(
201
+ system_prompt="Extract entities from the text",
202
+ job_name="entity_extraction",
203
+ input_data_path="data.json",
204
+ output_data_filename="results.json",
205
+ model="gpt-4o-mini"
206
+ )
207
+
208
+ # Define your output schema
209
+ class Output(BaseModel):
210
+ entities: list[str]
211
+
212
+ # Run the batch job
213
+ runner = BatchJobRunner(config, output_model=Output)
214
+ runner.run()
215
+ ```
173
216
 
174
217
  ---
175
218
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hamtaa-texttools"
7
- version = "1.1.3"
7
+ version = "1.1.7"
8
8
  authors = [
9
9
  { name = "Tohidi", email = "the.mohammad.tohidi@gmail.com" },
10
10
  { name = "Montazer", email = "montazerh82@gmail.com" },
@@ -0,0 +1,4 @@
1
+ from .batch import BatchJobRunner, BatchConfig
2
+ from .tools import AsyncTheTool, TheTool
3
+
4
+ __all__ = ["TheTool", "AsyncTheTool", "BatchJobRunner", "BatchConfig"]
@@ -0,0 +1,3 @@
1
+ from .batch_runner import BatchJobRunner, BatchConfig
2
+
3
+ __all__ = ["BatchJobRunner", "BatchConfig"]
@@ -1,18 +1,20 @@
1
1
  import json
2
2
  import uuid
3
3
  from pathlib import Path
4
- from typing import Any, Type
4
+ from typing import Any, Type, TypeVar
5
5
  import logging
6
6
 
7
7
  from pydantic import BaseModel
8
8
  from openai import OpenAI
9
9
  from openai.lib._pydantic import to_strict_json_schema
10
10
 
11
- logger = logging.getLogger("batch_runner")
12
- logger.setLevel(logging.INFO)
11
+ # Base Model type for output models
12
+ T = TypeVar("T", bound=BaseModel)
13
13
 
14
+ logger = logging.getLogger("texttools.batch_runner")
14
15
 
15
- class SimpleBatchManager:
16
+
17
+ class BatchManager:
16
18
  """
17
19
  Manages batch processing jobs for OpenAI's chat completions with structured outputs.
18
20
 
@@ -25,9 +27,8 @@ class SimpleBatchManager:
25
27
  self,
26
28
  client: OpenAI,
27
29
  model: str,
28
- output_model: Type[BaseModel],
30
+ output_model: Type[T],
29
31
  prompt_template: str,
30
- handlers: list[Any] | None = None,
31
32
  state_dir: Path = Path(".batch_jobs"),
32
33
  custom_json_schema_obj_str: dict | None = None,
33
34
  **client_kwargs: Any,
@@ -36,7 +37,6 @@ class SimpleBatchManager:
36
37
  self.model = model
37
38
  self.output_model = output_model
38
39
  self.prompt_template = prompt_template
39
- self.handlers = handlers or []
40
40
  self.state_dir = state_dir
41
41
  self.state_dir.mkdir(parents=True, exist_ok=True)
42
42
  self.custom_json_schema_obj_str = custom_json_schema_obj_str
@@ -45,7 +45,7 @@ class SimpleBatchManager:
45
45
 
46
46
  if self.custom_json_schema_obj_str:
47
47
  if self.custom_json_schema_obj_str is not dict:
48
- raise ValueError("schema should be a dict")
48
+ raise ValueError("Schema should be a dict")
49
49
 
50
50
  def _state_file(self, job_name: str) -> Path:
51
51
  return self.state_dir / f"{job_name}.json"
@@ -126,7 +126,7 @@ class SimpleBatchManager:
126
126
 
127
127
  else:
128
128
  raise TypeError(
129
- "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}."
129
+ "The input must be either a list of texts or a dictionary in the form {'id': str, 'text': str}"
130
130
  )
131
131
 
132
132
  file_path = self.state_dir / f"batch_{uuid.uuid4().hex}.jsonl"
@@ -220,8 +220,6 @@ class SimpleBatchManager:
220
220
  error_d = {custom_id: results[custom_id]}
221
221
  log.append(error_d)
222
222
 
223
- for handler in self.handlers:
224
- handler.handle(results)
225
223
  if remove_cache:
226
224
  self._clear_state(job_name)
227
225
 
@@ -3,24 +3,23 @@ import os
3
3
  import time
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, Callable
6
+ from typing import Any, Callable, Type, TypeVar
7
7
  import logging
8
8
 
9
9
  from dotenv import load_dotenv
10
10
  from openai import OpenAI
11
11
  from pydantic import BaseModel
12
12
 
13
- from texttools.batch import SimpleBatchManager
13
+ from texttools.batch.batch_manager import BatchManager
14
+ from texttools.tools.internals.output_models import StrOutput
14
15
 
15
- logger = logging.getLogger("batch_runner")
16
- logger.setLevel(logging.INFO)
16
+ # Base Model type for output models
17
+ T = TypeVar("T", bound=BaseModel)
17
18
 
19
+ logger = logging.getLogger("texttools.batch_runner")
18
20
 
19
- class OutputModel(BaseModel):
20
- desired_output: str
21
21
 
22
-
23
- def export_data(data):
22
+ def export_data(data) -> list[dict[str, str]]:
24
23
  """
25
24
  Produces a structure of the following form from an initial data structure:
26
25
  [{"id": str, "text": str},...]
@@ -28,7 +27,7 @@ def export_data(data):
28
27
  return data
29
28
 
30
29
 
31
- def import_data(data):
30
+ def import_data(data) -> Any:
32
31
  """
33
32
  Takes the output and adds and aggregates it to the original structure.
34
33
  """
@@ -47,9 +46,9 @@ class BatchConfig:
47
46
  output_data_filename: str = ""
48
47
  model: str = "gpt-4.1-mini"
49
48
  MAX_BATCH_SIZE: int = 100
50
- MAX_TOTAL_TOKENS: int = 2000000
49
+ MAX_TOTAL_TOKENS: int = 2_000_000
51
50
  CHARS_PER_TOKEN: float = 2.7
52
- PROMPT_TOKEN_MULTIPLIER: int = 1000
51
+ PROMPT_TOKEN_MULTIPLIER: int = 1_000
53
52
  BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
54
53
  import_function: Callable = import_data
55
54
  export_function: Callable = export_data
@@ -63,7 +62,7 @@ class BatchJobRunner:
63
62
  """
64
63
 
65
64
  def __init__(
66
- self, config: BatchConfig = BatchConfig(), output_model: type = OutputModel
65
+ self, config: BatchConfig = BatchConfig(), output_model: Type[T] = StrOutput
67
66
  ):
68
67
  self.config = config
69
68
  self.system_prompt = config.system_prompt
@@ -82,11 +81,11 @@ class BatchJobRunner:
82
81
  # Track retry attempts per part
83
82
  self.part_attempts: dict[int, int] = {}
84
83
 
85
- def _init_manager(self) -> SimpleBatchManager:
84
+ def _init_manager(self) -> BatchManager:
86
85
  load_dotenv()
87
86
  api_key = os.getenv("OPENAI_API_KEY")
88
87
  client = OpenAI(api_key=api_key)
89
- return SimpleBatchManager(
88
+ return BatchManager(
90
89
  client=client,
91
90
  model=self.model,
92
91
  prompt_template=self.system_prompt,
@@ -101,12 +100,12 @@ class BatchJobRunner:
101
100
  # Ensure data is a list of dicts with 'id' and 'content' as strings
102
101
  if not isinstance(data, list):
103
102
  raise ValueError(
104
- 'Exported data must be a list in this form: [ {"id": str, "content": str},...]'
103
+ "Exported data must be a list of dicts with 'id' and 'content' keys"
105
104
  )
106
105
  for item in data:
107
106
  if not (isinstance(item, dict) and "id" in item and "content" in item):
108
107
  raise ValueError(
109
- "Each item must be a dict with 'id' and 'content' keys."
108
+ f"Item must be a dict with 'id' and 'content' keys. Got: {type(item)}"
110
109
  )
111
110
  if not (isinstance(item["id"], str) and isinstance(item["content"], str)):
112
111
  raise ValueError("'id' and 'content' must be strings.")
@@ -161,7 +160,45 @@ class BatchJobRunner:
161
160
  logger.info("Uploading...")
162
161
  time.sleep(30)
163
162
 
163
+ def _save_results(
164
+ self,
165
+ output_data: list[dict[str, Any]] | dict[str, Any],
166
+ log: list[Any],
167
+ part_idx: int,
168
+ ):
169
+ part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
170
+ result_path = (
171
+ Path(self.config.BASE_OUTPUT_DIR)
172
+ / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
173
+ )
174
+ if not output_data:
175
+ logger.info("No output data to save. Skipping this part.")
176
+ return
177
+ else:
178
+ with open(result_path, "w", encoding="utf-8") as f:
179
+ json.dump(output_data, f, ensure_ascii=False, indent=4)
180
+ if log:
181
+ log_path = (
182
+ Path(self.config.BASE_OUTPUT_DIR)
183
+ / f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
184
+ )
185
+ with open(log_path, "w", encoding="utf-8") as f:
186
+ json.dump(log, f, ensure_ascii=False, indent=4)
187
+
188
+ def _result_exists(self, part_idx: int) -> bool:
189
+ part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
190
+ result_path = (
191
+ Path(self.config.BASE_OUTPUT_DIR)
192
+ / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
193
+ )
194
+ return result_path.exists()
195
+
164
196
  def run(self):
197
+ """
198
+ Execute the batch job processing pipeline.
199
+
200
+ Submits jobs, monitors progress, handles retries, and saves results.
201
+ """
165
202
  # Submit all jobs up-front for concurrent execution
166
203
  self._submit_all_jobs()
167
204
  pending_parts: set[int] = set(self.part_idx_to_job_name.keys())
@@ -215,48 +252,3 @@ class BatchJobRunner:
215
252
  f"Waiting {self.config.poll_interval_seconds}s before next status check for parts: {sorted(pending_parts)}"
216
253
  )
217
254
  time.sleep(self.config.poll_interval_seconds)
218
-
219
- def _save_results(
220
- self,
221
- output_data: list[dict[str, Any]] | dict[str, Any],
222
- log: list[Any],
223
- part_idx: int,
224
- ):
225
- part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
226
- result_path = (
227
- Path(self.config.BASE_OUTPUT_DIR)
228
- / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
229
- )
230
- if not output_data:
231
- logger.info("No output data to save. Skipping this part.")
232
- return
233
- else:
234
- with open(result_path, "w", encoding="utf-8") as f:
235
- json.dump(output_data, f, ensure_ascii=False, indent=4)
236
- if log:
237
- log_path = (
238
- Path(self.config.BASE_OUTPUT_DIR)
239
- / f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
240
- )
241
- with open(log_path, "w", encoding="utf-8") as f:
242
- json.dump(log, f, ensure_ascii=False, indent=4)
243
-
244
- def _result_exists(self, part_idx: int) -> bool:
245
- part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
246
- result_path = (
247
- Path(self.config.BASE_OUTPUT_DIR)
248
- / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
249
- )
250
- return result_path.exists()
251
-
252
-
253
- if __name__ == "__main__":
254
- logger.info("=== Batch Job Runner ===")
255
- config = BatchConfig(
256
- system_prompt="",
257
- job_name="job_name",
258
- input_data_path="Data.json",
259
- output_data_filename="output",
260
- )
261
- runner = BatchJobRunner(config)
262
- runner.run()
@@ -1,4 +1,4 @@
1
- from typing import Literal, Any
1
+ from typing import Literal, Any, Callable
2
2
 
3
3
  from openai import AsyncOpenAI
4
4
 
@@ -34,7 +34,7 @@ class AsyncTheTool:
34
34
  temperature: float | None = 0.0,
35
35
  logprobs: bool = False,
36
36
  top_logprobs: int | None = None,
37
- validator: Any | None = None,
37
+ validator: Callable[[Any], bool] | None = None,
38
38
  ) -> OutputModels.ToolOutput:
39
39
  """
40
40
  Categorize a text into a single Islamic studies domain category.
@@ -71,7 +71,7 @@ class AsyncTheTool:
71
71
  temperature: float | None = 0.0,
72
72
  logprobs: bool = False,
73
73
  top_logprobs: int | None = None,
74
- validator: Any | None = None,
74
+ validator: Callable[[Any], bool] | None = None,
75
75
  ) -> OutputModels.ToolOutput:
76
76
  """
77
77
  Extract salient keywords from text.
@@ -108,7 +108,7 @@ class AsyncTheTool:
108
108
  temperature: float | None = 0.0,
109
109
  logprobs: bool = False,
110
110
  top_logprobs: int | None = None,
111
- validator: Any | None = None,
111
+ validator: Callable[[Any], bool] | None = None,
112
112
  ) -> OutputModels.ToolOutput:
113
113
  """
114
114
  Perform Named Entity Recognition (NER) over the input text.
@@ -144,7 +144,7 @@ class AsyncTheTool:
144
144
  temperature: float | None = 0.0,
145
145
  logprobs: bool = False,
146
146
  top_logprobs: int | None = None,
147
- validator: Any | None = None,
147
+ validator: Callable[[Any], bool] | None = None,
148
148
  ) -> OutputModels.ToolOutput:
149
149
  """
150
150
  Detect if the input is phrased as a question.
@@ -181,7 +181,7 @@ class AsyncTheTool:
181
181
  temperature: float | None = 0.0,
182
182
  logprobs: bool = False,
183
183
  top_logprobs: int | None = None,
184
- validator: Any | None = None,
184
+ validator: Callable[[Any], bool] | None = None,
185
185
  ) -> OutputModels.ToolOutput:
186
186
  """
187
187
  Generate a single question from the given text.
@@ -219,7 +219,7 @@ class AsyncTheTool:
219
219
  logprobs: bool = False,
220
220
  top_logprobs: int | None = None,
221
221
  mode: Literal["default", "reason"] = "default",
222
- validator: Any | None = None,
222
+ validator: Callable[[Any], bool] | None = None,
223
223
  ) -> OutputModels.ToolOutput:
224
224
  """
225
225
  Merge multiple questions into a single unified question.
@@ -258,7 +258,7 @@ class AsyncTheTool:
258
258
  logprobs: bool = False,
259
259
  top_logprobs: int | None = None,
260
260
  mode: Literal["positive", "negative", "hard_negative"] = "positive",
261
- validator: Any | None = None,
261
+ validator: Callable[[Any], bool] | None = None,
262
262
  ) -> OutputModels.ToolOutput:
263
263
  """
264
264
  Rewrite a text with different modes.
@@ -296,7 +296,7 @@ class AsyncTheTool:
296
296
  temperature: float | None = 0.0,
297
297
  logprobs: bool = False,
298
298
  top_logprobs: int | None = None,
299
- validator: Any | None = None,
299
+ validator: Callable[[Any], bool] | None = None,
300
300
  ) -> OutputModels.ToolOutput:
301
301
  """
302
302
  Generate a list of questions about a subject.
@@ -334,7 +334,7 @@ class AsyncTheTool:
334
334
  temperature: float | None = 0.0,
335
335
  logprobs: bool = False,
336
336
  top_logprobs: int | None = None,
337
- validator: Any | None = None,
337
+ validator: Callable[[Any], bool] | None = None,
338
338
  ) -> OutputModels.ToolOutput:
339
339
  """
340
340
  Summarize the given subject text.
@@ -371,7 +371,7 @@ class AsyncTheTool:
371
371
  temperature: float | None = 0.0,
372
372
  logprobs: bool = False,
373
373
  top_logprobs: int | None = None,
374
- validator: Any | None = None,
374
+ validator: Callable[[Any], bool] | None = None,
375
375
  ) -> OutputModels.ToolOutput:
376
376
  """
377
377
  Translate text between languages.
@@ -1,4 +1,4 @@
1
- from typing import Any, TypeVar, Type, Literal
1
+ from typing import Any, TypeVar, Type, Literal, Callable
2
2
  import logging
3
3
 
4
4
  from openai import AsyncOpenAI
@@ -12,8 +12,7 @@ from texttools.tools.internals.prompt_loader import PromptLoader
12
12
  # Base Model type for output models
13
13
  T = TypeVar("T", bound=BaseModel)
14
14
 
15
- logger = logging.getLogger("async_operator")
16
- logger.setLevel(logging.INFO)
15
+ logger = logging.getLogger("texttools.async_operator")
17
16
 
18
17
 
19
18
  class AsyncOperator(BaseOperator):
@@ -115,7 +114,7 @@ class AsyncOperator(BaseOperator):
115
114
  temperature: float,
116
115
  logprobs: bool,
117
116
  top_logprobs: int | None,
118
- validator: Any | None,
117
+ validator: Callable[[Any], bool] | None,
119
118
  # Internal parameters
120
119
  prompt_file: str,
121
120
  output_model: Type[T],
@@ -128,7 +127,7 @@ class AsyncOperator(BaseOperator):
128
127
  """
129
128
  prompt_loader = PromptLoader()
130
129
  formatter = Formatter()
131
- output = ToolOutput(result="", analysis="", logprobs=[], errors=[])
130
+ output = ToolOutput()
132
131
 
133
132
  try:
134
133
  # Prompt configs contain two keys: main_template and analyze template, both are string
@@ -239,4 +238,5 @@ class AsyncOperator(BaseOperator):
239
238
 
240
239
  except Exception as e:
241
240
  logger.error(f"AsyncTheTool failed: {e}")
242
- return output.errors.append(str(e))
241
+ output.errors.append(str(e))
242
+ return output
@@ -11,8 +11,7 @@ from openai import OpenAI, AsyncOpenAI
11
11
  # Base Model type for output models
12
12
  T = TypeVar("T", bound=BaseModel)
13
13
 
14
- logger = logging.getLogger("base_operator")
15
- logger.setLevel(logging.INFO)
14
+ logger = logging.getLogger("texttools.base_operator")
16
15
 
17
16
 
18
17
  class BaseOperator:
@@ -1,4 +1,4 @@
1
- from typing import Any, TypeVar, Type, Literal
1
+ from typing import Any, TypeVar, Type, Literal, Callable
2
2
  import logging
3
3
 
4
4
  from openai import OpenAI
@@ -12,8 +12,7 @@ from texttools.tools.internals.prompt_loader import PromptLoader
12
12
  # Base Model type for output models
13
13
  T = TypeVar("T", bound=BaseModel)
14
14
 
15
- logger = logging.getLogger("operator")
16
- logger.setLevel(logging.INFO)
15
+ logger = logging.getLogger("texttools.operator")
17
16
 
18
17
 
19
18
  class Operator(BaseOperator):
@@ -115,7 +114,7 @@ class Operator(BaseOperator):
115
114
  temperature: float,
116
115
  logprobs: bool,
117
116
  top_logprobs: int | None,
118
- validator: Any | None,
117
+ validator: Callable[[Any], bool] | None,
119
118
  # Internal parameters
120
119
  prompt_file: str,
121
120
  output_model: Type[T],
@@ -128,7 +127,7 @@ class Operator(BaseOperator):
128
127
  """
129
128
  prompt_loader = PromptLoader()
130
129
  formatter = Formatter()
131
- output = ToolOutput(result="", analysis="", logprobs=[], errors=[])
130
+ output = ToolOutput()
132
131
 
133
132
  try:
134
133
  # Prompt configs contain two keys: main_template and analyze template, both are string
@@ -239,4 +238,5 @@ class Operator(BaseOperator):
239
238
 
240
239
  except Exception as e:
241
240
  logger.error(f"TheTool failed: {e}")
242
- return output.errors.append(str(e))
241
+ output.errors.append(str(e))
242
+ return output
@@ -4,10 +4,13 @@ from pydantic import BaseModel, Field
4
4
 
5
5
 
6
6
  class ToolOutput(BaseModel):
7
- result: str
8
- analysis: str
9
- logprobs: list[dict[str, Any]]
10
- errors: list[str]
7
+ result: Any = None
8
+ analysis: str = ""
9
+ logprobs: list[dict[str, Any]] = []
10
+ errors: list[str] = []
11
+
12
+ def __repr__(self) -> str:
13
+ return f"ToolOutput(result_type='{type(self.result)}', result='{self.result}', analysis='{self.analysis}', logprobs='{self.logprobs}', errors='{self.errors}'"
11
14
 
12
15
 
13
16
  class StrOutput(BaseModel):
@@ -1,4 +1,4 @@
1
- from typing import Literal, Any
1
+ from typing import Literal, Any, Callable
2
2
 
3
3
  from openai import OpenAI
4
4
 
@@ -32,7 +32,7 @@ class TheTool:
32
32
  temperature: float | None = 0.0,
33
33
  logprobs: bool = False,
34
34
  top_logprobs: int | None = None,
35
- validator: Any | None = None,
35
+ validator: Callable[[Any], bool] | None = None,
36
36
  ) -> OutputModels.ToolOutput:
37
37
  """
38
38
  Categorize a text into a single Islamic studies domain category.
@@ -69,7 +69,7 @@ class TheTool:
69
69
  temperature: float | None = 0.0,
70
70
  logprobs: bool = False,
71
71
  top_logprobs: int | None = None,
72
- validator: Any | None = None,
72
+ validator: Callable[[Any], bool] | None = None,
73
73
  ) -> OutputModels.ToolOutput:
74
74
  """
75
75
  Extract salient keywords from text.
@@ -106,7 +106,7 @@ class TheTool:
106
106
  temperature: float | None = 0.0,
107
107
  logprobs: bool = False,
108
108
  top_logprobs: int | None = None,
109
- validator: Any | None = None,
109
+ validator: Callable[[Any], bool] | None = None,
110
110
  ) -> OutputModels.ToolOutput:
111
111
  """
112
112
  Perform Named Entity Recognition (NER) over the input text.
@@ -142,7 +142,7 @@ class TheTool:
142
142
  temperature: float | None = 0.0,
143
143
  logprobs: bool = False,
144
144
  top_logprobs: int | None = None,
145
- validator: Any | None = None,
145
+ validator: Callable[[Any], bool] | None = None,
146
146
  ) -> OutputModels.ToolOutput:
147
147
  """
148
148
  Detect if the input is phrased as a question.
@@ -179,7 +179,7 @@ class TheTool:
179
179
  temperature: float | None = 0.0,
180
180
  logprobs: bool = False,
181
181
  top_logprobs: int | None = None,
182
- validator: Any | None = None,
182
+ validator: Callable[[Any], bool] | None = None,
183
183
  ) -> OutputModels.ToolOutput:
184
184
  """
185
185
  Generate a single question from the given text.
@@ -217,7 +217,7 @@ class TheTool:
217
217
  logprobs: bool = False,
218
218
  top_logprobs: int | None = None,
219
219
  mode: Literal["default", "reason"] = "default",
220
- validator: Any | None = None,
220
+ validator: Callable[[Any], bool] | None = None,
221
221
  ) -> OutputModels.ToolOutput:
222
222
  """
223
223
  Merge multiple questions into a single unified question.
@@ -256,7 +256,7 @@ class TheTool:
256
256
  logprobs: bool = False,
257
257
  top_logprobs: int | None = None,
258
258
  mode: Literal["positive", "negative", "hard_negative"] = "positive",
259
- validator: Any | None = None,
259
+ validator: Callable[[Any], bool] | None = None,
260
260
  ) -> OutputModels.ToolOutput:
261
261
  """
262
262
  Rewrite a text with different modes.
@@ -294,7 +294,7 @@ class TheTool:
294
294
  temperature: float | None = 0.0,
295
295
  logprobs: bool = False,
296
296
  top_logprobs: int | None = None,
297
- validator: Any | None = None,
297
+ validator: Callable[[Any], bool] | None = None,
298
298
  ) -> OutputModels.ToolOutput:
299
299
  """
300
300
  Generate a list of questions about a subject.
@@ -332,7 +332,7 @@ class TheTool:
332
332
  temperature: float | None = 0.0,
333
333
  logprobs: bool = False,
334
334
  top_logprobs: int | None = None,
335
- validator: Any | None = None,
335
+ validator: Callable[[Any], bool] | None = None,
336
336
  ) -> OutputModels.ToolOutput:
337
337
  """
338
338
  Summarize the given subject text.
@@ -369,7 +369,7 @@ class TheTool:
369
369
  temperature: float | None = 0.0,
370
370
  logprobs: bool = False,
371
371
  top_logprobs: int | None = None,
372
- validator: Any | None = None,
372
+ validator: Callable[[Any], bool] | None = None,
373
373
  ) -> OutputModels.ToolOutput:
374
374
  """
375
375
  Translate text between languages.
@@ -1,9 +0,0 @@
1
- from .batch import BatchJobRunner, SimpleBatchManager
2
- from .tools import AsyncTheTool, TheTool
3
-
4
- __all__ = [
5
- "TheTool",
6
- "AsyncTheTool",
7
- "SimpleBatchManager",
8
- "BatchJobRunner",
9
- ]
@@ -1,4 +0,0 @@
1
- from .batch_manager import SimpleBatchManager
2
- from .batch_runner import BatchJobRunner
3
-
4
- __all__ = ["SimpleBatchManager", "BatchJobRunner"]