hamtaa-texttools 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hamtaa-texttools might be problematic. Click here for more details.
- hamtaa_texttools-1.1.7.dist-info/METADATA +228 -0
- hamtaa_texttools-1.1.7.dist-info/RECORD +30 -0
- hamtaa_texttools-1.1.7.dist-info/licenses/LICENSE +21 -0
- texttools/__init__.py +4 -26
- texttools/batch/__init__.py +3 -0
- texttools/{utils/batch_manager → batch}/batch_manager.py +226 -241
- texttools/batch/batch_runner.py +254 -0
- texttools/prompts/README.md +35 -0
- texttools/prompts/categorizer.yaml +28 -0
- texttools/prompts/extract_entities.yaml +20 -0
- texttools/prompts/extract_keywords.yaml +18 -0
- texttools/prompts/is_question.yaml +14 -0
- texttools/prompts/merge_questions.yaml +46 -0
- texttools/prompts/rewrite.yaml +111 -0
- texttools/prompts/run_custom.yaml +7 -0
- texttools/prompts/subject_to_question.yaml +22 -0
- texttools/prompts/summarize.yaml +14 -0
- texttools/prompts/text_to_question.yaml +20 -0
- texttools/prompts/translate.yaml +15 -0
- texttools/tools/__init__.py +4 -33
- texttools/tools/async_the_tool.py +435 -0
- texttools/tools/internals/async_operator.py +242 -0
- texttools/tools/internals/base_operator.py +100 -0
- texttools/tools/internals/formatters.py +24 -0
- texttools/tools/internals/operator.py +242 -0
- texttools/tools/internals/output_models.py +62 -0
- texttools/tools/internals/prompt_loader.py +60 -0
- texttools/tools/the_tool.py +433 -0
- hamtaa_texttools-0.1.48.dist-info/METADATA +0 -60
- hamtaa_texttools-0.1.48.dist-info/RECORD +0 -61
- texttools/base/__init__.py +0 -3
- texttools/base/base_categorizer.py +0 -40
- texttools/base/base_keyword_extractor.py +0 -35
- texttools/base/base_ner_extractor.py +0 -61
- texttools/base/base_question_detector.py +0 -35
- texttools/base/base_question_generator.py +0 -99
- texttools/base/base_question_merger.py +0 -59
- texttools/base/base_question_rewriter.py +0 -61
- texttools/base/base_router.py +0 -33
- texttools/base/base_summarizer.py +0 -55
- texttools/base/base_task_performer.py +0 -53
- texttools/base/base_translator.py +0 -38
- texttools/formatter/__init__.py +0 -1
- texttools/formatter/base.py +0 -26
- texttools/formatter/gemma3_formatter.py +0 -54
- texttools/handlers/__init__.py +0 -6
- texttools/handlers/categorizer/__init__.py +0 -6
- texttools/handlers/categorizer/categorizer.py +0 -61
- texttools/handlers/handlers.py +0 -88
- texttools/tools/categorizer/__init__.py +0 -2
- texttools/tools/categorizer/encoder_model/__init__.py +0 -1
- texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
- texttools/tools/categorizer/llm/__init__.py +0 -2
- texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
- texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
- texttools/tools/keyword_extractor/__init__.py +0 -1
- texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
- texttools/tools/merger/__init__.py +0 -2
- texttools/tools/merger/gemma_question_merger.py +0 -214
- texttools/tools/ner/__init__.py +0 -1
- texttools/tools/ner/gemma_ner_extractor.py +0 -157
- texttools/tools/question_detector/__init__.py +0 -2
- texttools/tools/question_detector/gemma_detector.py +0 -114
- texttools/tools/question_detector/llm_detector.py +0 -112
- texttools/tools/question_generator/__init__.py +0 -1
- texttools/tools/question_generator/gemma_question_generator.py +0 -198
- texttools/tools/reranker/__init__.py +0 -3
- texttools/tools/reranker/reranker.py +0 -137
- texttools/tools/reranker/scorer.py +0 -216
- texttools/tools/reranker/sorter.py +0 -278
- texttools/tools/rewriter/__init__.py +0 -2
- texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
- texttools/tools/router/__init__.py +0 -0
- texttools/tools/router/gemma_router.py +0 -169
- texttools/tools/subject_to_question/__init__.py +0 -1
- texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
- texttools/tools/summarizer/__init__.py +0 -2
- texttools/tools/summarizer/gemma_summarizer.py +0 -140
- texttools/tools/summarizer/llm_summerizer.py +0 -108
- texttools/tools/translator/__init__.py +0 -1
- texttools/tools/translator/gemma_translator.py +0 -189
- texttools/utils/batch_manager/__init__.py +0 -2
- texttools/utils/batch_manager/batch_runner.py +0 -207
- texttools/utils/flex_processor.py +0 -78
- {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,207 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import time
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any, Callable
|
|
7
|
-
|
|
8
|
-
# from dotenv import load_dotenv
|
|
9
|
-
from openai import OpenAI
|
|
10
|
-
from pydantic import BaseModel
|
|
11
|
-
|
|
12
|
-
from texttools.batch_manager import SimpleBatchManager
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class OutputModel(BaseModel):
|
|
16
|
-
desired_output: str
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def exporting_data(data):
|
|
20
|
-
"""
|
|
21
|
-
Produces a structure of the following form from an initial data structure:
|
|
22
|
-
[
|
|
23
|
-
{"id": str, "content": str},...
|
|
24
|
-
]
|
|
25
|
-
"""
|
|
26
|
-
return data
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def importing_data(data):
|
|
30
|
-
"""
|
|
31
|
-
Takes the output and adds and aggregates it to the original structure.
|
|
32
|
-
"""
|
|
33
|
-
return data
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@dataclass
|
|
37
|
-
class BatchConfig:
|
|
38
|
-
"""
|
|
39
|
-
Configuration for batch job runner.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
system_prompt: str = ""
|
|
43
|
-
job_name: str = ""
|
|
44
|
-
input_data_path: str = ""
|
|
45
|
-
output_data_filename: str = ""
|
|
46
|
-
model: str = "gpt-4.1-mini"
|
|
47
|
-
MAX_BATCH_SIZE: int = 100
|
|
48
|
-
MAX_TOTAL_TOKENS: int = 2000000
|
|
49
|
-
CHARS_PER_TOKEN: float = 2.7
|
|
50
|
-
PROMPT_TOKEN_MULTIPLIER: int = 1000
|
|
51
|
-
BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
|
|
52
|
-
import_function: Callable = importing_data
|
|
53
|
-
export_function: Callable = exporting_data
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
class BatchJobRunner:
|
|
57
|
-
"""
|
|
58
|
-
Handles running batch jobs using a batch manager and configuration.
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
def __init__(
|
|
62
|
-
self, config: BatchConfig = BatchConfig(), output_model: type = OutputModel
|
|
63
|
-
):
|
|
64
|
-
self.config = config
|
|
65
|
-
self.system_prompt = config.system_prompt
|
|
66
|
-
self.job_name = config.job_name
|
|
67
|
-
self.input_data_path = config.input_data_path
|
|
68
|
-
self.output_data_filename = config.output_data_filename
|
|
69
|
-
self.model = config.model
|
|
70
|
-
self.output_model = output_model
|
|
71
|
-
self.manager = self._init_manager()
|
|
72
|
-
self.data = self._load_data()
|
|
73
|
-
self.parts: list[list[dict[str, Any]]] = []
|
|
74
|
-
self._partition_data()
|
|
75
|
-
Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
|
76
|
-
|
|
77
|
-
def _init_manager(self) -> SimpleBatchManager:
|
|
78
|
-
# load_dotenv()
|
|
79
|
-
api_key = os.getenv("OPENAI_API_KEY")
|
|
80
|
-
client = OpenAI(api_key=api_key)
|
|
81
|
-
return SimpleBatchManager(
|
|
82
|
-
client=client,
|
|
83
|
-
model=self.model,
|
|
84
|
-
prompt_template=self.system_prompt,
|
|
85
|
-
output_model=self.output_model,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
def _load_data(self):
|
|
89
|
-
with open(self.input_data_path, "r", encoding="utf-8") as f:
|
|
90
|
-
data = json.load(f)
|
|
91
|
-
data = self.config.export_function(data)
|
|
92
|
-
|
|
93
|
-
# Validation: ensure data is a list of dicts with 'id' and 'content' as strings
|
|
94
|
-
if not isinstance(data, list):
|
|
95
|
-
raise ValueError(
|
|
96
|
-
'Exported data must be a list in this form: [ {"id": str, "content": str},...]'
|
|
97
|
-
)
|
|
98
|
-
for item in data:
|
|
99
|
-
if not (isinstance(item, dict) and "id" in item and "content" in item):
|
|
100
|
-
raise ValueError(
|
|
101
|
-
"Each item must be a dict with 'id' and 'content' keys."
|
|
102
|
-
)
|
|
103
|
-
if not (isinstance(item["id"], str) and isinstance(item["content"], str)):
|
|
104
|
-
raise ValueError("'id' and 'content' must be strings.")
|
|
105
|
-
return data
|
|
106
|
-
|
|
107
|
-
def _partition_data(self):
|
|
108
|
-
total_length = sum(len(item["content"]) for item in self.data)
|
|
109
|
-
prompt_length = len(self.system_prompt)
|
|
110
|
-
total = total_length + (prompt_length * len(self.data))
|
|
111
|
-
calculation = total / self.config.CHARS_PER_TOKEN
|
|
112
|
-
print(
|
|
113
|
-
f"Total chars: {total_length}, Prompt chars: {prompt_length}, Total: {total}, Tokens: {calculation}"
|
|
114
|
-
)
|
|
115
|
-
if calculation < self.config.MAX_TOTAL_TOKENS:
|
|
116
|
-
self.parts = [self.data]
|
|
117
|
-
else:
|
|
118
|
-
# Partition into chunks of MAX_BATCH_SIZE
|
|
119
|
-
self.parts = [
|
|
120
|
-
self.data[i : i + self.config.MAX_BATCH_SIZE]
|
|
121
|
-
for i in range(0, len(self.data), self.config.MAX_BATCH_SIZE)
|
|
122
|
-
]
|
|
123
|
-
print(f"Data split into {len(self.parts)} part(s)")
|
|
124
|
-
|
|
125
|
-
def run(self):
|
|
126
|
-
for idx, part in enumerate(self.parts):
|
|
127
|
-
if self._result_exists(idx):
|
|
128
|
-
print(f"Skipping part {idx + 1}: result already exists.")
|
|
129
|
-
continue
|
|
130
|
-
part_job_name = (
|
|
131
|
-
f"{self.job_name}_part_{idx + 1}"
|
|
132
|
-
if len(self.parts) > 1
|
|
133
|
-
else self.job_name
|
|
134
|
-
)
|
|
135
|
-
print(
|
|
136
|
-
f"\n--- Processing part {idx + 1}/{len(self.parts)}: {part_job_name} ---"
|
|
137
|
-
)
|
|
138
|
-
self._process_part(part, part_job_name, idx)
|
|
139
|
-
|
|
140
|
-
def _process_part(
|
|
141
|
-
self, part: list[dict[str, Any]], part_job_name: str, part_idx: int
|
|
142
|
-
):
|
|
143
|
-
while True:
|
|
144
|
-
print(f"Starting job for part: {part_job_name}")
|
|
145
|
-
self.manager.start(part, job_name=part_job_name)
|
|
146
|
-
print("Started batch job. Checking status...")
|
|
147
|
-
while True:
|
|
148
|
-
status = self.manager.check_status(job_name=part_job_name)
|
|
149
|
-
print(f"Status: {status}")
|
|
150
|
-
if status == "completed":
|
|
151
|
-
print("Job completed. Fetching results...")
|
|
152
|
-
output_data, log = self.manager.fetch_results(
|
|
153
|
-
job_name=part_job_name, remove_cache=False
|
|
154
|
-
)
|
|
155
|
-
output_data = self.config.import_function(output_data)
|
|
156
|
-
self._save_results(output_data, log, part_idx)
|
|
157
|
-
print("Fetched and saved results for this part.")
|
|
158
|
-
return
|
|
159
|
-
elif status == "failed":
|
|
160
|
-
print("Job failed. Clearing state, waiting, and retrying...")
|
|
161
|
-
self.manager._clear_state(part_job_name)
|
|
162
|
-
time.sleep(10) # Wait before retrying
|
|
163
|
-
break # Break inner loop to restart the job
|
|
164
|
-
else:
|
|
165
|
-
time.sleep(5) # Wait before checking again
|
|
166
|
-
|
|
167
|
-
def _save_results(
|
|
168
|
-
self, output_data: list[dict[str, Any]], log: list[Any], part_idx: int
|
|
169
|
-
):
|
|
170
|
-
part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
|
|
171
|
-
result_path = (
|
|
172
|
-
Path(self.config.BASE_OUTPUT_DIR)
|
|
173
|
-
/ f"{Path(self.output_data_filename).stem}{part_suffix}.json"
|
|
174
|
-
)
|
|
175
|
-
if not output_data:
|
|
176
|
-
print("No output data to save. Skipping this part.")
|
|
177
|
-
return
|
|
178
|
-
else:
|
|
179
|
-
with open(result_path, "w", encoding="utf-8") as f:
|
|
180
|
-
json.dump(output_data, f, ensure_ascii=False, indent=4)
|
|
181
|
-
if log:
|
|
182
|
-
log_path = (
|
|
183
|
-
Path(self.config.BASE_OUTPUT_DIR)
|
|
184
|
-
/ f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
|
|
185
|
-
)
|
|
186
|
-
with open(log_path, "w", encoding="utf-8") as f:
|
|
187
|
-
json.dump(log, f, ensure_ascii=False, indent=4)
|
|
188
|
-
|
|
189
|
-
def _result_exists(self, part_idx: int) -> bool:
|
|
190
|
-
part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
|
|
191
|
-
result_path = (
|
|
192
|
-
Path(self.config.BASE_OUTPUT_DIR)
|
|
193
|
-
/ f"{Path(self.output_data_path).stem}{part_suffix}.json"
|
|
194
|
-
)
|
|
195
|
-
return result_path.exists()
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if __name__ == "__main__":
|
|
199
|
-
print("=== Batch Job Runner ===")
|
|
200
|
-
config = BatchConfig(
|
|
201
|
-
system_prompt="",
|
|
202
|
-
job_name="job_name",
|
|
203
|
-
input_data_path="Data.json",
|
|
204
|
-
output_data_filename="output",
|
|
205
|
-
)
|
|
206
|
-
runner = BatchJobRunner(config)
|
|
207
|
-
runner.run()
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import asyncio
|
|
3
|
-
from openai import OpenAI, RateLimitError, APIError
|
|
4
|
-
from typing import Optional
|
|
5
|
-
from pydantic import BaseModel, ValidationError
|
|
6
|
-
import httpx
|
|
7
|
-
|
|
8
|
-
# http_client = httpx()
|
|
9
|
-
# test_client = OpenAI(http_client=http_client)
|
|
10
|
-
|
|
11
|
-
async def flex_processing(
|
|
12
|
-
LLM_client: OpenAI,
|
|
13
|
-
system_prompt: str,
|
|
14
|
-
user_prompt: str,
|
|
15
|
-
output_model: Optional[BaseModel]=None,
|
|
16
|
-
prompt_cache_key: Optional[str]=None,
|
|
17
|
-
max_retries: int = 10,
|
|
18
|
-
base_delay: float = 2.0,
|
|
19
|
-
model_name: Optional[str] ="gpt-5-mini",
|
|
20
|
-
**client_kwargs):
|
|
21
|
-
"""
|
|
22
|
-
Wrapper for flex processing with retry and exponential backoff.
|
|
23
|
-
Handles 429 'Resource Unavailable' errors gracefully.
|
|
24
|
-
"""
|
|
25
|
-
for attempt in range(max_retries):
|
|
26
|
-
try:
|
|
27
|
-
request_kwargs = {
|
|
28
|
-
"model": model_name,
|
|
29
|
-
"messages": [
|
|
30
|
-
{"role": "system", "content": system_prompt},
|
|
31
|
-
{"role": "user", "content": user_prompt},
|
|
32
|
-
],
|
|
33
|
-
"service_tier": "flex",
|
|
34
|
-
"timeout": 900.0,
|
|
35
|
-
**client_kwargs
|
|
36
|
-
}
|
|
37
|
-
if output_model:
|
|
38
|
-
request_kwargs["response_format"] = output_model
|
|
39
|
-
if prompt_cache_key:
|
|
40
|
-
request_kwargs["prompt_cache_key"] = prompt_cache_key
|
|
41
|
-
|
|
42
|
-
response = LLM_client.chat.completions.parse(**request_kwargs)
|
|
43
|
-
# response = self.client.chat.completions.parse(output_model)
|
|
44
|
-
content = response.choices[0].message.content
|
|
45
|
-
# ✅ Validate structured output if a model is provided
|
|
46
|
-
if output_model is not None:
|
|
47
|
-
try:
|
|
48
|
-
output_model.model_validate_json(content)
|
|
49
|
-
base_content = response.choices[0].message.parsed
|
|
50
|
-
# base_content = output_model(**content)
|
|
51
|
-
return base_content
|
|
52
|
-
except ValidationError as ve:
|
|
53
|
-
# Treat invalid output as retryable
|
|
54
|
-
wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
|
55
|
-
print(
|
|
56
|
-
f"[Flex Retry] Attempt {attempt+1}/{max_retries} produced invalid structured output. "
|
|
57
|
-
f"Retrying in {wait_time:.2f}s... (ValidationError: {ve})"
|
|
58
|
-
)
|
|
59
|
-
await asyncio.sleep(wait_time)
|
|
60
|
-
continue
|
|
61
|
-
except (RateLimitError, APIError) as e:
|
|
62
|
-
wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
|
63
|
-
print(
|
|
64
|
-
f"[Flex Retry] Attempt {attempt+1}/{max_retries} failed "
|
|
65
|
-
f"with error: {type(e).__name__} - {e}. "
|
|
66
|
-
f"Retrying in {wait_time:.2f}s..."
|
|
67
|
-
)
|
|
68
|
-
await asyncio.sleep(wait_time)
|
|
69
|
-
|
|
70
|
-
except Exception as e:
|
|
71
|
-
# Non-recoverable error: break out immediately
|
|
72
|
-
raise RuntimeError(
|
|
73
|
-
f"[Flex Processing] Unrecoverable error for prompt_key={prompt_cache_key}: {e}"
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
raise RuntimeError(
|
|
77
|
-
f"[Flex Processing] Exhausted {max_retries} retries for prompt_key={prompt_cache_key}"
|
|
78
|
-
)
|
|
File without changes
|
|
File without changes
|