hamtaa-texttools 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (86) hide show
  1. hamtaa_texttools-1.1.7.dist-info/METADATA +228 -0
  2. hamtaa_texttools-1.1.7.dist-info/RECORD +30 -0
  3. hamtaa_texttools-1.1.7.dist-info/licenses/LICENSE +21 -0
  4. texttools/__init__.py +4 -26
  5. texttools/batch/__init__.py +3 -0
  6. texttools/{utils/batch_manager → batch}/batch_manager.py +226 -241
  7. texttools/batch/batch_runner.py +254 -0
  8. texttools/prompts/README.md +35 -0
  9. texttools/prompts/categorizer.yaml +28 -0
  10. texttools/prompts/extract_entities.yaml +20 -0
  11. texttools/prompts/extract_keywords.yaml +18 -0
  12. texttools/prompts/is_question.yaml +14 -0
  13. texttools/prompts/merge_questions.yaml +46 -0
  14. texttools/prompts/rewrite.yaml +111 -0
  15. texttools/prompts/run_custom.yaml +7 -0
  16. texttools/prompts/subject_to_question.yaml +22 -0
  17. texttools/prompts/summarize.yaml +14 -0
  18. texttools/prompts/text_to_question.yaml +20 -0
  19. texttools/prompts/translate.yaml +15 -0
  20. texttools/tools/__init__.py +4 -33
  21. texttools/tools/async_the_tool.py +435 -0
  22. texttools/tools/internals/async_operator.py +242 -0
  23. texttools/tools/internals/base_operator.py +100 -0
  24. texttools/tools/internals/formatters.py +24 -0
  25. texttools/tools/internals/operator.py +242 -0
  26. texttools/tools/internals/output_models.py +62 -0
  27. texttools/tools/internals/prompt_loader.py +60 -0
  28. texttools/tools/the_tool.py +433 -0
  29. hamtaa_texttools-0.1.48.dist-info/METADATA +0 -60
  30. hamtaa_texttools-0.1.48.dist-info/RECORD +0 -61
  31. texttools/base/__init__.py +0 -3
  32. texttools/base/base_categorizer.py +0 -40
  33. texttools/base/base_keyword_extractor.py +0 -35
  34. texttools/base/base_ner_extractor.py +0 -61
  35. texttools/base/base_question_detector.py +0 -35
  36. texttools/base/base_question_generator.py +0 -99
  37. texttools/base/base_question_merger.py +0 -59
  38. texttools/base/base_question_rewriter.py +0 -61
  39. texttools/base/base_router.py +0 -33
  40. texttools/base/base_summarizer.py +0 -55
  41. texttools/base/base_task_performer.py +0 -53
  42. texttools/base/base_translator.py +0 -38
  43. texttools/formatter/__init__.py +0 -1
  44. texttools/formatter/base.py +0 -26
  45. texttools/formatter/gemma3_formatter.py +0 -54
  46. texttools/handlers/__init__.py +0 -6
  47. texttools/handlers/categorizer/__init__.py +0 -6
  48. texttools/handlers/categorizer/categorizer.py +0 -61
  49. texttools/handlers/handlers.py +0 -88
  50. texttools/tools/categorizer/__init__.py +0 -2
  51. texttools/tools/categorizer/encoder_model/__init__.py +0 -1
  52. texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
  53. texttools/tools/categorizer/llm/__init__.py +0 -2
  54. texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
  55. texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
  56. texttools/tools/keyword_extractor/__init__.py +0 -1
  57. texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
  58. texttools/tools/merger/__init__.py +0 -2
  59. texttools/tools/merger/gemma_question_merger.py +0 -214
  60. texttools/tools/ner/__init__.py +0 -1
  61. texttools/tools/ner/gemma_ner_extractor.py +0 -157
  62. texttools/tools/question_detector/__init__.py +0 -2
  63. texttools/tools/question_detector/gemma_detector.py +0 -114
  64. texttools/tools/question_detector/llm_detector.py +0 -112
  65. texttools/tools/question_generator/__init__.py +0 -1
  66. texttools/tools/question_generator/gemma_question_generator.py +0 -198
  67. texttools/tools/reranker/__init__.py +0 -3
  68. texttools/tools/reranker/reranker.py +0 -137
  69. texttools/tools/reranker/scorer.py +0 -216
  70. texttools/tools/reranker/sorter.py +0 -278
  71. texttools/tools/rewriter/__init__.py +0 -2
  72. texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
  73. texttools/tools/router/__init__.py +0 -0
  74. texttools/tools/router/gemma_router.py +0 -169
  75. texttools/tools/subject_to_question/__init__.py +0 -1
  76. texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
  77. texttools/tools/summarizer/__init__.py +0 -2
  78. texttools/tools/summarizer/gemma_summarizer.py +0 -140
  79. texttools/tools/summarizer/llm_summerizer.py +0 -108
  80. texttools/tools/translator/__init__.py +0 -1
  81. texttools/tools/translator/gemma_translator.py +0 -189
  82. texttools/utils/batch_manager/__init__.py +0 -2
  83. texttools/utils/batch_manager/batch_runner.py +0 -207
  84. texttools/utils/flex_processor.py +0 -78
  85. {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/WHEEL +0 -0
  86. {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,207 +0,0 @@
1
- import json
2
- import os
3
- import time
4
- from dataclasses import dataclass
5
- from pathlib import Path
6
- from typing import Any, Callable
7
-
8
- # from dotenv import load_dotenv
9
- from openai import OpenAI
10
- from pydantic import BaseModel
11
-
12
- from texttools.batch_manager import SimpleBatchManager
13
-
14
-
15
- class OutputModel(BaseModel):
16
- desired_output: str
17
-
18
-
19
- def exporting_data(data):
20
- """
21
- Produces a structure of the following form from an initial data structure:
22
- [
23
- {"id": str, "content": str},...
24
- ]
25
- """
26
- return data
27
-
28
-
29
- def importing_data(data):
30
- """
31
- Takes the output and adds and aggregates it to the original structure.
32
- """
33
- return data
34
-
35
-
36
- @dataclass
37
- class BatchConfig:
38
- """
39
- Configuration for batch job runner.
40
- """
41
-
42
- system_prompt: str = ""
43
- job_name: str = ""
44
- input_data_path: str = ""
45
- output_data_filename: str = ""
46
- model: str = "gpt-4.1-mini"
47
- MAX_BATCH_SIZE: int = 100
48
- MAX_TOTAL_TOKENS: int = 2000000
49
- CHARS_PER_TOKEN: float = 2.7
50
- PROMPT_TOKEN_MULTIPLIER: int = 1000
51
- BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
52
- import_function: Callable = importing_data
53
- export_function: Callable = exporting_data
54
-
55
-
56
- class BatchJobRunner:
57
- """
58
- Handles running batch jobs using a batch manager and configuration.
59
- """
60
-
61
- def __init__(
62
- self, config: BatchConfig = BatchConfig(), output_model: type = OutputModel
63
- ):
64
- self.config = config
65
- self.system_prompt = config.system_prompt
66
- self.job_name = config.job_name
67
- self.input_data_path = config.input_data_path
68
- self.output_data_filename = config.output_data_filename
69
- self.model = config.model
70
- self.output_model = output_model
71
- self.manager = self._init_manager()
72
- self.data = self._load_data()
73
- self.parts: list[list[dict[str, Any]]] = []
74
- self._partition_data()
75
- Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
76
-
77
- def _init_manager(self) -> SimpleBatchManager:
78
- # load_dotenv()
79
- api_key = os.getenv("OPENAI_API_KEY")
80
- client = OpenAI(api_key=api_key)
81
- return SimpleBatchManager(
82
- client=client,
83
- model=self.model,
84
- prompt_template=self.system_prompt,
85
- output_model=self.output_model,
86
- )
87
-
88
- def _load_data(self):
89
- with open(self.input_data_path, "r", encoding="utf-8") as f:
90
- data = json.load(f)
91
- data = self.config.export_function(data)
92
-
93
- # Validation: ensure data is a list of dicts with 'id' and 'content' as strings
94
- if not isinstance(data, list):
95
- raise ValueError(
96
- 'Exported data must be a list in this form: [ {"id": str, "content": str},...]'
97
- )
98
- for item in data:
99
- if not (isinstance(item, dict) and "id" in item and "content" in item):
100
- raise ValueError(
101
- "Each item must be a dict with 'id' and 'content' keys."
102
- )
103
- if not (isinstance(item["id"], str) and isinstance(item["content"], str)):
104
- raise ValueError("'id' and 'content' must be strings.")
105
- return data
106
-
107
- def _partition_data(self):
108
- total_length = sum(len(item["content"]) for item in self.data)
109
- prompt_length = len(self.system_prompt)
110
- total = total_length + (prompt_length * len(self.data))
111
- calculation = total / self.config.CHARS_PER_TOKEN
112
- print(
113
- f"Total chars: {total_length}, Prompt chars: {prompt_length}, Total: {total}, Tokens: {calculation}"
114
- )
115
- if calculation < self.config.MAX_TOTAL_TOKENS:
116
- self.parts = [self.data]
117
- else:
118
- # Partition into chunks of MAX_BATCH_SIZE
119
- self.parts = [
120
- self.data[i : i + self.config.MAX_BATCH_SIZE]
121
- for i in range(0, len(self.data), self.config.MAX_BATCH_SIZE)
122
- ]
123
- print(f"Data split into {len(self.parts)} part(s)")
124
-
125
- def run(self):
126
- for idx, part in enumerate(self.parts):
127
- if self._result_exists(idx):
128
- print(f"Skipping part {idx + 1}: result already exists.")
129
- continue
130
- part_job_name = (
131
- f"{self.job_name}_part_{idx + 1}"
132
- if len(self.parts) > 1
133
- else self.job_name
134
- )
135
- print(
136
- f"\n--- Processing part {idx + 1}/{len(self.parts)}: {part_job_name} ---"
137
- )
138
- self._process_part(part, part_job_name, idx)
139
-
140
- def _process_part(
141
- self, part: list[dict[str, Any]], part_job_name: str, part_idx: int
142
- ):
143
- while True:
144
- print(f"Starting job for part: {part_job_name}")
145
- self.manager.start(part, job_name=part_job_name)
146
- print("Started batch job. Checking status...")
147
- while True:
148
- status = self.manager.check_status(job_name=part_job_name)
149
- print(f"Status: {status}")
150
- if status == "completed":
151
- print("Job completed. Fetching results...")
152
- output_data, log = self.manager.fetch_results(
153
- job_name=part_job_name, remove_cache=False
154
- )
155
- output_data = self.config.import_function(output_data)
156
- self._save_results(output_data, log, part_idx)
157
- print("Fetched and saved results for this part.")
158
- return
159
- elif status == "failed":
160
- print("Job failed. Clearing state, waiting, and retrying...")
161
- self.manager._clear_state(part_job_name)
162
- time.sleep(10) # Wait before retrying
163
- break # Break inner loop to restart the job
164
- else:
165
- time.sleep(5) # Wait before checking again
166
-
167
- def _save_results(
168
- self, output_data: list[dict[str, Any]], log: list[Any], part_idx: int
169
- ):
170
- part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
171
- result_path = (
172
- Path(self.config.BASE_OUTPUT_DIR)
173
- / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
174
- )
175
- if not output_data:
176
- print("No output data to save. Skipping this part.")
177
- return
178
- else:
179
- with open(result_path, "w", encoding="utf-8") as f:
180
- json.dump(output_data, f, ensure_ascii=False, indent=4)
181
- if log:
182
- log_path = (
183
- Path(self.config.BASE_OUTPUT_DIR)
184
- / f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
185
- )
186
- with open(log_path, "w", encoding="utf-8") as f:
187
- json.dump(log, f, ensure_ascii=False, indent=4)
188
-
189
- def _result_exists(self, part_idx: int) -> bool:
190
- part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
191
- result_path = (
192
- Path(self.config.BASE_OUTPUT_DIR)
193
- / f"{Path(self.output_data_path).stem}{part_suffix}.json"
194
- )
195
- return result_path.exists()
196
-
197
-
198
- if __name__ == "__main__":
199
- print("=== Batch Job Runner ===")
200
- config = BatchConfig(
201
- system_prompt="",
202
- job_name="job_name",
203
- input_data_path="Data.json",
204
- output_data_filename="output",
205
- )
206
- runner = BatchJobRunner(config)
207
- runner.run()
@@ -1,78 +0,0 @@
1
- import random
2
- import asyncio
3
- from openai import OpenAI, RateLimitError, APIError
4
- from typing import Optional
5
- from pydantic import BaseModel, ValidationError
6
- import httpx
7
-
8
- # http_client = httpx()
9
- # test_client = OpenAI(http_client=http_client)
10
-
11
- async def flex_processing(
12
- LLM_client: OpenAI,
13
- system_prompt: str,
14
- user_prompt: str,
15
- output_model: Optional[BaseModel]=None,
16
- prompt_cache_key: Optional[str]=None,
17
- max_retries: int = 10,
18
- base_delay: float = 2.0,
19
- model_name: Optional[str] ="gpt-5-mini",
20
- **client_kwargs):
21
- """
22
- Wrapper for flex processing with retry and exponential backoff.
23
- Handles 429 'Resource Unavailable' errors gracefully.
24
- """
25
- for attempt in range(max_retries):
26
- try:
27
- request_kwargs = {
28
- "model": model_name,
29
- "messages": [
30
- {"role": "system", "content": system_prompt},
31
- {"role": "user", "content": user_prompt},
32
- ],
33
- "service_tier": "flex",
34
- "timeout": 900.0,
35
- **client_kwargs
36
- }
37
- if output_model:
38
- request_kwargs["response_format"] = output_model
39
- if prompt_cache_key:
40
- request_kwargs["prompt_cache_key"] = prompt_cache_key
41
-
42
- response = LLM_client.chat.completions.parse(**request_kwargs)
43
- # response = self.client.chat.completions.parse(output_model)
44
- content = response.choices[0].message.content
45
- # ✅ Validate structured output if a model is provided
46
- if output_model is not None:
47
- try:
48
- output_model.model_validate_json(content)
49
- base_content = response.choices[0].message.parsed
50
- # base_content = output_model(**content)
51
- return base_content
52
- except ValidationError as ve:
53
- # Treat invalid output as retryable
54
- wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
55
- print(
56
- f"[Flex Retry] Attempt {attempt+1}/{max_retries} produced invalid structured output. "
57
- f"Retrying in {wait_time:.2f}s... (ValidationError: {ve})"
58
- )
59
- await asyncio.sleep(wait_time)
60
- continue
61
- except (RateLimitError, APIError) as e:
62
- wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
63
- print(
64
- f"[Flex Retry] Attempt {attempt+1}/{max_retries} failed "
65
- f"with error: {type(e).__name__} - {e}. "
66
- f"Retrying in {wait_time:.2f}s..."
67
- )
68
- await asyncio.sleep(wait_time)
69
-
70
- except Exception as e:
71
- # Non-recoverable error: break out immediately
72
- raise RuntimeError(
73
- f"[Flex Processing] Unrecoverable error for prompt_key={prompt_cache_key}: {e}"
74
- )
75
-
76
- raise RuntimeError(
77
- f"[Flex Processing] Exhausted {max_retries} retries for prompt_key={prompt_cache_key}"
78
- )