hamtaa-texttools 1.0.5__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (36) hide show
  1. {hamtaa_texttools-1.0.5/hamtaa_texttools.egg-info → hamtaa_texttools-1.0.6}/PKG-INFO +15 -15
  2. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/README.md +14 -14
  3. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6/hamtaa_texttools.egg-info}/PKG-INFO +15 -15
  4. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/hamtaa_texttools.egg-info/SOURCES.txt +4 -4
  5. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/pyproject.toml +2 -2
  6. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/batch/batch_manager.py +7 -18
  7. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/batch/batch_runner.py +96 -45
  8. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/README.md +4 -0
  9. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/keyword_extractor.yaml +6 -6
  10. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/question_merger.yaml +5 -5
  11. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/async_the_tool.py +6 -6
  12. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/internals/async_operator.py +21 -10
  13. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/internals/operator.py +2 -2
  14. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/internals/prompt_loader.py +12 -22
  15. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/the_tool.py +12 -12
  16. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/LICENSE +0 -0
  17. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/MANIFEST.in +0 -0
  18. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/hamtaa_texttools.egg-info/dependency_links.txt +0 -0
  19. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/hamtaa_texttools.egg-info/requires.txt +0 -0
  20. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/hamtaa_texttools.egg-info/top_level.txt +0 -0
  21. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/setup.cfg +0 -0
  22. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/__init__.py +0 -0
  23. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/batch/__init__.py +0 -0
  24. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/formatters/base_formatter.py +0 -0
  25. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/formatters/user_merge_formatter.py +0 -0
  26. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/categorizer.yaml +0 -0
  27. /hamtaa_texttools-1.0.5/texttools/prompts/question_detector.yaml → /hamtaa_texttools-1.0.6/texttools/prompts/is_question.yaml +0 -0
  28. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/ner_extractor.yaml +0 -0
  29. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/rewriter.yaml +0 -0
  30. /hamtaa_texttools-1.0.5/texttools/prompts/custom_tool.yaml → /hamtaa_texttools-1.0.6/texttools/prompts/run_custom.yaml +0 -0
  31. /hamtaa_texttools-1.0.5/texttools/prompts/subject_question_generator.yaml → /hamtaa_texttools-1.0.6/texttools/prompts/subject_to_question.yaml +0 -0
  32. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/summarizer.yaml +0 -0
  33. /hamtaa_texttools-1.0.5/texttools/prompts/question_generator.yaml → /hamtaa_texttools-1.0.6/texttools/prompts/text_to_question.yaml +0 -0
  34. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/prompts/translator.yaml +0 -0
  35. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/__init__.py +0 -0
  36. {hamtaa_texttools-1.0.5 → hamtaa_texttools-1.0.6}/texttools/tools/internals/output_models.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: TextTools is a high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>
6
6
  License: MIT License
@@ -51,17 +51,17 @@ It provides ready-to-use utilities for **translation, question detection, keywor
51
51
  TextTools provides a rich collection of high-level NLP utilities built on top of LLMs.
52
52
  Each tool is designed to work out-of-the-box with structured outputs (JSON / Pydantic).
53
53
 
54
- - **Categorizer** → Zero-finetuning text categorization for fast, scalable classification.
55
- - **Keyword Extractor** Identify the most important keywords in a text.
56
- - **Question Merger** Merge the provided questions, preserving all the main points
57
- - **NER (Named Entity Recognition) Extractor** → Extract people, places, organizations, and other entities.
58
- - **Question Detector** Determine whether a text is a question or not.
59
- - **Question Generator From Text** → Generate high-quality, context-relevant questions from provided text.
60
- - **Question Generator From Subject** → Generate high-quality, context-relevant questions from a subject.
61
- - **Rewriter** Rewrite text while preserving meaning or without it.
62
- - **Summarizer** Condense long passages into clear, structured summaries.
63
- - **Translator** Translate text across multiple languages, with support for custom rules.
64
- - **Custom Tool** Allows users to define a custom tool with arbitrary BaseModel.
54
+ - **`categorize()`** - Classifies text into Islamic studies categories
55
+ - **`is_question()`** - Binary detection of whether input is a question
56
+ - **`extract_keywords()`** - Extracts keywords from text
57
+ - **`extract_entities()`** - Named Entity Recognition (NER) system
58
+ - **`summarize()`** - Text summarization
59
+ - **`text_to_question()`** - Generates questions from text
60
+ - **`merge_questions()`** - Merges multiple questions with different modes
61
+ - **`rewrite()`** - Rewrites text with different wording/meaning
62
+ - **`subject_to_question()`** - Generates questions about a specific subject
63
+ - **`translate()`** - Text translation between languages
64
+ - **`run_custom()`** - Allows users to define a custom tool with arbitrary BaseModel
65
65
 
66
66
  ---
67
67
 
@@ -87,7 +87,7 @@ All these flags can be used individually or together to tailor the behavior of a
87
87
  Install the latest release via PyPI:
88
88
 
89
89
  ```bash
90
- pip install -U hamta-texttools
90
+ pip install -U hamtaa-texttools
91
91
  ```
92
92
 
93
93
  ---
@@ -118,7 +118,7 @@ model = "gpt-4o-mini"
118
118
  the_tool = TheTool(client=client, model=model, with_analysis=True, output_lang="English")
119
119
 
120
120
  # Example: Question Detection
121
- detection = the_tool.detect_question("Is this project open source?", logpobs=True, top_logprobs=2)
121
+ detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
122
122
  print(detection["result"])
123
123
  print(detection["logprobs"])
124
124
  # Output: True
@@ -135,7 +135,7 @@ class Custom(BaseModel):
135
135
  result: list[list[dict[str, int]]]
136
136
 
137
137
  custom_prompt = "Something"
138
- custom_result = the_tool.custom_tool(custom_prompt, Custom)
138
+ custom_result = the_tool.run_custom(custom_prompt, Custom)
139
139
  print(custom_result)
140
140
  ```
141
141
 
@@ -17,17 +17,17 @@ It provides ready-to-use utilities for **translation, question detection, keywor
17
17
  TextTools provides a rich collection of high-level NLP utilities built on top of LLMs.
18
18
  Each tool is designed to work out-of-the-box with structured outputs (JSON / Pydantic).
19
19
 
20
- - **Categorizer** → Zero-finetuning text categorization for fast, scalable classification.
21
- - **Keyword Extractor** Identify the most important keywords in a text.
22
- - **Question Merger** Merge the provided questions, preserving all the main points
23
- - **NER (Named Entity Recognition) Extractor** → Extract people, places, organizations, and other entities.
24
- - **Question Detector** Determine whether a text is a question or not.
25
- - **Question Generator From Text** → Generate high-quality, context-relevant questions from provided text.
26
- - **Question Generator From Subject** → Generate high-quality, context-relevant questions from a subject.
27
- - **Rewriter** Rewrite text while preserving meaning or without it.
28
- - **Summarizer** Condense long passages into clear, structured summaries.
29
- - **Translator** Translate text across multiple languages, with support for custom rules.
30
- - **Custom Tool** Allows users to define a custom tool with arbitrary BaseModel.
20
+ - **`categorize()`** - Classifies text into Islamic studies categories
21
+ - **`is_question()`** - Binary detection of whether input is a question
22
+ - **`extract_keywords()`** - Extracts keywords from text
23
+ - **`extract_entities()`** - Named Entity Recognition (NER) system
24
+ - **`summarize()`** - Text summarization
25
+ - **`text_to_question()`** - Generates questions from text
26
+ - **`merge_questions()`** - Merges multiple questions with different modes
27
+ - **`rewrite()`** - Rewrites text with different wording/meaning
28
+ - **`subject_to_question()`** - Generates questions about a specific subject
29
+ - **`translate()`** - Text translation between languages
30
+ - **`run_custom()`** - Allows users to define a custom tool with arbitrary BaseModel
31
31
 
32
32
  ---
33
33
 
@@ -53,7 +53,7 @@ All these flags can be used individually or together to tailor the behavior of a
53
53
  Install the latest release via PyPI:
54
54
 
55
55
  ```bash
56
- pip install -U hamta-texttools
56
+ pip install -U hamtaa-texttools
57
57
  ```
58
58
 
59
59
  ---
@@ -84,7 +84,7 @@ model = "gpt-4o-mini"
84
84
  the_tool = TheTool(client=client, model=model, with_analysis=True, output_lang="English")
85
85
 
86
86
  # Example: Question Detection
87
- detection = the_tool.detect_question("Is this project open source?", logpobs=True, top_logprobs=2)
87
+ detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
88
88
  print(detection["result"])
89
89
  print(detection["logprobs"])
90
90
  # Output: True
@@ -101,7 +101,7 @@ class Custom(BaseModel):
101
101
  result: list[list[dict[str, int]]]
102
102
 
103
103
  custom_prompt = "Something"
104
- custom_result = the_tool.custom_tool(custom_prompt, Custom)
104
+ custom_result = the_tool.run_custom(custom_prompt, Custom)
105
105
  print(custom_result)
106
106
  ```
107
107
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hamtaa-texttools
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: TextTools is a high-level NLP toolkit built on top of modern LLMs.
5
5
  Author-email: Tohidi <the.mohammad.tohidi@gmail.com>, Montazer <montazerh82@gmail.com>, Givechi <mohamad.m.givechi@gmail.com>, MoosaviNejad <erfanmoosavi84@gmail.com>
6
6
  License: MIT License
@@ -51,17 +51,17 @@ It provides ready-to-use utilities for **translation, question detection, keywor
51
51
  TextTools provides a rich collection of high-level NLP utilities built on top of LLMs.
52
52
  Each tool is designed to work out-of-the-box with structured outputs (JSON / Pydantic).
53
53
 
54
- - **Categorizer** → Zero-finetuning text categorization for fast, scalable classification.
55
- - **Keyword Extractor** Identify the most important keywords in a text.
56
- - **Question Merger** Merge the provided questions, preserving all the main points
57
- - **NER (Named Entity Recognition) Extractor** → Extract people, places, organizations, and other entities.
58
- - **Question Detector** Determine whether a text is a question or not.
59
- - **Question Generator From Text** → Generate high-quality, context-relevant questions from provided text.
60
- - **Question Generator From Subject** → Generate high-quality, context-relevant questions from a subject.
61
- - **Rewriter** Rewrite text while preserving meaning or without it.
62
- - **Summarizer** Condense long passages into clear, structured summaries.
63
- - **Translator** Translate text across multiple languages, with support for custom rules.
64
- - **Custom Tool** Allows users to define a custom tool with arbitrary BaseModel.
54
+ - **`categorize()`** - Classifies text into Islamic studies categories
55
+ - **`is_question()`** - Binary detection of whether input is a question
56
+ - **`extract_keywords()`** - Extracts keywords from text
57
+ - **`extract_entities()`** - Named Entity Recognition (NER) system
58
+ - **`summarize()`** - Text summarization
59
+ - **`text_to_question()`** - Generates questions from text
60
+ - **`merge_questions()`** - Merges multiple questions with different modes
61
+ - **`rewrite()`** - Rewrites text with different wording/meaning
62
+ - **`subject_to_question()`** - Generates questions about a specific subject
63
+ - **`translate()`** - Text translation between languages
64
+ - **`run_custom()`** - Allows users to define a custom tool with arbitrary BaseModel
65
65
 
66
66
  ---
67
67
 
@@ -87,7 +87,7 @@ All these flags can be used individually or together to tailor the behavior of a
87
87
  Install the latest release via PyPI:
88
88
 
89
89
  ```bash
90
- pip install -U hamta-texttools
90
+ pip install -U hamtaa-texttools
91
91
  ```
92
92
 
93
93
  ---
@@ -118,7 +118,7 @@ model = "gpt-4o-mini"
118
118
  the_tool = TheTool(client=client, model=model, with_analysis=True, output_lang="English")
119
119
 
120
120
  # Example: Question Detection
121
- detection = the_tool.detect_question("Is this project open source?", logpobs=True, top_logprobs=2)
121
+ detection = the_tool.is_question("Is this project open source?", logprobs=True, top_logprobs=2)
122
122
  print(detection["result"])
123
123
  print(detection["logprobs"])
124
124
  # Output: True
@@ -135,7 +135,7 @@ class Custom(BaseModel):
135
135
  result: list[list[dict[str, int]]]
136
136
 
137
137
  custom_prompt = "Something"
138
- custom_result = the_tool.custom_tool(custom_prompt, Custom)
138
+ custom_result = the_tool.run_custom(custom_prompt, Custom)
139
139
  print(custom_result)
140
140
  ```
141
141
 
@@ -15,15 +15,15 @@ texttools/formatters/base_formatter.py
15
15
  texttools/formatters/user_merge_formatter.py
16
16
  texttools/prompts/README.md
17
17
  texttools/prompts/categorizer.yaml
18
- texttools/prompts/custom_tool.yaml
18
+ texttools/prompts/is_question.yaml
19
19
  texttools/prompts/keyword_extractor.yaml
20
20
  texttools/prompts/ner_extractor.yaml
21
- texttools/prompts/question_detector.yaml
22
- texttools/prompts/question_generator.yaml
23
21
  texttools/prompts/question_merger.yaml
24
22
  texttools/prompts/rewriter.yaml
25
- texttools/prompts/subject_question_generator.yaml
23
+ texttools/prompts/run_custom.yaml
24
+ texttools/prompts/subject_to_question.yaml
26
25
  texttools/prompts/summarizer.yaml
26
+ texttools/prompts/text_to_question.yaml
27
27
  texttools/prompts/translator.yaml
28
28
  texttools/tools/__init__.py
29
29
  texttools/tools/async_the_tool.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hamtaa-texttools"
7
- version = "1.0.5"
7
+ version = "1.0.6"
8
8
  authors = [
9
9
  { name = "Tohidi", email = "the.mohammad.tohidi@gmail.com" },
10
10
  { name = "Montazer", email = "montazerh82@gmail.com" },
@@ -17,7 +17,7 @@ license = {file = "LICENSE"}
17
17
  requires-python = ">=3.8"
18
18
  dependencies = [
19
19
  "openai==1.97.1",
20
- "PyYAML>=6.0"
20
+ "PyYAML>=6.0",
21
21
  ]
22
22
  keywords = ["nlp", "llm", "text-processing", "openai"]
23
23
 
@@ -2,11 +2,16 @@ import json
2
2
  import uuid
3
3
  from pathlib import Path
4
4
  from typing import Any, Type
5
+ import logging
5
6
 
6
7
  from pydantic import BaseModel
7
8
  from openai import OpenAI
8
9
  from openai.lib._pydantic import to_strict_json_schema
9
10
 
11
+ # Configure logger
12
+ logger = logging.getLogger("batch_runner")
13
+ logger.setLevel(logging.INFO)
14
+
10
15
 
11
16
  class SimpleBatchManager:
12
17
  """
@@ -159,25 +164,9 @@ class SimpleBatchManager:
159
164
  info = self.client.batches.retrieve(job["id"])
160
165
  job = info.to_dict()
161
166
  self._save_state(job_name, [job])
162
- print("HERE is the job", job)
167
+ logger.info("Batch job status: %s", job)
163
168
  return job["status"]
164
169
 
165
- def _parsed(self, result: dict) -> list:
166
- """
167
- Parses the result dictionary, extracting the desired output or error for each item.
168
- Returns a list of dictionaries with 'id' and 'output' keys.
169
- """
170
- modified_result = []
171
-
172
- for key, d in result.items():
173
- if "desired_output" in d:
174
- new_dict = {"id": key, "output": d["desired_output"]}
175
- modified_result.append(new_dict)
176
- else:
177
- new_dict = {"id": key, "output": d["error"]}
178
- modified_result.append(new_dict)
179
- return modified_result
180
-
181
170
  def fetch_results(
182
171
  self, job_name: str, remove_cache: bool = True
183
172
  ) -> tuple[dict[str, str], list]:
@@ -198,7 +187,7 @@ class SimpleBatchManager:
198
187
  err_content = (
199
188
  self.client.files.content(error_file_id).read().decode("utf-8")
200
189
  )
201
- print("Error file content:", err_content)
190
+ logger.info("Error file content:", err_content)
202
191
  return {}
203
192
 
204
193
  content = self.client.files.content(out_file_id).read().decode("utf-8")
@@ -4,23 +4,27 @@ import time
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Any, Callable
7
+ import logging
7
8
 
9
+ from dotenv import load_dotenv
8
10
  from openai import OpenAI
9
11
  from pydantic import BaseModel
10
12
 
11
- from texttools.batch.batch_manager import SimpleBatchManager
13
+ from texttools.batch import SimpleBatchManager
12
14
 
15
+ # Configure logger
16
+ logger = logging.getLogger("batch_runner")
17
+ logger.setLevel(logging.INFO)
13
18
 
14
- class Output(BaseModel):
15
- output: str
19
+
20
+ class OutputModel(BaseModel):
21
+ desired_output: str
16
22
 
17
23
 
18
24
  def export_data(data):
19
25
  """
20
26
  Produces a structure of the following form from an initial data structure:
21
- [
22
- {"id": str, "content": str},...
23
- ]
27
+ [{"id": str, "text": str},...]
24
28
  """
25
29
  return data
26
30
 
@@ -50,19 +54,17 @@ class BatchConfig:
50
54
  BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
51
55
  import_function: Callable = import_data
52
56
  export_function: Callable = export_data
57
+ poll_interval_seconds: int = 30
58
+ max_retries: int = 3
53
59
 
54
60
 
55
61
  class BatchJobRunner:
56
62
  """
57
- Orchestrates the execution of batched LLM processing jobs.
58
-
59
- Handles data loading, partitioning, job execution via SimpleBatchManager,
60
- and result saving. Manages the complete workflow from input data to processed outputs,
61
- including retries and progress tracking across multiple batch parts.
63
+ Handles running batch jobs using a batch manager and configuration.
62
64
  """
63
65
 
64
66
  def __init__(
65
- self, config: BatchConfig = BatchConfig(), output_model: type = Output
67
+ self, config: BatchConfig = BatchConfig(), output_model: type = OutputModel
66
68
  ):
67
69
  self.config = config
68
70
  self.system_prompt = config.system_prompt
@@ -76,8 +78,13 @@ class BatchJobRunner:
76
78
  self.parts: list[list[dict[str, Any]]] = []
77
79
  self._partition_data()
78
80
  Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
81
+ # Map part index to job name
82
+ self.part_idx_to_job_name: dict[int, str] = {}
83
+ # Track retry attempts per part
84
+ self.part_attempts: dict[int, int] = {}
79
85
 
80
86
  def _init_manager(self) -> SimpleBatchManager:
87
+ load_dotenv()
81
88
  api_key = os.getenv("OPENAI_API_KEY")
82
89
  client = OpenAI(api_key=api_key)
83
90
  return SimpleBatchManager(
@@ -111,7 +118,7 @@ class BatchJobRunner:
111
118
  prompt_length = len(self.system_prompt)
112
119
  total = total_length + (prompt_length * len(self.data))
113
120
  calculation = total / self.config.CHARS_PER_TOKEN
114
- print(
121
+ logger.info(
115
122
  f"Total chars: {total_length}, Prompt chars: {prompt_length}, Total: {total}, Tokens: {calculation}"
116
123
  )
117
124
  if calculation < self.config.MAX_TOTAL_TOKENS:
@@ -122,55 +129,99 @@ class BatchJobRunner:
122
129
  self.data[i : i + self.config.MAX_BATCH_SIZE]
123
130
  for i in range(0, len(self.data), self.config.MAX_BATCH_SIZE)
124
131
  ]
125
- print(f"Data split into {len(self.parts)} part(s)")
132
+ logger.info(f"Data split into {len(self.parts)} part(s)")
126
133
 
127
- def run(self):
134
+ def _submit_all_jobs(self) -> None:
128
135
  for idx, part in enumerate(self.parts):
129
136
  if self._result_exists(idx):
130
- print(f"Skipping part {idx + 1}: result already exists.")
137
+ logger.info(f"Skipping part {idx + 1}: result already exists.")
131
138
  continue
132
139
  part_job_name = (
133
140
  f"{self.job_name}_part_{idx + 1}"
134
141
  if len(self.parts) > 1
135
142
  else self.job_name
136
143
  )
137
- print(
138
- f"\n--- Processing part {idx + 1}/{len(self.parts)}: {part_job_name} ---"
144
+ # If a job with this name already exists, register and skip submitting
145
+ existing_job = self.manager._load_state(part_job_name)
146
+ if existing_job:
147
+ logger.info(
148
+ f"Skipping part {idx + 1}: job already exists ({part_job_name})."
149
+ )
150
+ self.part_idx_to_job_name[idx] = part_job_name
151
+ self.part_attempts.setdefault(idx, 0)
152
+ continue
153
+
154
+ payload = part
155
+ logger.info(
156
+ f"Submitting job for part {idx + 1}/{len(self.parts)}: {part_job_name}"
139
157
  )
140
- self._process_part(part, part_job_name, idx)
158
+ self.manager.start(payload, job_name=part_job_name)
159
+ self.part_idx_to_job_name[idx] = part_job_name
160
+ self.part_attempts.setdefault(idx, 0)
161
+ # This is added for letting file get uploaded, before starting the next part.
162
+ logger.info("Uploading...")
163
+ time.sleep(30)
141
164
 
142
- def _process_part(
143
- self, part: list[dict[str, Any]], part_job_name: str, part_idx: int
144
- ):
145
- while True:
146
- print(f"Starting job for part: {part_job_name}")
147
- self.manager.start(part, job_name=part_job_name)
148
- print("Started batch job. Checking status...")
149
- while True:
150
- status = self.manager.check_status(job_name=part_job_name)
151
- print(f"Status: {status}")
165
+ def run(self):
166
+ # Submit all jobs up-front for concurrent execution
167
+ self._submit_all_jobs()
168
+ pending_parts: set[int] = set(self.part_idx_to_job_name.keys())
169
+ logger.info(f"Pending parts: {sorted(pending_parts)}")
170
+ # Polling loop
171
+ while pending_parts:
172
+ finished_this_round: list[int] = []
173
+ for part_idx in list(pending_parts):
174
+ job_name = self.part_idx_to_job_name[part_idx]
175
+ status = self.manager.check_status(job_name=job_name)
176
+ logger.info(f"Status for {job_name}: {status}")
152
177
  if status == "completed":
153
- print("Job completed. Fetching results...")
178
+ logger.info(
179
+ f"Job completed. Fetching results for part {part_idx + 1}..."
180
+ )
154
181
  output_data, log = self.manager.fetch_results(
155
- job_name=part_job_name, remove_cache=False
182
+ job_name=job_name, remove_cache=False
156
183
  )
157
184
  output_data = self.config.import_function(output_data)
158
185
  self._save_results(output_data, log, part_idx)
159
- print("Fetched and saved results for this part.")
160
- return
186
+ logger.info(f"Fetched and saved results for part {part_idx + 1}.")
187
+ finished_this_round.append(part_idx)
161
188
  elif status == "failed":
162
- print("Job failed. Clearing state, waiting, and retrying...")
163
- self.manager._clear_state(part_job_name)
164
- # Wait before retrying
165
- time.sleep(10)
166
- # Break inner loop to restart the job
167
- break
189
+ attempt = self.part_attempts.get(part_idx, 0) + 1
190
+ self.part_attempts[part_idx] = attempt
191
+ if attempt <= self.config.max_retries:
192
+ logger.info(
193
+ f"Job {job_name} failed (attempt {attempt}). Retrying after short backoff..."
194
+ )
195
+ self.manager._clear_state(job_name)
196
+ time.sleep(10)
197
+ payload = self._to_manager_payload(self.parts[part_idx])
198
+ new_job_name = (
199
+ f"{self.job_name}_part_{part_idx + 1}_retry_{attempt}"
200
+ )
201
+ self.manager.start(payload, job_name=new_job_name)
202
+ self.part_idx_to_job_name[part_idx] = new_job_name
203
+ else:
204
+ logger.info(
205
+ f"Job {job_name} failed after {attempt - 1} retries. Marking as failed."
206
+ )
207
+ finished_this_round.append(part_idx)
168
208
  else:
169
- # Wait before checking again
170
- time.sleep(5)
209
+ # Still running or queued
210
+ continue
211
+ # Remove finished parts
212
+ for part_idx in finished_this_round:
213
+ pending_parts.discard(part_idx)
214
+ if pending_parts:
215
+ logger.info(
216
+ f"Waiting {self.config.poll_interval_seconds}s before next status check for parts: {sorted(pending_parts)}"
217
+ )
218
+ time.sleep(self.config.poll_interval_seconds)
171
219
 
172
220
  def _save_results(
173
- self, output_data: list[dict[str, Any]], log: list[Any], part_idx: int
221
+ self,
222
+ output_data: list[dict[str, Any]] | dict[str, Any],
223
+ log: list[Any],
224
+ part_idx: int,
174
225
  ):
175
226
  part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
176
227
  result_path = (
@@ -178,7 +229,7 @@ class BatchJobRunner:
178
229
  / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
179
230
  )
180
231
  if not output_data:
181
- print("No output data to save. Skipping this part.")
232
+ logger.info("No output data to save. Skipping this part.")
182
233
  return
183
234
  else:
184
235
  with open(result_path, "w", encoding="utf-8") as f:
@@ -195,13 +246,13 @@ class BatchJobRunner:
195
246
  part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
196
247
  result_path = (
197
248
  Path(self.config.BASE_OUTPUT_DIR)
198
- / f"{Path(self.output_data_path).stem}{part_suffix}.json"
249
+ / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
199
250
  )
200
251
  return result_path.exists()
201
252
 
202
253
 
203
254
  if __name__ == "__main__":
204
- print("=== Batch Job Runner ===")
255
+ logger.info("=== Batch Job Runner ===")
205
256
  config = BatchConfig(
206
257
  system_prompt="",
207
258
  job_name="job_name",
@@ -3,6 +3,8 @@
3
3
  ## Overview
4
4
  This folder contains YAML files for all prompts used in the project. Each file represents a separate prompt template, which can be loaded by tools or scripts that require structured prompts for AI models.
5
5
 
6
+ ---
7
+
6
8
  ## Structure
7
9
  - **prompt_file.yaml**: Each YAML file represents a single prompt template.
8
10
  - **main_template**: The main instruction template for the model.
@@ -24,6 +26,8 @@ analyze_template:
24
26
  Optional detailed analysis template.
25
27
  ```
26
28
 
29
+ ---
30
+
27
31
  ## Guidelines
28
32
  1. **Naming**: Use descriptive names for each YAML file corresponding to the tool or task it serves.
29
33
  2. **Placeholders**: Use `{input}` or other relevant placeholders to dynamically inject data.
@@ -2,12 +2,12 @@ main_template: |
2
2
  You are an expert keyword extractor.
3
3
  Extract the most relevant keywords from the given text.
4
4
  Guidelines:
5
- 1. Keywords must represent the main concepts of the text.
6
- 2. If two words have overlapping meanings, choose only one.
7
- 3. Do not include generic or unrelated words.
8
- 4. Keywords must be single, self-contained words (no phrases).
9
- 5. Output between 3 and 7 keywords based on the input length.
10
- 6. Respond only in JSON format:
5
+ - Keywords must represent the main concepts of the text.
6
+ - If two words have overlapping meanings, choose only one.
7
+ - Do not include generic or unrelated words.
8
+ - Keywords must be single, self-contained words (no phrases).
9
+ - Output between 3 and 7 keywords based on the input length.
10
+ - Respond only in JSON format:
11
11
  {{"result": ["keyword1", "keyword2", etc.]}}
12
12
  Here is the text:
13
13
  {input}
@@ -5,11 +5,11 @@ main_template:
5
5
  I will give you a list of questions that are semantically similar.
6
6
  Your task is to merge them into one unified question.
7
7
  Guidelines:
8
- 1. Preserves all the information and intent from the original questions.
9
- 2. Sounds natural, fluent, and concise.
10
- 3. Avoids redundancy or unnecessary repetition.
11
- 4. Does not omit any unique idea from the originals.
12
- 5. Respond only in JSON format:
8
+ - Preserves all the information and intent from the original questions.
9
+ - Sounds natural, fluent, and concise.
10
+ - Avoids redundancy or unnecessary repetition.
11
+ - Does not omit any unique idea from the originals.
12
+ - Respond only in JSON format:
13
13
  {{"result": "string"}}
14
14
  Here is the questions:
15
15
  {input}
@@ -99,7 +99,7 @@ class AsyncTheTool:
99
99
  )
100
100
  return results
101
101
 
102
- async def detect_question(
102
+ async def is_question(
103
103
  self,
104
104
  question: str,
105
105
  output_lang: str | None = None,
@@ -111,7 +111,7 @@ class AsyncTheTool:
111
111
  ) -> dict[str, bool]:
112
112
  results = await self.operator.run(
113
113
  question,
114
- prompt_file="question_detector.yaml",
114
+ prompt_file="is_question.yaml",
115
115
  output_model=OutputModels.BoolOutput,
116
116
  with_analysis=with_analysis,
117
117
  resp_format="parse",
@@ -123,7 +123,7 @@ class AsyncTheTool:
123
123
  )
124
124
  return results
125
125
 
126
- async def generate_question_from_text(
126
+ async def text_to_question(
127
127
  self,
128
128
  text: str,
129
129
  output_lang: str | None = None,
@@ -135,7 +135,7 @@ class AsyncTheTool:
135
135
  ) -> dict[str, str]:
136
136
  results = await self.operator.run(
137
137
  text,
138
- prompt_file="question_generator.yaml",
138
+ prompt_file="text_to_question.yaml",
139
139
  output_model=OutputModels.StrOutput,
140
140
  with_analysis=with_analysis,
141
141
  resp_format="parse",
@@ -202,7 +202,7 @@ class AsyncTheTool:
202
202
  )
203
203
  return results
204
204
 
205
- async def generate_questions_from_subject(
205
+ async def subject_to_question(
206
206
  self,
207
207
  subject: str,
208
208
  number_of_questions: int,
@@ -215,7 +215,7 @@ class AsyncTheTool:
215
215
  ) -> dict[str, list[str]]:
216
216
  results = await self.operator.run(
217
217
  subject,
218
- prompt_file="subject_question_generator.yaml",
218
+ prompt_file="subject_to_question.yaml",
219
219
  output_model=OutputModels.ReasonListStrOutput,
220
220
  with_analysis=with_analysis,
221
221
  resp_format="parse",
@@ -3,7 +3,8 @@ from __future__ import annotations
3
3
  import json
4
4
  import math
5
5
  import re
6
- from typing import Any, Literal, Optional, TypeVar
6
+ from typing import Any, Literal, TypeVar
7
+ import logging
7
8
 
8
9
  from openai import AsyncOpenAI
9
10
  from pydantic import BaseModel
@@ -16,6 +17,10 @@ from texttools.tools.internals.prompt_loader import PromptLoader
16
17
  # Base Model type for output models
17
18
  T = TypeVar("T", bound=BaseModel)
18
19
 
20
+ # Configure logger
21
+ logger = logging.getLogger("async_operator")
22
+ logger.setLevel(logging.INFO)
23
+
19
24
 
20
25
  class AsyncOperator:
21
26
  """
@@ -190,6 +195,7 @@ class AsyncOperator:
190
195
 
191
196
  for choice in completion.choices:
192
197
  if not getattr(choice, "logprobs", None):
198
+ logger.info("No logprobs found.")
193
199
  continue
194
200
 
195
201
  for logprob_item in choice.logprobs.content:
@@ -237,11 +243,10 @@ class AsyncOperator:
237
243
  try:
238
244
  cleaned_text = input_text.strip()
239
245
 
240
- # FIXED: Correct parameter order for load
241
246
  prompt_configs = prompt_loader.load(
242
- prompt_file=prompt_file, # prompt_file
243
- text=cleaned_text, # text
244
- mode=mode if use_modes else "", # mode
247
+ prompt_file=prompt_file,
248
+ text=cleaned_text,
249
+ mode=mode if use_modes else "",
245
250
  **extra_kwargs,
246
251
  )
247
252
 
@@ -269,7 +274,7 @@ class AsyncOperator:
269
274
  output_model,
270
275
  logprobs,
271
276
  top_logprobs,
272
- max_tokens, # Pass max_tokens
277
+ max_tokens,
273
278
  )
274
279
  elif resp_format == "parse":
275
280
  parsed, completion = await self._parse_completion(
@@ -277,10 +282,16 @@ class AsyncOperator:
277
282
  output_model,
278
283
  logprobs,
279
284
  top_logprobs,
280
- max_tokens, # Pass max_tokens
285
+ max_tokens,
281
286
  )
282
287
  else:
283
- raise ValueError(f"Unknown resp_format: {resp_format}")
288
+ logger.error(f"Unknown resp_format: {resp_format}")
289
+
290
+ # Ensure output_model has a `result` field
291
+ if not hasattr(parsed, "result"):
292
+ logger.error(
293
+ "The provided output_model must define a field named 'result'"
294
+ )
284
295
 
285
296
  results = {"result": parsed.result}
286
297
 
@@ -293,5 +304,5 @@ class AsyncOperator:
293
304
  return results
294
305
 
295
306
  except Exception as e:
296
- print(f"[ERROR] Async operation failed: {e}")
297
- raise
307
+ logger.error(f"Async TheTool failed: {e}")
308
+ return {"Error": str(e), "result": ""}
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import math
4
4
  import re
5
- from typing import Any, TypeVar, Type, Literal, Optional
5
+ from typing import Any, TypeVar, Type, Literal
6
6
  import json
7
7
  import logging
8
8
 
@@ -291,5 +291,5 @@ class Operator:
291
291
  return results
292
292
 
293
293
  except Exception as e:
294
- logger.error(f"Operation failed: {e}")
294
+ logger.error(f"TheTool failed: {e}")
295
295
  return {"Error": str(e), "result": ""}
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from functools import lru_cache
2
2
  from pathlib import Path
3
3
  import yaml
4
4
 
@@ -7,10 +7,6 @@ class PromptLoader:
7
7
  """
8
8
  Utility for loading and formatting YAML prompt templates.
9
9
 
10
- Each YAML file under `prompts/` must define at least a `main_template`,
11
- and optionally an `analyze_template`. These can either be a single string
12
- or a dictionary keyed by mode names (if `use_modes=True`).
13
-
14
10
  Responsibilities:
15
11
  - Load and parse YAML prompt definitions.
16
12
  - Select the right template (by mode, if applicable).
@@ -22,31 +18,30 @@ class PromptLoader:
22
18
  }
23
19
  """
24
20
 
21
+ def __init__(self):
22
+ self.base_dir = Path(__file__).parent.parent.parent / Path("prompts")
23
+
25
24
  MAIN_TEMPLATE: str = "main_template"
26
25
  ANALYZE_TEMPLATE: str = "analyze_template"
27
26
 
28
- def _load_templates(
29
- self,
30
- prompts_dir: str,
31
- prompt_file: str,
32
- mode: str | None,
33
- ) -> dict[str, str]:
34
- prompt_path = Path(__file__).parent.parent.parent / prompts_dir / prompt_file
27
+ # Use lru_cache to load each file once
28
+ @lru_cache(maxsize=32)
29
+ def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
30
+ prompt_path = self.base_dir / prompt_file
35
31
 
36
32
  if not prompt_path.exists():
37
33
  raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
38
34
 
39
35
  try:
40
- # Load the data
41
36
  data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
42
37
  except yaml.YAMLError as e:
43
38
  raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
44
39
 
45
40
  return {
46
- "main_template": data[self.MAIN_TEMPLATE][mode]
41
+ self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
47
42
  if mode
48
43
  else data[self.MAIN_TEMPLATE],
49
- "analyze_template": data.get(self.ANALYZE_TEMPLATE)[mode]
44
+ self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
50
45
  if mode
51
46
  else data.get(self.ANALYZE_TEMPLATE),
52
47
  }
@@ -59,14 +54,9 @@ class PromptLoader:
59
54
  return format_args
60
55
 
61
56
  def load(
62
- self,
63
- prompt_file: str,
64
- text: str,
65
- mode: str,
66
- prompts_dir: str = "prompts",
67
- **extra_kwargs,
57
+ self, prompt_file: str, text: str, mode: str, **extra_kwargs
68
58
  ) -> dict[str, str]:
69
- template_configs = self._load_templates(prompts_dir, prompt_file, mode)
59
+ template_configs = self._load_templates(prompt_file, mode)
70
60
  format_args = self._build_format_args(text, **extra_kwargs)
71
61
 
72
62
  # Inject variables inside each template
@@ -17,11 +17,11 @@ class TheTool:
17
17
  - categorize: assign a text to one of several Islamic categories.
18
18
  - extract_keywords: produce a keyword list from text.
19
19
  - extract_entities: simple NER (name/type pairs).
20
- - detect_question: binary check whether input is a question.
21
- - generate_question_from_text: produce a new question from a text.
20
+ - is_question: binary check whether input is a question.
21
+ - text_to_question: produce a new question from a text.
22
22
  - merge_questions: combine multiple questions (default/reason modes).
23
23
  - rewrite: rephrase questions (same meaning/different wording, or vice versa).
24
- - generate_questions_from_subject: generate multiple questions given a subject.
24
+ - subject_to_question: generate multiple questions given a subject.
25
25
  - summarize: produce a concise summary of a subject.
26
26
  - translate: translate text between languages.
27
27
 
@@ -174,7 +174,7 @@ class TheTool:
174
174
  top_logprobs=self.top_logprobs if top_logprobs is None else top_logprobs,
175
175
  )
176
176
 
177
- def detect_question(
177
+ def is_question(
178
178
  self,
179
179
  text: str,
180
180
  model: str | None = None,
@@ -196,7 +196,7 @@ class TheTool:
196
196
  """
197
197
  return self.operator.run(
198
198
  # Internal parameters
199
- prompt_file="question_detector.yaml",
199
+ prompt_file="is_question.yaml",
200
200
  output_model=OutputModels.BoolOutput,
201
201
  resp_format="parse",
202
202
  output_lang=False,
@@ -212,7 +212,7 @@ class TheTool:
212
212
  top_logprobs=self.top_logprobs if top_logprobs is None else top_logprobs,
213
213
  )
214
214
 
215
- def generate_question_from_text(
215
+ def text_to_question(
216
216
  self,
217
217
  text: str,
218
218
  model: str | None = None,
@@ -235,7 +235,7 @@ class TheTool:
235
235
  """
236
236
  return self.operator.run(
237
237
  # Internal parameters
238
- prompt_file="question_generator.yaml",
238
+ prompt_file="text_to_question.yaml",
239
239
  output_model=OutputModels.StrOutput,
240
240
  resp_format="parse",
241
241
  # User parameters
@@ -340,7 +340,7 @@ class TheTool:
340
340
  top_logprobs=self.top_logprobs if top_logprobs is None else top_logprobs,
341
341
  )
342
342
 
343
- def generate_questions_from_subject(
343
+ def subject_to_question(
344
344
  self,
345
345
  text: str,
346
346
  number_of_questions: int,
@@ -366,7 +366,7 @@ class TheTool:
366
366
  """
367
367
  return self.operator.run(
368
368
  # Internal parameters
369
- prompt_file="subject_question_generator.yaml",
369
+ prompt_file="subject_to_question.yaml",
370
370
  output_model=OutputModels.ReasonListStrOutput,
371
371
  resp_format="parse",
372
372
  # User parameters
@@ -463,14 +463,14 @@ class TheTool:
463
463
  top_logprobs=self.top_logprobs if top_logprobs is None else top_logprobs,
464
464
  )
465
465
 
466
- def custom_tool(
466
+ def run_custom(
467
467
  self,
468
468
  prompt: str,
469
469
  output_model: Any,
470
470
  model: str | None = None,
471
471
  output_lang: str | None = None,
472
472
  temperature: float | None = None,
473
- logprobs: float | None = None,
473
+ logprobs: bool | None = None,
474
474
  top_logprobs: int | None = None,
475
475
  ) -> dict[str, Any]:
476
476
  """
@@ -485,7 +485,7 @@ class TheTool:
485
485
  """
486
486
  return self.operator.run(
487
487
  # Internal parameters
488
- prompt_file="custom_tool.yaml",
488
+ prompt_file="run_custom.yaml",
489
489
  resp_format="parse",
490
490
  user_prompt=False,
491
491
  with_analysis=False,