hamtaa-texttools 0.1.44__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (70) hide show
  1. hamtaa_texttools-1.0.1.dist-info/METADATA +129 -0
  2. hamtaa_texttools-1.0.1.dist-info/RECORD +18 -0
  3. hamtaa_texttools-1.0.1.dist-info/licenses/LICENSE +21 -0
  4. {hamtaa_texttools-0.1.44.dist-info → hamtaa_texttools-1.0.1.dist-info}/top_level.txt +0 -0
  5. texttools/__init__.py +4 -21
  6. texttools/formatters/base_formatter.py +33 -0
  7. texttools/formatters/user_merge_formatter/user_merge_formatter.py +47 -0
  8. texttools/tools/__init__.py +2 -32
  9. texttools/tools/operator.py +236 -0
  10. texttools/tools/output_models.py +54 -0
  11. texttools/tools/prompt_loader.py +84 -0
  12. texttools/tools/the_tool.py +291 -0
  13. texttools/utils/__init__.py +4 -0
  14. texttools/{batch_manager → utils/batch_manager}/__init__.py +2 -0
  15. texttools/{batch_manager → utils/batch_manager}/batch_manager.py +11 -12
  16. texttools/{batch_manager → utils/batch_manager}/batch_runner.py +20 -15
  17. hamtaa_texttools-0.1.44.dist-info/METADATA +0 -60
  18. hamtaa_texttools-0.1.44.dist-info/RECORD +0 -60
  19. texttools/base/__init__.py +0 -3
  20. texttools/base/base_categorizer.py +0 -40
  21. texttools/base/base_keyword_extractor.py +0 -35
  22. texttools/base/base_ner_extractor.py +0 -61
  23. texttools/base/base_question_detector.py +0 -35
  24. texttools/base/base_question_generator.py +0 -99
  25. texttools/base/base_question_merger.py +0 -59
  26. texttools/base/base_question_rewriter.py +0 -61
  27. texttools/base/base_router.py +0 -33
  28. texttools/base/base_summarizer.py +0 -55
  29. texttools/base/base_task_performer.py +0 -53
  30. texttools/base/base_translator.py +0 -38
  31. texttools/formatter/__init__.py +0 -1
  32. texttools/formatter/base.py +0 -26
  33. texttools/formatter/gemma3_formatter.py +0 -54
  34. texttools/handlers/__init__.py +0 -6
  35. texttools/handlers/categorizer/__init__.py +0 -6
  36. texttools/handlers/categorizer/categorizer.py +0 -61
  37. texttools/handlers/handlers.py +0 -88
  38. texttools/tools/categorizer/__init__.py +0 -2
  39. texttools/tools/categorizer/encoder_model/__init__.py +0 -1
  40. texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
  41. texttools/tools/categorizer/llm/__init__.py +0 -2
  42. texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
  43. texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
  44. texttools/tools/keyword_extractor/__init__.py +0 -1
  45. texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
  46. texttools/tools/merger/__init__.py +0 -2
  47. texttools/tools/merger/gemma_question_merger.py +0 -214
  48. texttools/tools/ner/__init__.py +0 -1
  49. texttools/tools/ner/gemma_ner_extractor.py +0 -157
  50. texttools/tools/question_detector/__init__.py +0 -2
  51. texttools/tools/question_detector/gemma_detector.py +0 -114
  52. texttools/tools/question_detector/llm_detector.py +0 -112
  53. texttools/tools/question_generator/__init__.py +0 -1
  54. texttools/tools/question_generator/gemma_question_generator.py +0 -198
  55. texttools/tools/reranker/__init__.py +0 -3
  56. texttools/tools/reranker/reranker.py +0 -137
  57. texttools/tools/reranker/scorer.py +0 -216
  58. texttools/tools/reranker/sorter.py +0 -278
  59. texttools/tools/rewriter/__init__.py +0 -2
  60. texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
  61. texttools/tools/router/gemma_router.py +0 -169
  62. texttools/tools/subject_to_question/__init__.py +0 -1
  63. texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
  64. texttools/tools/summarizer/__init__.py +0 -2
  65. texttools/tools/summarizer/gemma_summarizer.py +0 -140
  66. texttools/tools/summarizer/llm_summerizer.py +0 -108
  67. texttools/tools/translator/__init__.py +0 -1
  68. texttools/tools/translator/gemma_translator.py +0 -202
  69. {hamtaa_texttools-0.1.44.dist-info → hamtaa_texttools-1.0.1.dist-info}/WHEEL +0 -0
  70. /texttools/{tools/router → prompts}/__init__.py +0 -0
@@ -0,0 +1,84 @@
1
+ from typing import Optional
2
+ from pathlib import Path
3
+ import yaml
4
+
5
+
6
+ class PromptLoader:
7
+ """
8
+ Utility for loading and formatting YAML prompt templates.
9
+
10
+ Each YAML file under `prompts/` must define at least a `main_template`,
11
+ and optionally an `analyze_template`. These can either be a single string
12
+ or a dictionary keyed by mode names (if `use_modes=True`).
13
+
14
+ Responsibilities:
15
+ - Load and parse YAML prompt definitions.
16
+ - Select the right template (by mode, if applicable).
17
+ - Inject variables (`{input}`, plus any extra kwargs) into the templates.
18
+ - Return a dict with:
19
+ {
20
+ "main_template": "...",
21
+ "analyze_template": "..." | None
22
+ }
23
+ """
24
+
25
+ MAIN_TEMPLATE: str = "main_template"
26
+ ANALYZE_TEMPLATE: str = "analyze_template"
27
+
28
+ def __init__(self, prompts_dir: Optional[str] = None):
29
+ self.PROMPTS_DIR = prompts_dir or "prompts"
30
+
31
+ def _get_prompt_path(self, prompt_file: str) -> Path:
32
+ return Path(__file__).parent.parent / self.PROMPTS_DIR / prompt_file
33
+
34
+ def _load_templates(
35
+ self, prompt_file: str, use_modes: bool, mode: str
36
+ ) -> dict[str, str]:
37
+ prompt_path = self._get_prompt_path(prompt_file)
38
+
39
+ if not prompt_path.exists():
40
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
41
+
42
+ try:
43
+ # Load the data
44
+ data = yaml.safe_load(prompt_path.read_text(encoding="utf-8"))
45
+ except yaml.YAMLError as e:
46
+ raise ValueError(f"Invalid YAML in {prompt_path}: {e}")
47
+
48
+ if self.MAIN_TEMPLATE not in data:
49
+ raise ValueError(
50
+ f"Missing required '{self.MAIN_TEMPLATE}' in {prompt_file}"
51
+ )
52
+
53
+ return {
54
+ self.MAIN_TEMPLATE: data[self.MAIN_TEMPLATE][mode]
55
+ if use_modes
56
+ else data[self.MAIN_TEMPLATE],
57
+ self.ANALYZE_TEMPLATE: data.get(self.ANALYZE_TEMPLATE)[mode]
58
+ if use_modes
59
+ else data.get(self.ANALYZE_TEMPLATE),
60
+ }
61
+
62
+ def _build_format_args(self, input_text: str, **extra_kwargs) -> dict[str, str]:
63
+ # Base formatting args
64
+ format_args = {"input": input_text}
65
+ # Merge extras
66
+ format_args.update(extra_kwargs)
67
+ return format_args
68
+
69
+ def load_prompts(
70
+ self,
71
+ prompt_file: str,
72
+ use_modes: bool,
73
+ mode: str,
74
+ input_text: str,
75
+ **extra_kwargs,
76
+ ) -> dict[str, str]:
77
+ template_configs = self._load_templates(prompt_file, use_modes, mode)
78
+ format_args = self._build_format_args(input_text, **extra_kwargs)
79
+
80
+ # Inject variables inside each template
81
+ for key in template_configs.keys():
82
+ template_configs[key] = template_configs[key].format(**format_args)
83
+
84
+ return template_configs
@@ -0,0 +1,291 @@
1
+ from typing import Literal, Any
2
+
3
+ from openai import OpenAI
4
+
5
+ from texttools.tools.operator import Operator
6
+ import texttools.tools.output_models as OutputModels
7
+
8
+
9
+ class TheTool:
10
+ """
11
+ High-level interface exposing specialized text tools for.
12
+
13
+ Each method configures the operator with a specific YAML prompt,
14
+ output schema, and flags, then delegates execution to `operator.run()`.
15
+
16
+ Supported capabilities:
17
+ - categorize: assign a text to one of several Islamic categories.
18
+ - extract_keywords: produce a keyword list from text.
19
+ - extract_entities: simple NER (name/type pairs).
20
+ - detect_question: binary check whether input is a question.
21
+ - generate_question_from_text: produce a new question from a text.
22
+ - merge_questions: combine multiple questions (default/reason modes).
23
+ - rewrite_question: rephrase questions (same meaning/different wording, or vice versa).
24
+ - generate_questions_from_subject: generate multiple questions given a subject.
25
+ - summarize: produce a concise summary of a subject.
26
+ - translate: translate text between languages.
27
+
28
+ Usage pattern:
29
+ client = OpenAI(...)
30
+ tool = TheTool(client, model="gemma-3")
31
+ result = tool.categorize("متن ورودی ...", with_analysis=True)
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ client: OpenAI,
37
+ *,
38
+ model: str,
39
+ temperature: float = 0.0,
40
+ **client_kwargs: Any,
41
+ ):
42
+ self.operator = Operator(
43
+ client=client,
44
+ model=model,
45
+ temperature=temperature,
46
+ **client_kwargs,
47
+ )
48
+
49
+ def categorize(self, text: str, with_analysis: bool = False) -> dict[str, str]:
50
+ """
51
+ Categorize a text into a single Islamic studies domain category.
52
+
53
+ Args:
54
+ text: Input string to categorize.
55
+ with_analysis: If True, first runs an LLM "analysis" step and
56
+ conditions the main prompt on that analysis.
57
+
58
+ Returns:
59
+ {"result": <category string>}
60
+ Example: {"result": "باورهای دینی"}
61
+ """
62
+ self.operator.PROMPT_FILE = "categorizer.yaml"
63
+ self.operator.OUTPUT_MODEL = OutputModels.CategorizerOutput
64
+ self.operator.WITH_ANALYSIS = with_analysis
65
+ self.operator.USE_MODES = False
66
+
67
+ results = self.operator.run(text)
68
+ return results
69
+
70
+ def extract_keywords(
71
+ self, text: str, with_analysis: bool = False
72
+ ) -> dict[str, list[str]]:
73
+ """
74
+ Extract salient keywords from text.
75
+
76
+ Args:
77
+ text: Input string to analyze.
78
+ with_analysis: Whether to run an extra LLM reasoning step.
79
+
80
+ Returns:
81
+ {"result": [<keyword1>, <keyword2>, ...]}
82
+ """
83
+ self.operator.PROMPT_FILE = "keyword_extractor.yaml"
84
+ self.operator.OUTPUT_MODEL = OutputModels.ListStrOutput
85
+ self.operator.WITH_ANALYSIS = with_analysis
86
+ self.operator.USE_MODES = False
87
+
88
+ results = self.operator.run(text)
89
+ return results
90
+
91
+ def extract_entities(
92
+ self, text: str, with_analysis: bool = False
93
+ ) -> dict[str, list[dict[str, str]]]:
94
+ """
95
+ Perform Named Entity Recognition (NER) over the input text.
96
+
97
+ Args:
98
+ text: Input string.
99
+ with_analysis: Whether to run an extra LLM reasoning step.
100
+
101
+ Returns:
102
+ {"result": [{"text": <entity>, "type": <entity_type>}, ...]}
103
+ """
104
+ self.operator.PROMPT_FILE = "ner_extractor.yaml"
105
+ self.operator.OUTPUT_MODEL = OutputModels.ListDictStrStrOutput
106
+ self.operator.WITH_ANALYSIS = with_analysis
107
+ self.operator.USE_MODES = False
108
+
109
+ results = self.operator.run(text)
110
+ return results
111
+
112
+ def detect_question(
113
+ self, question: str, with_analysis: bool = False
114
+ ) -> dict[str, str]:
115
+ """
116
+ Detect if the input is phrased as a question.
117
+
118
+ Args:
119
+ question: Input string to evaluate.
120
+ with_analysis: Whether to include an analysis step.
121
+
122
+ Returns:
123
+ {"result": "true"} or {"result": "false"}
124
+ """
125
+ self.operator.PROMPT_FILE = "question_detector.yaml"
126
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
127
+ self.operator.WITH_ANALYSIS = with_analysis
128
+ self.operator.USE_MODES = False
129
+
130
+ results = self.operator.run(question)
131
+ return results
132
+
133
+ def generate_question_from_text(
134
+ self, text: str, with_analysis: bool = False
135
+ ) -> dict[str, str]:
136
+ """
137
+ Generate a single question from the given text.
138
+
139
+ Args:
140
+ text: Source text to derive a question from.
141
+ with_analysis: Whether to use analysis before generation.
142
+
143
+ Returns:
144
+ {"result": <generated_question>}
145
+ """
146
+ self.operator.PROMPT_FILE = "question_generator.yaml"
147
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
148
+ self.operator.WITH_ANALYSIS = with_analysis
149
+ self.operator.USE_MODES = False
150
+
151
+ results = self.operator.run(text)
152
+ return results
153
+
154
+ def merge_questions(
155
+ self,
156
+ questions: list[str],
157
+ mode: Literal["default_mode", "reason_mode"] = "default_mode",
158
+ with_analysis: bool = False,
159
+ ) -> dict[str, str]:
160
+ """
161
+ Merge multiple questions into a single unified question.
162
+
163
+ Args:
164
+ questions: List of question strings.
165
+ mode: Merge strategy:
166
+ - "default_mode": simple merging.
167
+ - "reason_mode": merging with reasoning explanation.
168
+ with_analysis: Whether to use an analysis step.
169
+
170
+ Returns:
171
+ {"result": <merged_question>}
172
+ """
173
+ question_str = ", ".join(questions)
174
+
175
+ self.operator.PROMPT_FILE = "question_merger.yaml"
176
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
177
+ self.operator.WITH_ANALYSIS = with_analysis
178
+ self.operator.USE_MODES = True
179
+ self.operator.MODE = mode
180
+
181
+ results = self.operator.run(question_str)
182
+ return results
183
+
184
+ def rewrite_question(
185
+ self,
186
+ question: str,
187
+ mode: Literal[
188
+ "same_meaning_different_wording_mode",
189
+ "different_meaning_similar_wording_mode",
190
+ ] = "same_meaning_different_wording_mode",
191
+ with_analysis: bool = False,
192
+ ) -> dict[str, str]:
193
+ """
194
+ Rewrite a question with different wording or meaning.
195
+
196
+ Args:
197
+ question: Input question to rewrite.
198
+ mode: Rewrite strategy:
199
+ - "same_meaning_different_wording_mode": keep meaning, change words.
200
+ - "different_meaning_similar_wording_mode": alter meaning, preserve wording style.
201
+ with_analysis: Whether to include an analysis step.
202
+
203
+ Returns:
204
+ {"result": <rewritten_question>}
205
+ """
206
+ self.operator.PROMPT_FILE = "question_rewriter.yaml"
207
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
208
+ self.operator.WITH_ANALYSIS = with_analysis
209
+ self.operator.USE_MODES = True
210
+ self.operator.MODE = mode
211
+
212
+ results = self.operator.run(question)
213
+ return results
214
+
215
+ def generate_questions_from_subject(
216
+ self,
217
+ subject: str,
218
+ number_of_questions: int,
219
+ language: str = "English",
220
+ with_analysis: bool = False,
221
+ ) -> dict[str, list[str]]:
222
+ """
223
+ Generate a list of questions about a subject.
224
+
225
+ Args:
226
+ subject: Topic of interest.
227
+ number_of_questions: Number of questions to produce.
228
+ language: Target language for generated questions.
229
+ with_analysis: Whether to include an analysis step.
230
+
231
+ Returns:
232
+ {"result": [<question1>, <question2>, ...]}
233
+ """
234
+ self.operator.PROMPT_FILE = "subject_question_generator.yaml"
235
+ self.operator.OUTPUT_MODEL = OutputModels.ReasonListStrOutput
236
+ self.operator.WITH_ANALYSIS = with_analysis
237
+ self.operator.USE_MODES = False
238
+
239
+ results = self.operator.run(
240
+ subject,
241
+ number_of_questions=number_of_questions,
242
+ language=language,
243
+ )
244
+ return results
245
+
246
+ def summarize(self, subject: str, with_analysis: bool = False) -> dict[str, str]:
247
+ """
248
+ Summarize the given subject text.
249
+
250
+ Args:
251
+ subject: Input text to summarize.
252
+ with_analysis: Whether to include an analysis step.
253
+
254
+ Returns:
255
+ {"result": <summary>}
256
+ """
257
+ self.operator.PROMPT_FILE = "summarizer.yaml"
258
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
259
+ self.operator.WITH_ANALYSIS = with_analysis
260
+ self.operator.USE_MODES = False
261
+
262
+ results = self.operator.run(subject)
263
+ return results
264
+
265
+ def translate(
266
+ self,
267
+ text: str,
268
+ target_language: str,
269
+ with_analysis: bool = False,
270
+ ) -> dict[str, str]:
271
+ """
272
+ Translate text between languages.
273
+
274
+ Args:
275
+ text: Input string to translate.
276
+ target_language: Language code or name to translate into.
277
+ with_analysis: Whether to include an analysis step.
278
+
279
+ Returns:
280
+ {"result": <translated_text>}
281
+ """
282
+ self.operator.PROMPT_FILE = "translator.yaml"
283
+ self.operator.OUTPUT_MODEL = OutputModels.StrOutput
284
+ self.operator.WITH_ANALYSIS = with_analysis
285
+ self.operator.USE_MODES = False
286
+
287
+ results = self.operator.run(
288
+ text,
289
+ target_language=target_language,
290
+ )
291
+ return results
@@ -0,0 +1,4 @@
1
+ from .batch_manager.batch_manager import SimpleBatchManager
2
+ from .batch_manager.batch_runner import BatchJobRunner
3
+
4
+ __all__ = ["SimpleBatchManager", "BatchJobRunner"]
@@ -1,2 +1,4 @@
1
1
  from .batch_manager import SimpleBatchManager
2
2
  from .batch_runner import BatchJobRunner
3
+
4
+ __all__ = ["SimpleBatchManager", "BatchJobRunner"]
@@ -6,10 +6,17 @@ from typing import Any, Optional, Type
6
6
  from pydantic import BaseModel
7
7
  from openai import OpenAI
8
8
  from openai.lib._pydantic import to_strict_json_schema
9
- # from openai.lib._parsing._completions import type_to_response_format_param
10
9
 
11
10
 
12
11
  class SimpleBatchManager:
12
+ """
13
+ Manages batch processing jobs for OpenAI's chat completions with structured outputs.
14
+
15
+ Handles the full lifecycle of a batch job: creating tasks from input texts,
16
+ starting the job, monitoring status, and fetching results. Results are automatically
17
+ parsed into the specified Pydantic output model. Job state is persisted to disk.
18
+ """
19
+
13
20
  def __init__(
14
21
  self,
15
22
  client: OpenAI,
@@ -70,18 +77,12 @@ class SimpleBatchManager:
70
77
  Builds a single task dictionary for the batch job, including the prompt, model, and response format configuration.
71
78
  """
72
79
  response_format_config: dict[str, Any]
80
+
73
81
  if self.custom_json_schema_obj_str:
74
- # try:
75
- # parsed_custom_schema = json.loads(self.custom_json_schema_obj_str)
76
82
  response_format_config = {
77
83
  "type": "json_schema",
78
84
  "json_schema": self.custom_json_schema_obj_str,
79
85
  }
80
- # except json.JSONDecodeError as e:
81
- # raise ValueError(
82
- # "Failed to parse custom_json_schema_obj_str. "
83
- # "Please ensure it's a valid JSON string."
84
- # ) from e
85
86
  else:
86
87
  raw_schema = to_strict_json_schema(self.output_model)
87
88
  response_format_config = {
@@ -167,7 +168,7 @@ class SimpleBatchManager:
167
168
  Returns a list of dictionaries with 'id' and 'output' keys.
168
169
  """
169
170
  modified_result = []
170
- # errors = []
171
+
171
172
  for key, d in result.items():
172
173
  if "desired_output" in d:
173
174
  new_dict = {"id": key, "output": d["desired_output"]}
@@ -176,7 +177,6 @@ class SimpleBatchManager:
176
177
  new_dict = {"id": key, "output": d["error"]}
177
178
  modified_result.append(new_dict)
178
179
  return modified_result
179
- # return modified_result , errors
180
180
 
181
181
  def fetch_results(
182
182
  self, job_name: str, remove_cache: bool = True
@@ -236,6 +236,5 @@ class SimpleBatchManager:
236
236
  handler.handle(results)
237
237
  if remove_cache:
238
238
  self._clear_state(job_name)
239
- # results = {"results": results, "log": log}
240
- # return results
239
+
241
240
  return results, log
@@ -5,18 +5,17 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Any, Callable
7
7
 
8
- # from dotenv import load_dotenv
9
8
  from openai import OpenAI
10
9
  from pydantic import BaseModel
11
10
 
12
- from texttools.batch_manager import SimpleBatchManager
11
+ from texttools.utils.batch_manager import SimpleBatchManager
13
12
 
14
13
 
15
- class OutputModel(BaseModel):
16
- desired_output: str
14
+ class Output(BaseModel):
15
+ output: str
17
16
 
18
17
 
19
- def exporting_data(data):
18
+ def export_data(data):
20
19
  """
21
20
  Produces a structure of the following form from an initial data structure:
22
21
  [
@@ -26,7 +25,7 @@ def exporting_data(data):
26
25
  return data
27
26
 
28
27
 
29
- def importing_data(data):
28
+ def import_data(data):
30
29
  """
31
30
  Takes the output and adds and aggregates it to the original structure.
32
31
  """
@@ -49,17 +48,21 @@ class BatchConfig:
49
48
  CHARS_PER_TOKEN: float = 2.7
50
49
  PROMPT_TOKEN_MULTIPLIER: int = 1000
51
50
  BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
52
- import_function: Callable = importing_data
53
- export_function: Callable = exporting_data
51
+ import_function: Callable = import_data
52
+ export_function: Callable = export_data
54
53
 
55
54
 
56
55
  class BatchJobRunner:
57
56
  """
58
- Handles running batch jobs using a batch manager and configuration.
57
+ Orchestrates the execution of batched LLM processing jobs.
58
+
59
+ Handles data loading, partitioning, job execution via SimpleBatchManager,
60
+ and result saving. Manages the complete workflow from input data to processed outputs,
61
+ including retries and progress tracking across multiple batch parts.
59
62
  """
60
63
 
61
64
  def __init__(
62
- self, config: BatchConfig = BatchConfig(), output_model: type = OutputModel
65
+ self, config: BatchConfig = BatchConfig(), output_model: type = Output
63
66
  ):
64
67
  self.config = config
65
68
  self.system_prompt = config.system_prompt
@@ -75,7 +78,6 @@ class BatchJobRunner:
75
78
  Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
76
79
 
77
80
  def _init_manager(self) -> SimpleBatchManager:
78
- # load_dotenv()
79
81
  api_key = os.getenv("OPENAI_API_KEY")
80
82
  client = OpenAI(api_key=api_key)
81
83
  return SimpleBatchManager(
@@ -90,7 +92,7 @@ class BatchJobRunner:
90
92
  data = json.load(f)
91
93
  data = self.config.export_function(data)
92
94
 
93
- # Validation: ensure data is a list of dicts with 'id' and 'content' as strings
95
+ # Ensure data is a list of dicts with 'id' and 'content' as strings
94
96
  if not isinstance(data, list):
95
97
  raise ValueError(
96
98
  'Exported data must be a list in this form: [ {"id": str, "content": str},...]'
@@ -159,10 +161,13 @@ class BatchJobRunner:
159
161
  elif status == "failed":
160
162
  print("Job failed. Clearing state, waiting, and retrying...")
161
163
  self.manager._clear_state(part_job_name)
162
- time.sleep(10) # Wait before retrying
163
- break # Break inner loop to restart the job
164
+ # Wait before retrying
165
+ time.sleep(10)
166
+ # Break inner loop to restart the job
167
+ break
164
168
  else:
165
- time.sleep(5) # Wait before checking again
169
+ # Wait before checking again
170
+ time.sleep(5)
166
171
 
167
172
  def _save_results(
168
173
  self, output_data: list[dict[str, Any]], log: list[Any], part_idx: int
@@ -1,60 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: hamtaa-texttools
3
- Version: 0.1.44
4
- Summary: A set of high-level NLP tools
5
- Author: Tohidi, Montazer, Givechi, Mousavinezhad
6
- Requires-Python: >=3.8
7
- Description-Content-Type: text/markdown
8
- Requires-Dist: openai>=1.97.0
9
- Requires-Dist: numpy>=1.26.4
10
-
11
- # Text Tools
12
-
13
- <p align="center">
14
- <img src="https://img.shields.io/badge/TextTools-Python%20Text%20Processing-black?style=for-the-badge&logo=python&logoColor=white">
15
- </p>
16
-
17
-
18
- <p align="center">
19
- <img src="docs/logo.png" alt="Preview" width="300" height="300">
20
- </p>
21
-
22
-
23
- ## How to Install
24
-
25
- Install the package using:
26
-
27
- ```bash
28
- pip install -U hamta-texttools
29
- ```
30
-
31
-
32
- ---
33
-
34
- ## What This Library Is *Not*
35
-
36
- This is **not** a collection of low-level utilities.
37
-
38
- To clarify: this library **does not** include things like:
39
- - An standard `regex`
40
- - Word normalization utilities
41
-
42
- ---
43
-
44
- ## What This Library *Provides*
45
-
46
- This is a set of **high-level natural language processing (NLP)** tools.
47
-
48
- Some of the features include:
49
- - `question_detector`: Detecting if an incoming text is a question or not
50
- - `categorizer`: No finetuning need, categorizer
51
- - ... (Tell me what you want!)
52
-
53
- ---
54
-
55
- ## When to Use This Library
56
-
57
- Use `texttools` when:
58
- - You need to **process large volumes of data using OpenAI’s GPT models** via the BATCH API.
59
- - You want to treat an **LLM as a function** in Python that outputs structured JSON or Pydantic models.
60
- - You need to **categorize large datasets** using vector embeddings, efficiently and at scale.
@@ -1,60 +0,0 @@
1
- texttools/__init__.py,sha256=cI10Q_zaM9DPUCVOM79gZceuyt6Pjgpj3R-AG7xgUM8,778
2
- texttools/base/__init__.py,sha256=KUGm-Oe0BxlrRhPS-Jm2q1NCmwX8MdtZtloia7bcLaM,189
3
- texttools/base/base_categorizer.py,sha256=ojup94iXLxh92TjiJmrFXeRbsWKlon7PPAqez96B1bs,1130
4
- texttools/base/base_keyword_extractor.py,sha256=uKpxb3xI-sim-vXWe1R4_36QRhSNsWDR4IuVdpkZMME,868
5
- texttools/base/base_ner_extractor.py,sha256=D0LRNSyq1uIU9Qtepi7zpCWWzYz-AOxpVNjq97S1oUA,1933
6
- texttools/base/base_question_detector.py,sha256=FR9yDP0Z8aAfGafZy3kcpSDUUYWLJM7saRKdeVN5TiM,829
7
- texttools/base/base_question_generator.py,sha256=L_2ZwqyV9GxsKiQynWKRJG15OBFgQqiCic5H0i8R5yk,3238
8
- texttools/base/base_question_merger.py,sha256=TYhsihKaIdyGCVu4AcjxPZ1_HocHt__voV8WWGMRpMs,1945
9
- texttools/base/base_question_rewriter.py,sha256=K6ZnAjxi2qw4yLxm92zTI1IStCfX6c_6lCfIuBDSx8w,1973
10
- texttools/base/base_router.py,sha256=pFDjIXFqAhPiS9Onu5py_GxOq8geDGJDQh6k6IhCkvw,933
11
- texttools/base/base_summarizer.py,sha256=7NAilhUPs6ZUwkBpTtXAj6n2XxQH1w6SOolf3gQX2gc,1627
12
- texttools/base/base_task_performer.py,sha256=3-6qshkie50S7pRG4WHRNC_RdUbSmHOPKW56CD92-rM,1852
13
- texttools/base/base_translator.py,sha256=BoOxqaoPoUs8t1O3m2yL9pQa5iwisl097immTVcGZoE,1020
14
- texttools/batch_manager/__init__.py,sha256=3ZkxA395lRD4gNxJ1vp0fNuz_XuBr50GoP51rrwQ0Ks,87
15
- texttools/batch_manager/batch_manager.py,sha256=jAmKskL3OTYwwsO1mWsWAB3VxMlOF07c2GW1Ev83ZhY,9283
16
- texttools/batch_manager/batch_runner.py,sha256=kW0IPauI11xpssApMA7b4XI19FePImywym3V7tBaa-o,7404
17
- texttools/formatter/__init__.py,sha256=KHz2tFZctbit_HVbQNCTMi46JzmKlg-uB6Ost63IpVU,46
18
- texttools/formatter/base.py,sha256=0fiM6E7NdJevAVpL6yyPaUZVJGKWxE3fr-Ay1oqgJqQ,879
19
- texttools/formatter/gemma3_formatter.py,sha256=AmdKBYLj6HMsI2DDX4KHNEEVYJmz_VVNUBOv8ScGjsY,1865
20
- texttools/handlers/__init__.py,sha256=sv0JloipQ57AI0xo-3w9k6cK5rYjZP3ltR2EbBhkHTA,121
21
- texttools/handlers/handlers.py,sha256=LtC4FBuzRUDy3Jw-Fp21WR-QS1jOcDhsGaMPFQGjfTw,2381
22
- texttools/handlers/categorizer/__init__.py,sha256=mE05vt_ma6vcP8pQ37BZ85WVQ8jhcjDS0iZV81_LFCY,127
23
- texttools/handlers/categorizer/categorizer.py,sha256=HBpdhtCGUPl1TJUOxbgSLmVWD7o9xeIjmSWXvYzGrCA,1592
24
- texttools/tools/__init__.py,sha256=V3ZjSj_ZI9r02sOmxpxxxKBbBbtuYS1MQqtrdGZHC_A,1121
25
- texttools/tools/categorizer/__init__.py,sha256=VY0SVdik0et0fwLDj7qn-d5LtVqVBIalvlRVci699i4,48
26
- texttools/tools/categorizer/encoder_model/__init__.py,sha256=7UwoPlQ09VGN0cqfi5fPQRfsZZ8hoZj6fL6cax1BLSU,53
27
- texttools/tools/categorizer/encoder_model/encoder_vectorizer.py,sha256=MHPVJQJlvNhZ5xLVXk4FtvrORW2yxPSAnjEhjPbkQts,1476
28
- texttools/tools/categorizer/llm/__init__.py,sha256=0VbxvInITfNUlOF6bJqcUKKaYWlIe9K3vRmIRuvAGcY,95
29
- texttools/tools/categorizer/llm/gemma_categorizer.py,sha256=tjwKonTjT5cAhxWQaVyvyooRyOlGACHpnn72PNoLk-8,5636
30
- texttools/tools/categorizer/llm/openai_categorizer.py,sha256=omRk77Z5ZCIAz17h4wPDP_EcBSsscA-PQJpQjtI6--o,2547
31
- texttools/tools/keyword_extractor/__init__.py,sha256=eTpujS85MmRRbnNwc2ifKUh60W8OG4RQFmWki3Z7C_0,84
32
- texttools/tools/keyword_extractor/gemma_extractor.py,sha256=TJ4wMPWRuuzRi_Q0hr7UauKhEg8U_5U5j1D_lTFrn4s,4349
33
- texttools/tools/merger/__init__.py,sha256=bh2RBpqJvDaqEmDrM9y_GcjRqibagifAxiZVu8nEHc0,115
34
- texttools/tools/merger/gemma_question_merger.py,sha256=JAC-52kBbabIzEWp0MFi9viiu8nZOAMPaJZALHvNMqo,8035
35
- texttools/tools/ner/__init__.py,sha256=BW84BcItel6Mc2JlaDL6qvAktVMkti67VXceeCnOB1g,70
36
- texttools/tools/ner/gemma_ner_extractor.py,sha256=YhyIwX_8bdwkFb4gY8g9mZdYHW_r1jCvbmjjNCK9Wfo,5384
37
- texttools/tools/question_detector/__init__.py,sha256=ulArGttooSoxEe0vUDQSxUQrnsxr7gH9l-LjSER2dVI,162
38
- texttools/tools/question_detector/gemma_detector.py,sha256=DhlCAA6Hws_OTuYil6UY4sYlbjdQQU6EqHdoTl3a--w,3772
39
- texttools/tools/question_detector/llm_detector.py,sha256=zo89eh359hqQGGf83-6M22AaiH7q-m0m91SjTyxZaYs,3862
40
- texttools/tools/question_generator/__init__.py,sha256=EAElpB_YeyMoBqvFNjbW2a_j18SLtiKQ7sRmdS58Fww,61
41
- texttools/tools/question_generator/gemma_question_generator.py,sha256=V5QcXmHZ5shTvrThOxUrKJ4FqP0P58NIJbsPdyyy5IM,6744
42
- texttools/tools/reranker/__init__.py,sha256=70jqJ9cjpPzzvnMYgHYGVZ9PrWrN9N97visqD_PVxwU,100
43
- texttools/tools/reranker/reranker.py,sha256=2SiTMIxempMuHui2n4GJV_2dLGBeoC7WAn_rVVXlMBA,5518
44
- texttools/tools/reranker/scorer.py,sha256=fQ3Ya8QmNhrcmb-Rf-72hvhweGvVj6gQ4KOlham2eE8,8176
45
- texttools/tools/reranker/sorter.py,sha256=_ed5zGz7K60skPFFuEQZ1ObBFA71LAfVT6FyWicA-Pw,11419
46
- texttools/tools/rewriter/__init__.py,sha256=U_qwGeEOqHAcV4p2CHVb0AIvHKFfdvykRzGyWD54aWA,121
47
- texttools/tools/rewriter/gemma_question_rewriter.py,sha256=jXtRswfBvHn9QmE90JvxEmLvCTbwZqZhD_A5ONWeCzo,7925
48
- texttools/tools/router/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- texttools/tools/router/gemma_router.py,sha256=VX-kHphZVZNd0_ajugN08hGkWNUeUriwfonpYy2TIS4,5619
50
- texttools/tools/subject_to_question/__init__.py,sha256=VJpns16Qe5OL_-4WuGDUNShcJsodB2khGWT3Q1Hc-WU,72
51
- texttools/tools/subject_to_question/gemma_question_generator.py,sha256=VKXHhYHEvhFLUR87iEh0eFpD_4ueX4np8IjF-NkgWrY,7417
52
- texttools/tools/summarizer/__init__.py,sha256=phrR7qO20CNhO3hjXQBzhTRVumdVdGSufmH4GEYkhj4,140
53
- texttools/tools/summarizer/gemma_summarizer.py,sha256=ikhsBv7AiZD1dT_d12AyjXxojzSW92e2y5WjchI_3bE,4474
54
- texttools/tools/summarizer/llm_summerizer.py,sha256=-0rUKbSnl1aDeBfJ5DCSbIlwd2k-9qIaCKgoQJa0hWc,3412
55
- texttools/tools/translator/__init__.py,sha256=KO1m08J2BZwRqBGO9ICB4l4cnH1jfHLHL5HbgYFUWM8,72
56
- texttools/tools/translator/gemma_translator.py,sha256=57NMfJAZHQjZSr_eCBePE_Pnag8pu3O00Jicxhzn6Jc,7572
57
- hamtaa_texttools-0.1.44.dist-info/METADATA,sha256=OImC1zmuJh7p8SY3s3mhm8poOzYOuuqx6vjOeDy5O3k,1481
58
- hamtaa_texttools-0.1.44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
59
- hamtaa_texttools-0.1.44.dist-info/top_level.txt,sha256=5Mh0jIxxZ5rOXHGJ6Mp-JPKviywwN0MYuH0xk5bEWqE,10
60
- hamtaa_texttools-0.1.44.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- from texttools.base.base_categorizer import BaseCategorizer
2
- from texttools.base.base_question_detector import BaseQuestionDetector
3
- from texttools.base.base_summarizer import BaseSummarizer