hamtaa-texttools 0.1.48__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (86) hide show
  1. hamtaa_texttools-1.1.7.dist-info/METADATA +228 -0
  2. hamtaa_texttools-1.1.7.dist-info/RECORD +30 -0
  3. hamtaa_texttools-1.1.7.dist-info/licenses/LICENSE +21 -0
  4. texttools/__init__.py +4 -26
  5. texttools/batch/__init__.py +3 -0
  6. texttools/{utils/batch_manager → batch}/batch_manager.py +226 -241
  7. texttools/batch/batch_runner.py +254 -0
  8. texttools/prompts/README.md +35 -0
  9. texttools/prompts/categorizer.yaml +28 -0
  10. texttools/prompts/extract_entities.yaml +20 -0
  11. texttools/prompts/extract_keywords.yaml +18 -0
  12. texttools/prompts/is_question.yaml +14 -0
  13. texttools/prompts/merge_questions.yaml +46 -0
  14. texttools/prompts/rewrite.yaml +111 -0
  15. texttools/prompts/run_custom.yaml +7 -0
  16. texttools/prompts/subject_to_question.yaml +22 -0
  17. texttools/prompts/summarize.yaml +14 -0
  18. texttools/prompts/text_to_question.yaml +20 -0
  19. texttools/prompts/translate.yaml +15 -0
  20. texttools/tools/__init__.py +4 -33
  21. texttools/tools/async_the_tool.py +435 -0
  22. texttools/tools/internals/async_operator.py +242 -0
  23. texttools/tools/internals/base_operator.py +100 -0
  24. texttools/tools/internals/formatters.py +24 -0
  25. texttools/tools/internals/operator.py +242 -0
  26. texttools/tools/internals/output_models.py +62 -0
  27. texttools/tools/internals/prompt_loader.py +60 -0
  28. texttools/tools/the_tool.py +433 -0
  29. hamtaa_texttools-0.1.48.dist-info/METADATA +0 -60
  30. hamtaa_texttools-0.1.48.dist-info/RECORD +0 -61
  31. texttools/base/__init__.py +0 -3
  32. texttools/base/base_categorizer.py +0 -40
  33. texttools/base/base_keyword_extractor.py +0 -35
  34. texttools/base/base_ner_extractor.py +0 -61
  35. texttools/base/base_question_detector.py +0 -35
  36. texttools/base/base_question_generator.py +0 -99
  37. texttools/base/base_question_merger.py +0 -59
  38. texttools/base/base_question_rewriter.py +0 -61
  39. texttools/base/base_router.py +0 -33
  40. texttools/base/base_summarizer.py +0 -55
  41. texttools/base/base_task_performer.py +0 -53
  42. texttools/base/base_translator.py +0 -38
  43. texttools/formatter/__init__.py +0 -1
  44. texttools/formatter/base.py +0 -26
  45. texttools/formatter/gemma3_formatter.py +0 -54
  46. texttools/handlers/__init__.py +0 -6
  47. texttools/handlers/categorizer/__init__.py +0 -6
  48. texttools/handlers/categorizer/categorizer.py +0 -61
  49. texttools/handlers/handlers.py +0 -88
  50. texttools/tools/categorizer/__init__.py +0 -2
  51. texttools/tools/categorizer/encoder_model/__init__.py +0 -1
  52. texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +0 -51
  53. texttools/tools/categorizer/llm/__init__.py +0 -2
  54. texttools/tools/categorizer/llm/gemma_categorizer.py +0 -169
  55. texttools/tools/categorizer/llm/openai_categorizer.py +0 -80
  56. texttools/tools/keyword_extractor/__init__.py +0 -1
  57. texttools/tools/keyword_extractor/gemma_extractor.py +0 -138
  58. texttools/tools/merger/__init__.py +0 -2
  59. texttools/tools/merger/gemma_question_merger.py +0 -214
  60. texttools/tools/ner/__init__.py +0 -1
  61. texttools/tools/ner/gemma_ner_extractor.py +0 -157
  62. texttools/tools/question_detector/__init__.py +0 -2
  63. texttools/tools/question_detector/gemma_detector.py +0 -114
  64. texttools/tools/question_detector/llm_detector.py +0 -112
  65. texttools/tools/question_generator/__init__.py +0 -1
  66. texttools/tools/question_generator/gemma_question_generator.py +0 -198
  67. texttools/tools/reranker/__init__.py +0 -3
  68. texttools/tools/reranker/reranker.py +0 -137
  69. texttools/tools/reranker/scorer.py +0 -216
  70. texttools/tools/reranker/sorter.py +0 -278
  71. texttools/tools/rewriter/__init__.py +0 -2
  72. texttools/tools/rewriter/gemma_question_rewriter.py +0 -213
  73. texttools/tools/router/__init__.py +0 -0
  74. texttools/tools/router/gemma_router.py +0 -169
  75. texttools/tools/subject_to_question/__init__.py +0 -1
  76. texttools/tools/subject_to_question/gemma_question_generator.py +0 -224
  77. texttools/tools/summarizer/__init__.py +0 -2
  78. texttools/tools/summarizer/gemma_summarizer.py +0 -140
  79. texttools/tools/summarizer/llm_summerizer.py +0 -108
  80. texttools/tools/translator/__init__.py +0 -1
  81. texttools/tools/translator/gemma_translator.py +0 -189
  82. texttools/utils/batch_manager/__init__.py +0 -2
  83. texttools/utils/batch_manager/batch_runner.py +0 -207
  84. texttools/utils/flex_processor.py +0 -78
  85. {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/WHEEL +0 -0
  86. {hamtaa_texttools-0.1.48.dist-info → hamtaa_texttools-1.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,254 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any, Callable, Type, TypeVar
7
+ import logging
8
+
9
+ from dotenv import load_dotenv
10
+ from openai import OpenAI
11
+ from pydantic import BaseModel
12
+
13
+ from texttools.batch.batch_manager import BatchManager
14
+ from texttools.tools.internals.output_models import StrOutput
15
+
16
+ # Base Model type for output models
17
+ T = TypeVar("T", bound=BaseModel)
18
+
19
+ logger = logging.getLogger("texttools.batch_runner")
20
+
21
+
22
+ def export_data(data) -> list[dict[str, str]]:
23
+ """
24
+ Produces a structure of the following form from an initial data structure:
25
+ [{"id": str, "text": str},...]
26
+ """
27
+ return data
28
+
29
+
30
+ def import_data(data) -> Any:
31
+ """
32
+ Takes the output and adds and aggregates it to the original structure.
33
+ """
34
+ return data
35
+
36
+
37
+ @dataclass
38
+ class BatchConfig:
39
+ """
40
+ Configuration for batch job runner.
41
+ """
42
+
43
+ system_prompt: str = ""
44
+ job_name: str = ""
45
+ input_data_path: str = ""
46
+ output_data_filename: str = ""
47
+ model: str = "gpt-4.1-mini"
48
+ MAX_BATCH_SIZE: int = 100
49
+ MAX_TOTAL_TOKENS: int = 2_000_000
50
+ CHARS_PER_TOKEN: float = 2.7
51
+ PROMPT_TOKEN_MULTIPLIER: int = 1_000
52
+ BASE_OUTPUT_DIR: str = "Data/batch_entity_result"
53
+ import_function: Callable = import_data
54
+ export_function: Callable = export_data
55
+ poll_interval_seconds: int = 30
56
+ max_retries: int = 3
57
+
58
+
59
+ class BatchJobRunner:
60
+ """
61
+ Handles running batch jobs using a batch manager and configuration.
62
+ """
63
+
64
+ def __init__(
65
+ self, config: BatchConfig = BatchConfig(), output_model: Type[T] = StrOutput
66
+ ):
67
+ self.config = config
68
+ self.system_prompt = config.system_prompt
69
+ self.job_name = config.job_name
70
+ self.input_data_path = config.input_data_path
71
+ self.output_data_filename = config.output_data_filename
72
+ self.model = config.model
73
+ self.output_model = output_model
74
+ self.manager = self._init_manager()
75
+ self.data = self._load_data()
76
+ self.parts: list[list[dict[str, Any]]] = []
77
+ self._partition_data()
78
+ Path(self.config.BASE_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
79
+ # Map part index to job name
80
+ self.part_idx_to_job_name: dict[int, str] = {}
81
+ # Track retry attempts per part
82
+ self.part_attempts: dict[int, int] = {}
83
+
84
+ def _init_manager(self) -> BatchManager:
85
+ load_dotenv()
86
+ api_key = os.getenv("OPENAI_API_KEY")
87
+ client = OpenAI(api_key=api_key)
88
+ return BatchManager(
89
+ client=client,
90
+ model=self.model,
91
+ prompt_template=self.system_prompt,
92
+ output_model=self.output_model,
93
+ )
94
+
95
+ def _load_data(self):
96
+ with open(self.input_data_path, "r", encoding="utf-8") as f:
97
+ data = json.load(f)
98
+ data = self.config.export_function(data)
99
+
100
+ # Ensure data is a list of dicts with 'id' and 'content' as strings
101
+ if not isinstance(data, list):
102
+ raise ValueError(
103
+ "Exported data must be a list of dicts with 'id' and 'content' keys"
104
+ )
105
+ for item in data:
106
+ if not (isinstance(item, dict) and "id" in item and "content" in item):
107
+ raise ValueError(
108
+ f"Item must be a dict with 'id' and 'content' keys. Got: {type(item)}"
109
+ )
110
+ if not (isinstance(item["id"], str) and isinstance(item["content"], str)):
111
+ raise ValueError("'id' and 'content' must be strings.")
112
+ return data
113
+
114
+ def _partition_data(self):
115
+ total_length = sum(len(item["content"]) for item in self.data)
116
+ prompt_length = len(self.system_prompt)
117
+ total = total_length + (prompt_length * len(self.data))
118
+ calculation = total / self.config.CHARS_PER_TOKEN
119
+ logger.info(
120
+ f"Total chars: {total_length}, Prompt chars: {prompt_length}, Total: {total}, Tokens: {calculation}"
121
+ )
122
+ if calculation < self.config.MAX_TOTAL_TOKENS:
123
+ self.parts = [self.data]
124
+ else:
125
+ # Partition into chunks of MAX_BATCH_SIZE
126
+ self.parts = [
127
+ self.data[i : i + self.config.MAX_BATCH_SIZE]
128
+ for i in range(0, len(self.data), self.config.MAX_BATCH_SIZE)
129
+ ]
130
+ logger.info(f"Data split into {len(self.parts)} part(s)")
131
+
132
+ def _submit_all_jobs(self) -> None:
133
+ for idx, part in enumerate(self.parts):
134
+ if self._result_exists(idx):
135
+ logger.info(f"Skipping part {idx + 1}: result already exists.")
136
+ continue
137
+ part_job_name = (
138
+ f"{self.job_name}_part_{idx + 1}"
139
+ if len(self.parts) > 1
140
+ else self.job_name
141
+ )
142
+ # If a job with this name already exists, register and skip submitting
143
+ existing_job = self.manager._load_state(part_job_name)
144
+ if existing_job:
145
+ logger.info(
146
+ f"Skipping part {idx + 1}: job already exists ({part_job_name})."
147
+ )
148
+ self.part_idx_to_job_name[idx] = part_job_name
149
+ self.part_attempts.setdefault(idx, 0)
150
+ continue
151
+
152
+ payload = part
153
+ logger.info(
154
+ f"Submitting job for part {idx + 1}/{len(self.parts)}: {part_job_name}"
155
+ )
156
+ self.manager.start(payload, job_name=part_job_name)
157
+ self.part_idx_to_job_name[idx] = part_job_name
158
+ self.part_attempts.setdefault(idx, 0)
159
+ # This is added for letting file get uploaded, before starting the next part.
160
+ logger.info("Uploading...")
161
+ time.sleep(30)
162
+
163
+ def _save_results(
164
+ self,
165
+ output_data: list[dict[str, Any]] | dict[str, Any],
166
+ log: list[Any],
167
+ part_idx: int,
168
+ ):
169
+ part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
170
+ result_path = (
171
+ Path(self.config.BASE_OUTPUT_DIR)
172
+ / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
173
+ )
174
+ if not output_data:
175
+ logger.info("No output data to save. Skipping this part.")
176
+ return
177
+ else:
178
+ with open(result_path, "w", encoding="utf-8") as f:
179
+ json.dump(output_data, f, ensure_ascii=False, indent=4)
180
+ if log:
181
+ log_path = (
182
+ Path(self.config.BASE_OUTPUT_DIR)
183
+ / f"{Path(self.output_data_filename).stem}{part_suffix}_log.json"
184
+ )
185
+ with open(log_path, "w", encoding="utf-8") as f:
186
+ json.dump(log, f, ensure_ascii=False, indent=4)
187
+
188
+ def _result_exists(self, part_idx: int) -> bool:
189
+ part_suffix = f"_part_{part_idx + 1}" if len(self.parts) > 1 else ""
190
+ result_path = (
191
+ Path(self.config.BASE_OUTPUT_DIR)
192
+ / f"{Path(self.output_data_filename).stem}{part_suffix}.json"
193
+ )
194
+ return result_path.exists()
195
+
196
+ def run(self):
197
+ """
198
+ Execute the batch job processing pipeline.
199
+
200
+ Submits jobs, monitors progress, handles retries, and saves results.
201
+ """
202
+ # Submit all jobs up-front for concurrent execution
203
+ self._submit_all_jobs()
204
+ pending_parts: set[int] = set(self.part_idx_to_job_name.keys())
205
+ logger.info(f"Pending parts: {sorted(pending_parts)}")
206
+ # Polling loop
207
+ while pending_parts:
208
+ finished_this_round: list[int] = []
209
+ for part_idx in list(pending_parts):
210
+ job_name = self.part_idx_to_job_name[part_idx]
211
+ status = self.manager.check_status(job_name=job_name)
212
+ logger.info(f"Status for {job_name}: {status}")
213
+ if status == "completed":
214
+ logger.info(
215
+ f"Job completed. Fetching results for part {part_idx + 1}..."
216
+ )
217
+ output_data, log = self.manager.fetch_results(
218
+ job_name=job_name, remove_cache=False
219
+ )
220
+ output_data = self.config.import_function(output_data)
221
+ self._save_results(output_data, log, part_idx)
222
+ logger.info(f"Fetched and saved results for part {part_idx + 1}.")
223
+ finished_this_round.append(part_idx)
224
+ elif status == "failed":
225
+ attempt = self.part_attempts.get(part_idx, 0) + 1
226
+ self.part_attempts[part_idx] = attempt
227
+ if attempt <= self.config.max_retries:
228
+ logger.info(
229
+ f"Job {job_name} failed (attempt {attempt}). Retrying after short backoff..."
230
+ )
231
+ self.manager._clear_state(job_name)
232
+ time.sleep(10)
233
+ payload = self._to_manager_payload(self.parts[part_idx])
234
+ new_job_name = (
235
+ f"{self.job_name}_part_{part_idx + 1}_retry_{attempt}"
236
+ )
237
+ self.manager.start(payload, job_name=new_job_name)
238
+ self.part_idx_to_job_name[part_idx] = new_job_name
239
+ else:
240
+ logger.info(
241
+ f"Job {job_name} failed after {attempt - 1} retries. Marking as failed."
242
+ )
243
+ finished_this_round.append(part_idx)
244
+ else:
245
+ # Still running or queued
246
+ continue
247
+ # Remove finished parts
248
+ for part_idx in finished_this_round:
249
+ pending_parts.discard(part_idx)
250
+ if pending_parts:
251
+ logger.info(
252
+ f"Waiting {self.config.poll_interval_seconds}s before next status check for parts: {sorted(pending_parts)}"
253
+ )
254
+ time.sleep(self.config.poll_interval_seconds)
@@ -0,0 +1,35 @@
1
+ # Prompts
2
+
3
+ ## Overview
4
+ This folder contains YAML files for all prompts used in the project. Each file represents a separate prompt template, which can be loaded by tools or scripts that require structured prompts for AI models.
5
+
6
+ ---
7
+
8
+ ## Structure
9
+ - **prompt_file.yaml**: Each YAML file represents a single prompt template.
10
+ - **main_template**: The main instruction template for the model.
11
+ - **analyze_template** (optional): A secondary reasoning template used before generating the final response.
12
+ - **Modes** (optional): Some prompts may have multiple modes (e.g., `default`, `reason`) to allow different behaviors.
13
+
14
+ ### Example YAML Structure
15
+ ```yaml
16
+ main_template:
17
+ default: |
18
+ Your main instructions here with placeholders like {input}.
19
+ reason: |
20
+ Optional reasoning instructions here.
21
+
22
+ analyze_template:
23
+ default: |
24
+ Analyze and summarize the input.
25
+ reason: |
26
+ Optional detailed analysis template.
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Guidelines
32
+ 1. **Naming**: Use descriptive names for each YAML file corresponding to the tool or task it serves.
33
+ 2. **Placeholders**: Use `{input}` or other relevant placeholders to dynamically inject data.
34
+ 3. **Modes**: If using modes, ensure both `main_template` and `analyze_template` contain the corresponding keys.
35
+ 4. **Consistency**: Keep formatting consistent across files for easier parsing by scripts.
@@ -0,0 +1,28 @@
1
+ main_template: |
2
+ تو یک متخصص علوم دینی هستی
3
+ من یک متن به تو میدهم و تو باید
4
+ آن متن را در یکی از دسته بندی های زیر طبقه بندی کنی
5
+ دسته بندی ها:
6
+ "باورهای دینی",
7
+ "اخلاق اسلامی",
8
+ "احکام و فقه",
9
+ "تاریخ اسلام و شخصیت ها",
10
+ "منابع دینی",
11
+ "دین و جامعه/سیاست",
12
+ "عرفان و معنویت",
13
+ "هیچکدام",
14
+ فقط با این فرمت json پاسخ بده:
15
+ {{
16
+ "reason": "<دلیل انتخابت رو به صورت خلاصه بگو>",
17
+ "result": "<یکی از دسته بندی ها>"
18
+ }}
19
+ متنی که باید طبقه بندی کنی:
20
+ {input}
21
+
22
+ analyze_template: |
23
+ ما میخواهیم متنی که داده می شود را طبقه بندی کنیم.
24
+ برای بهبود طبقه بندی، نیاز به آنالیز متن داریم.
25
+ متنی که داده می شود را آنالیز کن و ایده اصلی و آنالیزی کوتاه از آن را بنویس.
26
+ آنالیز باید بسیار خلاصه باشد
27
+ نهایتا 20 کلمه
28
+ {input}
@@ -0,0 +1,20 @@
1
+ main_template: |
2
+ You are a Named Entity Recognition (NER) extractor.
3
+ Identify and extract all named entities (e.g., PER, ORG, LOC, DAT, etc.) from the given text.
4
+ For each entity, provide its text and a clear type.
5
+ Respond only in JSON format:
6
+ {{
7
+ "result": [
8
+ {{
9
+ "text": "string",
10
+ "type": "string",
11
+ }}
12
+ ]
13
+ }}
14
+ Here is the text:
15
+ {input}
16
+
17
+ analyze_template: |
18
+ Read the following text and identify any proper nouns, key concepts, or specific mentions that might represent named entities.
19
+ Provide a brief, summarized analysis that could help in categorizing these entities.
20
+ {input}
@@ -0,0 +1,18 @@
1
+ main_template: |
2
+ You are an expert keyword extractor.
3
+ Extract the most relevant keywords from the given text.
4
+ Guidelines:
5
+ - Keywords must represent the main concepts of the text.
6
+ - If two words have overlapping meanings, choose only one.
7
+ - Do not include generic or unrelated words.
8
+ - Keywords must be single, self-contained words (no phrases).
9
+ - Output between 3 and 7 keywords based on the input length.
10
+ - Respond only in JSON format:
11
+ {{"result": ["keyword1", "keyword2", etc.]}}
12
+ Here is the text:
13
+ {input}
14
+
15
+ analyze_template: |
16
+ Analyze the following text to identify its main topics, concepts, and important terms.
17
+ Provide a concise summary of your findings that will help in extracting relevant keywords.
18
+ {input}
@@ -0,0 +1,14 @@
1
+ main_template: |
2
+ You are a question detector.
3
+ Determine that if the given text contains any question or not.
4
+ Respond only in JSON format (Output should be a boolean):
5
+ {{"result": True/False}}
6
+ Here is the text:
7
+ {input}
8
+
9
+ analyze_template: |
10
+ We want to analyze this text snippet to see if it contains any question or request of some kind or not.
11
+ Read the text, and reason about it being a request or not.
12
+ Summerized, short answer.
13
+ {input}
14
+
@@ -0,0 +1,46 @@
1
+ main_template:
2
+
3
+ default: |
4
+ You are a language expert.
5
+ I will give you a list of questions that are semantically similar.
6
+ Your task is to merge them into one unified question.
7
+ Guidelines:
8
+ - Preserves all the information and intent from the original questions.
9
+ - Sounds natural, fluent, and concise.
10
+ - Avoids redundancy or unnecessary repetition.
11
+ - Does not omit any unique idea from the originals.
12
+ - Respond only in JSON format:
13
+ {{"result": "string"}}
14
+ Here is the questions:
15
+ {input}
16
+
17
+ reason: |
18
+ You are an AI assistant helping to unify semantically similar questions.
19
+ First, briefly extract the unique intent or content from each input question.
20
+ Then, write one merged question that combines all their content clearly and naturally, without redundancy.
21
+ Step 1: Extract key ideas.
22
+ Step 2: Write the final merged question.
23
+ Respond only in JSON format:
24
+ {{"result": "string"}}
25
+ Here is the questions:
26
+ {input}
27
+
28
+ analyze_template:
29
+
30
+ default: |
31
+ You are a language expert.
32
+ Analyze the following questions to identify their core intent, key concepts,
33
+ and the specific information they are seeking.
34
+ Provide a brief, summarized understanding of the questions' meaning that
35
+ will help in merging and rephrasing it accurately without changing its intent.
36
+ Here is the question:
37
+ {input}
38
+
39
+ reason: |
40
+ Analyze the following questions to identify their exact wording, phrasing,
41
+ and the literal meaning it conveys.
42
+ Provide a brief, summarized analysis of their linguistic structure and current meaning,
43
+ which will then be used to create a new question containing all of their contents.
44
+ Here is the question:
45
+ {input}
46
+
@@ -0,0 +1,111 @@
1
+ main_template:
2
+
3
+ positive: |
4
+ You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
5
+ Your task is to create a positive pair for a given "Anchor" text.
6
+
7
+ A high-quality positive pair consists of two sentences that are semantically equivalent or highly similar in meaning, but differ in wording, syntax, and sentence structure.
8
+ They should be paraphrases of each other.
9
+
10
+ Instructions:
11
+ - Preserve Core Meaning: The generated sentence must convey the same key information, intent, and context as the Anchor.
12
+ - Vary Lexicon: Use different words and phrases (synonyms, related terms).
13
+ - Vary Syntax: Change the sentence structure (e.g., active to passive voice, change clause order, combine or split sentences).
14
+ - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
15
+ - Avoid Minor Changes: Do not just add/remove a few words or swap names. Create a fundamentally different sentence.
16
+
17
+ Respond only in JSON format:
18
+ {{"result": "str"}}
19
+
20
+ Anchor Text:
21
+ "{input}"
22
+
23
+ negative: |
24
+ You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
25
+ Your task is to create a negative sample for a given "Anchor" text.
26
+
27
+ A high-quality negative sample is a sentence that is semantically unrelated to the Anchor's specific question, while staying within the same general domain (religious topics).
28
+
29
+ Instructions:
30
+ - Stay in Domain: The sentence must be about the text's topics, but on a different subject
31
+ - Ensure Clear Distinction: The topic should be clearly different from the anchor's specific focus
32
+ - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
33
+
34
+ Respond only in JSON format:
35
+ {{"result": "str"}}
36
+
37
+ Anchor Text:
38
+ "{input}"
39
+
40
+ hard_negative: |
41
+ You are an AI assistant designed to generate high-quality training data for semantic text embedding models.
42
+ Your task is to create a hard-negative sample for a given "Anchor" text.
43
+
44
+ A high-quality hard-negative sample is a sentence that is topically related but semantically distinct from the Anchor.
45
+ It should share some context (e.g., same domain, same entities) but differ in a crucial piece of information, action, conclusion, or specific detail.
46
+
47
+ Instructions:
48
+ - Stay in General Domain: Remain in the same broad domain (e.g., religious topics), but choose a completely different subject matter.
49
+ - Maintain Topical Overlap: Keep the same domain, subject, or entities (e.g., people, products, concepts) as the Anchor.
50
+ - Alter a Key Semantic Element: Reverse a key word or condition or place or proper name that completely reverses the meaning of the sentence.
51
+ - Avoid Being a Paraphrase: The sentence must NOT be semantically equivalent. The core factual claim or intent must be different.
52
+ - Make it Challenging: The difference should be subtle enough that it requires a deep understanding of the text to identify, not just a simple keyword mismatch.
53
+ - Maintain Similar Length: The generated sentence should be of roughly the same length and level of detail as the Anchor.
54
+
55
+
56
+ Respond only in JSON format:
57
+ {{"result": "str"}}
58
+
59
+ Anchor Text:
60
+ "{input}"
61
+
62
+
63
+ analyze_template:
64
+
65
+ positive: |
66
+ Analyze the following text to understand its CORE SEMANTIC MEANING for creating a high-quality POSITIVE sample.
67
+
68
+ Focus on:
69
+ - Core Intent: What is the fundamental question or statement being made?
70
+ - Key Entities/Concepts: What are the main subjects, objects, and concepts?
71
+ - Semantic Relationships: How are the entities related?
72
+ - Context & Domain: What is the broader context and domain?
73
+
74
+ Your analysis should capture the ESSENTIAL MEANING that must be preserved in any paraphrase.
75
+
76
+ Text:
77
+ {input}
78
+
79
+ negative: |
80
+ Analyze the following text to identify its SPECIFIC TOPIC and DOMAIN for creating a high-quality NEGATIVE sample.
81
+
82
+ Focus on:
83
+ - Specific Topic: What exact subject is this text about?
84
+ - Domain Context: What broader domain does this belong to?
85
+ - Key Elements to AVOID: What concepts, entities, or phrases must NOT appear in the negative sample?
86
+ - Alternative Topics: What are related but DISTINCT topics within the same domain?
87
+
88
+ The goal is to find topics that are in the same domain but semantically unrelated to this specific text.
89
+
90
+ Text:
91
+ {input}
92
+
93
+ hard_negative: |
94
+ Analyze this text to identify EXACTLY ONE ELEMENT that can be changed to create a hard-negative sample.
95
+
96
+ CRITICAL: The hard-negative must keep the SAME TOPIC and MOST WORDS identical.
97
+
98
+ Identify ONE change from these options:
99
+ - Change a quantity/order word (first→last, one→many)
100
+ - Change a key location/entity to a related one (paradise→hell, heaven→earth)
101
+ - Change the question focus slightly (who→what, what→how)
102
+ - Change a key action verb to a related action (enter→exit, give→take)
103
+
104
+ PRESERVE:
105
+ - Main topic and subject
106
+ - Sentence structure
107
+ - 80-90% of the vocabulary
108
+
109
+ Text:
110
+ {input}
111
+
@@ -0,0 +1,7 @@
1
+ main_template: |
2
+ {input}
3
+ Respond only in JSON format:
4
+ {output_model_str}
5
+
6
+ analyze_template: |
7
+
@@ -0,0 +1,22 @@
1
+ main_template: |
2
+ You are a question from subject generator.
3
+ Given the following subject, generate {number_of_questions} appropriate questions that this subject would directly respond to.
4
+ The generated subject should be independently meaningful,
5
+ and it must not mention any verbs like, this, that, he or she and etc. in the question.
6
+ There is a `reason` key, fill that up with a summerized version of your thoughts.
7
+ The `reason` must be less than 20 words.
8
+ Don't forget to fill the reason.
9
+ Respond only in JSON format:
10
+ {{"result": ["question1", "question2", ...], "reason": "string"}}
11
+ Here is the text:
12
+ {input}
13
+
14
+ analyze_template: |
15
+ Our goal is to generate questions from the given subject.
16
+ The questions must be meaningfull, some of them should be specific and some should be general.
17
+ But first, in this step we want to analyze the subject that I asked to generate questions for it.
18
+ We need a summerized analysis of the subject.
19
+ What is the subject about?
20
+ What point of views can we see and generate questoins from it? (Questions that real users might have.)
21
+ Here is the subject:
22
+ {input}
@@ -0,0 +1,14 @@
1
+ main_template: |
2
+ You are a summarizer.
3
+ You must summarize the given text, preserving its meaning.
4
+ Respond only in JSON format:
5
+ {{"result": "string"}}
6
+ Provide a concise summary of the following text:
7
+ {input}
8
+
9
+
10
+ analyze_template: |
11
+ Read the following text and identify its main points, key arguments, and overall purpose.
12
+ Provide a brief, summarized analysis that will help in generating an accurate and concise summary.
13
+ {input}
14
+
@@ -0,0 +1,20 @@
1
+ main_template: |
2
+ You are a question generator.
3
+ Given the following answer, generate one
4
+ appropriate question that this answer would directly respond to.
5
+ The generated answer should be independently meaningful,
6
+ and not mentioning any verbs like, this, that, he or she on the question.
7
+ Respond only in JSON format:
8
+ {{"result": "string"}}
9
+ Here is the answer:
10
+ {input}
11
+
12
+ analyze_template: |
13
+ Analyze the following answer to identify its key facts,
14
+ main subject, and what kind of information it provides.
15
+ Provide a brief, summarized understanding of the answer's content that will
16
+ help in formulating a relevant and direct question.
17
+ Just mention the keypoints that was provided in the answer
18
+ Here is the answer:
19
+ {input}
20
+
@@ -0,0 +1,15 @@
1
+ main_template: |
2
+ You are a {target_language} translator.
3
+ Output only the translated text.
4
+ Respond only in JSON format:
5
+ {{"result": "string"}}
6
+ Don't translate proper name, only transliterate them to {target_language}
7
+ Translate the following text to {target_language}:
8
+ {input}
9
+
10
+ analyze_template: |
11
+ Analyze the following text and identify important linguistic considerations for translation.
12
+ Point out any idioms, cultural references, or complex structures that need special attention.
13
+ Also, list all proper nouns that should not be translated. Write your analysis in the {target_language}.
14
+ {input}
15
+
@@ -1,33 +1,4 @@
1
- from .categorizer import EmbeddingCategorizer, GemmaCategorizer, LLMCategorizer
2
- from .keyword_extractor import GemmaKeywordExtractor
3
- from .ner import GemmaNERExtractor
4
- from .question_detector import GemmaQuestionDetector, LLMQuestionDetector
5
- from .question_generator import GemmaQuestionGenerator
6
- from .reranker import GemmaReranker, GemmaScorer, GemmaSorter
7
- from .rewriter import GemmaQuestionRewriter, RewriteMode
8
- from .merger import GemmaQuestionMerger, MergingMode
9
- from .subject_to_question import GemmaQuestionGeneratorFromSubject
10
- from .summarizer import GemmaSummarizer, LLMSummarizer
11
- from .translator import GemmaTranslator
12
-
13
- __all__ = [
14
- "EmbeddingCategorizer",
15
- "GemmaCategorizer",
16
- "LLMCategorizer",
17
- "GemmaTranslator",
18
- "GemmaSummarizer",
19
- "LLMSummarizer",
20
- "GemmaNERExtractor",
21
- "GemmaQuestionDetector",
22
- "LLMQuestionDetector",
23
- "GemmaQuestionGenerator",
24
- "GemmaScorer",
25
- "GemmaSorter",
26
- "GemmaReranker",
27
- "GemmaQuestionRewriter",
28
- "RewriteMode",
29
- "GemmaKeywordExtractor",
30
- "GemmaQuestionGeneratorFromSubject",
31
- "GemmaQuestionMerger",
32
- "MergingMode",
33
- ]
1
+ from .async_the_tool import AsyncTheTool
2
+ from .the_tool import TheTool
3
+
4
+ __all__ = ["TheTool", "AsyncTheTool"]