hamtaa-texttools 0.1.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hamtaa-texttools might be problematic. Click here for more details.

Files changed (60) hide show
  1. hamtaa_texttools-0.1.43.dist-info/METADATA +60 -0
  2. hamtaa_texttools-0.1.43.dist-info/RECORD +60 -0
  3. hamtaa_texttools-0.1.43.dist-info/WHEEL +5 -0
  4. hamtaa_texttools-0.1.43.dist-info/top_level.txt +1 -0
  5. texttools/__init__.py +26 -0
  6. texttools/base/__init__.py +3 -0
  7. texttools/base/base_categorizer.py +40 -0
  8. texttools/base/base_keyword_extractor.py +35 -0
  9. texttools/base/base_ner_extractor.py +61 -0
  10. texttools/base/base_question_detector.py +35 -0
  11. texttools/base/base_question_generator.py +99 -0
  12. texttools/base/base_question_merger.py +59 -0
  13. texttools/base/base_question_rewriter.py +61 -0
  14. texttools/base/base_router.py +33 -0
  15. texttools/base/base_summarizer.py +55 -0
  16. texttools/base/base_task_performer.py +53 -0
  17. texttools/base/base_translator.py +38 -0
  18. texttools/batch_manager/__init__.py +2 -0
  19. texttools/batch_manager/batch_manager.py +241 -0
  20. texttools/batch_manager/batch_runner.py +207 -0
  21. texttools/formatter/__init__.py +1 -0
  22. texttools/formatter/base.py +26 -0
  23. texttools/formatter/gemma3_formatter.py +51 -0
  24. texttools/handlers/__init__.py +6 -0
  25. texttools/handlers/categorizer/__init__.py +6 -0
  26. texttools/handlers/categorizer/categorizer.py +61 -0
  27. texttools/handlers/handlers.py +88 -0
  28. texttools/tools/__init__.py +33 -0
  29. texttools/tools/categorizer/__init__.py +2 -0
  30. texttools/tools/categorizer/encoder_model/__init__.py +1 -0
  31. texttools/tools/categorizer/encoder_model/encoder_vectorizer.py +51 -0
  32. texttools/tools/categorizer/llm/__init__.py +2 -0
  33. texttools/tools/categorizer/llm/gemma_categorizer.py +169 -0
  34. texttools/tools/categorizer/llm/openai_categorizer.py +80 -0
  35. texttools/tools/keyword_extractor/__init__.py +1 -0
  36. texttools/tools/keyword_extractor/gemma_extractor.py +138 -0
  37. texttools/tools/merger/__init__.py +2 -0
  38. texttools/tools/merger/gemma_question_merger.py +214 -0
  39. texttools/tools/ner/__init__.py +1 -0
  40. texttools/tools/ner/gemma_ner_extractor.py +157 -0
  41. texttools/tools/question_detector/__init__.py +2 -0
  42. texttools/tools/question_detector/gemma_detector.py +130 -0
  43. texttools/tools/question_detector/llm_detector.py +112 -0
  44. texttools/tools/question_generator/__init__.py +1 -0
  45. texttools/tools/question_generator/gemma_question_generator.py +198 -0
  46. texttools/tools/reranker/__init__.py +3 -0
  47. texttools/tools/reranker/reranker.py +137 -0
  48. texttools/tools/reranker/scorer.py +216 -0
  49. texttools/tools/reranker/sorter.py +278 -0
  50. texttools/tools/rewriter/__init__.py +2 -0
  51. texttools/tools/rewriter/gemma_question_rewriter.py +213 -0
  52. texttools/tools/router/__init__.py +0 -0
  53. texttools/tools/router/gemma_router.py +169 -0
  54. texttools/tools/subject_to_question/__init__.py +1 -0
  55. texttools/tools/subject_to_question/gemma_question_generator.py +224 -0
  56. texttools/tools/summarizer/__init__.py +2 -0
  57. texttools/tools/summarizer/gemma_summarizer.py +140 -0
  58. texttools/tools/summarizer/llm_summerizer.py +108 -0
  59. texttools/tools/translator/__init__.py +1 -0
  60. texttools/tools/translator/gemma_translator.py +202 -0
@@ -0,0 +1,198 @@
1
+ from typing import Any, Optional
2
+
3
+ from openai import OpenAI
4
+
5
+ from texttools.base.base_question_generator import BaseQuestionGenerator
6
+ from texttools.formatter import Gemma3Formatter
7
+
8
+ # class QuestionGeneration(BaseModel):
9
+ # generated_question: str
10
+
11
+
12
+ class GemmaQuestionGenerator(BaseQuestionGenerator):
13
+ """
14
+ Question Generator for Gemma-style models with optional reasoning step.
15
+ Outputs JSON with a single string field: {"generated_question": "..."}.
16
+
17
+ Allows optional extra instructions via `prompt_template`.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ client: OpenAI,
23
+ *,
24
+ model: str,
25
+ chat_formatter: Optional[Any] = None,
26
+ use_reason: bool = False,
27
+ temperature: float = 0.0,
28
+ prompt_template: Optional[str] = None,
29
+ handlers: Optional[list[Any]] = None,
30
+ **client_kwargs: Any,
31
+ ):
32
+ super().__init__(handlers)
33
+ self.client = client
34
+ self.model = model
35
+ self.temperature = temperature
36
+ self.client_kwargs = client_kwargs
37
+
38
+ self.chat_formatter = chat_formatter or Gemma3Formatter()
39
+
40
+ self.use_reason = use_reason
41
+ self.prompt_template = prompt_template
42
+
43
+ # Define the JSON schema for the generated question output
44
+ self.json_schema = {"generated_question": "string"}
45
+
46
+ def _build_messages(
47
+ self, answer: str, reason: Optional[str] = None
48
+ ) -> list[dict[str, str]]:
49
+ """
50
+ Builds the message list for the LLM API call for question generation.
51
+ """
52
+ clean_answer = self.preprocess(answer)
53
+ messages: list[dict[str, str]] = []
54
+
55
+ if self.prompt_template:
56
+ messages.append({"role": "user", "content": self.prompt_template})
57
+
58
+ if reason:
59
+ messages.append(
60
+ {
61
+ "role": "user",
62
+ "content": f"Based on this analysis of the answer: {reason}",
63
+ }
64
+ )
65
+
66
+ messages.append(
67
+ {
68
+ "role": "user",
69
+ "content": """Given the following answer, generate a single,
70
+ appropriate question that this answer would directly respond to.
71
+ the generated answer should be independently meaningful,
72
+ and not mentioning any verbs like, this, that, he or she ... on the question.
73
+ # **the generated question will be in the language of the users input**
74
+
75
+ """,
76
+ }
77
+ )
78
+ messages.append(
79
+ {"role": "user", "content": f"here is the text: {clean_answer}"}
80
+ )
81
+
82
+ # Ensure the schema is dumped as a valid JSON string for the LLM
83
+ # schema_instr = f"Respond only in JSON format: {json.dumps(self.json_schema)}"
84
+ messages.append(
85
+ {
86
+ "role": "user",
87
+ "content": """
88
+ Respond only with the new generated question, without any additional information.
89
+ **the generated question will be in the language of the users input**
90
+ """,
91
+ }
92
+ )
93
+
94
+ # messages.append(
95
+ # {"role": "assistant", "content": "{\n"}
96
+ # ) # Hint to start JSON output
97
+ # in this new version we will use
98
+ # parse function of openai library
99
+
100
+ # this line will restructure the messages
101
+ # based on the formatter that we provided
102
+ # some models will require custom settings
103
+ restructured = self.chat_formatter.format(messages=messages)
104
+
105
+ return restructured
106
+
107
+ def _reason(self, answer: str) -> str:
108
+ """
109
+ Internal reasoning step to help the model understand the core information
110
+ and implications of the answer.
111
+ """
112
+ messages = [
113
+ {
114
+ "role": "user",
115
+ "content": """
116
+ Analyze the following answer to identify its key facts,
117
+ main subject, and what kind of information it provides.
118
+ Provide a brief, summarized understanding of the answer's content that will
119
+ help in formulating a relevant and direct question.
120
+
121
+ provide the summary in the language of the content.
122
+ just mention the keypoints that was provided in the answer
123
+
124
+
125
+ """,
126
+ },
127
+ {
128
+ "role": "user",
129
+ "content": f"""
130
+
131
+ Here is the content:
132
+
133
+ {answer}
134
+
135
+ respond only with the language of the content
136
+ """,
137
+ },
138
+ ]
139
+
140
+ restructured = self.chat_formatter.format(messages=messages)
141
+
142
+ resp = self.client.chat.completions.create(
143
+ model=self.model,
144
+ messages=restructured,
145
+ temperature=self.temperature,
146
+ **self.client_kwargs,
147
+ )
148
+
149
+ reason_summary = resp.choices[0].message.content.strip()
150
+ return reason_summary
151
+
152
+ def generate_question(self, answer: str) -> str:
153
+ """
154
+ Generates a question for the input `answer`.
155
+ Optionally uses an internal reasoning step for better accuracy.
156
+ """
157
+ reason_summary = None
158
+ if self.use_reason:
159
+ reason_summary = self._reason(answer)
160
+
161
+ messages = self._build_messages(answer, reason_summary)
162
+
163
+ # i am deprecating the usage of structured output in the tasks that
164
+ # the input and output is str
165
+ # as we have noticed a huge decrease in the models outputs quality
166
+
167
+ #
168
+ # completion = self.client.beta.chat.completions.parse(
169
+ # model=self.model,
170
+ # messages=messages,
171
+ # response_format=QuestionGeneration,
172
+ # temperature=self.temperature,
173
+ # extra_body=dict(guided_decoding_backend="outlines"),
174
+ # **self.client_kwargs,
175
+ # )
176
+ # message = completion.choices[0].message
177
+ # if message.parsed:
178
+ # result = message.parsed.generated_question
179
+ # else:
180
+ # raise ValueError(f"Failed to parse the response. Raw content: {message.content}")
181
+
182
+ resp = self.client.chat.completions.create(
183
+ model=self.model,
184
+ messages=messages,
185
+ temperature=self.temperature,
186
+ **self.client_kwargs,
187
+ )
188
+
189
+ result = resp.choices[0].message.content.strip()
190
+
191
+ # dispatch and return
192
+ self._dispatch(
193
+ {
194
+ "original_answer": answer,
195
+ "generated_question": result,
196
+ }
197
+ )
198
+ return result
@@ -0,0 +1,3 @@
1
+ from .reranker import GemmaReranker
2
+ from .scorer import GemmaScorer
3
+ from .sorter import GemmaSorter
@@ -0,0 +1,137 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+
4
+ from openai import OpenAI
5
+
6
+ from texttools.base.base_task_performer import BaseTaskPerformer
7
+
8
+ from .scorer import GemmaScorer
9
+ from .sorter import GemmaSorter
10
+
11
+
12
+ class GemmaReranker(BaseTaskPerformer):
13
+ """
14
+ A Reranker component that orchestrates a GemmaScorer and GemmaSorter
15
+ to refine the order of a list of search results based on a query.
16
+ It first scores individual results and then sorts them.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ client: OpenAI,
22
+ *,
23
+ model: str,
24
+ scorer_prompt_template: Optional[str] = None,
25
+ sorter_prompt_template: Optional[str] = None,
26
+ scorer_use_reason: bool = False,
27
+ sorter_use_reason: bool = False,
28
+ temperature: float = 0.0,
29
+ handlers: Optional[list[Any]] = None,
30
+ **client_kwargs: Any,
31
+ ):
32
+ """
33
+ Initializes the GemmaReranker with configurations for its internal scorer and sorter.
34
+
35
+ :param client: An initialized OpenAI client (or compatible).
36
+ :param model: The LLM model to use for both scoring and sorting (e.g., "gemma-7b-it").
37
+ :param scorer_prompt_template: Optional initial system-level prompt for the scorer.
38
+ :param sorter_prompt_template: Optional initial system-level prompt for the sorter.
39
+ :param scorer_use_reason: If True, the internal scorer will use an internal reasoning step.
40
+ :param sorter_use_reason: If True, the internal sorter will use an internal reasoning step.
41
+ :param temperature: The sampling temperature for LLM generation (0.0 for deterministic).
42
+ :param handlers: Optional list of handlers for dispatching reranking results.
43
+ :param client_kwargs: Additional keyword arguments for the OpenAI client.
44
+ """
45
+ super().__init__(handlers)
46
+ self.client = client
47
+ self.model = model
48
+ self.temperature = temperature
49
+ self.client_kwargs = client_kwargs
50
+
51
+ # Initialize the internal Scorer and Sorter components
52
+ self.scorer = GemmaScorer(
53
+ client=self.client,
54
+ model=self.model,
55
+ temperature=self.temperature,
56
+ prompt_template=scorer_prompt_template,
57
+ use_reason=scorer_use_reason,
58
+ handlers=[],
59
+ **self.client_kwargs,
60
+ )
61
+ self.sorter = GemmaSorter(
62
+ client=self.client,
63
+ model=self.model,
64
+ temperature=self.temperature,
65
+ prompt_template=sorter_prompt_template,
66
+ use_reason=sorter_use_reason,
67
+ handlers=[],
68
+ **self.client_kwargs,
69
+ )
70
+
71
+ def perform(
72
+ self, query: str, results: list[dict[str, Any]]
73
+ ) -> list[dict[str, Any]]:
74
+ """
75
+ Performs the complete reranking operation: scoring each result and then sorting them.
76
+
77
+ :param query: The original search query.
78
+ :param results: A list of result dictionaries to be reranked.
79
+ Each dictionary should ideally have a 'text' key.
80
+ :return: A list of result dictionaries, reranked by relevance.
81
+ Each dictionary will have an added 'score' key.
82
+ :raises ValueError: If the input results list is empty or if sub-components fail.
83
+ """
84
+ if not results:
85
+ logging.info(
86
+ "Received empty list of results for reranking. Returning empty list."
87
+ )
88
+ self._dispatch(
89
+ {"query": query, "original_results": results, "reranked_results": []}
90
+ )
91
+ return []
92
+
93
+ scored_results: list[dict[str, Any]] = []
94
+
95
+ # Step 1: Score each individual result
96
+ for i, res in enumerate(results):
97
+ # Create a unique internal ID if not already present
98
+ _internal_id = res.get("id", f"rerank_id_{i}")
99
+
100
+ result_text = res.get("text")
101
+ if not result_text:
102
+ logging.warning(
103
+ f"Result with ID '{_internal_id}' has no 'text' key. Skipping scoring."
104
+ )
105
+ score = 0 # Default score if no text
106
+ else:
107
+ try:
108
+ score = self.scorer.perform(query, result_text)
109
+ except Exception as e:
110
+ logging.error(
111
+ f"Scorer failed for result ID '{_internal_id}': {e}",
112
+ exc_info=True,
113
+ )
114
+ score = 0 # Default score on scorer failure
115
+
116
+ # Create a new dictionary to add the score and internal ID
117
+ # It's important to copy the original result and add these for the sorter.
118
+ scored_res_copy = res.copy()
119
+ scored_res_copy["score"] = score
120
+ scored_res_copy["_internal_id"] = _internal_id # Sorter expects this
121
+ scored_results.append(scored_res_copy)
122
+
123
+ # Step 2: Sort the scored results
124
+ # The sorter's perform method expects a list of dictionaries with '_internal_id' and 'score'
125
+ reranked_results = self.sorter.perform(query, scored_results)
126
+
127
+ # Dispatch the final reranked results
128
+ self._dispatch(
129
+ {
130
+ "query": query,
131
+ "original_results": results,
132
+ "scored_results": scored_results, # Can include internal IDs for debugging if needed
133
+ "reranked_results": reranked_results, # This will have '_internal_id' removed by sorter
134
+ }
135
+ )
136
+
137
+ return reranked_results
@@ -0,0 +1,216 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Optional
4
+
5
+ from openai import OpenAI
6
+
7
+ from texttools.base.base_task_performer import BaseTaskPerformer
8
+
9
+
10
+ class GemmaScorer(BaseTaskPerformer):
11
+ """
12
+ A scorer component utilizing Gemma-style LLMs to evaluate the relevance of
13
+ individual text results against a given query. It assigns a score from 0-5.
14
+ Can optionally include a reasoning step for each result to enhance accuracy.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ client: OpenAI,
20
+ *,
21
+ model: str,
22
+ temperature: float = 0.0,
23
+ prompt_template: Optional[str] = None,
24
+ use_reason: bool = False,
25
+ handlers: Optional[list[Any]] = None,
26
+ **client_kwargs: Any,
27
+ ):
28
+ """
29
+ Initializes the GemmaScorer.
30
+
31
+ :param client: An initialized OpenAI client (or compatible).
32
+ :param model: The name of the LLM model to use for scoring (e.g., "gemma-7b-it").
33
+ :param temperature: The sampling temperature for LLM generation (0.0 for deterministic).
34
+ :param prompt_template: An optional initial system-level prompt for the LLM.
35
+ :param use_reason: If True, the scorer will perform an internal reasoning step
36
+ for each result and include it in the scoring prompt.
37
+ :param handlers: Optional list of handlers for dispatching scoring results.
38
+ :param client_kwargs: Additional keyword arguments for the OpenAI client.
39
+ """
40
+ super().__init__(handlers)
41
+ self.client = client
42
+ self.model = model
43
+ self.temperature = temperature
44
+ self.client_kwargs = client_kwargs
45
+ self.prompt_template = prompt_template
46
+ self.use_reason = use_reason
47
+
48
+ # Defines the expected JSON schema for the LLM's score output.
49
+ self.score_schema = {"score": "integer"}
50
+
51
+ def _build_messages(
52
+ self, query: str, result_text: str, reason: Optional[str] = None
53
+ ) -> list[dict[str, str]]:
54
+ """
55
+ Constructs the messages payload for the LLM API call to score a single result.
56
+ Now includes an optional 'reason' parameter.
57
+
58
+ :param query: The search query.
59
+ :param result_text: The text content of the result to be scored.
60
+ :param reason: An optional reasoning summary generated internally for this specific result.
61
+ :return: A list of message dictionaries formatted for the LLM API.
62
+ """
63
+ clean_query = self._preprocess(query)
64
+ clean_result_text = self._preprocess(result_text)
65
+
66
+ messages: list[dict[str, str]] = []
67
+
68
+ if self.prompt_template:
69
+ messages.append({"role": "user", "content": self.prompt_template})
70
+
71
+ # Conditionally add the reason to the prompt
72
+ if self.use_reason and reason:
73
+ messages.append(
74
+ {
75
+ "role": "user",
76
+ "content": f"Consider this preliminary analysis for scoring: {reason}",
77
+ }
78
+ )
79
+
80
+ messages.append({"role": "user", "content": f"Query: {clean_query}"})
81
+ messages.append(
82
+ {"role": "user", "content": f"Result text to score: {clean_result_text}"}
83
+ )
84
+
85
+ scoring_instruction = (
86
+ "Score this result from 0 (not relevant) to 5 (highly relevant) based on how well it matches the query. "
87
+ "Return only the score as a JSON object with a 'score' key."
88
+ )
89
+ messages.append({"role": "user", "content": scoring_instruction})
90
+
91
+ schema_instr = f"Respond only in JSON format: {json.dumps(self.score_schema)}"
92
+ messages.append({"role": "user", "content": schema_instr})
93
+
94
+ messages.append({"role": "assistant", "content": "{"})
95
+ return messages
96
+
97
+ def _reason(self, query: str, result_text: str) -> str:
98
+ """
99
+ Generates a brief reasoning summary for a single result's relevance to the query.
100
+ This summary is intended to provide additional context to the LLM for scoring.
101
+
102
+ :param query: The search query.
103
+ :param result_text: The text content of the result being analyzed.
104
+ :return: A string containing the reasoning summary.
105
+ """
106
+ clean_query = self._preprocess(query)
107
+ clean_result_text = self._preprocess(result_text)
108
+
109
+ # Improved handling for result text truncation
110
+ display_result_text = clean_result_text
111
+ if len(clean_result_text) > 200:
112
+ display_result_text = clean_result_text[:200] + "..."
113
+
114
+ reason_prompt = f"""
115
+ Analyze the relevance of the following result text to the given query.
116
+ Focus on key terms, concepts, and overall intent.
117
+ Query: "{clean_query}"
118
+ Result Text: "{display_result_text}"
119
+
120
+ Provide a brief summary of why this result might or might not be relevant to the query.
121
+ """
122
+
123
+ messages = [
124
+ {"role": "user", "content": reason_prompt},
125
+ ]
126
+
127
+ resp = self.client.chat.completions.create(
128
+ model=self.model,
129
+ messages=messages,
130
+ temperature=self.temperature,
131
+ **self.client_kwargs,
132
+ )
133
+ return resp.choices[0].message.content.strip()
134
+
135
+ def perform(self, query: str, result_text: str) -> int:
136
+ """
137
+ Scores a single result's text against a given query using the configured LLM.
138
+ This is the main public method for the scorer.
139
+
140
+ :param query: The search query string.
141
+ :param result_text: The text content of the result to be scored.
142
+ :return: An integer score (0-5) representing relevance.
143
+ :raises ValueError: If result_text is empty, LLM output is malformed, or score is invalid.
144
+ """
145
+ if not result_text:
146
+ logging.warning("Received empty result text for scoring.")
147
+ self._dispatch(
148
+ {
149
+ "query": query,
150
+ "result_text": result_text,
151
+ "score": 0,
152
+ "error": "Empty result text",
153
+ }
154
+ )
155
+ return 0
156
+
157
+ # Generate reason if enabled
158
+ reason_for_scoring = None
159
+ if self.use_reason:
160
+ reason_for_scoring = self._reason(query, result_text)
161
+
162
+ messages = self._build_messages(query, result_text, reason_for_scoring)
163
+
164
+ resp = self.client.chat.completions.create(
165
+ model=self.model,
166
+ messages=messages,
167
+ temperature=self.temperature,
168
+ **self.client_kwargs,
169
+ )
170
+ raw_score_output = resp.choices[0].message.content.strip()
171
+
172
+ # Robustly extract JSON if LLM adds preamble
173
+ if not raw_score_output.startswith("{"):
174
+ raw_score_output = "{" + raw_score_output
175
+ try:
176
+ parsed_score = json.loads(raw_score_output)
177
+ except json.JSONDecodeError as e:
178
+ logging.error(
179
+ f"Failed to parse JSON for single result scoring: {e}\nRaw output: {raw_score_output}"
180
+ )
181
+ self._dispatch(
182
+ {
183
+ "query": query,
184
+ "result_text": result_text,
185
+ "score": 0,
186
+ "error": f"JSON parsing failed: {e}",
187
+ }
188
+ )
189
+ return 0
190
+
191
+ score = parsed_score.get("score")
192
+ if not isinstance(score, (int, float)) or not (0 <= score <= 5):
193
+ logging.warning(
194
+ f"LLM returned invalid score for result text '{result_text[:50]}...': {score}. "
195
+ "Expected integer 0-5. Raw: {raw_score_output}"
196
+ )
197
+ self._dispatch(
198
+ {
199
+ "query": query,
200
+ "result_text": result_text,
201
+ "score": 0,
202
+ "warning": "Invalid score format",
203
+ }
204
+ )
205
+ return 0
206
+
207
+ final_score = int(score)
208
+ self._dispatch(
209
+ {
210
+ "query": query,
211
+ "result_text": result_text,
212
+ "score": final_score,
213
+ "reason": reason_for_scoring,
214
+ }
215
+ )
216
+ return final_score