hamtaa-texttools 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/METADATA +15 -34
- hamtaa_texttools-1.1.21.dist-info/RECORD +32 -0
- texttools/batch/batch_config.py +14 -1
- texttools/batch/{internals/batch_manager.py → batch_manager.py} +6 -6
- texttools/batch/batch_runner.py +7 -7
- texttools/internals/async_operator.py +48 -84
- texttools/internals/models.py +73 -113
- texttools/internals/operator_utils.py +2 -2
- texttools/internals/prompt_loader.py +3 -20
- texttools/internals/sync_operator.py +47 -83
- texttools/internals/text_to_chunks.py +97 -0
- texttools/prompts/README.md +2 -2
- texttools/prompts/categorize.yaml +35 -77
- texttools/prompts/check_fact.yaml +2 -2
- texttools/prompts/extract_entities.yaml +3 -3
- texttools/prompts/extract_keywords.yaml +6 -6
- texttools/prompts/is_question.yaml +2 -2
- texttools/prompts/merge_questions.yaml +4 -4
- texttools/prompts/propositionize.yaml +2 -2
- texttools/prompts/rewrite.yaml +6 -6
- texttools/prompts/run_custom.yaml +1 -1
- texttools/prompts/subject_to_question.yaml +2 -2
- texttools/prompts/summarize.yaml +2 -2
- texttools/prompts/text_to_question.yaml +8 -6
- texttools/prompts/translate.yaml +2 -2
- texttools/tools/async_tools.py +497 -519
- texttools/tools/sync_tools.py +498 -520
- hamtaa_texttools-1.1.19.dist-info/RECORD +0 -33
- texttools/batch/internals/utils.py +0 -16
- texttools/internals/formatters.py +0 -24
- {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/WHEEL +0 -0
- {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/licenses/LICENSE +0 -0
- {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/top_level.txt +0 -0
texttools/internals/models.py
CHANGED
|
@@ -1,44 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from datetime import datetime
|
|
2
|
-
from typing import Type,
|
|
4
|
+
from typing import Type, Literal, Any
|
|
3
5
|
|
|
4
6
|
from pydantic import BaseModel, Field, create_model
|
|
5
7
|
|
|
6
8
|
|
|
9
|
+
class ToolOutputMetadata(BaseModel):
|
|
10
|
+
tool_name: str
|
|
11
|
+
processed_at: datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
12
|
+
execution_time: float | None = None
|
|
13
|
+
|
|
14
|
+
|
|
7
15
|
class ToolOutput(BaseModel):
|
|
8
16
|
result: Any = None
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
process: str | None = None
|
|
12
|
-
processed_at: datetime = datetime.now()
|
|
13
|
-
execution_time: float | None = None
|
|
17
|
+
analysis: str | None = None
|
|
18
|
+
logprobs: list[dict[str, Any]] | None = None
|
|
14
19
|
errors: list[str] = []
|
|
20
|
+
metadata: ToolOutputMetadata | None = None
|
|
15
21
|
|
|
16
22
|
def __repr__(self) -> str:
|
|
17
|
-
|
|
18
|
-
ToolOutput(process='{self.process}', result_type='{type(self.result)}',
|
|
19
|
-
result='{self.result}', analysis='{self.analysis}',
|
|
20
|
-
logprobs='{self.logprobs}', errors='{self.errors}',
|
|
21
|
-
processed_at='{self.processed_at}', execution_time='{self.execution_time}'
|
|
22
|
-
"""
|
|
23
|
+
base = f"""ToolOutput(result='{self.result}', result_type='{type(self.result)}', analysis='{self.analysis}', logprobs='{self.logprobs}', errors='{self.errors}'"""
|
|
23
24
|
|
|
25
|
+
if self.metadata:
|
|
26
|
+
base += f""", tool_name='{self.metadata.tool_name}',
|
|
27
|
+
processed_at='{self.metadata.processed_at}', execution_time='{self.metadata.execution_time}'
|
|
28
|
+
"""
|
|
24
29
|
|
|
25
|
-
|
|
30
|
+
return base
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class OperatorOutput(BaseModel):
|
|
34
|
+
result: Any
|
|
35
|
+
analysis: str | None
|
|
36
|
+
logprobs: list[dict[str, Any]] | None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Str(BaseModel):
|
|
26
40
|
result: str = Field(..., description="The output string", example="text")
|
|
27
41
|
|
|
28
42
|
|
|
29
|
-
class
|
|
43
|
+
class Bool(BaseModel):
|
|
30
44
|
result: bool = Field(
|
|
31
45
|
..., description="Boolean indicating the output state", example=True
|
|
32
46
|
)
|
|
33
47
|
|
|
34
48
|
|
|
35
|
-
class
|
|
49
|
+
class ListStr(BaseModel):
|
|
36
50
|
result: list[str] = Field(
|
|
37
51
|
..., description="The output list of strings", example=["text_1", "text_2"]
|
|
38
52
|
)
|
|
39
53
|
|
|
40
54
|
|
|
41
|
-
class
|
|
55
|
+
class ListDictStrStr(BaseModel):
|
|
42
56
|
result: list[dict[str, str]] = Field(
|
|
43
57
|
...,
|
|
44
58
|
description="List of dictionaries containing string key-value pairs",
|
|
@@ -46,121 +60,76 @@ class ListDictStrStrOutput(BaseModel):
|
|
|
46
60
|
)
|
|
47
61
|
|
|
48
62
|
|
|
49
|
-
class
|
|
63
|
+
class ReasonListStr(BaseModel):
|
|
50
64
|
reason: str = Field(..., description="Thinking process that led to the output")
|
|
51
65
|
result: list[str] = Field(
|
|
52
66
|
..., description="The output list of strings", example=["text_1", "text_2"]
|
|
53
67
|
)
|
|
54
68
|
|
|
55
69
|
|
|
56
|
-
class Node
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
70
|
+
class Node:
|
|
71
|
+
def __init__(self, name: str, description: str, level: int, parent: Node | None):
|
|
72
|
+
self.name = name
|
|
73
|
+
self.description = description
|
|
74
|
+
self.level = level
|
|
75
|
+
self.parent = parent
|
|
76
|
+
self.children = {}
|
|
62
77
|
|
|
63
78
|
|
|
64
79
|
class CategoryTree:
|
|
65
|
-
def __init__(self
|
|
66
|
-
self._root = Node(
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
self._new_id = 1
|
|
71
|
-
|
|
72
|
-
def get_all_nodes(self) -> list[Node]:
|
|
80
|
+
def __init__(self):
|
|
81
|
+
self._root = Node(name="root", description="root", level=0, parent=None)
|
|
82
|
+
self._all_nodes = {"root": self._root}
|
|
83
|
+
|
|
84
|
+
def get_all_nodes(self) -> dict[str, Node]:
|
|
73
85
|
return self._all_nodes
|
|
74
86
|
|
|
75
87
|
def get_level_count(self) -> int:
|
|
76
|
-
return max(
|
|
77
|
-
|
|
78
|
-
def get_node(self,
|
|
79
|
-
|
|
80
|
-
for node in self.get_all_nodes():
|
|
81
|
-
if node.name == identifier:
|
|
82
|
-
return node
|
|
83
|
-
return None
|
|
84
|
-
elif isinstance(identifier, int):
|
|
85
|
-
for node in self.get_all_nodes():
|
|
86
|
-
if node.node_id == identifier:
|
|
87
|
-
return node
|
|
88
|
-
return None
|
|
89
|
-
else:
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
|
-
def get_children(self, parent_node: Node) -> list[Node] | None:
|
|
93
|
-
children = [
|
|
94
|
-
node
|
|
95
|
-
for node in self.get_all_nodes()
|
|
96
|
-
if parent_node.node_id == node.parent_id
|
|
97
|
-
]
|
|
98
|
-
return children if children else None
|
|
88
|
+
return max(node.level for node in self._all_nodes.values())
|
|
89
|
+
|
|
90
|
+
def get_node(self, name: str) -> Node | None:
|
|
91
|
+
return self._all_nodes.get(name)
|
|
99
92
|
|
|
100
93
|
def add_node(
|
|
101
94
|
self,
|
|
102
|
-
|
|
103
|
-
parent_name: str
|
|
95
|
+
name: str,
|
|
96
|
+
parent_name: str,
|
|
104
97
|
description: str | None = None,
|
|
105
98
|
) -> None:
|
|
106
|
-
if self.get_node(
|
|
107
|
-
raise ValueError(f"{
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
parent_id = parent_node.node_id
|
|
114
|
-
level = parent_node.level + 1
|
|
115
|
-
else:
|
|
116
|
-
level = 1
|
|
117
|
-
parent_id = 0
|
|
99
|
+
if self.get_node(name):
|
|
100
|
+
raise ValueError(f"Cannot add {name} category twice")
|
|
101
|
+
|
|
102
|
+
parent = self.get_node(parent_name)
|
|
103
|
+
|
|
104
|
+
if not parent:
|
|
105
|
+
raise ValueError(f"Parent category '{parent_name}' not found")
|
|
118
106
|
|
|
119
107
|
node_data = {
|
|
120
|
-
"
|
|
121
|
-
"name": node_name,
|
|
122
|
-
"level": level,
|
|
123
|
-
"parent_id": parent_id,
|
|
108
|
+
"name": name,
|
|
124
109
|
"description": description if description else "No description provided",
|
|
110
|
+
"level": parent.level + 1,
|
|
111
|
+
"parent": parent,
|
|
125
112
|
}
|
|
126
113
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def remove_node(self, identifier: int | str) -> None:
|
|
131
|
-
node = self.get_node(identifier)
|
|
132
|
-
|
|
133
|
-
if node:
|
|
134
|
-
# Remove node's children recursively
|
|
135
|
-
children = self.get_children(node)
|
|
114
|
+
new_node = Node(**node_data)
|
|
115
|
+
parent.children[name] = new_node
|
|
116
|
+
self._all_nodes[name] = new_node
|
|
136
117
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
118
|
+
def remove_node(self, name: str) -> None:
|
|
119
|
+
if name == "root":
|
|
120
|
+
raise ValueError("Cannot remove the root node")
|
|
140
121
|
|
|
141
|
-
|
|
142
|
-
|
|
122
|
+
node = self.get_node(name)
|
|
123
|
+
if not node:
|
|
124
|
+
raise ValueError(f"Category: '{name}' not found")
|
|
143
125
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
raise ValueError(f"Node with identifier: '{identifier}' not found.")
|
|
126
|
+
for child_name in list(node.children.keys()):
|
|
127
|
+
self.remove_node(child_name)
|
|
147
128
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
children = [
|
|
151
|
-
build_dict(child)
|
|
152
|
-
for child in self._all_nodes
|
|
153
|
-
if child.parent_id == node.node_id
|
|
154
|
-
]
|
|
155
|
-
return {
|
|
156
|
-
"node_id": node.node_id,
|
|
157
|
-
"name": node.name,
|
|
158
|
-
"level": node.level,
|
|
159
|
-
"parent_id": node.parent_id,
|
|
160
|
-
"children": children,
|
|
161
|
-
}
|
|
129
|
+
if node.parent:
|
|
130
|
+
del node.parent.children[name]
|
|
162
131
|
|
|
163
|
-
|
|
132
|
+
del self._all_nodes[name]
|
|
164
133
|
|
|
165
134
|
|
|
166
135
|
# This function is needed to create CategorizerOutput with dynamic categories
|
|
@@ -179,12 +148,3 @@ def create_dynamic_model(allowed_values: list[str]) -> Type[BaseModel]:
|
|
|
179
148
|
)
|
|
180
149
|
|
|
181
150
|
return CategorizerOutput
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
class Entity(BaseModel):
|
|
185
|
-
text: str = Field(description="The exact text of the entity")
|
|
186
|
-
entity_type: str = Field(description="The type of the entity")
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
class EntityDetectorOutput(BaseModel):
|
|
190
|
-
result: list[Entity] = Field(description="List of all extracted entities")
|
|
@@ -5,8 +5,8 @@ import random
|
|
|
5
5
|
|
|
6
6
|
class OperatorUtils:
|
|
7
7
|
@staticmethod
|
|
8
|
-
def build_user_message(prompt: str) -> dict[str, str]:
|
|
9
|
-
return {"role": "user", "content": prompt}
|
|
8
|
+
def build_user_message(prompt: str) -> list[dict[str, str]]:
|
|
9
|
+
return [{"role": "user", "content": prompt}]
|
|
10
10
|
|
|
11
11
|
@staticmethod
|
|
12
12
|
def extract_logprobs(completion: dict) -> list[dict]:
|
|
@@ -12,20 +12,12 @@ class PromptLoader:
|
|
|
12
12
|
Responsibilities:
|
|
13
13
|
- Load and parse YAML prompt definitions.
|
|
14
14
|
- Select the right template (by mode, if applicable).
|
|
15
|
-
- Inject variables (`{
|
|
15
|
+
- Inject variables (`{text}`, plus any extra kwargs) into the templates.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
MAIN_TEMPLATE = "main_template"
|
|
19
19
|
ANALYZE_TEMPLATE = "analyze_template"
|
|
20
20
|
|
|
21
|
-
@staticmethod
|
|
22
|
-
def _build_format_args(text: str, **extra_kwargs) -> dict[str, str]:
|
|
23
|
-
# Base formatting args
|
|
24
|
-
format_args = {"input": text}
|
|
25
|
-
# Merge extras
|
|
26
|
-
format_args.update(extra_kwargs)
|
|
27
|
-
return format_args
|
|
28
|
-
|
|
29
21
|
# Use lru_cache to load each file once
|
|
30
22
|
@lru_cache(maxsize=32)
|
|
31
23
|
def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
|
|
@@ -69,16 +61,6 @@ class PromptLoader:
|
|
|
69
61
|
+ (f" for mode '{mode}'" if mode else "")
|
|
70
62
|
)
|
|
71
63
|
|
|
72
|
-
if (
|
|
73
|
-
not analyze_template
|
|
74
|
-
or not analyze_template.strip()
|
|
75
|
-
or analyze_template.strip() in ["{analyze_template}", "{}"]
|
|
76
|
-
):
|
|
77
|
-
raise PromptError(
|
|
78
|
-
"analyze_template cannot be empty"
|
|
79
|
-
+ (f" for mode '{mode}'" if mode else "")
|
|
80
|
-
)
|
|
81
|
-
|
|
82
64
|
return {
|
|
83
65
|
self.MAIN_TEMPLATE: main_template,
|
|
84
66
|
self.ANALYZE_TEMPLATE: analyze_template,
|
|
@@ -94,7 +76,8 @@ class PromptLoader:
|
|
|
94
76
|
) -> dict[str, str]:
|
|
95
77
|
try:
|
|
96
78
|
template_configs = self._load_templates(prompt_file, mode)
|
|
97
|
-
format_args =
|
|
79
|
+
format_args = {"text": text}
|
|
80
|
+
format_args.update(extra_kwargs)
|
|
98
81
|
|
|
99
82
|
# Inject variables inside each template
|
|
100
83
|
for key in template_configs.keys():
|
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import TypeVar, Type
|
|
2
2
|
from collections.abc import Callable
|
|
3
|
-
import logging
|
|
4
3
|
|
|
5
4
|
from openai import OpenAI
|
|
6
5
|
from pydantic import BaseModel
|
|
7
6
|
|
|
8
|
-
from texttools.internals.models import
|
|
7
|
+
from texttools.internals.models import OperatorOutput
|
|
9
8
|
from texttools.internals.operator_utils import OperatorUtils
|
|
10
|
-
from texttools.internals.formatters import Formatter
|
|
11
9
|
from texttools.internals.prompt_loader import PromptLoader
|
|
12
10
|
from texttools.internals.exceptions import (
|
|
13
11
|
TextToolsError,
|
|
@@ -19,35 +17,23 @@ from texttools.internals.exceptions import (
|
|
|
19
17
|
# Base Model type for output models
|
|
20
18
|
T = TypeVar("T", bound=BaseModel)
|
|
21
19
|
|
|
22
|
-
logger = logging.getLogger("texttools.sync_operator")
|
|
23
|
-
|
|
24
20
|
|
|
25
21
|
class Operator:
|
|
26
22
|
"""
|
|
27
|
-
Core engine for running text-processing operations with an LLM
|
|
28
|
-
|
|
29
|
-
It wires together:
|
|
30
|
-
- `PromptLoader` → loads YAML prompt templates.
|
|
31
|
-
- `UserMergeFormatter` → applies formatting to messages (e.g., merging).
|
|
32
|
-
- OpenAI client → executes completions/parsed completions.
|
|
23
|
+
Core engine for running text-processing operations with an LLM.
|
|
33
24
|
"""
|
|
34
25
|
|
|
35
26
|
def __init__(self, client: OpenAI, model: str):
|
|
36
27
|
self._client = client
|
|
37
28
|
self._model = model
|
|
38
29
|
|
|
39
|
-
def
|
|
40
|
-
"""
|
|
41
|
-
Calls OpenAI API for analysis using the configured prompt template.
|
|
42
|
-
Returns the analyzed content as a string.
|
|
43
|
-
"""
|
|
30
|
+
def _analyze_completion(self, analyze_prompt: str, temperature: float) -> str:
|
|
44
31
|
try:
|
|
45
|
-
analyze_prompt = prompt_configs["analyze_template"]
|
|
46
|
-
|
|
47
32
|
if not analyze_prompt:
|
|
48
33
|
raise PromptError("Analyze template is empty")
|
|
49
34
|
|
|
50
|
-
analyze_message =
|
|
35
|
+
analyze_message = OperatorUtils.build_user_message(analyze_prompt)
|
|
36
|
+
|
|
51
37
|
completion = self._client.chat.completions.create(
|
|
52
38
|
model=self._model,
|
|
53
39
|
messages=analyze_message,
|
|
@@ -62,7 +48,7 @@ class Operator:
|
|
|
62
48
|
if not analysis:
|
|
63
49
|
raise LLMError("Empty analysis response")
|
|
64
50
|
|
|
65
|
-
return analysis
|
|
51
|
+
return analysis
|
|
66
52
|
|
|
67
53
|
except Exception as e:
|
|
68
54
|
if isinstance(e, (PromptError, LLMError)):
|
|
@@ -71,21 +57,23 @@ class Operator:
|
|
|
71
57
|
|
|
72
58
|
def _parse_completion(
|
|
73
59
|
self,
|
|
74
|
-
|
|
60
|
+
main_prompt: str,
|
|
75
61
|
output_model: Type[T],
|
|
76
62
|
temperature: float,
|
|
77
|
-
logprobs: bool
|
|
78
|
-
top_logprobs: int
|
|
79
|
-
priority: int
|
|
80
|
-
) -> tuple[T,
|
|
63
|
+
logprobs: bool,
|
|
64
|
+
top_logprobs: int,
|
|
65
|
+
priority: int,
|
|
66
|
+
) -> tuple[T, object]:
|
|
81
67
|
"""
|
|
82
68
|
Parses a chat completion using OpenAI's structured output format.
|
|
83
69
|
Returns both the parsed object and the raw completion for logprobs.
|
|
84
70
|
"""
|
|
85
71
|
try:
|
|
72
|
+
main_message = OperatorUtils.build_user_message(main_prompt)
|
|
73
|
+
|
|
86
74
|
request_kwargs = {
|
|
87
75
|
"model": self._model,
|
|
88
|
-
"messages":
|
|
76
|
+
"messages": main_message,
|
|
89
77
|
"response_format": output_model,
|
|
90
78
|
"temperature": temperature,
|
|
91
79
|
}
|
|
@@ -93,8 +81,10 @@ class Operator:
|
|
|
93
81
|
if logprobs:
|
|
94
82
|
request_kwargs["logprobs"] = True
|
|
95
83
|
request_kwargs["top_logprobs"] = top_logprobs
|
|
84
|
+
|
|
96
85
|
if priority:
|
|
97
86
|
request_kwargs["extra_body"] = {"priority": priority}
|
|
87
|
+
|
|
98
88
|
completion = self._client.beta.chat.completions.parse(**request_kwargs)
|
|
99
89
|
|
|
100
90
|
if not completion.choices:
|
|
@@ -121,25 +111,22 @@ class Operator:
|
|
|
121
111
|
user_prompt: str | None,
|
|
122
112
|
temperature: float,
|
|
123
113
|
logprobs: bool,
|
|
124
|
-
top_logprobs: int
|
|
125
|
-
validator: Callable[[
|
|
114
|
+
top_logprobs: int,
|
|
115
|
+
validator: Callable[[object], bool] | None,
|
|
126
116
|
max_validation_retries: int | None,
|
|
117
|
+
priority: int,
|
|
127
118
|
# Internal parameters
|
|
128
119
|
prompt_file: str,
|
|
129
120
|
output_model: Type[T],
|
|
130
121
|
mode: str | None,
|
|
131
|
-
priority: int | None = 0,
|
|
132
122
|
**extra_kwargs,
|
|
133
|
-
) ->
|
|
123
|
+
) -> OperatorOutput:
|
|
134
124
|
"""
|
|
135
125
|
Execute the LLM pipeline with the given input text. (Sync)
|
|
136
126
|
"""
|
|
137
127
|
try:
|
|
138
128
|
prompt_loader = PromptLoader()
|
|
139
|
-
formatter = Formatter()
|
|
140
|
-
output = ToolOutput()
|
|
141
129
|
|
|
142
|
-
# Prompt configs contain two keys: main_template and analyze template, both are string
|
|
143
130
|
prompt_configs = prompt_loader.load(
|
|
144
131
|
prompt_file=prompt_file,
|
|
145
132
|
text=text.strip(),
|
|
@@ -147,47 +134,32 @@ class Operator:
|
|
|
147
134
|
**extra_kwargs,
|
|
148
135
|
)
|
|
149
136
|
|
|
150
|
-
|
|
137
|
+
main_prompt = ""
|
|
138
|
+
analysis = ""
|
|
151
139
|
|
|
152
140
|
if with_analysis:
|
|
153
|
-
analysis = self.
|
|
154
|
-
|
|
155
|
-
OperatorUtils.build_user_message(
|
|
156
|
-
f"Based on this analysis: {analysis}"
|
|
157
|
-
)
|
|
141
|
+
analysis = self._analyze_completion(
|
|
142
|
+
prompt_configs["analyze_template"], temperature
|
|
158
143
|
)
|
|
144
|
+
main_prompt += f"Based on this analysis:\n{analysis}\n"
|
|
159
145
|
|
|
160
146
|
if output_lang:
|
|
161
|
-
|
|
162
|
-
OperatorUtils.build_user_message(
|
|
163
|
-
f"Respond only in the {output_lang} language."
|
|
164
|
-
)
|
|
165
|
-
)
|
|
147
|
+
main_prompt += f"Respond only in the {output_lang} language.\n"
|
|
166
148
|
|
|
167
149
|
if user_prompt:
|
|
168
|
-
|
|
169
|
-
OperatorUtils.build_user_message(
|
|
170
|
-
f"Consider this instruction {user_prompt}"
|
|
171
|
-
)
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
messages.append(
|
|
175
|
-
OperatorUtils.build_user_message(prompt_configs["main_template"])
|
|
176
|
-
)
|
|
150
|
+
main_prompt += f"Consider this instruction {user_prompt}\n"
|
|
177
151
|
|
|
178
|
-
|
|
152
|
+
main_prompt += prompt_configs["main_template"]
|
|
179
153
|
|
|
180
154
|
if logprobs and (not isinstance(top_logprobs, int) or top_logprobs < 2):
|
|
181
155
|
raise ValueError("top_logprobs should be an integer greater than 1")
|
|
182
156
|
|
|
183
157
|
parsed, completion = self._parse_completion(
|
|
184
|
-
|
|
158
|
+
main_prompt, output_model, temperature, logprobs, top_logprobs, priority
|
|
185
159
|
)
|
|
186
160
|
|
|
187
|
-
output.result = parsed.result
|
|
188
|
-
|
|
189
161
|
# Retry logic if validation fails
|
|
190
|
-
if validator and not validator(
|
|
162
|
+
if validator and not validator(parsed.result):
|
|
191
163
|
if (
|
|
192
164
|
not isinstance(max_validation_retries, int)
|
|
193
165
|
or max_validation_retries < 1
|
|
@@ -197,17 +169,13 @@ class Operator:
|
|
|
197
169
|
)
|
|
198
170
|
|
|
199
171
|
succeeded = False
|
|
200
|
-
for
|
|
201
|
-
|
|
202
|
-
f"Validation failed, retrying for the {attempt + 1} time."
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Generate new temperature for retry
|
|
172
|
+
for _ in range(max_validation_retries):
|
|
173
|
+
# Generate a new temperature to retry
|
|
206
174
|
retry_temperature = OperatorUtils.get_retry_temp(temperature)
|
|
207
175
|
|
|
208
176
|
try:
|
|
209
177
|
parsed, completion = self._parse_completion(
|
|
210
|
-
|
|
178
|
+
main_prompt,
|
|
211
179
|
output_model,
|
|
212
180
|
retry_temperature,
|
|
213
181
|
logprobs,
|
|
@@ -215,30 +183,26 @@ class Operator:
|
|
|
215
183
|
priority=priority,
|
|
216
184
|
)
|
|
217
185
|
|
|
218
|
-
output.result = parsed.result
|
|
219
|
-
|
|
220
186
|
# Check if retry was successful
|
|
221
|
-
if validator(
|
|
187
|
+
if validator(parsed.result):
|
|
222
188
|
succeeded = True
|
|
223
189
|
break
|
|
224
190
|
|
|
225
|
-
except LLMError
|
|
226
|
-
|
|
191
|
+
except LLMError:
|
|
192
|
+
pass
|
|
227
193
|
|
|
228
194
|
if not succeeded:
|
|
229
|
-
raise ValidationError(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
output.process = prompt_file[:-5]
|
|
195
|
+
raise ValidationError("Validation failed after all retries")
|
|
196
|
+
|
|
197
|
+
operator_output = OperatorOutput(
|
|
198
|
+
result=parsed.result,
|
|
199
|
+
analysis=analysis if with_analysis else None,
|
|
200
|
+
logprobs=OperatorUtils.extract_logprobs(completion)
|
|
201
|
+
if logprobs
|
|
202
|
+
else None,
|
|
203
|
+
)
|
|
240
204
|
|
|
241
|
-
return
|
|
205
|
+
return operator_output
|
|
242
206
|
|
|
243
207
|
except (PromptError, LLMError, ValidationError):
|
|
244
208
|
raise
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def text_to_chunks(text: str, size: int, overlap: int) -> list[str]:
|
|
5
|
+
separators = ["\n\n", "\n", " ", ""]
|
|
6
|
+
is_separator_regex = False
|
|
7
|
+
keep_separator = True # Equivalent to 'start'
|
|
8
|
+
length_function = len
|
|
9
|
+
strip_whitespace = True
|
|
10
|
+
chunk_size = size
|
|
11
|
+
chunk_overlap = overlap
|
|
12
|
+
|
|
13
|
+
def _split_text_with_regex(
|
|
14
|
+
text: str, separator: str, keep_separator: bool
|
|
15
|
+
) -> list[str]:
|
|
16
|
+
if not separator:
|
|
17
|
+
return [text]
|
|
18
|
+
if not keep_separator:
|
|
19
|
+
return re.split(separator, text)
|
|
20
|
+
_splits = re.split(f"({separator})", text)
|
|
21
|
+
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
|
22
|
+
if len(_splits) % 2 == 0:
|
|
23
|
+
splits += [_splits[-1]]
|
|
24
|
+
return [_splits[0]] + splits if _splits[0] else splits
|
|
25
|
+
|
|
26
|
+
def _join_docs(docs: list[str], separator: str) -> str | None:
|
|
27
|
+
text = separator.join(docs)
|
|
28
|
+
if strip_whitespace:
|
|
29
|
+
text = text.strip()
|
|
30
|
+
return text if text else None
|
|
31
|
+
|
|
32
|
+
def _merge_splits(splits: list[str], separator: str) -> list[str]:
|
|
33
|
+
separator_len = length_function(separator)
|
|
34
|
+
docs = []
|
|
35
|
+
current_doc = []
|
|
36
|
+
total = 0
|
|
37
|
+
for d in splits:
|
|
38
|
+
len_ = length_function(d)
|
|
39
|
+
if total + len_ + (separator_len if current_doc else 0) > chunk_size:
|
|
40
|
+
if total > chunk_size:
|
|
41
|
+
pass
|
|
42
|
+
if current_doc:
|
|
43
|
+
doc = _join_docs(current_doc, separator)
|
|
44
|
+
if doc is not None:
|
|
45
|
+
docs.append(doc)
|
|
46
|
+
while total > chunk_overlap or (
|
|
47
|
+
total + len_ + (separator_len if current_doc else 0)
|
|
48
|
+
> chunk_size
|
|
49
|
+
and total > 0
|
|
50
|
+
):
|
|
51
|
+
total -= length_function(current_doc[0]) + (
|
|
52
|
+
separator_len if len(current_doc) > 1 else 0
|
|
53
|
+
)
|
|
54
|
+
current_doc = current_doc[1:]
|
|
55
|
+
current_doc.append(d)
|
|
56
|
+
total += len_ + (separator_len if len(current_doc) > 1 else 0)
|
|
57
|
+
doc = _join_docs(current_doc, separator)
|
|
58
|
+
if doc is not None:
|
|
59
|
+
docs.append(doc)
|
|
60
|
+
return docs
|
|
61
|
+
|
|
62
|
+
def _split_text(text: str, separators: list[str]) -> list[str]:
|
|
63
|
+
final_chunks = []
|
|
64
|
+
separator = separators[-1]
|
|
65
|
+
new_separators = []
|
|
66
|
+
for i, _s in enumerate(separators):
|
|
67
|
+
separator_ = _s if is_separator_regex else re.escape(_s)
|
|
68
|
+
if not _s:
|
|
69
|
+
separator = _s
|
|
70
|
+
break
|
|
71
|
+
if re.search(separator_, text):
|
|
72
|
+
separator = _s
|
|
73
|
+
new_separators = separators[i + 1 :]
|
|
74
|
+
break
|
|
75
|
+
separator_ = separator if is_separator_regex else re.escape(separator)
|
|
76
|
+
splits = _split_text_with_regex(text, separator_, keep_separator)
|
|
77
|
+
_separator = "" if keep_separator else separator
|
|
78
|
+
good_splits = []
|
|
79
|
+
for s in splits:
|
|
80
|
+
if length_function(s) < chunk_size:
|
|
81
|
+
good_splits.append(s)
|
|
82
|
+
else:
|
|
83
|
+
if good_splits:
|
|
84
|
+
merged_text = _merge_splits(good_splits, _separator)
|
|
85
|
+
final_chunks.extend(merged_text)
|
|
86
|
+
good_splits = []
|
|
87
|
+
if not new_separators:
|
|
88
|
+
final_chunks.append(s)
|
|
89
|
+
else:
|
|
90
|
+
other_info = _split_text(s, new_separators)
|
|
91
|
+
final_chunks.extend(other_info)
|
|
92
|
+
if good_splits:
|
|
93
|
+
merged_text = _merge_splits(good_splits, _separator)
|
|
94
|
+
final_chunks.extend(merged_text)
|
|
95
|
+
return final_chunks
|
|
96
|
+
|
|
97
|
+
return _split_text(text, separators)
|
texttools/prompts/README.md
CHANGED
|
@@ -15,7 +15,7 @@ This folder contains YAML files for all prompts used in the project. Each file r
|
|
|
15
15
|
```yaml
|
|
16
16
|
main_template:
|
|
17
17
|
mode_1: |
|
|
18
|
-
Your main instructions here with placeholders like {
|
|
18
|
+
Your main instructions here with placeholders like {text}.
|
|
19
19
|
mode_2: |
|
|
20
20
|
Optional reasoning instructions here.
|
|
21
21
|
|
|
@@ -30,6 +30,6 @@ analyze_template:
|
|
|
30
30
|
|
|
31
31
|
## Guidelines
|
|
32
32
|
1. **Naming**: Use descriptive names for each YAML file corresponding to the tool or task it serves.
|
|
33
|
-
2. **Placeholders**: Use `{
|
|
33
|
+
2. **Placeholders**: Use `{text}` or other relevant placeholders to dynamically inject data.
|
|
34
34
|
3. **Modes**: If using modes, ensure both `main_template` and `analyze_template` contain the corresponding keys.
|
|
35
35
|
4. **Consistency**: Keep formatting consistent across files for easier parsing by scripts.
|