hamtaa-texttools 1.1.19__py3-none-any.whl → 1.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/METADATA +15 -34
  2. hamtaa_texttools-1.1.21.dist-info/RECORD +32 -0
  3. texttools/batch/batch_config.py +14 -1
  4. texttools/batch/{internals/batch_manager.py → batch_manager.py} +6 -6
  5. texttools/batch/batch_runner.py +7 -7
  6. texttools/internals/async_operator.py +48 -84
  7. texttools/internals/models.py +73 -113
  8. texttools/internals/operator_utils.py +2 -2
  9. texttools/internals/prompt_loader.py +3 -20
  10. texttools/internals/sync_operator.py +47 -83
  11. texttools/internals/text_to_chunks.py +97 -0
  12. texttools/prompts/README.md +2 -2
  13. texttools/prompts/categorize.yaml +35 -77
  14. texttools/prompts/check_fact.yaml +2 -2
  15. texttools/prompts/extract_entities.yaml +3 -3
  16. texttools/prompts/extract_keywords.yaml +6 -6
  17. texttools/prompts/is_question.yaml +2 -2
  18. texttools/prompts/merge_questions.yaml +4 -4
  19. texttools/prompts/propositionize.yaml +2 -2
  20. texttools/prompts/rewrite.yaml +6 -6
  21. texttools/prompts/run_custom.yaml +1 -1
  22. texttools/prompts/subject_to_question.yaml +2 -2
  23. texttools/prompts/summarize.yaml +2 -2
  24. texttools/prompts/text_to_question.yaml +8 -6
  25. texttools/prompts/translate.yaml +2 -2
  26. texttools/tools/async_tools.py +497 -519
  27. texttools/tools/sync_tools.py +498 -520
  28. hamtaa_texttools-1.1.19.dist-info/RECORD +0 -33
  29. texttools/batch/internals/utils.py +0 -16
  30. texttools/internals/formatters.py +0 -24
  31. {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/WHEEL +0 -0
  32. {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/licenses/LICENSE +0 -0
  33. {hamtaa_texttools-1.1.19.dist-info → hamtaa_texttools-1.1.21.dist-info}/top_level.txt +0 -0
@@ -1,44 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  from datetime import datetime
2
- from typing import Type, Any, Literal
4
+ from typing import Type, Literal, Any
3
5
 
4
6
  from pydantic import BaseModel, Field, create_model
5
7
 
6
8
 
9
+ class ToolOutputMetadata(BaseModel):
10
+ tool_name: str
11
+ processed_at: datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
12
+ execution_time: float | None = None
13
+
14
+
7
15
  class ToolOutput(BaseModel):
8
16
  result: Any = None
9
- logprobs: list[dict[str, Any]] = []
10
- analysis: str = ""
11
- process: str | None = None
12
- processed_at: datetime = datetime.now()
13
- execution_time: float | None = None
17
+ analysis: str | None = None
18
+ logprobs: list[dict[str, Any]] | None = None
14
19
  errors: list[str] = []
20
+ metadata: ToolOutputMetadata | None = None
15
21
 
16
22
  def __repr__(self) -> str:
17
- return f"""
18
- ToolOutput(process='{self.process}', result_type='{type(self.result)}',
19
- result='{self.result}', analysis='{self.analysis}',
20
- logprobs='{self.logprobs}', errors='{self.errors}',
21
- processed_at='{self.processed_at}', execution_time='{self.execution_time}'
22
- """
23
+ base = f"""ToolOutput(result='{self.result}', result_type='{type(self.result)}', analysis='{self.analysis}', logprobs='{self.logprobs}', errors='{self.errors}'"""
23
24
 
25
+ if self.metadata:
26
+ base += f""", tool_name='{self.metadata.tool_name}',
27
+ processed_at='{self.metadata.processed_at}', execution_time='{self.metadata.execution_time}'
28
+ """
24
29
 
25
- class StrOutput(BaseModel):
30
+ return base
31
+
32
+
33
+ class OperatorOutput(BaseModel):
34
+ result: Any
35
+ analysis: str | None
36
+ logprobs: list[dict[str, Any]] | None
37
+
38
+
39
+ class Str(BaseModel):
26
40
  result: str = Field(..., description="The output string", example="text")
27
41
 
28
42
 
29
- class BoolOutput(BaseModel):
43
+ class Bool(BaseModel):
30
44
  result: bool = Field(
31
45
  ..., description="Boolean indicating the output state", example=True
32
46
  )
33
47
 
34
48
 
35
- class ListStrOutput(BaseModel):
49
+ class ListStr(BaseModel):
36
50
  result: list[str] = Field(
37
51
  ..., description="The output list of strings", example=["text_1", "text_2"]
38
52
  )
39
53
 
40
54
 
41
- class ListDictStrStrOutput(BaseModel):
55
+ class ListDictStrStr(BaseModel):
42
56
  result: list[dict[str, str]] = Field(
43
57
  ...,
44
58
  description="List of dictionaries containing string key-value pairs",
@@ -46,121 +60,76 @@ class ListDictStrStrOutput(BaseModel):
46
60
  )
47
61
 
48
62
 
49
- class ReasonListStrOutput(BaseModel):
63
+ class ReasonListStr(BaseModel):
50
64
  reason: str = Field(..., description="Thinking process that led to the output")
51
65
  result: list[str] = Field(
52
66
  ..., description="The output list of strings", example=["text_1", "text_2"]
53
67
  )
54
68
 
55
69
 
56
- class Node(BaseModel):
57
- node_id: int
58
- name: str
59
- level: int
60
- parent_id: int | None
61
- description: str
70
+ class Node:
71
+ def __init__(self, name: str, description: str, level: int, parent: Node | None):
72
+ self.name = name
73
+ self.description = description
74
+ self.level = level
75
+ self.parent = parent
76
+ self.children = {}
62
77
 
63
78
 
64
79
  class CategoryTree:
65
- def __init__(self, tree_name):
66
- self._root = Node(
67
- node_id=0, name=tree_name, level=0, parent_id=None, description="Root node"
68
- )
69
- self._all_nodes: list[Node] = [self._root]
70
- self._new_id = 1
71
-
72
- def get_all_nodes(self) -> list[Node]:
80
+ def __init__(self):
81
+ self._root = Node(name="root", description="root", level=0, parent=None)
82
+ self._all_nodes = {"root": self._root}
83
+
84
+ def get_all_nodes(self) -> dict[str, Node]:
73
85
  return self._all_nodes
74
86
 
75
87
  def get_level_count(self) -> int:
76
- return max([item.level for item in self._all_nodes])
77
-
78
- def get_node(self, identifier: int | str) -> Node | None:
79
- if isinstance(identifier, str):
80
- for node in self.get_all_nodes():
81
- if node.name == identifier:
82
- return node
83
- return None
84
- elif isinstance(identifier, int):
85
- for node in self.get_all_nodes():
86
- if node.node_id == identifier:
87
- return node
88
- return None
89
- else:
90
- return None
91
-
92
- def get_children(self, parent_node: Node) -> list[Node] | None:
93
- children = [
94
- node
95
- for node in self.get_all_nodes()
96
- if parent_node.node_id == node.parent_id
97
- ]
98
- return children if children else None
88
+ return max(node.level for node in self._all_nodes.values())
89
+
90
+ def get_node(self, name: str) -> Node | None:
91
+ return self._all_nodes.get(name)
99
92
 
100
93
  def add_node(
101
94
  self,
102
- node_name: str,
103
- parent_name: str | None = None,
95
+ name: str,
96
+ parent_name: str,
104
97
  description: str | None = None,
105
98
  ) -> None:
106
- if self.get_node(node_name):
107
- raise ValueError(f"{node_name} has been chosen for another category before")
108
-
109
- if parent_name:
110
- parent_node = self.get_node(parent_name)
111
- if not parent_node:
112
- raise ValueError(f"Parent category '{parent_name}' not found")
113
- parent_id = parent_node.node_id
114
- level = parent_node.level + 1
115
- else:
116
- level = 1
117
- parent_id = 0
99
+ if self.get_node(name):
100
+ raise ValueError(f"Cannot add {name} category twice")
101
+
102
+ parent = self.get_node(parent_name)
103
+
104
+ if not parent:
105
+ raise ValueError(f"Parent category '{parent_name}' not found")
118
106
 
119
107
  node_data = {
120
- "node_id": self._new_id,
121
- "name": node_name,
122
- "level": level,
123
- "parent_id": parent_id,
108
+ "name": name,
124
109
  "description": description if description else "No description provided",
110
+ "level": parent.level + 1,
111
+ "parent": parent,
125
112
  }
126
113
 
127
- self._all_nodes.append(Node(**node_data))
128
- self._new_id += 1
129
-
130
- def remove_node(self, identifier: int | str) -> None:
131
- node = self.get_node(identifier)
132
-
133
- if node:
134
- # Remove node's children recursively
135
- children = self.get_children(node)
114
+ new_node = Node(**node_data)
115
+ parent.children[name] = new_node
116
+ self._all_nodes[name] = new_node
136
117
 
137
- if not children:
138
- self._all_nodes.remove(node)
139
- return
118
+ def remove_node(self, name: str) -> None:
119
+ if name == "root":
120
+ raise ValueError("Cannot remove the root node")
140
121
 
141
- for child in children:
142
- self.remove_node(child.name)
122
+ node = self.get_node(name)
123
+ if not node:
124
+ raise ValueError(f"Category: '{name}' not found")
143
125
 
144
- self._all_nodes.remove(node)
145
- else:
146
- raise ValueError(f"Node with identifier: '{identifier}' not found.")
126
+ for child_name in list(node.children.keys()):
127
+ self.remove_node(child_name)
147
128
 
148
- def dump_tree(self) -> dict:
149
- def build_dict(node: Node) -> dict:
150
- children = [
151
- build_dict(child)
152
- for child in self._all_nodes
153
- if child.parent_id == node.node_id
154
- ]
155
- return {
156
- "node_id": node.node_id,
157
- "name": node.name,
158
- "level": node.level,
159
- "parent_id": node.parent_id,
160
- "children": children,
161
- }
129
+ if node.parent:
130
+ del node.parent.children[name]
162
131
 
163
- return {"category_tree": build_dict(self._root)["children"]}
132
+ del self._all_nodes[name]
164
133
 
165
134
 
166
135
  # This function is needed to create CategorizerOutput with dynamic categories
@@ -179,12 +148,3 @@ def create_dynamic_model(allowed_values: list[str]) -> Type[BaseModel]:
179
148
  )
180
149
 
181
150
  return CategorizerOutput
182
-
183
-
184
- class Entity(BaseModel):
185
- text: str = Field(description="The exact text of the entity")
186
- entity_type: str = Field(description="The type of the entity")
187
-
188
-
189
- class EntityDetectorOutput(BaseModel):
190
- result: list[Entity] = Field(description="List of all extracted entities")
@@ -5,8 +5,8 @@ import random
5
5
 
6
6
  class OperatorUtils:
7
7
  @staticmethod
8
- def build_user_message(prompt: str) -> dict[str, str]:
9
- return {"role": "user", "content": prompt}
8
+ def build_user_message(prompt: str) -> list[dict[str, str]]:
9
+ return [{"role": "user", "content": prompt}]
10
10
 
11
11
  @staticmethod
12
12
  def extract_logprobs(completion: dict) -> list[dict]:
@@ -12,20 +12,12 @@ class PromptLoader:
12
12
  Responsibilities:
13
13
  - Load and parse YAML prompt definitions.
14
14
  - Select the right template (by mode, if applicable).
15
- - Inject variables (`{input}`, plus any extra kwargs) into the templates.
15
+ - Inject variables (`{text}`, plus any extra kwargs) into the templates.
16
16
  """
17
17
 
18
18
  MAIN_TEMPLATE = "main_template"
19
19
  ANALYZE_TEMPLATE = "analyze_template"
20
20
 
21
- @staticmethod
22
- def _build_format_args(text: str, **extra_kwargs) -> dict[str, str]:
23
- # Base formatting args
24
- format_args = {"input": text}
25
- # Merge extras
26
- format_args.update(extra_kwargs)
27
- return format_args
28
-
29
21
  # Use lru_cache to load each file once
30
22
  @lru_cache(maxsize=32)
31
23
  def _load_templates(self, prompt_file: str, mode: str | None) -> dict[str, str]:
@@ -69,16 +61,6 @@ class PromptLoader:
69
61
  + (f" for mode '{mode}'" if mode else "")
70
62
  )
71
63
 
72
- if (
73
- not analyze_template
74
- or not analyze_template.strip()
75
- or analyze_template.strip() in ["{analyze_template}", "{}"]
76
- ):
77
- raise PromptError(
78
- "analyze_template cannot be empty"
79
- + (f" for mode '{mode}'" if mode else "")
80
- )
81
-
82
64
  return {
83
65
  self.MAIN_TEMPLATE: main_template,
84
66
  self.ANALYZE_TEMPLATE: analyze_template,
@@ -94,7 +76,8 @@ class PromptLoader:
94
76
  ) -> dict[str, str]:
95
77
  try:
96
78
  template_configs = self._load_templates(prompt_file, mode)
97
- format_args = self._build_format_args(text, **extra_kwargs)
79
+ format_args = {"text": text}
80
+ format_args.update(extra_kwargs)
98
81
 
99
82
  # Inject variables inside each template
100
83
  for key in template_configs.keys():
@@ -1,13 +1,11 @@
1
- from typing import Any, TypeVar, Type
1
+ from typing import TypeVar, Type
2
2
  from collections.abc import Callable
3
- import logging
4
3
 
5
4
  from openai import OpenAI
6
5
  from pydantic import BaseModel
7
6
 
8
- from texttools.internals.models import ToolOutput
7
+ from texttools.internals.models import OperatorOutput
9
8
  from texttools.internals.operator_utils import OperatorUtils
10
- from texttools.internals.formatters import Formatter
11
9
  from texttools.internals.prompt_loader import PromptLoader
12
10
  from texttools.internals.exceptions import (
13
11
  TextToolsError,
@@ -19,35 +17,23 @@ from texttools.internals.exceptions import (
19
17
  # Base Model type for output models
20
18
  T = TypeVar("T", bound=BaseModel)
21
19
 
22
- logger = logging.getLogger("texttools.sync_operator")
23
-
24
20
 
25
21
  class Operator:
26
22
  """
27
- Core engine for running text-processing operations with an LLM (Sync).
28
-
29
- It wires together:
30
- - `PromptLoader` → loads YAML prompt templates.
31
- - `UserMergeFormatter` → applies formatting to messages (e.g., merging).
32
- - OpenAI client → executes completions/parsed completions.
23
+ Core engine for running text-processing operations with an LLM.
33
24
  """
34
25
 
35
26
  def __init__(self, client: OpenAI, model: str):
36
27
  self._client = client
37
28
  self._model = model
38
29
 
39
- def _analyze(self, prompt_configs: dict[str, str], temperature: float) -> str:
40
- """
41
- Calls OpenAI API for analysis using the configured prompt template.
42
- Returns the analyzed content as a string.
43
- """
30
+ def _analyze_completion(self, analyze_prompt: str, temperature: float) -> str:
44
31
  try:
45
- analyze_prompt = prompt_configs["analyze_template"]
46
-
47
32
  if not analyze_prompt:
48
33
  raise PromptError("Analyze template is empty")
49
34
 
50
- analyze_message = [OperatorUtils.build_user_message(analyze_prompt)]
35
+ analyze_message = OperatorUtils.build_user_message(analyze_prompt)
36
+
51
37
  completion = self._client.chat.completions.create(
52
38
  model=self._model,
53
39
  messages=analyze_message,
@@ -62,7 +48,7 @@ class Operator:
62
48
  if not analysis:
63
49
  raise LLMError("Empty analysis response")
64
50
 
65
- return analysis.strip()
51
+ return analysis
66
52
 
67
53
  except Exception as e:
68
54
  if isinstance(e, (PromptError, LLMError)):
@@ -71,21 +57,23 @@ class Operator:
71
57
 
72
58
  def _parse_completion(
73
59
  self,
74
- message: list[dict[str, str]],
60
+ main_prompt: str,
75
61
  output_model: Type[T],
76
62
  temperature: float,
77
- logprobs: bool = False,
78
- top_logprobs: int = 3,
79
- priority: int | None = 0,
80
- ) -> tuple[T, Any]:
63
+ logprobs: bool,
64
+ top_logprobs: int,
65
+ priority: int,
66
+ ) -> tuple[T, object]:
81
67
  """
82
68
  Parses a chat completion using OpenAI's structured output format.
83
69
  Returns both the parsed object and the raw completion for logprobs.
84
70
  """
85
71
  try:
72
+ main_message = OperatorUtils.build_user_message(main_prompt)
73
+
86
74
  request_kwargs = {
87
75
  "model": self._model,
88
- "messages": message,
76
+ "messages": main_message,
89
77
  "response_format": output_model,
90
78
  "temperature": temperature,
91
79
  }
@@ -93,8 +81,10 @@ class Operator:
93
81
  if logprobs:
94
82
  request_kwargs["logprobs"] = True
95
83
  request_kwargs["top_logprobs"] = top_logprobs
84
+
96
85
  if priority:
97
86
  request_kwargs["extra_body"] = {"priority": priority}
87
+
98
88
  completion = self._client.beta.chat.completions.parse(**request_kwargs)
99
89
 
100
90
  if not completion.choices:
@@ -121,25 +111,22 @@ class Operator:
121
111
  user_prompt: str | None,
122
112
  temperature: float,
123
113
  logprobs: bool,
124
- top_logprobs: int | None,
125
- validator: Callable[[Any], bool] | None,
114
+ top_logprobs: int,
115
+ validator: Callable[[object], bool] | None,
126
116
  max_validation_retries: int | None,
117
+ priority: int,
127
118
  # Internal parameters
128
119
  prompt_file: str,
129
120
  output_model: Type[T],
130
121
  mode: str | None,
131
- priority: int | None = 0,
132
122
  **extra_kwargs,
133
- ) -> ToolOutput:
123
+ ) -> OperatorOutput:
134
124
  """
135
125
  Execute the LLM pipeline with the given input text. (Sync)
136
126
  """
137
127
  try:
138
128
  prompt_loader = PromptLoader()
139
- formatter = Formatter()
140
- output = ToolOutput()
141
129
 
142
- # Prompt configs contain two keys: main_template and analyze template, both are string
143
130
  prompt_configs = prompt_loader.load(
144
131
  prompt_file=prompt_file,
145
132
  text=text.strip(),
@@ -147,47 +134,32 @@ class Operator:
147
134
  **extra_kwargs,
148
135
  )
149
136
 
150
- messages = []
137
+ main_prompt = ""
138
+ analysis = ""
151
139
 
152
140
  if with_analysis:
153
- analysis = self._analyze(prompt_configs, temperature)
154
- messages.append(
155
- OperatorUtils.build_user_message(
156
- f"Based on this analysis: {analysis}"
157
- )
141
+ analysis = self._analyze_completion(
142
+ prompt_configs["analyze_template"], temperature
158
143
  )
144
+ main_prompt += f"Based on this analysis:\n{analysis}\n"
159
145
 
160
146
  if output_lang:
161
- messages.append(
162
- OperatorUtils.build_user_message(
163
- f"Respond only in the {output_lang} language."
164
- )
165
- )
147
+ main_prompt += f"Respond only in the {output_lang} language.\n"
166
148
 
167
149
  if user_prompt:
168
- messages.append(
169
- OperatorUtils.build_user_message(
170
- f"Consider this instruction {user_prompt}"
171
- )
172
- )
173
-
174
- messages.append(
175
- OperatorUtils.build_user_message(prompt_configs["main_template"])
176
- )
150
+ main_prompt += f"Consider this instruction {user_prompt}\n"
177
151
 
178
- messages = formatter.user_merge_format(messages)
152
+ main_prompt += prompt_configs["main_template"]
179
153
 
180
154
  if logprobs and (not isinstance(top_logprobs, int) or top_logprobs < 2):
181
155
  raise ValueError("top_logprobs should be an integer greater than 1")
182
156
 
183
157
  parsed, completion = self._parse_completion(
184
- messages, output_model, temperature, logprobs, top_logprobs, priority
158
+ main_prompt, output_model, temperature, logprobs, top_logprobs, priority
185
159
  )
186
160
 
187
- output.result = parsed.result
188
-
189
161
  # Retry logic if validation fails
190
- if validator and not validator(output.result):
162
+ if validator and not validator(parsed.result):
191
163
  if (
192
164
  not isinstance(max_validation_retries, int)
193
165
  or max_validation_retries < 1
@@ -197,17 +169,13 @@ class Operator:
197
169
  )
198
170
 
199
171
  succeeded = False
200
- for attempt in range(max_validation_retries):
201
- logger.warning(
202
- f"Validation failed, retrying for the {attempt + 1} time."
203
- )
204
-
205
- # Generate new temperature for retry
172
+ for _ in range(max_validation_retries):
173
+ # Generate a new temperature to retry
206
174
  retry_temperature = OperatorUtils.get_retry_temp(temperature)
207
175
 
208
176
  try:
209
177
  parsed, completion = self._parse_completion(
210
- messages,
178
+ main_prompt,
211
179
  output_model,
212
180
  retry_temperature,
213
181
  logprobs,
@@ -215,30 +183,26 @@ class Operator:
215
183
  priority=priority,
216
184
  )
217
185
 
218
- output.result = parsed.result
219
-
220
186
  # Check if retry was successful
221
- if validator(output.result):
187
+ if validator(parsed.result):
222
188
  succeeded = True
223
189
  break
224
190
 
225
- except LLMError as e:
226
- logger.error(f"Retry attempt {attempt + 1} failed: {e}")
191
+ except LLMError:
192
+ pass
227
193
 
228
194
  if not succeeded:
229
- raise ValidationError(
230
- f"Validation failed after {max_validation_retries} retries"
231
- )
232
-
233
- if logprobs:
234
- output.logprobs = OperatorUtils.extract_logprobs(completion)
235
-
236
- if with_analysis:
237
- output.analysis = analysis
238
-
239
- output.process = prompt_file[:-5]
195
+ raise ValidationError("Validation failed after all retries")
196
+
197
+ operator_output = OperatorOutput(
198
+ result=parsed.result,
199
+ analysis=analysis if with_analysis else None,
200
+ logprobs=OperatorUtils.extract_logprobs(completion)
201
+ if logprobs
202
+ else None,
203
+ )
240
204
 
241
- return output
205
+ return operator_output
242
206
 
243
207
  except (PromptError, LLMError, ValidationError):
244
208
  raise
@@ -0,0 +1,97 @@
1
+ import re
2
+
3
+
4
+ def text_to_chunks(text: str, size: int, overlap: int) -> list[str]:
5
+ separators = ["\n\n", "\n", " ", ""]
6
+ is_separator_regex = False
7
+ keep_separator = True # Equivalent to 'start'
8
+ length_function = len
9
+ strip_whitespace = True
10
+ chunk_size = size
11
+ chunk_overlap = overlap
12
+
13
+ def _split_text_with_regex(
14
+ text: str, separator: str, keep_separator: bool
15
+ ) -> list[str]:
16
+ if not separator:
17
+ return [text]
18
+ if not keep_separator:
19
+ return re.split(separator, text)
20
+ _splits = re.split(f"({separator})", text)
21
+ splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
22
+ if len(_splits) % 2 == 0:
23
+ splits += [_splits[-1]]
24
+ return [_splits[0]] + splits if _splits[0] else splits
25
+
26
+ def _join_docs(docs: list[str], separator: str) -> str | None:
27
+ text = separator.join(docs)
28
+ if strip_whitespace:
29
+ text = text.strip()
30
+ return text if text else None
31
+
32
+ def _merge_splits(splits: list[str], separator: str) -> list[str]:
33
+ separator_len = length_function(separator)
34
+ docs = []
35
+ current_doc = []
36
+ total = 0
37
+ for d in splits:
38
+ len_ = length_function(d)
39
+ if total + len_ + (separator_len if current_doc else 0) > chunk_size:
40
+ if total > chunk_size:
41
+ pass
42
+ if current_doc:
43
+ doc = _join_docs(current_doc, separator)
44
+ if doc is not None:
45
+ docs.append(doc)
46
+ while total > chunk_overlap or (
47
+ total + len_ + (separator_len if current_doc else 0)
48
+ > chunk_size
49
+ and total > 0
50
+ ):
51
+ total -= length_function(current_doc[0]) + (
52
+ separator_len if len(current_doc) > 1 else 0
53
+ )
54
+ current_doc = current_doc[1:]
55
+ current_doc.append(d)
56
+ total += len_ + (separator_len if len(current_doc) > 1 else 0)
57
+ doc = _join_docs(current_doc, separator)
58
+ if doc is not None:
59
+ docs.append(doc)
60
+ return docs
61
+
62
+ def _split_text(text: str, separators: list[str]) -> list[str]:
63
+ final_chunks = []
64
+ separator = separators[-1]
65
+ new_separators = []
66
+ for i, _s in enumerate(separators):
67
+ separator_ = _s if is_separator_regex else re.escape(_s)
68
+ if not _s:
69
+ separator = _s
70
+ break
71
+ if re.search(separator_, text):
72
+ separator = _s
73
+ new_separators = separators[i + 1 :]
74
+ break
75
+ separator_ = separator if is_separator_regex else re.escape(separator)
76
+ splits = _split_text_with_regex(text, separator_, keep_separator)
77
+ _separator = "" if keep_separator else separator
78
+ good_splits = []
79
+ for s in splits:
80
+ if length_function(s) < chunk_size:
81
+ good_splits.append(s)
82
+ else:
83
+ if good_splits:
84
+ merged_text = _merge_splits(good_splits, _separator)
85
+ final_chunks.extend(merged_text)
86
+ good_splits = []
87
+ if not new_separators:
88
+ final_chunks.append(s)
89
+ else:
90
+ other_info = _split_text(s, new_separators)
91
+ final_chunks.extend(other_info)
92
+ if good_splits:
93
+ merged_text = _merge_splits(good_splits, _separator)
94
+ final_chunks.extend(merged_text)
95
+ return final_chunks
96
+
97
+ return _split_text(text, separators)
@@ -15,7 +15,7 @@ This folder contains YAML files for all prompts used in the project. Each file r
15
15
  ```yaml
16
16
  main_template:
17
17
  mode_1: |
18
- Your main instructions here with placeholders like {input}.
18
+ Your main instructions here with placeholders like {text}.
19
19
  mode_2: |
20
20
  Optional reasoning instructions here.
21
21
 
@@ -30,6 +30,6 @@ analyze_template:
30
30
 
31
31
  ## Guidelines
32
32
  1. **Naming**: Use descriptive names for each YAML file corresponding to the tool or task it serves.
33
- 2. **Placeholders**: Use `{input}` or other relevant placeholders to dynamically inject data.
33
+ 2. **Placeholders**: Use `{text}` or other relevant placeholders to dynamically inject data.
34
34
  3. **Modes**: If using modes, ensure both `main_template` and `analyze_template` contain the corresponding keys.
35
35
  4. **Consistency**: Keep formatting consistent across files for easier parsing by scripts.