abstract-utilities 0.2.2.496__py3-none-any.whl → 0.2.2.507__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_utilities/__init__.py +5 -9
- abstract_utilities/class_utils/__init__.py +7 -0
- abstract_utilities/class_utils/abstract_classes.py +74 -0
- abstract_utilities/class_utils/caller_utils.py +35 -0
- abstract_utilities/class_utils/class_utils.py +109 -0
- abstract_utilities/class_utils/function_utils.py +153 -0
- abstract_utilities/class_utils/global_utils.py +56 -0
- abstract_utilities/class_utils/imports/__init__.py +2 -0
- abstract_utilities/class_utils/imports/imports.py +2 -0
- abstract_utilities/class_utils/imports/utils.py +40 -0
- abstract_utilities/class_utils/module_utils.py +63 -0
- abstract_utilities/env_utils/imports/imports.py +3 -2
- abstract_utilities/error_utils/__init__.py +2 -0
- abstract_utilities/error_utils/error_utils.py +25 -0
- abstract_utilities/error_utils/imports/__init__.py +2 -0
- abstract_utilities/error_utils/imports/imports.py +1 -0
- abstract_utilities/error_utils/imports/module_imports.py +1 -0
- abstract_utilities/file_utils/imports/imports.py +3 -18
- abstract_utilities/file_utils/imports/module_imports.py +3 -6
- abstract_utilities/file_utils/src/filter_params.py +6 -6
- abstract_utilities/file_utils/src/type_checks.py +0 -1
- abstract_utilities/hash_utils/__init__.py +2 -0
- abstract_utilities/hash_utils/hash_utils.py +5 -0
- abstract_utilities/hash_utils/imports/__init__.py +2 -0
- abstract_utilities/hash_utils/imports/imports.py +1 -0
- abstract_utilities/hash_utils/imports/module_imports.py +0 -0
- abstract_utilities/history_utils/__init__.py +2 -0
- abstract_utilities/history_utils/history_utils.py +37 -0
- abstract_utilities/history_utils/imports/__init__.py +2 -0
- abstract_utilities/history_utils/imports/imports.py +1 -0
- abstract_utilities/history_utils/imports/module_imports.py +0 -0
- abstract_utilities/import_utils/imports/imports.py +1 -1
- abstract_utilities/import_utils/imports/module_imports.py +1 -1
- abstract_utilities/import_utils/src/__init__.py +1 -1
- abstract_utilities/import_utils/src/clean_imports.py +31 -5
- abstract_utilities/import_utils/src/dot_utils.py +9 -0
- abstract_utilities/import_utils/src/package_utilss/__init__.py +139 -0
- abstract_utilities/import_utils/src/package_utilss/context_utils.py +27 -0
- abstract_utilities/import_utils/src/package_utilss/import_collectors.py +53 -0
- abstract_utilities/import_utils/src/package_utilss/path_utils.py +28 -0
- abstract_utilities/import_utils/src/package_utilss/safe_import.py +27 -0
- abstract_utilities/import_utils/src/pkg_utils.py +140 -0
- abstract_utilities/imports.py +18 -0
- abstract_utilities/json_utils/__init__.py +2 -0
- abstract_utilities/json_utils/imports/__init__.py +2 -0
- abstract_utilities/json_utils/imports/imports.py +2 -0
- abstract_utilities/json_utils/imports/module_imports.py +5 -0
- abstract_utilities/json_utils/json_utils.py +743 -0
- abstract_utilities/list_utils/__init__.py +2 -0
- abstract_utilities/list_utils/imports/__init__.py +2 -0
- abstract_utilities/list_utils/imports/imports.py +1 -0
- abstract_utilities/list_utils/imports/module_imports.py +0 -0
- abstract_utilities/list_utils/list_utils.py +199 -0
- abstract_utilities/log_utils/__init__.py +5 -0
- abstract_utilities/log_utils/abstractLogManager.py +64 -0
- abstract_utilities/log_utils/call_response.py +68 -0
- abstract_utilities/log_utils/imports/__init__.py +2 -0
- abstract_utilities/log_utils/imports/imports.py +7 -0
- abstract_utilities/log_utils/imports/module_imports.py +2 -0
- abstract_utilities/log_utils/log_file.py +58 -0
- abstract_utilities/log_utils/logger_callable.py +49 -0
- abstract_utilities/math_utils/__init__.py +2 -0
- abstract_utilities/math_utils/imports/__init__.py +2 -0
- abstract_utilities/math_utils/imports/imports.py +2 -0
- abstract_utilities/math_utils/imports/module_imports.py +1 -0
- abstract_utilities/math_utils/math_utils.py +208 -0
- abstract_utilities/parse_utils/__init__.py +2 -0
- abstract_utilities/parse_utils/imports/__init__.py +3 -0
- abstract_utilities/parse_utils/imports/constants.py +10 -0
- abstract_utilities/parse_utils/imports/imports.py +2 -0
- abstract_utilities/parse_utils/imports/module_imports.py +4 -0
- abstract_utilities/parse_utils/parse_utils.py +516 -0
- abstract_utilities/path_utils/__init__.py +2 -0
- abstract_utilities/path_utils/imports/__init__.py +2 -0
- abstract_utilities/path_utils/imports/imports.py +1 -0
- abstract_utilities/path_utils/imports/module_imports.py +6 -0
- abstract_utilities/path_utils/path_utils.py +715 -0
- abstract_utilities/path_utils.py +94 -2
- abstract_utilities/read_write_utils/__init__.py +1 -0
- abstract_utilities/read_write_utils/imports/__init__.py +2 -0
- abstract_utilities/read_write_utils/imports/imports.py +2 -0
- abstract_utilities/read_write_utils/imports/module_imports.py +5 -0
- abstract_utilities/read_write_utils/read_write_utils.py +338 -0
- abstract_utilities/read_write_utils.py +2 -4
- abstract_utilities/safe_utils/__init__.py +2 -0
- abstract_utilities/safe_utils/imports/__init__.py +3 -0
- abstract_utilities/safe_utils/imports/imports.py +1 -0
- abstract_utilities/safe_utils/imports/module_imports.py +2 -0
- abstract_utilities/safe_utils/safe_utils.py +130 -0
- abstract_utilities/ssh_utils/__init__.py +2 -1
- abstract_utilities/ssh_utils/classes.py +0 -1
- abstract_utilities/ssh_utils/cmd_utils.py +207 -0
- abstract_utilities/ssh_utils/imports/__init__.py +3 -0
- abstract_utilities/ssh_utils/imports/imports.py +5 -0
- abstract_utilities/ssh_utils/imports/module_imports.py +5 -0
- abstract_utilities/ssh_utils/imports/utils.py +189 -0
- abstract_utilities/ssh_utils/pexpect_utils.py +11 -18
- abstract_utilities/string_utils/__init__.py +4 -0
- abstract_utilities/string_utils/clean_utils.py +28 -0
- abstract_utilities/string_utils/eat_utils.py +103 -0
- abstract_utilities/string_utils/imports/__init__.py +3 -0
- abstract_utilities/string_utils/imports/imports.py +2 -0
- abstract_utilities/string_utils/imports/module_imports.py +2 -0
- abstract_utilities/string_utils/imports/utils.py +81 -0
- abstract_utilities/string_utils/replace_utils.py +27 -0
- abstract_utilities/thread_utils/__init__.py +2 -0
- abstract_utilities/thread_utils/imports/__init__.py +2 -0
- abstract_utilities/thread_utils/imports/imports.py +2 -0
- abstract_utilities/thread_utils/imports/module_imports.py +2 -0
- abstract_utilities/thread_utils/thread_utils.py +140 -0
- abstract_utilities/time_utils/__init__.py +2 -0
- abstract_utilities/time_utils/imports/__init__.py +2 -0
- abstract_utilities/time_utils/imports/imports.py +3 -0
- abstract_utilities/time_utils/imports/module_imports.py +1 -0
- abstract_utilities/time_utils/time_utils.py +392 -0
- abstract_utilities/type_utils/__init__.py +3 -0
- abstract_utilities/type_utils/alpha_utils.py +59 -0
- abstract_utilities/type_utils/imports/__init__.py +2 -0
- abstract_utilities/type_utils/imports/imports.py +4 -0
- abstract_utilities/type_utils/imports/module_imports.py +1 -0
- abstract_utilities/type_utils/num_utils.py +19 -0
- abstract_utilities/type_utils/type_utils.py +981 -0
- {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/METADATA +1 -1
- abstract_utilities-0.2.2.507.dist-info/RECORD +229 -0
- abstract_utilities-0.2.2.496.dist-info/RECORD +0 -123
- {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/WHEEL +0 -0
- {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
from .imports import *
|
|
2
|
+
def detect_language_from_text(text: str):
|
|
3
|
+
patterns = {
|
|
4
|
+
'javascript': [
|
|
5
|
+
r'\bfunction\s+\w+\s*\(.*\)\s*{',
|
|
6
|
+
r'\bvar\s+\w+\s*=',
|
|
7
|
+
r'\bconst\s+\w+\s*=',
|
|
8
|
+
r'\blet\s+\w+\s*=',
|
|
9
|
+
r'\bconsole\.log\s*\(',
|
|
10
|
+
r'\bexport\s+(default|function|const|class)',
|
|
11
|
+
r'\bimport\s+.*\s+from\s+[\'"]'
|
|
12
|
+
],
|
|
13
|
+
'typescript': [
|
|
14
|
+
r'\binterface\s+\w+\s*{',
|
|
15
|
+
r'\btype\s+\w+\s*=',
|
|
16
|
+
r'\blet\s+\w+:\s+\w+',
|
|
17
|
+
r'\bfunction\s+\w+\s*\(.*:\s*\w+\)',
|
|
18
|
+
r'\bimport\s+.*\s+from\s+[\'"]',
|
|
19
|
+
r'\bexport\s+(default|function|const|class)'
|
|
20
|
+
],
|
|
21
|
+
'python': [
|
|
22
|
+
r'\bdef\s+\w+\(',
|
|
23
|
+
r'\bclass\s+\w+\s*:',
|
|
24
|
+
r'\bimport\s+\w+',
|
|
25
|
+
r'\bfrom\s+\w+\s+import\s+\w+',
|
|
26
|
+
r'\bif\s+__name__\s*==\s*[\'"]__main__[\'"]',
|
|
27
|
+
r'@\w+',
|
|
28
|
+
r'\blambda\s+'
|
|
29
|
+
],
|
|
30
|
+
'html': [
|
|
31
|
+
r'<!DOCTYPE\s+html>',
|
|
32
|
+
r'<html[^>]*>',
|
|
33
|
+
r'<head>',
|
|
34
|
+
r'<body>',
|
|
35
|
+
r'<div[^>]*>',
|
|
36
|
+
r'<script[^>]*>',
|
|
37
|
+
r'</\w+>'
|
|
38
|
+
],
|
|
39
|
+
'php': [
|
|
40
|
+
r'<\?php',
|
|
41
|
+
r'\$\w+\s*=',
|
|
42
|
+
r'echo\s+["\']',
|
|
43
|
+
r'->\w+\(',
|
|
44
|
+
r'function\s+\w+\s*\(',
|
|
45
|
+
r'\bclass\s+\w+\s*{'
|
|
46
|
+
],
|
|
47
|
+
'bash': [
|
|
48
|
+
r'#!/bin/bash',
|
|
49
|
+
r'\becho\s+["\']',
|
|
50
|
+
r'\bif\s+\[\[?',
|
|
51
|
+
r'\bthen\b',
|
|
52
|
+
r'\bfi\b',
|
|
53
|
+
r'\bfor\s+\w+\s+in\b',
|
|
54
|
+
r'\bdo\b',
|
|
55
|
+
r'\bdone\b'
|
|
56
|
+
]
|
|
57
|
+
}
|
|
58
|
+
text = str(text)
|
|
59
|
+
scores = {lang: sum(bool(re.search(p, text)) for p in pats) for lang, pats in patterns.items()}
|
|
60
|
+
max_score = max(scores.values(), default=0)
|
|
61
|
+
|
|
62
|
+
if max_score == 0:
|
|
63
|
+
return 'neither'
|
|
64
|
+
|
|
65
|
+
likely = [lang for lang, score in scores.items() if score == max_score]
|
|
66
|
+
return likely[0] if len(likely) == 1 else 'uncertain'
|
|
67
|
+
|
|
68
|
+
def search_code(code_languages, parts):
|
|
69
|
+
return [data for datas in parts for data in make_list(datas)
|
|
70
|
+
if detect_language_from_text(data) in code_languages]
|
|
71
|
+
def get_token_encoder(model_name: str = "gpt-4", encoding_name: str = None):
|
|
72
|
+
import tiktoken
|
|
73
|
+
"""
|
|
74
|
+
Retrieves the encoder for a given model or encoding name.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
model_name (str): The name of the model. Defaults to "gpt-4".
|
|
78
|
+
encoding_name (str, optional): The encoding name to use. If not provided, it defaults based on the model.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Encoder: A tiktoken encoder object.
|
|
82
|
+
"""
|
|
83
|
+
if encoding_name:
|
|
84
|
+
return tiktoken.get_encoding(encoding_name)
|
|
85
|
+
else:
|
|
86
|
+
return tiktoken.encoding_for_model(model_name)
|
|
87
|
+
|
|
88
|
+
def num_tokens_from_string(string: str, model_name: str = "gpt-4", encoding_name: str = None) -> int:
|
|
89
|
+
"""
|
|
90
|
+
Returns the number of tokens in a text string.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
string (str): The input text.
|
|
94
|
+
model_name (str, optional): The model name to determine encoding if encoding_name is not specified. Defaults to "gpt-4".
|
|
95
|
+
encoding_name (str, optional): The encoding name to use. If not specified, uses model-based encoding.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
int: The count of tokens.
|
|
99
|
+
"""
|
|
100
|
+
encoding = get_token_encoder(model_name, encoding_name)
|
|
101
|
+
num_tokens = len(encoding.encode(str(string)))
|
|
102
|
+
return num_tokens
|
|
103
|
+
|
|
104
|
+
def infer_tab_size(file_path):
|
|
105
|
+
if not os.path.isfile(file_path):
|
|
106
|
+
write_to_file(file_path=file_path, contents='\t')
|
|
107
|
+
with open(file_path, 'r') as file:
|
|
108
|
+
for line in file:
|
|
109
|
+
if '\t' in line:
|
|
110
|
+
return len(line) - len(line.lstrip()) # The length of indentation
|
|
111
|
+
return 4 # Default if no tab found
|
|
112
|
+
|
|
113
|
+
def get_blocks(data, delim='\n'):
|
|
114
|
+
if isinstance(data, list):
|
|
115
|
+
return data, None
|
|
116
|
+
if isinstance(data, tuple):
|
|
117
|
+
data, delim = data[0], data[-1]
|
|
118
|
+
return data.split(delim), delim
|
|
119
|
+
|
|
120
|
+
def get_indent_levels(text):
|
|
121
|
+
tab_size, indent_list = infer_tab_size('config.txt'), [0]
|
|
122
|
+
for line in text.split('\n'):
|
|
123
|
+
indent = 0
|
|
124
|
+
for char in line:
|
|
125
|
+
if char in [' ', '\t']:
|
|
126
|
+
indent += tab_size if char == '\t' else 1
|
|
127
|
+
else:
|
|
128
|
+
break
|
|
129
|
+
if indent not in indent_list:
|
|
130
|
+
indent_list.append(indent)
|
|
131
|
+
return indent_list
|
|
132
|
+
|
|
133
|
+
def get_code_blocks(data, indent_level=0):
|
|
134
|
+
blocks = [[]]
|
|
135
|
+
lines, delim = get_blocks(data, '\n')
|
|
136
|
+
for line in lines:
|
|
137
|
+
beginning = ''
|
|
138
|
+
for char in line:
|
|
139
|
+
if char in ['', ' ', '\n', '\t']:
|
|
140
|
+
beginning += char
|
|
141
|
+
else:
|
|
142
|
+
break
|
|
143
|
+
if len(beginning) == indent_level:
|
|
144
|
+
blocks[-1] = delim.join(blocks[-1])
|
|
145
|
+
blocks.append([line])
|
|
146
|
+
else:
|
|
147
|
+
blocks[-1].append(line)
|
|
148
|
+
blocks[-1] = delim.join(blocks[-1])
|
|
149
|
+
return blocks, delim
|
|
150
|
+
|
|
151
|
+
def chunk_any_to_tokens(data, max_tokens, model_name="gpt-4", encoding_name=None, delimiter='\n\n', reverse=False):
|
|
152
|
+
if isinstance(data, list):
|
|
153
|
+
blocks = data
|
|
154
|
+
else:
|
|
155
|
+
blocks, delimiter = get_blocks(data, delimiter)
|
|
156
|
+
|
|
157
|
+
if reverse:
|
|
158
|
+
blocks = reversed(blocks)
|
|
159
|
+
|
|
160
|
+
chunks = []
|
|
161
|
+
current_chunk = []
|
|
162
|
+
|
|
163
|
+
for block in blocks:
|
|
164
|
+
if num_tokens_from_string(delimiter.join(current_chunk + [block]), model_name, encoding_name) <= max_tokens:
|
|
165
|
+
current_chunk.append(block)
|
|
166
|
+
else:
|
|
167
|
+
if current_chunk:
|
|
168
|
+
chunks.append(delimiter.join(current_chunk))
|
|
169
|
+
current_chunk = [block]
|
|
170
|
+
|
|
171
|
+
if current_chunk:
|
|
172
|
+
chunks.append(''.join(current_chunk))
|
|
173
|
+
|
|
174
|
+
return chunks
|
|
175
|
+
|
|
176
|
+
def chunk_data_by_type(data, max_tokens, chunk_type=None, model_name="gpt-4", encoding_name=None, reverse=False):
|
|
177
|
+
delimiter = None
|
|
178
|
+
if chunk_type == "URL":
|
|
179
|
+
delimiter = None
|
|
180
|
+
blocks = re.split(r'<h[1-6].*?>.*?</h[1-6]>', data)
|
|
181
|
+
elif chunk_type == "SOUP":
|
|
182
|
+
delimiter = None
|
|
183
|
+
blocks = data
|
|
184
|
+
elif chunk_type == "DOCUMENT":
|
|
185
|
+
delimiter = "."
|
|
186
|
+
blocks = data.split(delimiter)
|
|
187
|
+
elif chunk_type == "CODE":
|
|
188
|
+
return chunk_source_code(data, max_tokens, model_name, encoding_name, reverse=reverse)
|
|
189
|
+
elif chunk_type == "TEXT":
|
|
190
|
+
return chunk_text_by_tokens(data, max_tokens, model_name, encoding_name, reverse=reverse)
|
|
191
|
+
else:
|
|
192
|
+
delimiter = "\n\n"
|
|
193
|
+
blocks = data.split(delimiter)
|
|
194
|
+
|
|
195
|
+
return chunk_any_to_tokens(blocks, max_tokens, model_name, encoding_name, delimiter, reverse=reverse)
|
|
196
|
+
def chunk_by_language_context(
|
|
197
|
+
text: str,
|
|
198
|
+
max_tokens: int = 1000,
|
|
199
|
+
model_name: str = "gpt-4",
|
|
200
|
+
encoding_name: str = None,
|
|
201
|
+
overlap: int = 0,
|
|
202
|
+
verbose: bool = False,
|
|
203
|
+
reverse: bool = False
|
|
204
|
+
) -> List[str]:
|
|
205
|
+
"""
|
|
206
|
+
Detects language and applies chunking strategy best suited to that context.
|
|
207
|
+
"""
|
|
208
|
+
task_params = get_class_inputs(ChunkParams,
|
|
209
|
+
max_tokens=max_tokens,
|
|
210
|
+
model_name=model_name,
|
|
211
|
+
encoding_name=encoding_name,
|
|
212
|
+
overlap=overlap,
|
|
213
|
+
verbose=verbose,
|
|
214
|
+
reverse=reverse
|
|
215
|
+
)
|
|
216
|
+
language = detect_language_from_text(text)
|
|
217
|
+
|
|
218
|
+
if verbose:
|
|
219
|
+
print(f"Detected language: {language}")
|
|
220
|
+
|
|
221
|
+
if language == 'python':
|
|
222
|
+
return chunk_source_code(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, task_params.reverse)
|
|
223
|
+
|
|
224
|
+
elif language in n('js','typescript'):
|
|
225
|
+
return chunk_by_braces(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, open='{', close='}', verbose=task_params.verbose)
|
|
226
|
+
|
|
227
|
+
elif language in ('html','php'):
|
|
228
|
+
return chunk_html_by_tag_blocks(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, verbose=task_params.verbose)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
else:
|
|
232
|
+
return strict_token_chunking(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, task_params.overlap, task_params.verbose)
|
|
233
|
+
def chunk_html_by_tag_blocks(
|
|
234
|
+
html: str,
|
|
235
|
+
max_tokens: int = 1000,
|
|
236
|
+
model_name: str = "gpt-4",
|
|
237
|
+
encoding_name: str = None,
|
|
238
|
+
tags: List[str] = None,
|
|
239
|
+
verbose: bool = False
|
|
240
|
+
) -> List[str]:
|
|
241
|
+
"""
|
|
242
|
+
Chunks HTML using BeautifulSoup, grouping by selected tag blocks (e.g., div, section).
|
|
243
|
+
Each chunk is ≤ max_tokens.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
html (str): HTML input.
|
|
247
|
+
max_tokens (int): Max tokens per chunk.
|
|
248
|
+
model_name (str): Token model name.
|
|
249
|
+
encoding_name (str): Optional encoding override.
|
|
250
|
+
tags (List[str]): Tags to treat as top-level chunks.
|
|
251
|
+
verbose (bool): Print debug info.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
List[str]: List of HTML chunks.
|
|
255
|
+
"""
|
|
256
|
+
task_params = get_class_inputs(ChunkParams,
|
|
257
|
+
max_tokens=max_tokens,
|
|
258
|
+
model_name=model_name,
|
|
259
|
+
encoding_name=encoding_name,
|
|
260
|
+
verbose=verbose,
|
|
261
|
+
tags=tags
|
|
262
|
+
|
|
263
|
+
)
|
|
264
|
+
tags = tags or ["section", "article", "div", "form", "main"]
|
|
265
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
266
|
+
|
|
267
|
+
encoding = (
|
|
268
|
+
tiktoken.get_encoding(task_params.encoding_name)
|
|
269
|
+
if encoding_name else tiktoken.encoding_for_model(task_params.model_name)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def count_tokens(text):
|
|
273
|
+
return len(encoding.encode(text))
|
|
274
|
+
|
|
275
|
+
chunks = []
|
|
276
|
+
current_chunk = ""
|
|
277
|
+
current_tokens = 0
|
|
278
|
+
|
|
279
|
+
for element in soup.find_all(tags, recursive=True):
|
|
280
|
+
element_html = str(element)
|
|
281
|
+
element_tokens = count_tokens(element_html)
|
|
282
|
+
|
|
283
|
+
if element_tokens > task_params.max_tokens:
|
|
284
|
+
# Include the element directly if it’s too large
|
|
285
|
+
chunks.append(element_html)
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
if current_tokens + element_tokens <= task_params.max_tokens:
|
|
289
|
+
current_chunk += "\n" + element_html
|
|
290
|
+
current_tokens += element_tokens
|
|
291
|
+
else:
|
|
292
|
+
chunks.append(current_chunk)
|
|
293
|
+
current_chunk = element_html
|
|
294
|
+
current_tokens = element_tokens
|
|
295
|
+
|
|
296
|
+
if current_chunk:
|
|
297
|
+
chunks.append(current_chunk)
|
|
298
|
+
|
|
299
|
+
if task_params.verbose:
|
|
300
|
+
print(f"Chunked into {len(chunks)} HTML segments")
|
|
301
|
+
|
|
302
|
+
return chunks
|
|
303
|
+
def chunk_text_by_tokens(prompt_data,
|
|
304
|
+
max_tokens,
|
|
305
|
+
model_name="gpt-4",
|
|
306
|
+
encoding_name=None,
|
|
307
|
+
reverse=False):
|
|
308
|
+
task_params = get_class_inputs(ChunkParams,
|
|
309
|
+
max_tokens=max_tokens,
|
|
310
|
+
model_name=model_name,
|
|
311
|
+
encoding_name=encoding_name,
|
|
312
|
+
reverse=reverse
|
|
313
|
+
|
|
314
|
+
)
|
|
315
|
+
sentences = prompt_data.split("\n")
|
|
316
|
+
if task_params.reverse:
|
|
317
|
+
sentences = reversed(sentences)
|
|
318
|
+
|
|
319
|
+
chunks = []
|
|
320
|
+
current_chunk = ""
|
|
321
|
+
current_chunk_tokens = 0
|
|
322
|
+
|
|
323
|
+
for sentence in sentences:
|
|
324
|
+
sentence_tokens = num_tokens_from_string(sentence,task_params.smodel_name, task_params.encoding_name)
|
|
325
|
+
|
|
326
|
+
if current_chunk_tokens + sentence_tokens <= task_params.max_tokens:
|
|
327
|
+
current_chunk += "\n" + sentence if current_chunk else sentence
|
|
328
|
+
current_chunk_tokens += sentence_tokens
|
|
329
|
+
else:
|
|
330
|
+
chunks.append(current_chunk)
|
|
331
|
+
current_chunk = sentence
|
|
332
|
+
current_chunk_tokens = sentence_tokens
|
|
333
|
+
|
|
334
|
+
if current_chunk:
|
|
335
|
+
chunks.append(current_chunk)
|
|
336
|
+
|
|
337
|
+
return chunks
|
|
338
|
+
def chunk_by_braces(text,
|
|
339
|
+
max_tokens,
|
|
340
|
+
model_name,
|
|
341
|
+
encoding_name,
|
|
342
|
+
open='{',
|
|
343
|
+
close='}'):
|
|
344
|
+
"""
|
|
345
|
+
Chunks code using balanced brace logic (useful for JS, PHP, etc.).
|
|
346
|
+
"""
|
|
347
|
+
task_params = get_class_inputs(ChunkParams,
|
|
348
|
+
max_tokens=max_tokens,
|
|
349
|
+
model_name=model_name,
|
|
350
|
+
encoding_name=encoding_name
|
|
351
|
+
)
|
|
352
|
+
encoding = (
|
|
353
|
+
tiktoken.get_encoding(task_params.encoding_name)
|
|
354
|
+
if encoding_name else tiktoken.encoding_for_model(task_params.model_name)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
tokens = encoding.encode(text)
|
|
358
|
+
decoded = encoding.decode(tokens)
|
|
359
|
+
|
|
360
|
+
stack = []
|
|
361
|
+
chunks = []
|
|
362
|
+
buffer = ''
|
|
363
|
+
for char in decoded:
|
|
364
|
+
buffer += char
|
|
365
|
+
if char == open:
|
|
366
|
+
stack.append(char)
|
|
367
|
+
elif char == close and stack:
|
|
368
|
+
stack.pop()
|
|
369
|
+
|
|
370
|
+
if not stack and len(encoding.encode(buffer)) >= task_params.max_tokens:
|
|
371
|
+
chunks.append(buffer)
|
|
372
|
+
buffer = ''
|
|
373
|
+
|
|
374
|
+
if buffer.strip():
|
|
375
|
+
chunks.append(buffer)
|
|
376
|
+
|
|
377
|
+
return chunks
|
|
378
|
+
|
|
379
|
+
def extract_python_blocks(source_code: str, reverse: bool = False) -> List[str]:
|
|
380
|
+
"""
|
|
381
|
+
Extracts top-level function and class definitions (including decorators) from Python source.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
source_code (str): Python code to analyze.
|
|
385
|
+
reverse (bool): Whether to extract blocks in reverse order.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
List[str]: List of code blocks (functions or classes).
|
|
389
|
+
"""
|
|
390
|
+
reverse = reverse or False
|
|
391
|
+
func_pattern = re.compile(r'^\s*def\s+\w+\s*\(.*\)\s*:', re.MULTILINE)
|
|
392
|
+
class_pattern = re.compile(r'^\s*class\s+\w+\s*(\(.*\))?\s*:', re.MULTILINE)
|
|
393
|
+
decorator_pattern = re.compile(r'^\s*@\w+', re.MULTILINE)
|
|
394
|
+
|
|
395
|
+
lines = source_code.splitlines()
|
|
396
|
+
if reverse:
|
|
397
|
+
lines = list(reversed(lines))
|
|
398
|
+
|
|
399
|
+
blocks = []
|
|
400
|
+
current_block = []
|
|
401
|
+
|
|
402
|
+
for line in lines:
|
|
403
|
+
if func_pattern.match(line) or class_pattern.match(line):
|
|
404
|
+
if current_block:
|
|
405
|
+
if reverse:
|
|
406
|
+
blocks.append("\n".join(reversed(current_block)))
|
|
407
|
+
else:
|
|
408
|
+
blocks.append("\n".join(current_block))
|
|
409
|
+
current_block = []
|
|
410
|
+
current_block.append(line)
|
|
411
|
+
# Include decorators directly above function/class
|
|
412
|
+
if decorator_pattern.match(line) and current_block:
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
if current_block:
|
|
416
|
+
if reverse:
|
|
417
|
+
blocks.append("\n".join(reversed(current_block)))
|
|
418
|
+
else:
|
|
419
|
+
blocks.append("\n".join(current_block))
|
|
420
|
+
|
|
421
|
+
return list(reversed(blocks)) if reverse else blocks
|
|
422
|
+
def chunk_source_code(source_code: str,
|
|
423
|
+
max_tokens: int,
|
|
424
|
+
model_name="gpt-4",
|
|
425
|
+
encoding_name=None,
|
|
426
|
+
reverse=False):
|
|
427
|
+
task_params = get_class_inputs(ChunkParams,
|
|
428
|
+
max_tokens=max_tokens,
|
|
429
|
+
model_name=model_name,
|
|
430
|
+
encoding_name=encoding_name,
|
|
431
|
+
reverse=reverse
|
|
432
|
+
)
|
|
433
|
+
encoding = (
|
|
434
|
+
tiktoken.get_encoding(task_params.encoding_name)
|
|
435
|
+
if task_params.encoding_name else tiktoken.encoding_for_model(task_params.model_name)
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def token_count(text): return len(encoding.encode(text))
|
|
439
|
+
|
|
440
|
+
chunks = []
|
|
441
|
+
current_chunk = []
|
|
442
|
+
current_tokens = 0
|
|
443
|
+
|
|
444
|
+
blocks = extract_python_blocks(source_code, task_params.reverse)
|
|
445
|
+
|
|
446
|
+
for block in blocks:
|
|
447
|
+
block_tokens = token_count(block)
|
|
448
|
+
if block_tokens > max_tokens:
|
|
449
|
+
chunks.append(block) # too big, include as is
|
|
450
|
+
elif current_tokens + block_tokens <= task_params.max_tokens:
|
|
451
|
+
current_chunk.append(block)
|
|
452
|
+
current_tokens += block_tokens
|
|
453
|
+
else:
|
|
454
|
+
chunks.append("\n\n".join(current_chunk))
|
|
455
|
+
current_chunk = [block]
|
|
456
|
+
current_tokens = block_tokens
|
|
457
|
+
|
|
458
|
+
if current_chunk:
|
|
459
|
+
chunks.append("\n\n".join(current_chunk))
|
|
460
|
+
|
|
461
|
+
return chunks
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def strict_token_chunking(data: str,
|
|
465
|
+
max_tokens: int,
|
|
466
|
+
model_name: str = "gpt-4",
|
|
467
|
+
encoding_name: str = None,
|
|
468
|
+
overlap:int=0,
|
|
469
|
+
verbose:bool=False) -> List[str]:
|
|
470
|
+
"""
|
|
471
|
+
Improved chunking method for descriptive summarization. This version uses paragraph-based boundaries
|
|
472
|
+
and ensures token limits are respected. It preserves semantic coherence better than line-based splits.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
data (str): The full input text.
|
|
476
|
+
max_tokens (int): Maximum number of tokens per chunk.
|
|
477
|
+
model_name (str): Model name for tokenization.
|
|
478
|
+
encoding_name (str): Optional encoding override.
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
List[str]: List of token-bound text chunks.
|
|
482
|
+
"""
|
|
483
|
+
# Importing token counting utility
|
|
484
|
+
task_params = get_class_inputs(ChunkParams,
|
|
485
|
+
max_tokens=max_tokens,
|
|
486
|
+
model_name=model_name,
|
|
487
|
+
encoding_name=encoding_name,
|
|
488
|
+
overlap=overlap,
|
|
489
|
+
verbose=verbose
|
|
490
|
+
)
|
|
491
|
+
encoding = (
|
|
492
|
+
tiktoken.get_encoding(task_params.encoding_name)
|
|
493
|
+
if task_params.encoding_name
|
|
494
|
+
else tiktoken.encoding_for_model(task_params.model_name)
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def count_tokens(text):
|
|
498
|
+
return len(encoding.encode(text))
|
|
499
|
+
|
|
500
|
+
paragraphs = re.split(r"\n\s*\n", data.strip()) # split on paragraph gaps
|
|
501
|
+
chunks = []
|
|
502
|
+
current_chunk = []
|
|
503
|
+
|
|
504
|
+
for paragraph in paragraphs:
|
|
505
|
+
trial_chunk = "\n\n".join(current_chunk + [paragraph])
|
|
506
|
+
if count_tokens(trial_chunk) <= task_params.max_tokens:
|
|
507
|
+
current_chunk.append(paragraph)
|
|
508
|
+
else:
|
|
509
|
+
if current_chunk:
|
|
510
|
+
chunks.append("\n\n".join(current_chunk))
|
|
511
|
+
current_chunk = [paragraph]
|
|
512
|
+
|
|
513
|
+
if current_chunk:
|
|
514
|
+
chunks.append("\n\n".join(current_chunk))
|
|
515
|
+
|
|
516
|
+
return chunks
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ...imports import os,shlex
|