abstract-utilities 0.2.2.496__py3-none-any.whl → 0.2.2.507__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. abstract_utilities/__init__.py +5 -9
  2. abstract_utilities/class_utils/__init__.py +7 -0
  3. abstract_utilities/class_utils/abstract_classes.py +74 -0
  4. abstract_utilities/class_utils/caller_utils.py +35 -0
  5. abstract_utilities/class_utils/class_utils.py +109 -0
  6. abstract_utilities/class_utils/function_utils.py +153 -0
  7. abstract_utilities/class_utils/global_utils.py +56 -0
  8. abstract_utilities/class_utils/imports/__init__.py +2 -0
  9. abstract_utilities/class_utils/imports/imports.py +2 -0
  10. abstract_utilities/class_utils/imports/utils.py +40 -0
  11. abstract_utilities/class_utils/module_utils.py +63 -0
  12. abstract_utilities/env_utils/imports/imports.py +3 -2
  13. abstract_utilities/error_utils/__init__.py +2 -0
  14. abstract_utilities/error_utils/error_utils.py +25 -0
  15. abstract_utilities/error_utils/imports/__init__.py +2 -0
  16. abstract_utilities/error_utils/imports/imports.py +1 -0
  17. abstract_utilities/error_utils/imports/module_imports.py +1 -0
  18. abstract_utilities/file_utils/imports/imports.py +3 -18
  19. abstract_utilities/file_utils/imports/module_imports.py +3 -6
  20. abstract_utilities/file_utils/src/filter_params.py +6 -6
  21. abstract_utilities/file_utils/src/type_checks.py +0 -1
  22. abstract_utilities/hash_utils/__init__.py +2 -0
  23. abstract_utilities/hash_utils/hash_utils.py +5 -0
  24. abstract_utilities/hash_utils/imports/__init__.py +2 -0
  25. abstract_utilities/hash_utils/imports/imports.py +1 -0
  26. abstract_utilities/hash_utils/imports/module_imports.py +0 -0
  27. abstract_utilities/history_utils/__init__.py +2 -0
  28. abstract_utilities/history_utils/history_utils.py +37 -0
  29. abstract_utilities/history_utils/imports/__init__.py +2 -0
  30. abstract_utilities/history_utils/imports/imports.py +1 -0
  31. abstract_utilities/history_utils/imports/module_imports.py +0 -0
  32. abstract_utilities/import_utils/imports/imports.py +1 -1
  33. abstract_utilities/import_utils/imports/module_imports.py +1 -1
  34. abstract_utilities/import_utils/src/__init__.py +1 -1
  35. abstract_utilities/import_utils/src/clean_imports.py +31 -5
  36. abstract_utilities/import_utils/src/dot_utils.py +9 -0
  37. abstract_utilities/import_utils/src/package_utilss/__init__.py +139 -0
  38. abstract_utilities/import_utils/src/package_utilss/context_utils.py +27 -0
  39. abstract_utilities/import_utils/src/package_utilss/import_collectors.py +53 -0
  40. abstract_utilities/import_utils/src/package_utilss/path_utils.py +28 -0
  41. abstract_utilities/import_utils/src/package_utilss/safe_import.py +27 -0
  42. abstract_utilities/import_utils/src/pkg_utils.py +140 -0
  43. abstract_utilities/imports.py +18 -0
  44. abstract_utilities/json_utils/__init__.py +2 -0
  45. abstract_utilities/json_utils/imports/__init__.py +2 -0
  46. abstract_utilities/json_utils/imports/imports.py +2 -0
  47. abstract_utilities/json_utils/imports/module_imports.py +5 -0
  48. abstract_utilities/json_utils/json_utils.py +743 -0
  49. abstract_utilities/list_utils/__init__.py +2 -0
  50. abstract_utilities/list_utils/imports/__init__.py +2 -0
  51. abstract_utilities/list_utils/imports/imports.py +1 -0
  52. abstract_utilities/list_utils/imports/module_imports.py +0 -0
  53. abstract_utilities/list_utils/list_utils.py +199 -0
  54. abstract_utilities/log_utils/__init__.py +5 -0
  55. abstract_utilities/log_utils/abstractLogManager.py +64 -0
  56. abstract_utilities/log_utils/call_response.py +68 -0
  57. abstract_utilities/log_utils/imports/__init__.py +2 -0
  58. abstract_utilities/log_utils/imports/imports.py +7 -0
  59. abstract_utilities/log_utils/imports/module_imports.py +2 -0
  60. abstract_utilities/log_utils/log_file.py +58 -0
  61. abstract_utilities/log_utils/logger_callable.py +49 -0
  62. abstract_utilities/math_utils/__init__.py +2 -0
  63. abstract_utilities/math_utils/imports/__init__.py +2 -0
  64. abstract_utilities/math_utils/imports/imports.py +2 -0
  65. abstract_utilities/math_utils/imports/module_imports.py +1 -0
  66. abstract_utilities/math_utils/math_utils.py +208 -0
  67. abstract_utilities/parse_utils/__init__.py +2 -0
  68. abstract_utilities/parse_utils/imports/__init__.py +3 -0
  69. abstract_utilities/parse_utils/imports/constants.py +10 -0
  70. abstract_utilities/parse_utils/imports/imports.py +2 -0
  71. abstract_utilities/parse_utils/imports/module_imports.py +4 -0
  72. abstract_utilities/parse_utils/parse_utils.py +516 -0
  73. abstract_utilities/path_utils/__init__.py +2 -0
  74. abstract_utilities/path_utils/imports/__init__.py +2 -0
  75. abstract_utilities/path_utils/imports/imports.py +1 -0
  76. abstract_utilities/path_utils/imports/module_imports.py +6 -0
  77. abstract_utilities/path_utils/path_utils.py +715 -0
  78. abstract_utilities/path_utils.py +94 -2
  79. abstract_utilities/read_write_utils/__init__.py +1 -0
  80. abstract_utilities/read_write_utils/imports/__init__.py +2 -0
  81. abstract_utilities/read_write_utils/imports/imports.py +2 -0
  82. abstract_utilities/read_write_utils/imports/module_imports.py +5 -0
  83. abstract_utilities/read_write_utils/read_write_utils.py +338 -0
  84. abstract_utilities/read_write_utils.py +2 -4
  85. abstract_utilities/safe_utils/__init__.py +2 -0
  86. abstract_utilities/safe_utils/imports/__init__.py +3 -0
  87. abstract_utilities/safe_utils/imports/imports.py +1 -0
  88. abstract_utilities/safe_utils/imports/module_imports.py +2 -0
  89. abstract_utilities/safe_utils/safe_utils.py +130 -0
  90. abstract_utilities/ssh_utils/__init__.py +2 -1
  91. abstract_utilities/ssh_utils/classes.py +0 -1
  92. abstract_utilities/ssh_utils/cmd_utils.py +207 -0
  93. abstract_utilities/ssh_utils/imports/__init__.py +3 -0
  94. abstract_utilities/ssh_utils/imports/imports.py +5 -0
  95. abstract_utilities/ssh_utils/imports/module_imports.py +5 -0
  96. abstract_utilities/ssh_utils/imports/utils.py +189 -0
  97. abstract_utilities/ssh_utils/pexpect_utils.py +11 -18
  98. abstract_utilities/string_utils/__init__.py +4 -0
  99. abstract_utilities/string_utils/clean_utils.py +28 -0
  100. abstract_utilities/string_utils/eat_utils.py +103 -0
  101. abstract_utilities/string_utils/imports/__init__.py +3 -0
  102. abstract_utilities/string_utils/imports/imports.py +2 -0
  103. abstract_utilities/string_utils/imports/module_imports.py +2 -0
  104. abstract_utilities/string_utils/imports/utils.py +81 -0
  105. abstract_utilities/string_utils/replace_utils.py +27 -0
  106. abstract_utilities/thread_utils/__init__.py +2 -0
  107. abstract_utilities/thread_utils/imports/__init__.py +2 -0
  108. abstract_utilities/thread_utils/imports/imports.py +2 -0
  109. abstract_utilities/thread_utils/imports/module_imports.py +2 -0
  110. abstract_utilities/thread_utils/thread_utils.py +140 -0
  111. abstract_utilities/time_utils/__init__.py +2 -0
  112. abstract_utilities/time_utils/imports/__init__.py +2 -0
  113. abstract_utilities/time_utils/imports/imports.py +3 -0
  114. abstract_utilities/time_utils/imports/module_imports.py +1 -0
  115. abstract_utilities/time_utils/time_utils.py +392 -0
  116. abstract_utilities/type_utils/__init__.py +3 -0
  117. abstract_utilities/type_utils/alpha_utils.py +59 -0
  118. abstract_utilities/type_utils/imports/__init__.py +2 -0
  119. abstract_utilities/type_utils/imports/imports.py +4 -0
  120. abstract_utilities/type_utils/imports/module_imports.py +1 -0
  121. abstract_utilities/type_utils/num_utils.py +19 -0
  122. abstract_utilities/type_utils/type_utils.py +981 -0
  123. {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/METADATA +1 -1
  124. abstract_utilities-0.2.2.507.dist-info/RECORD +229 -0
  125. abstract_utilities-0.2.2.496.dist-info/RECORD +0 -123
  126. {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/WHEEL +0 -0
  127. {abstract_utilities-0.2.2.496.dist-info → abstract_utilities-0.2.2.507.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,516 @@
1
+ from .imports import *
2
+ def detect_language_from_text(text: str):
3
+ patterns = {
4
+ 'javascript': [
5
+ r'\bfunction\s+\w+\s*\(.*\)\s*{',
6
+ r'\bvar\s+\w+\s*=',
7
+ r'\bconst\s+\w+\s*=',
8
+ r'\blet\s+\w+\s*=',
9
+ r'\bconsole\.log\s*\(',
10
+ r'\bexport\s+(default|function|const|class)',
11
+ r'\bimport\s+.*\s+from\s+[\'"]'
12
+ ],
13
+ 'typescript': [
14
+ r'\binterface\s+\w+\s*{',
15
+ r'\btype\s+\w+\s*=',
16
+ r'\blet\s+\w+:\s+\w+',
17
+ r'\bfunction\s+\w+\s*\(.*:\s*\w+\)',
18
+ r'\bimport\s+.*\s+from\s+[\'"]',
19
+ r'\bexport\s+(default|function|const|class)'
20
+ ],
21
+ 'python': [
22
+ r'\bdef\s+\w+\(',
23
+ r'\bclass\s+\w+\s*:',
24
+ r'\bimport\s+\w+',
25
+ r'\bfrom\s+\w+\s+import\s+\w+',
26
+ r'\bif\s+__name__\s*==\s*[\'"]__main__[\'"]',
27
+ r'@\w+',
28
+ r'\blambda\s+'
29
+ ],
30
+ 'html': [
31
+ r'<!DOCTYPE\s+html>',
32
+ r'<html[^>]*>',
33
+ r'<head>',
34
+ r'<body>',
35
+ r'<div[^>]*>',
36
+ r'<script[^>]*>',
37
+ r'</\w+>'
38
+ ],
39
+ 'php': [
40
+ r'<\?php',
41
+ r'\$\w+\s*=',
42
+ r'echo\s+["\']',
43
+ r'->\w+\(',
44
+ r'function\s+\w+\s*\(',
45
+ r'\bclass\s+\w+\s*{'
46
+ ],
47
+ 'bash': [
48
+ r'#!/bin/bash',
49
+ r'\becho\s+["\']',
50
+ r'\bif\s+\[\[?',
51
+ r'\bthen\b',
52
+ r'\bfi\b',
53
+ r'\bfor\s+\w+\s+in\b',
54
+ r'\bdo\b',
55
+ r'\bdone\b'
56
+ ]
57
+ }
58
+ text = str(text)
59
+ scores = {lang: sum(bool(re.search(p, text)) for p in pats) for lang, pats in patterns.items()}
60
+ max_score = max(scores.values(), default=0)
61
+
62
+ if max_score == 0:
63
+ return 'neither'
64
+
65
+ likely = [lang for lang, score in scores.items() if score == max_score]
66
+ return likely[0] if len(likely) == 1 else 'uncertain'
67
+
68
+ def search_code(code_languages, parts):
69
+ return [data for datas in parts for data in make_list(datas)
70
+ if detect_language_from_text(data) in code_languages]
71
+ def get_token_encoder(model_name: str = "gpt-4", encoding_name: str = None):
72
+ import tiktoken
73
+ """
74
+ Retrieves the encoder for a given model or encoding name.
75
+
76
+ Args:
77
+ model_name (str): The name of the model. Defaults to "gpt-4".
78
+ encoding_name (str, optional): The encoding name to use. If not provided, it defaults based on the model.
79
+
80
+ Returns:
81
+ Encoder: A tiktoken encoder object.
82
+ """
83
+ if encoding_name:
84
+ return tiktoken.get_encoding(encoding_name)
85
+ else:
86
+ return tiktoken.encoding_for_model(model_name)
87
+
88
+ def num_tokens_from_string(string: str, model_name: str = "gpt-4", encoding_name: str = None) -> int:
89
+ """
90
+ Returns the number of tokens in a text string.
91
+
92
+ Args:
93
+ string (str): The input text.
94
+ model_name (str, optional): The model name to determine encoding if encoding_name is not specified. Defaults to "gpt-4".
95
+ encoding_name (str, optional): The encoding name to use. If not specified, uses model-based encoding.
96
+
97
+ Returns:
98
+ int: The count of tokens.
99
+ """
100
+ encoding = get_token_encoder(model_name, encoding_name)
101
+ num_tokens = len(encoding.encode(str(string)))
102
+ return num_tokens
103
+
104
+ def infer_tab_size(file_path):
105
+ if not os.path.isfile(file_path):
106
+ write_to_file(file_path=file_path, contents='\t')
107
+ with open(file_path, 'r') as file:
108
+ for line in file:
109
+ if '\t' in line:
110
+ return len(line) - len(line.lstrip()) # The length of indentation
111
+ return 4 # Default if no tab found
112
+
113
+ def get_blocks(data, delim='\n'):
114
+ if isinstance(data, list):
115
+ return data, None
116
+ if isinstance(data, tuple):
117
+ data, delim = data[0], data[-1]
118
+ return data.split(delim), delim
119
+
120
+ def get_indent_levels(text):
121
+ tab_size, indent_list = infer_tab_size('config.txt'), [0]
122
+ for line in text.split('\n'):
123
+ indent = 0
124
+ for char in line:
125
+ if char in [' ', '\t']:
126
+ indent += tab_size if char == '\t' else 1
127
+ else:
128
+ break
129
+ if indent not in indent_list:
130
+ indent_list.append(indent)
131
+ return indent_list
132
+
133
+ def get_code_blocks(data, indent_level=0):
134
+ blocks = [[]]
135
+ lines, delim = get_blocks(data, '\n')
136
+ for line in lines:
137
+ beginning = ''
138
+ for char in line:
139
+ if char in ['', ' ', '\n', '\t']:
140
+ beginning += char
141
+ else:
142
+ break
143
+ if len(beginning) == indent_level:
144
+ blocks[-1] = delim.join(blocks[-1])
145
+ blocks.append([line])
146
+ else:
147
+ blocks[-1].append(line)
148
+ blocks[-1] = delim.join(blocks[-1])
149
+ return blocks, delim
150
+
151
+ def chunk_any_to_tokens(data, max_tokens, model_name="gpt-4", encoding_name=None, delimiter='\n\n', reverse=False):
152
+ if isinstance(data, list):
153
+ blocks = data
154
+ else:
155
+ blocks, delimiter = get_blocks(data, delimiter)
156
+
157
+ if reverse:
158
+ blocks = reversed(blocks)
159
+
160
+ chunks = []
161
+ current_chunk = []
162
+
163
+ for block in blocks:
164
+ if num_tokens_from_string(delimiter.join(current_chunk + [block]), model_name, encoding_name) <= max_tokens:
165
+ current_chunk.append(block)
166
+ else:
167
+ if current_chunk:
168
+ chunks.append(delimiter.join(current_chunk))
169
+ current_chunk = [block]
170
+
171
+ if current_chunk:
172
+ chunks.append(''.join(current_chunk))
173
+
174
+ return chunks
175
+
176
+ def chunk_data_by_type(data, max_tokens, chunk_type=None, model_name="gpt-4", encoding_name=None, reverse=False):
177
+ delimiter = None
178
+ if chunk_type == "URL":
179
+ delimiter = None
180
+ blocks = re.split(r'<h[1-6].*?>.*?</h[1-6]>', data)
181
+ elif chunk_type == "SOUP":
182
+ delimiter = None
183
+ blocks = data
184
+ elif chunk_type == "DOCUMENT":
185
+ delimiter = "."
186
+ blocks = data.split(delimiter)
187
+ elif chunk_type == "CODE":
188
+ return chunk_source_code(data, max_tokens, model_name, encoding_name, reverse=reverse)
189
+ elif chunk_type == "TEXT":
190
+ return chunk_text_by_tokens(data, max_tokens, model_name, encoding_name, reverse=reverse)
191
+ else:
192
+ delimiter = "\n\n"
193
+ blocks = data.split(delimiter)
194
+
195
+ return chunk_any_to_tokens(blocks, max_tokens, model_name, encoding_name, delimiter, reverse=reverse)
196
+ def chunk_by_language_context(
197
+ text: str,
198
+ max_tokens: int = 1000,
199
+ model_name: str = "gpt-4",
200
+ encoding_name: str = None,
201
+ overlap: int = 0,
202
+ verbose: bool = False,
203
+ reverse: bool = False
204
+ ) -> List[str]:
205
+ """
206
+ Detects language and applies chunking strategy best suited to that context.
207
+ """
208
+ task_params = get_class_inputs(ChunkParams,
209
+ max_tokens=max_tokens,
210
+ model_name=model_name,
211
+ encoding_name=encoding_name,
212
+ overlap=overlap,
213
+ verbose=verbose,
214
+ reverse=reverse
215
+ )
216
+ language = detect_language_from_text(text)
217
+
218
+ if verbose:
219
+ print(f"Detected language: {language}")
220
+
221
+ if language == 'python':
222
+ return chunk_source_code(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, task_params.reverse)
223
+
224
+ elif language in n('js','typescript'):
225
+ return chunk_by_braces(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, open='{', close='}', verbose=task_params.verbose)
226
+
227
+ elif language in ('html','php'):
228
+ return chunk_html_by_tag_blocks(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, verbose=task_params.verbose)
229
+
230
+
231
+ else:
232
+ return strict_token_chunking(text, task_params.max_tokens, task_params.model_name, task_params.encoding_name, task_params.overlap, task_params.verbose)
233
+ def chunk_html_by_tag_blocks(
234
+ html: str,
235
+ max_tokens: int = 1000,
236
+ model_name: str = "gpt-4",
237
+ encoding_name: str = None,
238
+ tags: List[str] = None,
239
+ verbose: bool = False
240
+ ) -> List[str]:
241
+ """
242
+ Chunks HTML using BeautifulSoup, grouping by selected tag blocks (e.g., div, section).
243
+ Each chunk is ≤ max_tokens.
244
+
245
+ Args:
246
+ html (str): HTML input.
247
+ max_tokens (int): Max tokens per chunk.
248
+ model_name (str): Token model name.
249
+ encoding_name (str): Optional encoding override.
250
+ tags (List[str]): Tags to treat as top-level chunks.
251
+ verbose (bool): Print debug info.
252
+
253
+ Returns:
254
+ List[str]: List of HTML chunks.
255
+ """
256
+ task_params = get_class_inputs(ChunkParams,
257
+ max_tokens=max_tokens,
258
+ model_name=model_name,
259
+ encoding_name=encoding_name,
260
+ verbose=verbose,
261
+ tags=tags
262
+
263
+ )
264
+ tags = tags or ["section", "article", "div", "form", "main"]
265
+ soup = BeautifulSoup(html, "html.parser")
266
+
267
+ encoding = (
268
+ tiktoken.get_encoding(task_params.encoding_name)
269
+ if encoding_name else tiktoken.encoding_for_model(task_params.model_name)
270
+ )
271
+
272
+ def count_tokens(text):
273
+ return len(encoding.encode(text))
274
+
275
+ chunks = []
276
+ current_chunk = ""
277
+ current_tokens = 0
278
+
279
+ for element in soup.find_all(tags, recursive=True):
280
+ element_html = str(element)
281
+ element_tokens = count_tokens(element_html)
282
+
283
+ if element_tokens > task_params.max_tokens:
284
+ # Include the element directly if it’s too large
285
+ chunks.append(element_html)
286
+ continue
287
+
288
+ if current_tokens + element_tokens <= task_params.max_tokens:
289
+ current_chunk += "\n" + element_html
290
+ current_tokens += element_tokens
291
+ else:
292
+ chunks.append(current_chunk)
293
+ current_chunk = element_html
294
+ current_tokens = element_tokens
295
+
296
+ if current_chunk:
297
+ chunks.append(current_chunk)
298
+
299
+ if task_params.verbose:
300
+ print(f"Chunked into {len(chunks)} HTML segments")
301
+
302
+ return chunks
303
+ def chunk_text_by_tokens(prompt_data,
304
+ max_tokens,
305
+ model_name="gpt-4",
306
+ encoding_name=None,
307
+ reverse=False):
308
+ task_params = get_class_inputs(ChunkParams,
309
+ max_tokens=max_tokens,
310
+ model_name=model_name,
311
+ encoding_name=encoding_name,
312
+ reverse=reverse
313
+
314
+ )
315
+ sentences = prompt_data.split("\n")
316
+ if task_params.reverse:
317
+ sentences = reversed(sentences)
318
+
319
+ chunks = []
320
+ current_chunk = ""
321
+ current_chunk_tokens = 0
322
+
323
+ for sentence in sentences:
324
+ sentence_tokens = num_tokens_from_string(sentence,task_params.smodel_name, task_params.encoding_name)
325
+
326
+ if current_chunk_tokens + sentence_tokens <= task_params.max_tokens:
327
+ current_chunk += "\n" + sentence if current_chunk else sentence
328
+ current_chunk_tokens += sentence_tokens
329
+ else:
330
+ chunks.append(current_chunk)
331
+ current_chunk = sentence
332
+ current_chunk_tokens = sentence_tokens
333
+
334
+ if current_chunk:
335
+ chunks.append(current_chunk)
336
+
337
+ return chunks
338
+ def chunk_by_braces(text,
339
+ max_tokens,
340
+ model_name,
341
+ encoding_name,
342
+ open='{',
343
+ close='}'):
344
+ """
345
+ Chunks code using balanced brace logic (useful for JS, PHP, etc.).
346
+ """
347
+ task_params = get_class_inputs(ChunkParams,
348
+ max_tokens=max_tokens,
349
+ model_name=model_name,
350
+ encoding_name=encoding_name
351
+ )
352
+ encoding = (
353
+ tiktoken.get_encoding(task_params.encoding_name)
354
+ if encoding_name else tiktoken.encoding_for_model(task_params.model_name)
355
+ )
356
+
357
+ tokens = encoding.encode(text)
358
+ decoded = encoding.decode(tokens)
359
+
360
+ stack = []
361
+ chunks = []
362
+ buffer = ''
363
+ for char in decoded:
364
+ buffer += char
365
+ if char == open:
366
+ stack.append(char)
367
+ elif char == close and stack:
368
+ stack.pop()
369
+
370
+ if not stack and len(encoding.encode(buffer)) >= task_params.max_tokens:
371
+ chunks.append(buffer)
372
+ buffer = ''
373
+
374
+ if buffer.strip():
375
+ chunks.append(buffer)
376
+
377
+ return chunks
378
+
379
+ def extract_python_blocks(source_code: str, reverse: bool = False) -> List[str]:
380
+ """
381
+ Extracts top-level function and class definitions (including decorators) from Python source.
382
+
383
+ Args:
384
+ source_code (str): Python code to analyze.
385
+ reverse (bool): Whether to extract blocks in reverse order.
386
+
387
+ Returns:
388
+ List[str]: List of code blocks (functions or classes).
389
+ """
390
+ reverse = reverse or False
391
+ func_pattern = re.compile(r'^\s*def\s+\w+\s*\(.*\)\s*:', re.MULTILINE)
392
+ class_pattern = re.compile(r'^\s*class\s+\w+\s*(\(.*\))?\s*:', re.MULTILINE)
393
+ decorator_pattern = re.compile(r'^\s*@\w+', re.MULTILINE)
394
+
395
+ lines = source_code.splitlines()
396
+ if reverse:
397
+ lines = list(reversed(lines))
398
+
399
+ blocks = []
400
+ current_block = []
401
+
402
+ for line in lines:
403
+ if func_pattern.match(line) or class_pattern.match(line):
404
+ if current_block:
405
+ if reverse:
406
+ blocks.append("\n".join(reversed(current_block)))
407
+ else:
408
+ blocks.append("\n".join(current_block))
409
+ current_block = []
410
+ current_block.append(line)
411
+ # Include decorators directly above function/class
412
+ if decorator_pattern.match(line) and current_block:
413
+ continue
414
+
415
+ if current_block:
416
+ if reverse:
417
+ blocks.append("\n".join(reversed(current_block)))
418
+ else:
419
+ blocks.append("\n".join(current_block))
420
+
421
+ return list(reversed(blocks)) if reverse else blocks
422
+ def chunk_source_code(source_code: str,
423
+ max_tokens: int,
424
+ model_name="gpt-4",
425
+ encoding_name=None,
426
+ reverse=False):
427
+ task_params = get_class_inputs(ChunkParams,
428
+ max_tokens=max_tokens,
429
+ model_name=model_name,
430
+ encoding_name=encoding_name,
431
+ reverse=reverse
432
+ )
433
+ encoding = (
434
+ tiktoken.get_encoding(task_params.encoding_name)
435
+ if task_params.encoding_name else tiktoken.encoding_for_model(task_params.model_name)
436
+ )
437
+
438
+ def token_count(text): return len(encoding.encode(text))
439
+
440
+ chunks = []
441
+ current_chunk = []
442
+ current_tokens = 0
443
+
444
+ blocks = extract_python_blocks(source_code, task_params.reverse)
445
+
446
+ for block in blocks:
447
+ block_tokens = token_count(block)
448
+ if block_tokens > max_tokens:
449
+ chunks.append(block) # too big, include as is
450
+ elif current_tokens + block_tokens <= task_params.max_tokens:
451
+ current_chunk.append(block)
452
+ current_tokens += block_tokens
453
+ else:
454
+ chunks.append("\n\n".join(current_chunk))
455
+ current_chunk = [block]
456
+ current_tokens = block_tokens
457
+
458
+ if current_chunk:
459
+ chunks.append("\n\n".join(current_chunk))
460
+
461
+ return chunks
462
+
463
+
464
+ def strict_token_chunking(data: str,
465
+ max_tokens: int,
466
+ model_name: str = "gpt-4",
467
+ encoding_name: str = None,
468
+ overlap:int=0,
469
+ verbose:bool=False) -> List[str]:
470
+ """
471
+ Improved chunking method for descriptive summarization. This version uses paragraph-based boundaries
472
+ and ensures token limits are respected. It preserves semantic coherence better than line-based splits.
473
+
474
+ Args:
475
+ data (str): The full input text.
476
+ max_tokens (int): Maximum number of tokens per chunk.
477
+ model_name (str): Model name for tokenization.
478
+ encoding_name (str): Optional encoding override.
479
+
480
+ Returns:
481
+ List[str]: List of token-bound text chunks.
482
+ """
483
+ # Importing token counting utility
484
+ task_params = get_class_inputs(ChunkParams,
485
+ max_tokens=max_tokens,
486
+ model_name=model_name,
487
+ encoding_name=encoding_name,
488
+ overlap=overlap,
489
+ verbose=verbose
490
+ )
491
+ encoding = (
492
+ tiktoken.get_encoding(task_params.encoding_name)
493
+ if task_params.encoding_name
494
+ else tiktoken.encoding_for_model(task_params.model_name)
495
+ )
496
+
497
+ def count_tokens(text):
498
+ return len(encoding.encode(text))
499
+
500
+ paragraphs = re.split(r"\n\s*\n", data.strip()) # split on paragraph gaps
501
+ chunks = []
502
+ current_chunk = []
503
+
504
+ for paragraph in paragraphs:
505
+ trial_chunk = "\n\n".join(current_chunk + [paragraph])
506
+ if count_tokens(trial_chunk) <= task_params.max_tokens:
507
+ current_chunk.append(paragraph)
508
+ else:
509
+ if current_chunk:
510
+ chunks.append("\n\n".join(current_chunk))
511
+ current_chunk = [paragraph]
512
+
513
+ if current_chunk:
514
+ chunks.append("\n\n".join(current_chunk))
515
+
516
+ return chunks
@@ -0,0 +1,2 @@
1
+ from .imports import *
2
+ from .path_utils import *
@@ -0,0 +1,2 @@
1
+ from .imports import *
2
+ from .module_imports import *
@@ -0,0 +1 @@
1
+ from ...imports import os,shlex
@@ -0,0 +1,6 @@
1
+ from ...string_utils import eatAll
2
+ from ...list_utils import make_list
3
+ from ...type_utils import get_media_exts, is_media_type,MIME_TYPES
4
+ from ...safe_utils import safe_join
5
+ from ...class_utils import get_caller_path,get_caller_dir
6
+