symbolicai 0.20.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +96 -64
- symai/backend/base.py +93 -80
- symai/backend/engines/drawing/engine_bfl.py +12 -11
- symai/backend/engines/drawing/engine_gpt_image.py +108 -87
- symai/backend/engines/embedding/engine_llama_cpp.py +25 -28
- symai/backend/engines/embedding/engine_openai.py +3 -5
- symai/backend/engines/execute/engine_python.py +6 -5
- symai/backend/engines/files/engine_io.py +74 -67
- symai/backend/engines/imagecaptioning/engine_blip2.py +3 -3
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +54 -38
- symai/backend/engines/index/engine_pinecone.py +23 -24
- symai/backend/engines/index/engine_vectordb.py +16 -14
- symai/backend/engines/lean/engine_lean4.py +38 -34
- symai/backend/engines/neurosymbolic/__init__.py +41 -13
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +262 -182
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +263 -191
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +53 -49
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +212 -211
- symai/backend/engines/neurosymbolic/engine_groq.py +87 -63
- symai/backend/engines/neurosymbolic/engine_huggingface.py +21 -24
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +117 -48
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +256 -229
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +270 -150
- symai/backend/engines/ocr/engine_apilayer.py +6 -8
- symai/backend/engines/output/engine_stdout.py +1 -4
- symai/backend/engines/search/engine_openai.py +7 -7
- symai/backend/engines/search/engine_perplexity.py +5 -5
- symai/backend/engines/search/engine_serpapi.py +12 -14
- symai/backend/engines/speech_to_text/engine_local_whisper.py +20 -27
- symai/backend/engines/symbolic/engine_wolframalpha.py +3 -3
- symai/backend/engines/text_to_speech/engine_openai.py +5 -7
- symai/backend/engines/text_vision/engine_clip.py +7 -11
- symai/backend/engines/userinput/engine_console.py +3 -3
- symai/backend/engines/webscraping/engine_requests.py +81 -48
- symai/backend/mixin/__init__.py +13 -0
- symai/backend/mixin/anthropic.py +4 -2
- symai/backend/mixin/deepseek.py +2 -0
- symai/backend/mixin/google.py +2 -0
- symai/backend/mixin/openai.py +11 -3
- symai/backend/settings.py +83 -16
- symai/chat.py +101 -78
- symai/collect/__init__.py +7 -1
- symai/collect/dynamic.py +77 -69
- symai/collect/pipeline.py +35 -27
- symai/collect/stats.py +75 -63
- symai/components.py +198 -169
- symai/constraints.py +15 -12
- symai/core.py +698 -359
- symai/core_ext.py +32 -34
- symai/endpoints/api.py +80 -73
- symai/extended/.DS_Store +0 -0
- symai/extended/__init__.py +46 -12
- symai/extended/api_builder.py +11 -8
- symai/extended/arxiv_pdf_parser.py +13 -12
- symai/extended/bibtex_parser.py +2 -3
- symai/extended/conversation.py +101 -90
- symai/extended/document.py +17 -10
- symai/extended/file_merger.py +18 -13
- symai/extended/graph.py +18 -13
- symai/extended/html_style_template.py +2 -4
- symai/extended/interfaces/blip_2.py +1 -2
- symai/extended/interfaces/clip.py +1 -2
- symai/extended/interfaces/console.py +7 -1
- symai/extended/interfaces/dall_e.py +1 -1
- symai/extended/interfaces/flux.py +1 -1
- symai/extended/interfaces/gpt_image.py +1 -1
- symai/extended/interfaces/input.py +1 -1
- symai/extended/interfaces/llava.py +0 -1
- symai/extended/interfaces/naive_vectordb.py +7 -8
- symai/extended/interfaces/naive_webscraping.py +1 -1
- symai/extended/interfaces/ocr.py +1 -1
- symai/extended/interfaces/pinecone.py +6 -5
- symai/extended/interfaces/serpapi.py +1 -1
- symai/extended/interfaces/terminal.py +2 -3
- symai/extended/interfaces/tts.py +1 -1
- symai/extended/interfaces/whisper.py +1 -1
- symai/extended/interfaces/wolframalpha.py +1 -1
- symai/extended/metrics/__init__.py +11 -1
- symai/extended/metrics/similarity.py +11 -13
- symai/extended/os_command.py +17 -16
- symai/extended/packages/__init__.py +29 -3
- symai/extended/packages/symdev.py +19 -16
- symai/extended/packages/sympkg.py +12 -9
- symai/extended/packages/symrun.py +21 -19
- symai/extended/repo_cloner.py +11 -10
- symai/extended/seo_query_optimizer.py +1 -2
- symai/extended/solver.py +20 -23
- symai/extended/summarizer.py +4 -3
- symai/extended/taypan_interpreter.py +10 -12
- symai/extended/vectordb.py +99 -82
- symai/formatter/__init__.py +9 -1
- symai/formatter/formatter.py +12 -16
- symai/formatter/regex.py +62 -63
- symai/functional.py +176 -122
- symai/imports.py +136 -127
- symai/interfaces.py +56 -27
- symai/memory.py +14 -13
- symai/misc/console.py +49 -39
- symai/misc/loader.py +5 -3
- symai/models/__init__.py +17 -1
- symai/models/base.py +269 -181
- symai/models/errors.py +0 -1
- symai/ops/__init__.py +32 -22
- symai/ops/measures.py +11 -15
- symai/ops/primitives.py +348 -228
- symai/post_processors.py +32 -28
- symai/pre_processors.py +39 -41
- symai/processor.py +6 -4
- symai/prompts.py +59 -45
- symai/server/huggingface_server.py +23 -20
- symai/server/llama_cpp_server.py +7 -5
- symai/shell.py +3 -4
- symai/shellsv.py +499 -375
- symai/strategy.py +517 -287
- symai/symbol.py +111 -116
- symai/utils.py +42 -36
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/METADATA +4 -2
- symbolicai-1.0.0.dist-info/RECORD +163 -0
- symbolicai-0.20.2.dist-info/RECORD +0 -162
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/WHEEL +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/top_level.txt +0 -0
symai/formatter/regex.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
|
-
import
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from ..utils import UserMessage
|
|
4
5
|
|
|
5
6
|
# Define variables for magic numbers
|
|
6
7
|
MAX_HEADING_LENGTH = 7
|
|
@@ -33,13 +34,13 @@ LOOKAHEAD_RANGE = 100 # Number of characters to look ahead for a sentence bound
|
|
|
33
34
|
|
|
34
35
|
# Define emoji ranges
|
|
35
36
|
def generate_emoji_pattern(file_name):
|
|
36
|
-
current_dir =
|
|
37
|
-
file_path =
|
|
37
|
+
current_dir = Path(__file__).resolve().parent
|
|
38
|
+
file_path = current_dir / file_name
|
|
38
39
|
|
|
39
40
|
emoji_codes = set()
|
|
40
41
|
|
|
41
42
|
try:
|
|
42
|
-
with open(
|
|
43
|
+
with file_path.open(encoding='utf-8') as file:
|
|
43
44
|
for line in file:
|
|
44
45
|
# Skip comments and empty lines
|
|
45
46
|
if line.strip() and not line.startswith('#'):
|
|
@@ -61,10 +62,10 @@ def generate_emoji_pattern(file_name):
|
|
|
61
62
|
# pass
|
|
62
63
|
|
|
63
64
|
except FileNotFoundError:
|
|
64
|
-
|
|
65
|
+
UserMessage(f"Error: File '{file_name}' not found in {current_dir}")
|
|
65
66
|
return None
|
|
66
67
|
except Exception as e:
|
|
67
|
-
|
|
68
|
+
UserMessage(f"Error reading file: {e}")
|
|
68
69
|
return None
|
|
69
70
|
|
|
70
71
|
# Sort the emoji codes
|
|
@@ -78,9 +79,7 @@ def generate_emoji_pattern(file_name):
|
|
|
78
79
|
else:
|
|
79
80
|
pattern_parts.append(''.join(f'\\U{ord(c):08x}' for c in code))
|
|
80
81
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
return emoji_pattern
|
|
82
|
+
return '(?:' + '|'.join(pattern_parts) + ')'
|
|
84
83
|
|
|
85
84
|
# Usage
|
|
86
85
|
file_name = 'emoji.pytxt'
|
|
@@ -88,74 +87,74 @@ EMOJI_PATTERN = generate_emoji_pattern(file_name)
|
|
|
88
87
|
|
|
89
88
|
# Define the regex pattern
|
|
90
89
|
CHUNK_REGEX = re.compile(
|
|
91
|
-
r"("
|
|
90
|
+
r"("
|
|
92
91
|
# 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
|
|
93
|
-
fr"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\r?\n|$))"
|
|
94
|
-
"|"
|
|
92
|
+
fr"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\r?\n|$))"
|
|
93
|
+
"|"
|
|
95
94
|
# New pattern for citations
|
|
96
|
-
fr"(?:\[[0-9]+\][^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}})"
|
|
97
|
-
"|"
|
|
95
|
+
fr"(?:\[[0-9]+\][^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}})"
|
|
96
|
+
"|"
|
|
98
97
|
# 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)
|
|
99
|
-
|
|
100
|
-
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
101
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
102
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
103
|
-
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
104
|
-
|
|
105
|
-
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
106
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
107
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
108
|
-
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
109
|
-
fr"){{0,{MAX_NESTED_LIST_ITEMS}}}"
|
|
110
|
-
fr"(?:\r?\n[ \t]{{4,{MAX_LIST_INDENT_SPACES}}}(?:[-*+•]|\d{{1,3}}\.|\[[ xX]\])[ \t]+"
|
|
111
|
-
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
112
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
113
|
-
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
114
|
-
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
115
|
-
fr"){{0,{MAX_NESTED_LIST_ITEMS}}})"
|
|
116
|
-
")"
|
|
98
|
+
r"(?:(?:^|\r?\n)[ \t]{0,3}(?:[-*+•]|\d{1,3}\.|\[[ xX]\])[ \t]+"
|
|
99
|
+
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
100
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
101
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
102
|
+
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
103
|
+
r"(?:(?:\r?\n[ \t]{2,5}(?:[-*+•]|\d{1,3}\.|\[[ xX]\])[ \t]+"
|
|
104
|
+
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
105
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
106
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
107
|
+
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
108
|
+
fr"){{0,{MAX_NESTED_LIST_ITEMS}}}"
|
|
109
|
+
fr"(?:\r?\n[ \t]{{4,{MAX_LIST_INDENT_SPACES}}}(?:[-*+•]|\d{{1,3}}\.|\[[ xX]\])[ \t]+"
|
|
110
|
+
fr"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
111
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
112
|
+
fr"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
113
|
+
fr"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
114
|
+
fr"){{0,{MAX_NESTED_LIST_ITEMS}}})"
|
|
115
|
+
")"
|
|
117
116
|
# 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints)
|
|
118
|
-
fr"(?:(?:^>(?:>|\s{{2,}}){{0,2}}(?:(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))\\r?\\n?){{1,{MAX_BLOCKQUOTE_LINES}}})"
|
|
119
|
-
"|"
|
|
117
|
+
fr"(?:(?:^>(?:>|\s{{2,}}){{0,2}}(?:(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))\\r?\\n?){{1,{MAX_BLOCKQUOTE_LINES}}})"
|
|
118
|
+
"|"
|
|
120
119
|
# 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints)
|
|
121
|
-
fr"(?:(?:^|\r?\n)(?:```|~~~)(?:\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:```|~~~)\r?\n?"
|
|
122
|
-
fr"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}){{0,{MAX_INDENTED_CODE_LINES}}}\r?\n?)"
|
|
123
|
-
fr"|(?:<pre>(?:<code>)?[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:</code>)?</pre>))"
|
|
124
|
-
"|"
|
|
120
|
+
fr"(?:(?:^|\r?\n)(?:```|~~~)(?:\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:```|~~~)\r?\n?"
|
|
121
|
+
fr"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}){{0,{MAX_INDENTED_CODE_LINES}}}\r?\n?)"
|
|
122
|
+
fr"|(?:<pre>(?:<code>)?[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:</code>)?</pre>))"
|
|
123
|
+
"|"
|
|
125
124
|
# 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)
|
|
126
|
-
fr"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|){{0,{MAX_TABLE_ROWS}}}"
|
|
127
|
-
fr"|<table>[\s\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>))"
|
|
128
|
-
"|"
|
|
125
|
+
fr"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|){{0,{MAX_TABLE_ROWS}}}"
|
|
126
|
+
fr"|<table>[\s\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>))"
|
|
127
|
+
"|"
|
|
129
128
|
# 6. Horizontal rules (Markdown and HTML hr tag)
|
|
130
|
-
fr"(?:^(?:[-*_]){{{MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|<hr\s*/?>\r?\n)"
|
|
131
|
-
"|"
|
|
129
|
+
fr"(?:^(?:[-*_]){{{MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|<hr\s*/?>\r?\n)"
|
|
130
|
+
"|"
|
|
132
131
|
# 7. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)
|
|
133
|
-
fr"(?:^(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>)?(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</[a-zA-Z]+>)?(?:\r?\n|$))"
|
|
134
|
-
"|"
|
|
132
|
+
fr"(?:^(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>)?(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</[a-zA-Z]+>)?(?:\r?\n|$))"
|
|
133
|
+
"|"
|
|
135
134
|
# 8. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)
|
|
136
|
-
fr"(?:(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
137
|
-
"|"
|
|
135
|
+
fr"(?:(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
136
|
+
"|"
|
|
138
137
|
# 9. Quoted text, parenthetical phrases, or bracketed content (with length constraints)
|
|
139
|
-
r"(?:"
|
|
140
|
-
fr'(?<!\w)\"\"\"[^\""]{{0,{MAX_QUOTED_TEXT_LENGTH}}}\"\"\"(?!\w)'
|
|
141
|
-
|
|
142
|
-
fr"|\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\)[^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\)"
|
|
143
|
-
fr"|\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\]"
|
|
144
|
-
fr"|\$[^\r\n$]{{0,{MAX_MATH_INLINE_LENGTH}}}\$"
|
|
145
|
-
fr"|`[^`\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}`"
|
|
146
|
-
r")"
|
|
147
|
-
"|"
|
|
138
|
+
r"(?:"
|
|
139
|
+
fr'(?<!\w)\"\"\"[^\""]{{0,{MAX_QUOTED_TEXT_LENGTH}}}\"\"\"(?!\w)'
|
|
140
|
+
r"|(?<!\w)['\"`][^\r\n]{{0,{MAX_QUOTED_TEXT_LENGTH}}}['\"`](?!\w)"
|
|
141
|
+
fr"|\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\)[^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\)"
|
|
142
|
+
fr"|\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\]"
|
|
143
|
+
fr"|\$[^\r\n$]{{0,{MAX_MATH_INLINE_LENGTH}}}\$"
|
|
144
|
+
fr"|`[^`\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}`"
|
|
145
|
+
r")"
|
|
146
|
+
"|"
|
|
148
147
|
# 10. Paragraphs (with length constraints)
|
|
149
|
-
fr"(?:(?:^|\r?\n\r?\n)(?:<p>)?(?:(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</p>)?(?=\r?\n\r?\n|$))"
|
|
150
|
-
"|"
|
|
148
|
+
fr"(?:(?:^|\r?\n\r?\n)(?:<p>)?(?:(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</p>)?(?=\r?\n\r?\n|$))"
|
|
149
|
+
"|"
|
|
151
150
|
# 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)
|
|
152
|
-
fr"(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{MAX_HTML_TAG_CONTENT_LENGTH}}}?</[a-zA-Z]+>|\s*/>))"
|
|
153
|
-
"|"
|
|
151
|
+
fr"(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{MAX_HTML_TAG_CONTENT_LENGTH}}}?</[a-zA-Z]+>|\s*/>))"
|
|
152
|
+
"|"
|
|
154
153
|
# 12. LaTeX-style math expressions (inline and block, with length constraints)
|
|
155
|
-
fr"(?:(?:\$\$[\s\S]{{0,{MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}\$))"
|
|
156
|
-
"|"
|
|
154
|
+
fr"(?:(?:\$\$[\s\S]{{0,{MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}\$))"
|
|
155
|
+
"|"
|
|
157
156
|
# 13. Fallback for any remaining content (with length constraints)
|
|
158
|
-
fr"(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
157
|
+
fr"(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
159
158
|
")",
|
|
160
159
|
re.MULTILINE | re.UNICODE
|
|
161
160
|
)
|