symbolicai 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +269 -173
- symai/backend/base.py +123 -110
- symai/backend/engines/drawing/engine_bfl.py +45 -44
- symai/backend/engines/drawing/engine_gpt_image.py +112 -97
- symai/backend/engines/embedding/engine_llama_cpp.py +63 -52
- symai/backend/engines/embedding/engine_openai.py +25 -21
- symai/backend/engines/execute/engine_python.py +19 -18
- symai/backend/engines/files/engine_io.py +104 -95
- symai/backend/engines/imagecaptioning/engine_blip2.py +28 -24
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +102 -79
- symai/backend/engines/index/engine_pinecone.py +124 -97
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +84 -56
- symai/backend/engines/lean/engine_lean4.py +96 -52
- symai/backend/engines/neurosymbolic/__init__.py +41 -13
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +330 -248
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +329 -264
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +118 -88
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +344 -299
- symai/backend/engines/neurosymbolic/engine_groq.py +173 -115
- symai/backend/engines/neurosymbolic/engine_huggingface.py +114 -84
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +144 -118
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +415 -307
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +394 -231
- symai/backend/engines/ocr/engine_apilayer.py +23 -27
- symai/backend/engines/output/engine_stdout.py +10 -13
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +101 -54
- symai/backend/engines/search/engine_openai.py +100 -88
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +44 -45
- symai/backend/engines/search/engine_serpapi.py +37 -34
- symai/backend/engines/speech_to_text/engine_local_whisper.py +54 -51
- symai/backend/engines/symbolic/engine_wolframalpha.py +15 -9
- symai/backend/engines/text_to_speech/engine_openai.py +20 -26
- symai/backend/engines/text_vision/engine_clip.py +39 -37
- symai/backend/engines/userinput/engine_console.py +5 -6
- symai/backend/mixin/__init__.py +13 -0
- symai/backend/mixin/anthropic.py +48 -38
- symai/backend/mixin/deepseek.py +6 -5
- symai/backend/mixin/google.py +7 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +140 -110
- symai/backend/settings.py +87 -20
- symai/chat.py +216 -123
- symai/collect/__init__.py +7 -1
- symai/collect/dynamic.py +80 -70
- symai/collect/pipeline.py +67 -51
- symai/collect/stats.py +161 -109
- symai/components.py +707 -360
- symai/constraints.py +24 -12
- symai/core.py +1857 -1233
- symai/core_ext.py +83 -80
- symai/endpoints/api.py +166 -104
- symai/extended/.DS_Store +0 -0
- symai/extended/__init__.py +46 -12
- symai/extended/api_builder.py +29 -21
- symai/extended/arxiv_pdf_parser.py +23 -14
- symai/extended/bibtex_parser.py +9 -6
- symai/extended/conversation.py +156 -126
- symai/extended/document.py +50 -30
- symai/extended/file_merger.py +57 -14
- symai/extended/graph.py +51 -32
- symai/extended/html_style_template.py +18 -14
- symai/extended/interfaces/blip_2.py +2 -3
- symai/extended/interfaces/clip.py +4 -3
- symai/extended/interfaces/console.py +9 -1
- symai/extended/interfaces/dall_e.py +4 -2
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +4 -2
- symai/extended/interfaces/gpt_image.py +16 -7
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -2
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +4 -3
- symai/extended/interfaces/naive_vectordb.py +9 -10
- symai/extended/interfaces/ocr.py +5 -3
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +12 -9
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +3 -1
- symai/extended/interfaces/terminal.py +2 -4
- symai/extended/interfaces/tts.py +3 -2
- symai/extended/interfaces/whisper.py +3 -2
- symai/extended/interfaces/wolframalpha.py +2 -1
- symai/extended/metrics/__init__.py +11 -1
- symai/extended/metrics/similarity.py +14 -13
- symai/extended/os_command.py +39 -29
- symai/extended/packages/__init__.py +29 -3
- symai/extended/packages/symdev.py +51 -43
- symai/extended/packages/sympkg.py +41 -35
- symai/extended/packages/symrun.py +63 -50
- symai/extended/repo_cloner.py +14 -12
- symai/extended/seo_query_optimizer.py +15 -13
- symai/extended/solver.py +116 -91
- symai/extended/summarizer.py +12 -10
- symai/extended/taypan_interpreter.py +17 -18
- symai/extended/vectordb.py +122 -92
- symai/formatter/__init__.py +9 -1
- symai/formatter/formatter.py +51 -47
- symai/formatter/regex.py +70 -69
- symai/functional.py +325 -176
- symai/imports.py +190 -147
- symai/interfaces.py +57 -28
- symai/memory.py +45 -35
- symai/menu/screen.py +28 -19
- symai/misc/console.py +66 -56
- symai/misc/loader.py +8 -5
- symai/models/__init__.py +17 -1
- symai/models/base.py +395 -236
- symai/models/errors.py +1 -2
- symai/ops/__init__.py +32 -22
- symai/ops/measures.py +24 -25
- symai/ops/primitives.py +1149 -731
- symai/post_processors.py +58 -50
- symai/pre_processors.py +86 -82
- symai/processor.py +21 -13
- symai/prompts.py +764 -685
- symai/server/huggingface_server.py +135 -49
- symai/server/llama_cpp_server.py +21 -11
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +100 -42
- symai/shellsv.py +700 -492
- symai/strategy.py +630 -346
- symai/symbol.py +368 -322
- symai/utils.py +100 -78
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +22 -10
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-0.21.0.dist-info/RECORD +0 -162
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
symai/formatter/regex.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
|
-
import
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
|
|
4
|
+
from ..utils import UserMessage
|
|
4
5
|
|
|
5
6
|
# Define variables for magic numbers
|
|
6
7
|
MAX_HEADING_LENGTH = 7
|
|
@@ -31,19 +32,20 @@ MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100
|
|
|
31
32
|
MAX_HTML_TAG_CONTENT_LENGTH = 1000
|
|
32
33
|
LOOKAHEAD_RANGE = 100 # Number of characters to look ahead for a sentence boundary
|
|
33
34
|
|
|
35
|
+
|
|
34
36
|
# Define emoji ranges
|
|
35
37
|
def generate_emoji_pattern(file_name):
|
|
36
|
-
current_dir =
|
|
37
|
-
file_path =
|
|
38
|
+
current_dir = Path(__file__).resolve().parent
|
|
39
|
+
file_path = current_dir / file_name
|
|
38
40
|
|
|
39
41
|
emoji_codes = set()
|
|
40
42
|
|
|
41
43
|
try:
|
|
42
|
-
with open(
|
|
44
|
+
with file_path.open(encoding="utf-8") as file:
|
|
43
45
|
for line in file:
|
|
44
46
|
# Skip comments and empty lines
|
|
45
|
-
if line.strip() and not line.startswith(
|
|
46
|
-
fields = line.strip().split(
|
|
47
|
+
if line.strip() and not line.startswith("#"):
|
|
48
|
+
fields = line.strip().split(";")
|
|
47
49
|
unicode_codes = fields[0].strip().split()
|
|
48
50
|
|
|
49
51
|
if len(unicode_codes) == 1:
|
|
@@ -51,7 +53,7 @@ def generate_emoji_pattern(file_name):
|
|
|
51
53
|
emoji_codes.add(chr(int(unicode_codes[0], 16)))
|
|
52
54
|
elif len(unicode_codes) > 1:
|
|
53
55
|
# Sequence of Unicode characters
|
|
54
|
-
emoji_sequence =
|
|
56
|
+
emoji_sequence = "".join(chr(int(code, 16)) for code in unicode_codes)
|
|
55
57
|
emoji_codes.add(emoji_sequence)
|
|
56
58
|
|
|
57
59
|
# We could also process vendor-specific codes here if needed
|
|
@@ -61,10 +63,10 @@ def generate_emoji_pattern(file_name):
|
|
|
61
63
|
# pass
|
|
62
64
|
|
|
63
65
|
except FileNotFoundError:
|
|
64
|
-
|
|
66
|
+
UserMessage(f"Error: File '{file_name}' not found in {current_dir}")
|
|
65
67
|
return None
|
|
66
68
|
except Exception as e:
|
|
67
|
-
|
|
69
|
+
UserMessage(f"Error reading file: {e}")
|
|
68
70
|
return None
|
|
69
71
|
|
|
70
72
|
# Sort the emoji codes
|
|
@@ -74,88 +76,87 @@ def generate_emoji_pattern(file_name):
|
|
|
74
76
|
pattern_parts = []
|
|
75
77
|
for code in sorted_codes:
|
|
76
78
|
if len(code) == 1:
|
|
77
|
-
pattern_parts.append(f
|
|
79
|
+
pattern_parts.append(f"\\U{ord(code):08x}")
|
|
78
80
|
else:
|
|
79
|
-
pattern_parts.append(
|
|
81
|
+
pattern_parts.append("".join(f"\\U{ord(c):08x}" for c in code))
|
|
80
82
|
|
|
81
|
-
|
|
83
|
+
return "(?:" + "|".join(pattern_parts) + ")"
|
|
82
84
|
|
|
83
|
-
return emoji_pattern
|
|
84
85
|
|
|
85
86
|
# Usage
|
|
86
|
-
file_name =
|
|
87
|
+
file_name = "emoji.pytxt"
|
|
87
88
|
EMOJI_PATTERN = generate_emoji_pattern(file_name)
|
|
88
89
|
|
|
89
90
|
# Define the regex pattern
|
|
90
91
|
CHUNK_REGEX = re.compile(
|
|
91
|
-
r"("
|
|
92
|
+
r"("
|
|
92
93
|
# 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)
|
|
93
|
-
|
|
94
|
-
"|"
|
|
94
|
+
rf"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|<h[1-6][^>]{{0,{MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{MAX_HEADING_CONTENT_LENGTH}}}(?:</h[1-6]>)?(?:\r?\n|$))"
|
|
95
|
+
"|"
|
|
95
96
|
# New pattern for citations
|
|
96
|
-
|
|
97
|
-
"|"
|
|
97
|
+
rf"(?:\[[0-9]+\][^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}})"
|
|
98
|
+
"|"
|
|
98
99
|
# 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
")"
|
|
100
|
+
r"(?:(?:^|\r?\n)[ \t]{0,3}(?:[-*+•]|\d{1,3}\.|\[[ xX]\])[ \t]+"
|
|
101
|
+
rf"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
102
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
103
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
104
|
+
rf"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
105
|
+
r"(?:(?:\r?\n[ \t]{2,5}(?:[-*+•]|\d{1,3}\.|\[[ xX]\])[ \t]+"
|
|
106
|
+
rf"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
107
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
108
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
109
|
+
rf"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
110
|
+
rf"){{0,{MAX_NESTED_LIST_ITEMS}}}"
|
|
111
|
+
rf"(?:\r?\n[ \t]{{4,{MAX_LIST_INDENT_SPACES}}}(?:[-*+•]|\d{{1,3}}\.|\[[ xX]\])[ \t]+"
|
|
112
|
+
rf"(?:(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|"
|
|
113
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|"
|
|
114
|
+
rf"(?:\b[^\r\n]{{1,{MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})"
|
|
115
|
+
rf"(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
116
|
+
rf"){{0,{MAX_NESTED_LIST_ITEMS}}})"
|
|
117
|
+
")"
|
|
117
118
|
# 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints)
|
|
118
|
-
|
|
119
|
-
"|"
|
|
119
|
+
rf"(?:(?:^>(?:>|\s{{2,}}){{0,2}}(?:(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{0,{MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))\\r?\\n?){{1,{MAX_BLOCKQUOTE_LINES}}})"
|
|
120
|
+
"|"
|
|
120
121
|
# 4. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
"|"
|
|
122
|
+
rf"(?:(?:^|\r?\n)(?:```|~~~)(?:\w{{0,{MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:```|~~~)\r?\n?"
|
|
123
|
+
rf"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{MAX_LIST_ITEM_LENGTH}}}){{0,{MAX_INDENTED_CODE_LINES}}}\r?\n?)"
|
|
124
|
+
rf"|(?:<pre>(?:<code>)?[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:</code>)?</pre>))"
|
|
125
|
+
"|"
|
|
125
126
|
# 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
"|"
|
|
127
|
+
rf"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|){{0,{MAX_TABLE_ROWS}}}"
|
|
128
|
+
rf"|<table>[\s\S]{{0,{MAX_HTML_TABLE_LENGTH}}}?</table>))"
|
|
129
|
+
"|"
|
|
129
130
|
# 6. Horizontal rules (Markdown and HTML hr tag)
|
|
130
|
-
|
|
131
|
-
"|"
|
|
131
|
+
rf"(?:^(?:[-*_]){{{MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|<hr\s*/?>\r?\n)"
|
|
132
|
+
"|"
|
|
132
133
|
# 7. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)
|
|
133
|
-
|
|
134
|
-
"|"
|
|
134
|
+
rf"(?:^(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>)?(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</[a-zA-Z]+>)?(?:\r?\n|$))"
|
|
135
|
+
"|"
|
|
135
136
|
# 8. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)
|
|
136
|
-
|
|
137
|
-
"|"
|
|
137
|
+
rf"(?:(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_SENTENCE_LENGTH}}}(?=[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
138
|
+
"|"
|
|
138
139
|
# 9. Quoted text, parenthetical phrases, or bracketed content (with length constraints)
|
|
139
|
-
r"(?:"
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
r")"
|
|
147
|
-
"|"
|
|
140
|
+
r"(?:"
|
|
141
|
+
rf'(?<!\w)\"\"\"[^\""]{{0,{MAX_QUOTED_TEXT_LENGTH}}}\"\"\"(?!\w)'
|
|
142
|
+
r"|(?<!\w)['\"`][^\r\n]{{0,{MAX_QUOTED_TEXT_LENGTH}}}['\"`](?!\w)"
|
|
143
|
+
rf"|\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\([^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\)[^\r\n(){{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\)"
|
|
144
|
+
rf"|\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]{{0,{MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{MAX_NESTED_PARENTHESES}}}\]"
|
|
145
|
+
rf"|\$[^\r\n$]{{0,{MAX_MATH_INLINE_LENGTH}}}\$"
|
|
146
|
+
rf"|`[^`\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}`"
|
|
147
|
+
r")"
|
|
148
|
+
"|"
|
|
148
149
|
# 10. Paragraphs (with length constraints)
|
|
149
|
-
|
|
150
|
-
"|"
|
|
150
|
+
rf"(?:(?:^|\r?\n\r?\n)(?:<p>)?(?:(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))(?:</p>)?(?=\r?\n\r?\n|$))"
|
|
151
|
+
"|"
|
|
151
152
|
# 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)
|
|
152
|
-
|
|
153
|
-
"|"
|
|
153
|
+
rf"(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{MAX_HTML_TAG_CONTENT_LENGTH}}}?</[a-zA-Z]+>|\s*/>))"
|
|
154
|
+
"|"
|
|
154
155
|
# 12. LaTeX-style math expressions (inline and block, with length constraints)
|
|
155
|
-
|
|
156
|
-
"|"
|
|
156
|
+
rf"(?:(?:\$\$[\s\S]{{0,{MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}\$))"
|
|
157
|
+
"|"
|
|
157
158
|
# 13. Fallback for any remaining content (with length constraints)
|
|
158
|
-
|
|
159
|
+
rf"(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|\u2026\u2047-\u2049|{EMOJI_PATTERN})(?=\s|$))?))"
|
|
159
160
|
")",
|
|
160
|
-
re.MULTILINE | re.UNICODE
|
|
161
|
+
re.MULTILINE | re.UNICODE,
|
|
161
162
|
)
|