chatgpt-md-converter 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/PKG-INFO +1 -1
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/converters.py +2 -1
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/formatters.py +14 -6
- chatgpt_md_converter-0.2.0/chatgpt_md_converter/telegram_formatter.py +100 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/setup.py +1 -1
- chatgpt_md_converter-0.2.0/tests/test_parser.py +658 -0
- chatgpt_md_converter-0.1.1/chatgpt_md_converter/telegram_formatter.py +0 -56
- chatgpt_md_converter-0.1.1/tests/test_parser.py +0 -256
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/LICENSE +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/__init__.py +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/extractors.py +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/helpers.py +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/SOURCES.txt +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
- {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/setup.cfg +0 -0
{chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/converters.py
RENAMED
|
@@ -16,6 +16,7 @@ def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
|
|
|
16
16
|
Splits the text by markdown tag and replaces it with the specified HTML tag.
|
|
17
17
|
"""
|
|
18
18
|
tag_pattern = re.compile(
|
|
19
|
-
r"{}(.*?){}".format(re.escape(md_tag), re.escape(md_tag)),
|
|
19
|
+
r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
|
|
20
|
+
re.DOTALL,
|
|
20
21
|
)
|
|
21
22
|
return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
|
{chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/formatters.py
RENAMED
|
@@ -1,9 +1,6 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
1
|
def combine_blockquotes(text: str) -> str:
|
|
5
2
|
"""
|
|
6
|
-
Combines multiline blockquotes into a single blockquote.
|
|
3
|
+
Combines multiline blockquotes into a single blockquote while keeping the \n characters.
|
|
7
4
|
"""
|
|
8
5
|
lines = text.split("\n")
|
|
9
6
|
combined_lines = []
|
|
@@ -17,7 +14,7 @@ def combine_blockquotes(text: str) -> str:
|
|
|
17
14
|
else:
|
|
18
15
|
if in_blockquote:
|
|
19
16
|
combined_lines.append(
|
|
20
|
-
"<blockquote>" + "
|
|
17
|
+
"<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
|
|
21
18
|
)
|
|
22
19
|
blockquote_lines = []
|
|
23
20
|
in_blockquote = False
|
|
@@ -25,7 +22,18 @@ def combine_blockquotes(text: str) -> str:
|
|
|
25
22
|
|
|
26
23
|
if in_blockquote:
|
|
27
24
|
combined_lines.append(
|
|
28
|
-
"<blockquote>" + "
|
|
25
|
+
"<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
|
|
29
26
|
)
|
|
30
27
|
|
|
31
28
|
return "\n".join(combined_lines)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def fix_asterisk_equations(text: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Replaces numeric expressions with '*' in them with '×'
|
|
34
|
+
to avoid accidental italic formatting.
|
|
35
|
+
e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
|
|
36
|
+
"""
|
|
37
|
+
import re
|
|
38
|
+
eq_pattern = re.compile(r'(\d+)\s*\*\s*(\d+)')
|
|
39
|
+
return eq_pattern.sub(r'\1×\2', text)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from .converters import convert_html_chars, split_by_tag
|
|
4
|
+
from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
5
|
+
from .formatters import combine_blockquotes
|
|
6
|
+
from .helpers import remove_blockquote_escaping
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def extract_inline_code_snippets(text: str):
|
|
10
|
+
"""
|
|
11
|
+
Extracts inline code (single-backtick content) from the text,
|
|
12
|
+
replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
|
|
13
|
+
This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
|
|
14
|
+
"""
|
|
15
|
+
placeholders = []
|
|
16
|
+
code_snippets = {}
|
|
17
|
+
inline_code_pattern = re.compile(r"`([^`]+)`")
|
|
18
|
+
|
|
19
|
+
def replacer(match):
|
|
20
|
+
snippet = match.group(1)
|
|
21
|
+
placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
|
|
22
|
+
placeholders.append(placeholder)
|
|
23
|
+
code_snippets[placeholder] = snippet
|
|
24
|
+
return placeholder
|
|
25
|
+
|
|
26
|
+
new_text = inline_code_pattern.sub(replacer, text)
|
|
27
|
+
return new_text, code_snippets
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def telegram_format(text: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Converts markdown in the provided text to HTML supported by Telegram.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Step 0: Combine blockquotes
|
|
36
|
+
text = combine_blockquotes(text)
|
|
37
|
+
|
|
38
|
+
# Step 1: Convert HTML reserved symbols
|
|
39
|
+
text = convert_html_chars(text)
|
|
40
|
+
|
|
41
|
+
# Step 2: Extract and convert triple-backtick code blocks first
|
|
42
|
+
output, triple_code_blocks = extract_and_convert_code_blocks(text)
|
|
43
|
+
|
|
44
|
+
# Step 2.5: Extract inline code snippets (single backticks) so they won't be parsed as italics, etc.
|
|
45
|
+
output, inline_code_snippets = extract_inline_code_snippets(output)
|
|
46
|
+
|
|
47
|
+
# Step 3: Escape HTML special characters in the output text (for non-code parts)
|
|
48
|
+
# We do NOT want to escape what's inside placeholders here, only what's outside code placeholders.
|
|
49
|
+
output = output.replace("<", "<").replace(">", ">")
|
|
50
|
+
|
|
51
|
+
# Convert headings (H1-H6)
|
|
52
|
+
output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
|
|
53
|
+
|
|
54
|
+
# Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
|
|
55
|
+
output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
|
|
56
|
+
|
|
57
|
+
# Remove this old inline code replacement — now handled by extract_inline_code_snippets()
|
|
58
|
+
# output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
|
|
59
|
+
|
|
60
|
+
# Nested Bold and Italic
|
|
61
|
+
output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
|
|
62
|
+
output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
|
|
63
|
+
|
|
64
|
+
# Process markdown for bold (**), underline (__), strikethrough (~~)
|
|
65
|
+
output = split_by_tag(output, "**", "b")
|
|
66
|
+
output = split_by_tag(output, "__", "u")
|
|
67
|
+
output = split_by_tag(output, "~~", "s")
|
|
68
|
+
|
|
69
|
+
# Custom approach for single-asterisk italic
|
|
70
|
+
italic_pattern = re.compile(
|
|
71
|
+
r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])",
|
|
72
|
+
re.DOTALL
|
|
73
|
+
)
|
|
74
|
+
output = italic_pattern.sub(r"<i>\1</i>", output)
|
|
75
|
+
|
|
76
|
+
# Process single underscore-based italic
|
|
77
|
+
output = split_by_tag(output, "_", "i")
|
|
78
|
+
|
|
79
|
+
# Remove storage links (Vector storage placeholders like 【4:0†source】)
|
|
80
|
+
output = re.sub(r"【[^】]+】", "", output)
|
|
81
|
+
|
|
82
|
+
# Convert Markdown links/images to <a href="">…</a>
|
|
83
|
+
link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
|
|
84
|
+
output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
|
|
85
|
+
|
|
86
|
+
# Step 3.5: Reinsert inline code snippets, escaping special chars in code content
|
|
87
|
+
for placeholder, snippet in inline_code_snippets.items():
|
|
88
|
+
escaped_snippet = snippet.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
89
|
+
output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
|
|
90
|
+
|
|
91
|
+
# Step 4: Reinsert the converted triple-backtick code blocks
|
|
92
|
+
output = reinsert_code_blocks(output, triple_code_blocks)
|
|
93
|
+
|
|
94
|
+
# Step 5: Remove blockquote escaping
|
|
95
|
+
output = remove_blockquote_escaping(output)
|
|
96
|
+
|
|
97
|
+
# Clean up multiple consecutive newlines, but preserve intentional spacing
|
|
98
|
+
output = re.sub(r"\n{3,}", "\n\n", output)
|
|
99
|
+
|
|
100
|
+
return output.strip()
|
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
from chatgpt_md_converter.extractors import ensure_closing_delimiters
|
|
2
|
+
from chatgpt_md_converter.telegram_formatter import telegram_format
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_split_by_tag_bold():
|
|
6
|
+
text = "This is **bold** text"
|
|
7
|
+
assert telegram_format(text) == "This is <b>bold</b> text"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_telegram_format_italic():
|
|
11
|
+
text = "This is _italic_ text"
|
|
12
|
+
output = telegram_format(text)
|
|
13
|
+
assert output == "This is <i>italic</i> text"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_telegram_format_italic_star():
|
|
17
|
+
text = "This is *italic* text"
|
|
18
|
+
output = telegram_format(text)
|
|
19
|
+
assert output == "This is <i>italic</i> text"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_triple_backticks_with_language():
|
|
23
|
+
input_text = "```python\nprint('Hello, world!')\n```"
|
|
24
|
+
expected_output = (
|
|
25
|
+
"<pre><code class=\"language-python\">print('Hello, world!')\n</code></pre>"
|
|
26
|
+
)
|
|
27
|
+
output = telegram_format(input_text)
|
|
28
|
+
assert (
|
|
29
|
+
output == expected_output
|
|
30
|
+
), "Failed converting triple backticks with language to <pre><code> tags"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_bold_and_underline_conversion():
|
|
34
|
+
input_text = "This is **bold** and this is __underline__."
|
|
35
|
+
expected_output = "This is <b>bold</b> and this is <u>underline</u>."
|
|
36
|
+
output = telegram_format(input_text)
|
|
37
|
+
assert output == expected_output, "Failed converting ** and __ to <b> and <u> tags"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_escaping_special_characters():
|
|
41
|
+
input_text = "Avoid using < or > in your HTML."
|
|
42
|
+
expected_output = "Avoid using < or > in your HTML."
|
|
43
|
+
output = telegram_format(input_text)
|
|
44
|
+
assert output == expected_output, "Failed escaping < and > characters"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_nested_markdown_syntax():
|
|
48
|
+
input_text = "This is **bold and _italic_** text."
|
|
49
|
+
expected_output = "This is <b>bold and <i>italic</i></b> text."
|
|
50
|
+
output = telegram_format(input_text)
|
|
51
|
+
assert output == expected_output, "Failed handling nested markdown syntax"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_combination_of_markdown_elements():
|
|
55
|
+
input_text = """
|
|
56
|
+
# Heading
|
|
57
|
+
This is a test of **bold**, __underline__, and `inline code`.
|
|
58
|
+
- Item 1
|
|
59
|
+
* Item 2
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
for i in range(3):
|
|
63
|
+
print(i)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
[Link](http://example.com)
|
|
67
|
+
"""
|
|
68
|
+
expected_output = """
|
|
69
|
+
<b>Heading</b>
|
|
70
|
+
This is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.
|
|
71
|
+
• Item 1
|
|
72
|
+
• Item 2
|
|
73
|
+
|
|
74
|
+
<pre><code class="language-python">for i in range(3):
|
|
75
|
+
print(i)
|
|
76
|
+
</code></pre>
|
|
77
|
+
|
|
78
|
+
<a href="http://example.com">Link</a>
|
|
79
|
+
"""
|
|
80
|
+
output = telegram_format(input_text)
|
|
81
|
+
assert (
|
|
82
|
+
output.strip() == expected_output.strip()
|
|
83
|
+
), "Failed combining multiple markdown elements into HTML"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_nested_bold_within_italic():
|
|
87
|
+
input_text = "This is *__bold within italic__* text."
|
|
88
|
+
expected_output = "This is <i><u>bold within italic</u></i> text."
|
|
89
|
+
output = telegram_format(input_text)
|
|
90
|
+
assert (
|
|
91
|
+
output == expected_output
|
|
92
|
+
), "Failed converting nested bold within italic markdown to HTML"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_italic_within_bold():
|
|
96
|
+
input_text = "This is **bold and _italic_ together**."
|
|
97
|
+
expected_output = "This is <b>bold and <i>italic</i> together</b>."
|
|
98
|
+
output = telegram_format(input_text)
|
|
99
|
+
assert (
|
|
100
|
+
output == expected_output
|
|
101
|
+
), "Failed converting italic within bold markdown to HTML"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_inline_code_within_bold_text():
|
|
105
|
+
input_text = "This is **bold and `inline code` together**."
|
|
106
|
+
expected_output = "This is <b>bold and <code>inline code</code> together</b>."
|
|
107
|
+
output = telegram_format(input_text)
|
|
108
|
+
assert output == expected_output, "Failed handling inline code within bold text"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_mixed_formatting_tags_with_lists_and_links():
|
|
112
|
+
input_text = """
|
|
113
|
+
- This is a list item with **bold**, __underline__, and [a link](http://example.com)
|
|
114
|
+
- Another item with ***bold and italic*** text
|
|
115
|
+
"""
|
|
116
|
+
expected_output = """
|
|
117
|
+
• This is a list item with <b>bold</b>, <u>underline</u>, and <a href="http://example.com">a link</a>
|
|
118
|
+
• Another item with <b><i>bold and italic</i></b> text
|
|
119
|
+
"""
|
|
120
|
+
output = telegram_format(input_text)
|
|
121
|
+
assert (
|
|
122
|
+
output.strip() == expected_output.strip()
|
|
123
|
+
), "Failed handling mixed formatting tags with lists and links"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_special_characters_within_code_blocks():
|
|
127
|
+
input_text = "Here is a code block: ```<script>alert('Hello')</script>```"
|
|
128
|
+
expected_output = "Here is a code block: <pre><code><script>alert('Hello')</script></code></pre>"
|
|
129
|
+
output = telegram_format(input_text)
|
|
130
|
+
assert (
|
|
131
|
+
output == expected_output
|
|
132
|
+
), "Failed escaping special characters within code blocks"
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_code_block_within_bold_text():
|
|
136
|
+
input_text = "This is **bold with a `code block` inside**."
|
|
137
|
+
expected_output = "This is <b>bold with a <code>code block</code> inside</b>."
|
|
138
|
+
output = telegram_format(input_text)
|
|
139
|
+
assert output == expected_output, "Failed handling code block within bold text"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_triple_backticks_with_nested_markdown():
|
|
143
|
+
input_text = "```python\n**bold text** and __underline__ in code block```"
|
|
144
|
+
expected_output = '<pre><code class="language-python">**bold text** and __underline__ in code block</code></pre>'
|
|
145
|
+
output = telegram_format(input_text)
|
|
146
|
+
assert (
|
|
147
|
+
output == expected_output
|
|
148
|
+
), "Failed handling markdown within triple backtick code blocks"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_unmatched_code_delimiters():
|
|
152
|
+
input_text = "This has an `unmatched code delimiter."
|
|
153
|
+
expected_output = "This has an <code>unmatched code delimiter.</code>"
|
|
154
|
+
output = telegram_format(input_text)
|
|
155
|
+
assert output == expected_output, "Failed handling unmatched code delimiters"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def test_preformatted_block_with_unusual_language_specification():
|
|
159
|
+
input_text = "```weirdLang\nSome weirdLang code\n```"
|
|
160
|
+
expected_output = (
|
|
161
|
+
'<pre><code class="language-weirdLang">Some weirdLang code\n</code></pre>'
|
|
162
|
+
)
|
|
163
|
+
output = telegram_format(input_text)
|
|
164
|
+
assert (
|
|
165
|
+
output == expected_output
|
|
166
|
+
), "Failed handling preformatted block with unusual language specification"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_inline_code_within_lists():
|
|
170
|
+
input_text = """
|
|
171
|
+
- List item with `code`
|
|
172
|
+
* Another `code` item
|
|
173
|
+
"""
|
|
174
|
+
expected_output = """
|
|
175
|
+
• List item with <code>code</code>
|
|
176
|
+
• Another <code>code</code> item
|
|
177
|
+
"""
|
|
178
|
+
output = telegram_format(input_text)
|
|
179
|
+
assert (
|
|
180
|
+
output.strip() == expected_output.strip()
|
|
181
|
+
), "Failed handling inline code within lists"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def test_vector_storage_links_trim():
|
|
185
|
+
input_text = """
|
|
186
|
+
- List item with `code`
|
|
187
|
+
* Another `code` item【4:0†source】
|
|
188
|
+
"""
|
|
189
|
+
expected_output = """
|
|
190
|
+
• List item with <code>code</code>
|
|
191
|
+
• Another <code>code</code> item
|
|
192
|
+
"""
|
|
193
|
+
output = telegram_format(input_text)
|
|
194
|
+
assert output.strip() == expected_output.strip(), "Failed trim storage links"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_strikethrough_conversion():
|
|
198
|
+
input_text = "This is ~~strikethrough~~ text."
|
|
199
|
+
expected_output = "This is <s>strikethrough</s> text."
|
|
200
|
+
output = telegram_format(input_text)
|
|
201
|
+
assert output == expected_output, "Failed converting ~~ to <s> tags"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def test_blockquote_conversion():
|
|
205
|
+
input_text = "> This is a blockquote."
|
|
206
|
+
expected_output = "<blockquote>This is a blockquote.</blockquote>"
|
|
207
|
+
output = telegram_format(input_text)
|
|
208
|
+
assert output == expected_output, "Failed converting > to <blockquote> tags"
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def test_inline_url_conversion():
|
|
212
|
+
input_text = "[example](http://example.com)"
|
|
213
|
+
expected_output = '<a href="http://example.com">example</a>'
|
|
214
|
+
output = telegram_format(input_text)
|
|
215
|
+
assert output == expected_output, "Failed converting [text](URL) to <a> tags"
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_inline_mention_conversion():
|
|
219
|
+
input_text = "[User](tg://user?id=123456789)"
|
|
220
|
+
expected_output = '<a href="tg://user?id=123456789">User</a>'
|
|
221
|
+
output = telegram_format(input_text)
|
|
222
|
+
assert (
|
|
223
|
+
output == expected_output
|
|
224
|
+
), "Failed converting [text](tg://user?id=ID) to <a> tags"
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_escaping_ampersand():
|
|
228
|
+
input_text = "Use & in your HTML."
|
|
229
|
+
expected_output = "Use & in your HTML."
|
|
230
|
+
output = telegram_format(input_text)
|
|
231
|
+
assert output == expected_output, "Failed escaping & character"
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def test_pre_and_code_tags_with_html_entities():
|
|
235
|
+
input_text = "```html\n<div>Content</div>\n```"
|
|
236
|
+
expected_output = (
|
|
237
|
+
'<pre><code class="language-html"><div>Content</div>\n</code></pre>'
|
|
238
|
+
)
|
|
239
|
+
output = telegram_format(input_text)
|
|
240
|
+
assert (
|
|
241
|
+
output == expected_output
|
|
242
|
+
), "Failed handling pre and code tags with HTML entities"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def test_code_with_multiple_lines():
|
|
246
|
+
input_text = "```\ndef example():\n return 'example'\n```"
|
|
247
|
+
expected_output = "<pre><code>def example():\n return 'example'\n</code></pre>"
|
|
248
|
+
output = telegram_format(input_text)
|
|
249
|
+
assert output == expected_output, "Failed handling code with multiple lines"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def test_combined_formatting_with_lists():
|
|
253
|
+
input_text = """
|
|
254
|
+
- **Bold** list item
|
|
255
|
+
- _Italic_ list item
|
|
256
|
+
- `Code` list item
|
|
257
|
+
"""
|
|
258
|
+
expected_output = """
|
|
259
|
+
• <b>Bold</b> list item
|
|
260
|
+
• <i>Italic</i> list item
|
|
261
|
+
• <code>Code</code> list item
|
|
262
|
+
"""
|
|
263
|
+
output = telegram_format(input_text)
|
|
264
|
+
assert (
|
|
265
|
+
output.strip() == expected_output.strip()
|
|
266
|
+
), "Failed handling combined formatting with lists"
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def test_md_large_example():
|
|
270
|
+
input_text = """
|
|
271
|
+
1. **Headings:**
|
|
272
|
+
# H1 Heading
|
|
273
|
+
## H2 Heading
|
|
274
|
+
### H3 Heading
|
|
275
|
+
#### H4 Heading
|
|
276
|
+
##### H5 Heading
|
|
277
|
+
###### H6 Heading
|
|
278
|
+
|
|
279
|
+
2. **Emphasis:**
|
|
280
|
+
|
|
281
|
+
*Italic text* or _Italic text_
|
|
282
|
+
|
|
283
|
+
**Bold text** or __Underline text__
|
|
284
|
+
|
|
285
|
+
***Bold and italic text*** or ___Underline and italic text___
|
|
286
|
+
|
|
287
|
+
3. **Lists:**
|
|
288
|
+
- **Unordered List:**
|
|
289
|
+
|
|
290
|
+
- Item 1
|
|
291
|
+
- Item 2
|
|
292
|
+
- Subitem 1
|
|
293
|
+
- Subitem 2
|
|
294
|
+
|
|
295
|
+
- **Ordered List:**
|
|
296
|
+
|
|
297
|
+
1. First item
|
|
298
|
+
2. Second item
|
|
299
|
+
1. Subitem 1
|
|
300
|
+
2. Subitem 2
|
|
301
|
+
|
|
302
|
+
4. **Links:**
|
|
303
|
+
|
|
304
|
+
[OpenAI](https://www.openai.com)
|
|
305
|
+
|
|
306
|
+
5. **Images:**
|
|
307
|
+
|
|
308
|
+

|
|
309
|
+

|
|
310
|
+
|
|
311
|
+
6. **Blockquotes:**
|
|
312
|
+
|
|
313
|
+
> This is a blockquote.
|
|
314
|
+
> It can span multiple lines.
|
|
315
|
+
|
|
316
|
+
7. **Inline Code:**
|
|
317
|
+
|
|
318
|
+
Here is some `inline code`.
|
|
319
|
+
|
|
320
|
+
8. **Code Blocks:**
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
def example_function():
|
|
324
|
+
print("Hello World")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
9. **Tables:**
|
|
328
|
+
|
|
329
|
+
| Header 1 | Header 2 |
|
|
330
|
+
|----------|----------|
|
|
331
|
+
| Row 1 Col 1 | Row 1 Col 2 |
|
|
332
|
+
| Row 2 Col 1 | Row 2 Col 2 |
|
|
333
|
+
|
|
334
|
+
10. **Horizontal Rule:**
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
"""
|
|
338
|
+
expected_output = """
|
|
339
|
+
1. <b>Headings:</b>
|
|
340
|
+
<b>H1 Heading</b>
|
|
341
|
+
<b>H2 Heading</b>
|
|
342
|
+
<b>H3 Heading</b>
|
|
343
|
+
<b>H4 Heading</b>
|
|
344
|
+
<b>H5 Heading</b>
|
|
345
|
+
<b>H6 Heading</b>
|
|
346
|
+
|
|
347
|
+
2. <b>Emphasis:</b>
|
|
348
|
+
|
|
349
|
+
<i>Italic text</i> or <i>Italic text</i>
|
|
350
|
+
|
|
351
|
+
<b>Bold text</b> or <u>Underline text</u>
|
|
352
|
+
|
|
353
|
+
<b><i>Bold and italic text</i></b> or <u><i>Underline and italic text</i></u>
|
|
354
|
+
|
|
355
|
+
3. <b>Lists:</b>
|
|
356
|
+
• <b>Unordered List:</b>
|
|
357
|
+
|
|
358
|
+
• Item 1
|
|
359
|
+
• Item 2
|
|
360
|
+
• Subitem 1
|
|
361
|
+
• Subitem 2
|
|
362
|
+
|
|
363
|
+
• <b>Ordered List:</b>
|
|
364
|
+
|
|
365
|
+
1. First item
|
|
366
|
+
2. Second item
|
|
367
|
+
1. Subitem 1
|
|
368
|
+
2. Subitem 2
|
|
369
|
+
|
|
370
|
+
4. <b>Links:</b>
|
|
371
|
+
|
|
372
|
+
<a href="https://www.openai.com">OpenAI</a>
|
|
373
|
+
|
|
374
|
+
5. <b>Images:</b>
|
|
375
|
+
|
|
376
|
+
<a href="URL_to_image">Alt text for image</a>
|
|
377
|
+
<a href="URL_to_імедж">Alt text for image</a>
|
|
378
|
+
|
|
379
|
+
6. <b>Blockquotes:</b>
|
|
380
|
+
|
|
381
|
+
<blockquote>This is a blockquote.
|
|
382
|
+
It can span multiple lines.</blockquote>
|
|
383
|
+
|
|
384
|
+
7. <b>Inline Code:</b>
|
|
385
|
+
|
|
386
|
+
Here is some <code>inline code</code>.
|
|
387
|
+
|
|
388
|
+
8. <b>Code Blocks:</b>
|
|
389
|
+
|
|
390
|
+
<pre><code class="language-python">def example_function():
|
|
391
|
+
print("Hello World")
|
|
392
|
+
</code></pre>
|
|
393
|
+
|
|
394
|
+
9. <b>Tables:</b>
|
|
395
|
+
|
|
396
|
+
| Header 1 | Header 2 |
|
|
397
|
+
|----------|----------|
|
|
398
|
+
| Row 1 Col 1 | Row 1 Col 2 |
|
|
399
|
+
| Row 2 Col 1 | Row 2 Col 2 |
|
|
400
|
+
|
|
401
|
+
10. <b>Horizontal Rule:</b>
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
"""
|
|
405
|
+
output = telegram_format(input_text)
|
|
406
|
+
assert (
|
|
407
|
+
output.strip() == expected_output.strip()
|
|
408
|
+
), "Failed handling large markdown example"
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def test_unclosed_single_backtick():
|
|
412
|
+
"""Test that a single unclosed backtick is properly handled"""
|
|
413
|
+
text = "Here is some `code without closing"
|
|
414
|
+
result = ensure_closing_delimiters(text)
|
|
415
|
+
assert result == "Here is some `code without closing`"
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def test_unclosed_triple_backtick():
|
|
419
|
+
"""Test that unclosed triple backticks are properly handled"""
|
|
420
|
+
text = "Here is some ```code without closing"
|
|
421
|
+
result = ensure_closing_delimiters(text)
|
|
422
|
+
assert result == "Here is some ```code without closing```"
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def test_bracket_link_with_additional_text():
|
|
426
|
+
"""
|
|
427
|
+
Ensures that text like '[OtherText] [Title](Link)' doesn't
|
|
428
|
+
merge 'OtherText' and 'Title' into the <a> tag text.
|
|
429
|
+
"""
|
|
430
|
+
input_text = "[OtherText] [Title](https://example.com)"
|
|
431
|
+
output = telegram_format(input_text)
|
|
432
|
+
expected_output = '[OtherText] <a href="https://example.com">Title</a>'
|
|
433
|
+
assert output == expected_output, f"Output was: {output}"
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def test_heading_formatting_with_newlines():
|
|
437
|
+
"""
|
|
438
|
+
Checks that headings #, ##, etc. are properly wrapped in <b> tags.
|
|
439
|
+
"""
|
|
440
|
+
input_text = """# Heading1
|
|
441
|
+
Some text
|
|
442
|
+
## Heading2
|
|
443
|
+
More text
|
|
444
|
+
"""
|
|
445
|
+
output = telegram_format(input_text)
|
|
446
|
+
lines = output.splitlines()
|
|
447
|
+
|
|
448
|
+
assert "<b>Heading1</b>" in output
|
|
449
|
+
assert "<b>Heading2</b>" in output
|
|
450
|
+
assert lines[0] == "<b>Heading1</b>"
|
|
451
|
+
assert lines[1] == "Some text"
|
|
452
|
+
assert lines[2] == "<b>Heading2</b>"
|
|
453
|
+
assert lines[3] == "More text"
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def test_list_formatting_with_newlines():
|
|
457
|
+
"""
|
|
458
|
+
Checks that list items (starting with '-' or '*') become bullet points,
|
|
459
|
+
each on its own line with proper spacing.
|
|
460
|
+
"""
|
|
461
|
+
input_text = """- Item one
|
|
462
|
+
- Item two
|
|
463
|
+
* Item three
|
|
464
|
+
Some text
|
|
465
|
+
- Item four"""
|
|
466
|
+
output = telegram_format(input_text)
|
|
467
|
+
lines = [line.strip() for line in output.splitlines() if line.strip()]
|
|
468
|
+
|
|
469
|
+
assert "• Item one" in lines
|
|
470
|
+
assert "• Item two" in lines
|
|
471
|
+
assert "• Item three" in lines
|
|
472
|
+
assert "• Item four" in lines
|
|
473
|
+
assert "Some text" in lines
|
|
474
|
+
|
|
475
|
+
bullet_lines = [line for line in lines if line.startswith("•")]
|
|
476
|
+
assert len(bullet_lines) == 4
|
|
477
|
+
assert bullet_lines[0] == "• Item one"
|
|
478
|
+
assert bullet_lines[1] == "• Item two"
|
|
479
|
+
assert bullet_lines[2] == "• Item three"
|
|
480
|
+
assert bullet_lines[3] == "• Item four"
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def test_preserve_other_brackets():
|
|
484
|
+
"""
|
|
485
|
+
Ensures that other bracketed text not forming a valid link is preserved literally.
|
|
486
|
+
"""
|
|
487
|
+
input_text = "Look at [this], but [not a link] something else."
|
|
488
|
+
output = telegram_format(input_text)
|
|
489
|
+
assert "[this]" in output
|
|
490
|
+
assert "[not a link]" in output
|
|
491
|
+
assert "<a href=" not in output
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def test_link_with_nested_brackets():
|
|
495
|
+
"""Test that links with nested brackets in the text are handled correctly"""
|
|
496
|
+
input_text = "[Link [with brackets]](https://example.com)"
|
|
497
|
+
output = telegram_format(input_text)
|
|
498
|
+
expected_output = '<a href="https://example.com">Link [with brackets]</a>'
|
|
499
|
+
assert output == expected_output, f"Output was: {output}"
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def test_link_with_spaces():
|
|
503
|
+
"""Test that links with spaces are handled correctly"""
|
|
504
|
+
input_text = "[OtherText] [Title](Link)"
|
|
505
|
+
output = telegram_format(input_text)
|
|
506
|
+
expected_output = '[OtherText] <a href="Link">Title</a>'
|
|
507
|
+
assert output == expected_output, f"Output was: {output}"
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def test_ukrainian_bullet_points():
|
|
511
|
+
input_text = """Звісно, ось список цікавих речей у форматі Markdown:
|
|
512
|
+
|
|
513
|
+
* **Парадокс кота Шредінгера:** Чи може кіт бути одночасно живим і мертвим? 🤔
|
|
514
|
+
* **Ефект метелика:** Маленька зміна може мати великі наслідки. 🦋
|
|
515
|
+
* **Теорія струн:** Чи є наш всесвіт просто вібрацією струн? 🎶
|
|
516
|
+
* **Темна матерія та темна енергія:** Що складає 95% всесвіту? 🌌
|
|
517
|
+
* **Квантова заплутаність:** Чи можуть два об'єкти бути зв'язані на відстані? 🔗
|
|
518
|
+
* **Соліпсизм:** Чи існує щось, крім моєї свідомості? 🤨
|
|
519
|
+
* **Парадокс Фермі:** Де всі інші інопланетяни? 👽
|
|
520
|
+
* **Симуляційна гіпотеза:** Чи живемо ми в симуляції? 💻
|
|
521
|
+
* **Ефект Даннінга-Крюгера:** Чому некомпетентні люди переоцінюють себе? 🤓
|
|
522
|
+
* **Когнітивні спотворення:** Як наш мозок обманює нас? 🤯
|
|
523
|
+
"""
|
|
524
|
+
|
|
525
|
+
expected_output = """Звісно, ось список цікавих речей у форматі Markdown:
|
|
526
|
+
|
|
527
|
+
• <b>Парадокс кота Шредінгера:</b> Чи може кіт бути одночасно живим і мертвим? 🤔
|
|
528
|
+
• <b>Ефект метелика:</b> Маленька зміна може мати великі наслідки. 🦋
|
|
529
|
+
• <b>Теорія струн:</b> Чи є наш всесвіт просто вібрацією струн? 🎶
|
|
530
|
+
• <b>Темна матерія та темна енергія:</b> Що складає 95% всесвіту? 🌌
|
|
531
|
+
• <b>Квантова заплутаність:</b> Чи можуть два об'єкти бути зв'язані на відстані? 🔗
|
|
532
|
+
• <b>Соліпсизм:</b> Чи існує щось, крім моєї свідомості? 🤨
|
|
533
|
+
• <b>Парадокс Фермі:</b> Де всі інші інопланетяни? 👽
|
|
534
|
+
• <b>Симуляційна гіпотеза:</b> Чи живемо ми в симуляції? 💻
|
|
535
|
+
• <b>Ефект Даннінга-Крюгера:</b> Чому некомпетентні люди переоцінюють себе? 🤓
|
|
536
|
+
• <b>Когнітивні спотворення:</b> Як наш мозок обманює нас? 🤯
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
output = telegram_format(input_text)
|
|
540
|
+
print(output)
|
|
541
|
+
assert output.strip() == expected_output.strip()
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def test_asterisk_in_equations():
|
|
545
|
+
"""Test that asterisks in mathematical equations are not converted to italic"""
|
|
546
|
+
test_cases = [
|
|
547
|
+
("2 * 2 = 4", "2 * 2 = 4"),
|
|
548
|
+
("x*y + z = 10", "x*y + z = 10"),
|
|
549
|
+
("a * b * c", "a * b * c"),
|
|
550
|
+
("2*x + 3*y = z", "2*x + 3*y = z"),
|
|
551
|
+
("This is *italic* but 2 * 2 is not", "This is <i>italic</i> but 2 * 2 is not"),
|
|
552
|
+
("5 * x + *emphasized* text", "5 * x + <i>emphasized</i> text"),
|
|
553
|
+
]
|
|
554
|
+
|
|
555
|
+
for input_text, expected_output in test_cases:
|
|
556
|
+
output = telegram_format(input_text)
|
|
557
|
+
assert (
|
|
558
|
+
output == expected_output
|
|
559
|
+
), f"Failed on input: {input_text}, got: {output}"
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def test_complex_equations_with_asterisk():
|
|
563
|
+
"""Test more complex mathematical expressions with asterisks"""
|
|
564
|
+
input_text = """The formula is:
|
|
565
|
+
f(x) = 2*x + 3*y
|
|
566
|
+
g(x) = x * (y + z)
|
|
567
|
+
This is *italic* text with equation 2 * 2 = 4
|
|
568
|
+
"""
|
|
569
|
+
expected_output = """The formula is:
|
|
570
|
+
f(x) = 2*x + 3*y
|
|
571
|
+
g(x) = x * (y + z)
|
|
572
|
+
This is <i>italic</i> text with equation 2 * 2 = 4"""
|
|
573
|
+
|
|
574
|
+
output = telegram_format(input_text)
|
|
575
|
+
assert output.strip() == expected_output.strip(), f"Output was: {output}"
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# ----------------------------------------------------------------------------------------
|
|
579
|
+
# New, more comprehensive and edge-case test methods begin here
|
|
580
|
+
# ----------------------------------------------------------------------------------------
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def test_empty_string():
|
|
584
|
+
"""Check behavior with an empty string."""
|
|
585
|
+
input_text = ""
|
|
586
|
+
output = telegram_format(input_text)
|
|
587
|
+
assert output == ""
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def test_spaces_only():
|
|
591
|
+
"""Check behavior with a string that has only spaces."""
|
|
592
|
+
input_text = " "
|
|
593
|
+
output = telegram_format(input_text)
|
|
594
|
+
# Should either remain blank or just be those spaces (strip() might remove them)
|
|
595
|
+
assert output.strip() == ""
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def test_asterisk_in_parentheses():
|
|
599
|
+
"""Edge case with asterisk in parentheses."""
|
|
600
|
+
input_text = "(2*3) is an equation, but *italic* text is separate."
|
|
601
|
+
expected_output = "(2*3) is an equation, but <i>italic</i> text is separate."
|
|
602
|
+
output = telegram_format(input_text)
|
|
603
|
+
assert output == expected_output
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def test_underscore_in_non_italic_context():
|
|
607
|
+
"""Edge case with underscores that should not convert to italic."""
|
|
608
|
+
input_text = "This_variable should remain, but _italic_ should convert."
|
|
609
|
+
expected_output = "This_variable should remain, but <i>italic</i> should convert."
|
|
610
|
+
output = telegram_format(input_text)
|
|
611
|
+
assert output == expected_output
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def test_code_block_mixed_with_unescaped_html():
|
|
615
|
+
"""Ensure code block remains escaped but outside text is processed normally."""
|
|
616
|
+
input_text = """
|
|
617
|
+
Some <div>stuff</div> here.
|
|
618
|
+
```
|
|
619
|
+
<html><body>Unescaped?</body></html>
|
|
620
|
+
```
|
|
621
|
+
More text with *italic*.
|
|
622
|
+
"""
|
|
623
|
+
expected_output = """
|
|
624
|
+
Some <div>stuff</div> here.
|
|
625
|
+
<pre><code><html><body>Unescaped?</body></html>
|
|
626
|
+
</code></pre>
|
|
627
|
+
More text with <i>italic</i>.
|
|
628
|
+
"""
|
|
629
|
+
output = telegram_format(input_text)
|
|
630
|
+
assert output.strip() == expected_output.strip()
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def test_equation_with_asterisks_and_italics_combined():
|
|
634
|
+
"""More advanced check: combine equations and true italics side by side."""
|
|
635
|
+
input_text = "2*x + 3*y = 10, and *italic* is separate."
|
|
636
|
+
expected_output = "2*x + 3*y = 10, and <i>italic</i> is separate."
|
|
637
|
+
output = telegram_format(input_text)
|
|
638
|
+
assert output == expected_output
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def test_inline_code_with_asterisk_and_underscore():
|
|
642
|
+
"""Ensure that `*` and `_` inside inline code are not interpreted as markdown."""
|
|
643
|
+
input_text = "Here is `code_with_*_asterisk` outside of `code_with__underscore__`"
|
|
644
|
+
expected_output = "Here is <code>code_with_*_asterisk</code> outside of <code>code_with__underscore__</code>"
|
|
645
|
+
output = telegram_format(input_text)
|
|
646
|
+
assert output == expected_output
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def test_heading_followed_by_equation():
|
|
650
|
+
"""Check heading usage right before an equation line."""
|
|
651
|
+
input_text = """# MyHeading
|
|
652
|
+
2*x + y = 4
|
|
653
|
+
"""
|
|
654
|
+
# Heading should become <b>MyHeading</b>, equation line remains as is
|
|
655
|
+
expected_output = """<b>MyHeading</b>
|
|
656
|
+
2*x + y = 4"""
|
|
657
|
+
output = telegram_format(input_text)
|
|
658
|
+
assert output.strip() == expected_output.strip(), f"Got: {output}"
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from .converters import convert_html_chars, split_by_tag
|
|
3
|
-
from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
4
|
-
from .formatters import combine_blockquotes
|
|
5
|
-
from .helpers import remove_blockquote_escaping
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def telegram_format(text: str) -> str:
|
|
9
|
-
"""
|
|
10
|
-
Converts markdown in the provided text to HTML supported by Telegram.
|
|
11
|
-
"""
|
|
12
|
-
# Step 0: Combine blockquotes
|
|
13
|
-
text = combine_blockquotes(text)
|
|
14
|
-
|
|
15
|
-
# Step 1: Convert HTML reserved symbols
|
|
16
|
-
text = convert_html_chars(text)
|
|
17
|
-
|
|
18
|
-
# Step 2: Extract and convert code blocks first
|
|
19
|
-
output, code_blocks = extract_and_convert_code_blocks(text)
|
|
20
|
-
|
|
21
|
-
# Step 3: Escape HTML special characters in the output text
|
|
22
|
-
output = output.replace("<", "<").replace(">", ">")
|
|
23
|
-
|
|
24
|
-
# Inline code
|
|
25
|
-
output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
|
|
26
|
-
|
|
27
|
-
# Nested Bold and Italic
|
|
28
|
-
output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
|
|
29
|
-
|
|
30
|
-
# Process markdown formatting tags (bold, underline, italic, strikethrough)
|
|
31
|
-
# and convert them to their respective HTML tags
|
|
32
|
-
output = split_by_tag(output, "**", "b")
|
|
33
|
-
output = split_by_tag(output, "__", "u")
|
|
34
|
-
output = split_by_tag(output, "_", "i")
|
|
35
|
-
output = split_by_tag(output, "*", "i")
|
|
36
|
-
output = split_by_tag(output, "~~", "s")
|
|
37
|
-
|
|
38
|
-
# Remove storage links
|
|
39
|
-
output = re.sub(r"【[^】]+】", "", output)
|
|
40
|
-
|
|
41
|
-
# Convert links
|
|
42
|
-
output = re.sub(r"\[(.*?)\]\((.*?)\)", r'<a href="\2">\1</a>', output)
|
|
43
|
-
|
|
44
|
-
# Convert lists
|
|
45
|
-
output = re.sub(r"^\s*[\-\*] (.+)", r"• \1", output, flags=re.MULTILINE)
|
|
46
|
-
|
|
47
|
-
# Convert headings
|
|
48
|
-
output = re.sub(r"^\s*#+ (.+)", r"<b>\1</b>", output, flags=re.MULTILINE)
|
|
49
|
-
|
|
50
|
-
# Step 4: Reinsert the converted HTML code blocks
|
|
51
|
-
output = reinsert_code_blocks(output, code_blocks)
|
|
52
|
-
|
|
53
|
-
# Step 5: Remove blockquote escaping
|
|
54
|
-
output = remove_blockquote_escaping(output)
|
|
55
|
-
|
|
56
|
-
return output
|
|
@@ -1,256 +0,0 @@
|
|
|
1
|
-
from chatgpt_md_converter.telegram_formatter import telegram_format
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def test_split_by_tag_bold():
|
|
5
|
-
text = "This is **bold** text"
|
|
6
|
-
assert telegram_format(text) == "This is <b>bold</b> text"
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def test_telegram_format_italic():
|
|
10
|
-
text = "This is _italic_ text"
|
|
11
|
-
output = telegram_format(text)
|
|
12
|
-
assert output == "This is <i>italic</i> text"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def test_telegram_format_italic_star():
|
|
16
|
-
text = "This is *italic* text"
|
|
17
|
-
output = telegram_format(text)
|
|
18
|
-
assert output == "This is <i>italic</i> text"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def test_triple_backticks_with_language():
|
|
22
|
-
input_text = "```python\nprint('Hello, world!')\n```"
|
|
23
|
-
expected_output = (
|
|
24
|
-
"<pre><code class=\"language-python\">print('Hello, world!')\n</code></pre>"
|
|
25
|
-
)
|
|
26
|
-
output = telegram_format(input_text)
|
|
27
|
-
assert (
|
|
28
|
-
output == expected_output
|
|
29
|
-
), "Failed converting triple backticks with language to <pre><code> tags"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def test_bold_and_underline_conversion():
|
|
33
|
-
input_text = "This is **bold** and this is __underline__."
|
|
34
|
-
expected_output = "This is <b>bold</b> and this is <u>underline</u>."
|
|
35
|
-
output = telegram_format(input_text)
|
|
36
|
-
assert output == expected_output, "Failed converting ** and __ to <b> and <u> tags"
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def test_escaping_special_characters():
|
|
40
|
-
input_text = "Avoid using < or > in your HTML."
|
|
41
|
-
expected_output = "Avoid using < or > in your HTML."
|
|
42
|
-
output = telegram_format(input_text)
|
|
43
|
-
assert output == expected_output, "Failed escaping < and > characters"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def test_nested_markdown_syntax():
|
|
47
|
-
input_text = "This is **bold and _italic_** text."
|
|
48
|
-
expected_output = "This is <b>bold and <i>italic</i></b> text."
|
|
49
|
-
output = telegram_format(input_text)
|
|
50
|
-
assert output == expected_output, "Failed handling nested markdown syntax"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_combination_of_markdown_elements():
|
|
54
|
-
input_text = """
|
|
55
|
-
# Heading
|
|
56
|
-
This is a test of **bold**, __underline__, and `inline code`.
|
|
57
|
-
- Item 1
|
|
58
|
-
* Item 2
|
|
59
|
-
|
|
60
|
-
```python
|
|
61
|
-
for i in range(3):
|
|
62
|
-
print(i)
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
[Link](http://example.com)
|
|
66
|
-
"""
|
|
67
|
-
expected_output = """<b>Heading</b>\nThis is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.\n• Item 1\n• Item 2\n\n<pre><code class="language-python">for i in range(3):\n print(i)\n</code></pre>\n\n<a href="http://example.com">Link</a>\n"""
|
|
68
|
-
output = telegram_format(input_text)
|
|
69
|
-
assert (
|
|
70
|
-
output.strip() == expected_output.strip()
|
|
71
|
-
), "Failed combining multiple markdown elements into HTML"
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def test_nested_bold_within_italic():
|
|
75
|
-
input_text = "This is *__bold within italic__* text."
|
|
76
|
-
expected_output = "This is <i><u>bold within italic</u></i> text."
|
|
77
|
-
output = telegram_format(input_text)
|
|
78
|
-
assert (
|
|
79
|
-
output == expected_output
|
|
80
|
-
), "Failed converting nested bold within italic markdown to HTML"
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def test_italic_within_bold():
|
|
84
|
-
input_text = "This is **bold and _italic_ together**."
|
|
85
|
-
expected_output = "This is <b>bold and <i>italic</i> together</b>."
|
|
86
|
-
output = telegram_format(input_text)
|
|
87
|
-
assert (
|
|
88
|
-
output == expected_output
|
|
89
|
-
), "Failed converting italic within bold markdown to HTML"
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def test_inline_code_within_bold_text():
|
|
93
|
-
input_text = "This is **bold and `inline code` together**."
|
|
94
|
-
expected_output = "This is <b>bold and <code>inline code</code> together</b>."
|
|
95
|
-
output = telegram_format(input_text)
|
|
96
|
-
assert output == expected_output, "Failed handling inline code within bold text"
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def test_mixed_formatting_tags_with_lists_and_links():
|
|
100
|
-
input_text = """
|
|
101
|
-
- This is a list item with **bold**, __underline__, and [a link](http://example.com)
|
|
102
|
-
- Another item with ***bold and italic*** text
|
|
103
|
-
"""
|
|
104
|
-
expected_output = """
|
|
105
|
-
• This is a list item with <b>bold</b>, <u>underline</u>, and <a href="http://example.com">a link</a>
|
|
106
|
-
• Another item with <b><i>bold and italic</i></b> text
|
|
107
|
-
"""
|
|
108
|
-
output = telegram_format(input_text)
|
|
109
|
-
assert (
|
|
110
|
-
output.strip() == expected_output.strip()
|
|
111
|
-
), "Failed handling mixed formatting tags with lists and links"
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def test_special_characters_within_code_blocks():
|
|
115
|
-
input_text = "Here is a code block: ```<script>alert('Hello')</script>```"
|
|
116
|
-
expected_output = "Here is a code block: <pre><code><script>alert('Hello')</script></code></pre>"
|
|
117
|
-
output = telegram_format(input_text)
|
|
118
|
-
assert (
|
|
119
|
-
output == expected_output
|
|
120
|
-
), "Failed escaping special characters within code blocks"
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def test_code_block_within_bold_text():
|
|
124
|
-
input_text = "This is **bold with a `code block` inside**."
|
|
125
|
-
expected_output = "This is <b>bold with a <code>code block</code> inside</b>."
|
|
126
|
-
output = telegram_format(input_text)
|
|
127
|
-
assert output == expected_output, "Failed handling code block within bold text"
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def test_triple_backticks_with_nested_markdown():
|
|
131
|
-
input_text = "```python\n**bold text** and __underline__ in code block```"
|
|
132
|
-
# Expecting the markdown syntax to be ignored within the code block
|
|
133
|
-
expected_output = '<pre><code class="language-python">**bold text** and __underline__ in code block</code></pre>'
|
|
134
|
-
output = telegram_format(input_text)
|
|
135
|
-
assert (
|
|
136
|
-
output == expected_output
|
|
137
|
-
), "Failed handling markdown within triple backtick code blocks"
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def test_unmatched_code_delimiters():
|
|
141
|
-
input_text = "This has an `unmatched code delimiter."
|
|
142
|
-
# Expecting original input as output due to the unmatched delimiter
|
|
143
|
-
expected_output = "This has an <code>unmatched code delimiter.</code>"
|
|
144
|
-
output = telegram_format(input_text)
|
|
145
|
-
assert output == expected_output, "Failed handling unmatched code delimiters"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def test_preformatted_block_with_unusual_language_specification():
|
|
149
|
-
input_text = "```weirdLang\nSome weirdLang code\n```"
|
|
150
|
-
expected_output = (
|
|
151
|
-
'<pre><code class="language-weirdLang">Some weirdLang code\n</code></pre>'
|
|
152
|
-
)
|
|
153
|
-
output = telegram_format(input_text)
|
|
154
|
-
assert (
|
|
155
|
-
output == expected_output
|
|
156
|
-
), "Failed handling preformatted block with unusual language specification"
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def test_inline_code_within_lists():
|
|
160
|
-
input_text = """
|
|
161
|
-
- List item with `code`
|
|
162
|
-
* Another `code` item
|
|
163
|
-
"""
|
|
164
|
-
expected_output = """
|
|
165
|
-
• List item with <code>code</code>
|
|
166
|
-
• Another <code>code</code> item
|
|
167
|
-
"""
|
|
168
|
-
output = telegram_format(input_text)
|
|
169
|
-
assert (
|
|
170
|
-
output.strip() == expected_output.strip()
|
|
171
|
-
), "Failed handling inline code within lists"
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def test_vector_storage_links_trim():
|
|
175
|
-
input_text = """
|
|
176
|
-
- List item with `code`
|
|
177
|
-
* Another `code` item【4:0†source】
|
|
178
|
-
"""
|
|
179
|
-
expected_output = """
|
|
180
|
-
• List item with <code>code</code>
|
|
181
|
-
• Another <code>code</code> item
|
|
182
|
-
"""
|
|
183
|
-
output = telegram_format(input_text)
|
|
184
|
-
assert output.strip() == expected_output.strip(), "Failed trim storage links"
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def test_strikethrough_conversion():
|
|
188
|
-
input_text = "This is ~~strikethrough~~ text."
|
|
189
|
-
expected_output = "This is <s>strikethrough</s> text."
|
|
190
|
-
output = telegram_format(input_text)
|
|
191
|
-
assert output == expected_output, "Failed converting ~~ to <s> tags"
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def test_blockquote_conversion():
|
|
195
|
-
input_text = "> This is a blockquote."
|
|
196
|
-
expected_output = "<blockquote>This is a blockquote.</blockquote>"
|
|
197
|
-
output = telegram_format(input_text)
|
|
198
|
-
assert output == expected_output, "Failed converting > to <blockquote> tags"
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def test_inline_url_conversion():
|
|
202
|
-
input_text = "[example](http://example.com)"
|
|
203
|
-
expected_output = '<a href="http://example.com">example</a>'
|
|
204
|
-
output = telegram_format(input_text)
|
|
205
|
-
assert output == expected_output, "Failed converting [text](URL) to <a> tags"
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
def test_inline_mention_conversion():
|
|
209
|
-
input_text = "[User](tg://user?id=123456789)"
|
|
210
|
-
expected_output = '<a href="tg://user?id=123456789">User</a>'
|
|
211
|
-
output = telegram_format(input_text)
|
|
212
|
-
assert (
|
|
213
|
-
output == expected_output
|
|
214
|
-
), "Failed converting [text](tg://user?id=ID) to <a> tags"
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def test_escaping_ampersand():
|
|
218
|
-
input_text = "Use & in your HTML."
|
|
219
|
-
expected_output = "Use & in your HTML."
|
|
220
|
-
output = telegram_format(input_text)
|
|
221
|
-
assert output == expected_output, "Failed escaping & character"
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def test_pre_and_code_tags_with_html_entities():
|
|
225
|
-
input_text = "```html\n<div>Content</div>\n```"
|
|
226
|
-
expected_output = (
|
|
227
|
-
'<pre><code class="language-html"><div>Content</div>\n</code></pre>'
|
|
228
|
-
)
|
|
229
|
-
output = telegram_format(input_text)
|
|
230
|
-
assert (
|
|
231
|
-
output == expected_output
|
|
232
|
-
), "Failed handling pre and code tags with HTML entities"
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
def test_code_with_multiple_lines():
|
|
236
|
-
input_text = "```\ndef example():\n return 'example'\n```"
|
|
237
|
-
expected_output = "<pre><code>def example():\n return 'example'\n</code></pre>"
|
|
238
|
-
output = telegram_format(input_text)
|
|
239
|
-
assert output == expected_output, "Failed handling code with multiple lines"
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def test_combined_formatting_with_lists():
|
|
243
|
-
input_text = """
|
|
244
|
-
- **Bold** list item
|
|
245
|
-
- _Italic_ list item
|
|
246
|
-
- `Code` list item
|
|
247
|
-
"""
|
|
248
|
-
expected_output = """
|
|
249
|
-
• <b>Bold</b> list item
|
|
250
|
-
• <i>Italic</i> list item
|
|
251
|
-
• <code>Code</code> list item
|
|
252
|
-
"""
|
|
253
|
-
output = telegram_format(input_text)
|
|
254
|
-
assert (
|
|
255
|
-
output.strip() == expected_output.strip()
|
|
256
|
-
), "Failed handling combined formatting with lists"
|
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/extractors.py
RENAMED
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|