chatgpt-md-converter 0.1.1__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/PKG-INFO +1 -1
  2. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/converters.py +2 -1
  3. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/formatters.py +14 -6
  4. chatgpt_md_converter-0.2.0/chatgpt_md_converter/telegram_formatter.py +100 -0
  5. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
  6. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/setup.py +1 -1
  7. chatgpt_md_converter-0.2.0/tests/test_parser.py +658 -0
  8. chatgpt_md_converter-0.1.1/chatgpt_md_converter/telegram_formatter.py +0 -56
  9. chatgpt_md_converter-0.1.1/tests/test_parser.py +0 -256
  10. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/LICENSE +0 -0
  11. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/__init__.py +0 -0
  12. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/extractors.py +0 -0
  13. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/helpers.py +0 -0
  14. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/SOURCES.txt +0 -0
  15. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
  16. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
  17. {chatgpt_md_converter-0.1.1 → chatgpt_md_converter-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: chatgpt_md_converter
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/Latand/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -16,6 +16,7 @@ def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
16
16
  Splits the text by markdown tag and replaces it with the specified HTML tag.
17
17
  """
18
18
  tag_pattern = re.compile(
19
- r"{}(.*?){}".format(re.escape(md_tag), re.escape(md_tag)), re.DOTALL
19
+ r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
20
+ re.DOTALL,
20
21
  )
21
22
  return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
@@ -1,9 +1,6 @@
1
- import re
2
-
3
-
4
1
  def combine_blockquotes(text: str) -> str:
5
2
  """
6
- Combines multiline blockquotes into a single blockquote.
3
+ Combines multiline blockquotes into a single blockquote while keeping the \n characters.
7
4
  """
8
5
  lines = text.split("\n")
9
6
  combined_lines = []
@@ -17,7 +14,7 @@ def combine_blockquotes(text: str) -> str:
17
14
  else:
18
15
  if in_blockquote:
19
16
  combined_lines.append(
20
- "<blockquote>" + " ".join(blockquote_lines) + "</blockquote>"
17
+ "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
21
18
  )
22
19
  blockquote_lines = []
23
20
  in_blockquote = False
@@ -25,7 +22,18 @@ def combine_blockquotes(text: str) -> str:
25
22
 
26
23
  if in_blockquote:
27
24
  combined_lines.append(
28
- "<blockquote>" + " ".join(blockquote_lines) + "</blockquote>"
25
+ "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
29
26
  )
30
27
 
31
28
  return "\n".join(combined_lines)
29
+
30
+
31
+ def fix_asterisk_equations(text: str) -> str:
32
+ """
33
+ Replaces numeric expressions with '*' in them with '×'
34
+ to avoid accidental italic formatting.
35
+ e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
36
+ """
37
+ import re
38
+ eq_pattern = re.compile(r'(\d+)\s*\*\s*(\d+)')
39
+ return eq_pattern.sub(r'\1×\2', text)
@@ -0,0 +1,100 @@
1
+ import re
2
+
3
+ from .converters import convert_html_chars, split_by_tag
4
+ from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
5
+ from .formatters import combine_blockquotes
6
+ from .helpers import remove_blockquote_escaping
7
+
8
+
9
+ def extract_inline_code_snippets(text: str):
10
+ """
11
+ Extracts inline code (single-backtick content) from the text,
12
+ replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
13
+ This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
14
+ """
15
+ placeholders = []
16
+ code_snippets = {}
17
+ inline_code_pattern = re.compile(r"`([^`]+)`")
18
+
19
+ def replacer(match):
20
+ snippet = match.group(1)
21
+ placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
22
+ placeholders.append(placeholder)
23
+ code_snippets[placeholder] = snippet
24
+ return placeholder
25
+
26
+ new_text = inline_code_pattern.sub(replacer, text)
27
+ return new_text, code_snippets
28
+
29
+
30
+ def telegram_format(text: str) -> str:
31
+ """
32
+ Converts markdown in the provided text to HTML supported by Telegram.
33
+ """
34
+
35
+ # Step 0: Combine blockquotes
36
+ text = combine_blockquotes(text)
37
+
38
+ # Step 1: Convert HTML reserved symbols
39
+ text = convert_html_chars(text)
40
+
41
+ # Step 2: Extract and convert triple-backtick code blocks first
42
+ output, triple_code_blocks = extract_and_convert_code_blocks(text)
43
+
44
+ # Step 2.5: Extract inline code snippets (single backticks) so they won't be parsed as italics, etc.
45
+ output, inline_code_snippets = extract_inline_code_snippets(output)
46
+
47
+ # Step 3: Escape HTML special characters in the output text (for non-code parts)
48
+ # We do NOT want to escape what's inside placeholders here, only what's outside code placeholders.
49
+ output = output.replace("<", "&lt;").replace(">", "&gt;")
50
+
51
+ # Convert headings (H1-H6)
52
+ output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
53
+
54
+ # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
55
+ output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
56
+
57
+ # Remove this old inline code replacement — now handled by extract_inline_code_snippets()
58
+ # output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
59
+
60
+ # Nested Bold and Italic
61
+ output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
62
+ output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
63
+
64
+ # Process markdown for bold (**), underline (__), strikethrough (~~)
65
+ output = split_by_tag(output, "**", "b")
66
+ output = split_by_tag(output, "__", "u")
67
+ output = split_by_tag(output, "~~", "s")
68
+
69
+ # Custom approach for single-asterisk italic
70
+ italic_pattern = re.compile(
71
+ r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])",
72
+ re.DOTALL
73
+ )
74
+ output = italic_pattern.sub(r"<i>\1</i>", output)
75
+
76
+ # Process single underscore-based italic
77
+ output = split_by_tag(output, "_", "i")
78
+
79
+ # Remove storage links (Vector storage placeholders like 【4:0†source】)
80
+ output = re.sub(r"【[^】]+】", "", output)
81
+
82
+ # Convert Markdown links/images to <a href="">…</a>
83
+ link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
84
+ output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
85
+
86
+ # Step 3.5: Reinsert inline code snippets, escaping special chars in code content
87
+ for placeholder, snippet in inline_code_snippets.items():
88
+ escaped_snippet = snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
89
+ output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
90
+
91
+ # Step 4: Reinsert the converted triple-backtick code blocks
92
+ output = reinsert_code_blocks(output, triple_code_blocks)
93
+
94
+ # Step 5: Remove blockquote escaping
95
+ output = remove_blockquote_escaping(output)
96
+
97
+ # Clean up multiple consecutive newlines, but preserve intentional spacing
98
+ output = re.sub(r"\n{3,}", "\n\n", output)
99
+
100
+ return output.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: chatgpt_md_converter
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/Latand/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -2,7 +2,7 @@ from setuptools import setup
2
2
 
3
3
  setup(
4
4
  name="chatgpt_md_converter",
5
- version="0.1.1",
5
+ version="0.2.0",
6
6
  author="Kostiantyn Kriuchkov",
7
7
  author_email="latand666@gmail.com",
8
8
  description="A package for converting markdown to HTML for chat Telegram bots",
@@ -0,0 +1,658 @@
1
+ from chatgpt_md_converter.extractors import ensure_closing_delimiters
2
+ from chatgpt_md_converter.telegram_formatter import telegram_format
3
+
4
+
5
+ def test_split_by_tag_bold():
6
+ text = "This is **bold** text"
7
+ assert telegram_format(text) == "This is <b>bold</b> text"
8
+
9
+
10
+ def test_telegram_format_italic():
11
+ text = "This is _italic_ text"
12
+ output = telegram_format(text)
13
+ assert output == "This is <i>italic</i> text"
14
+
15
+
16
+ def test_telegram_format_italic_star():
17
+ text = "This is *italic* text"
18
+ output = telegram_format(text)
19
+ assert output == "This is <i>italic</i> text"
20
+
21
+
22
+ def test_triple_backticks_with_language():
23
+ input_text = "```python\nprint('Hello, world!')\n```"
24
+ expected_output = (
25
+ "<pre><code class=\"language-python\">print('Hello, world!')\n</code></pre>"
26
+ )
27
+ output = telegram_format(input_text)
28
+ assert (
29
+ output == expected_output
30
+ ), "Failed converting triple backticks with language to <pre><code> tags"
31
+
32
+
33
+ def test_bold_and_underline_conversion():
34
+ input_text = "This is **bold** and this is __underline__."
35
+ expected_output = "This is <b>bold</b> and this is <u>underline</u>."
36
+ output = telegram_format(input_text)
37
+ assert output == expected_output, "Failed converting ** and __ to <b> and <u> tags"
38
+
39
+
40
+ def test_escaping_special_characters():
41
+ input_text = "Avoid using < or > in your HTML."
42
+ expected_output = "Avoid using &lt; or &gt; in your HTML."
43
+ output = telegram_format(input_text)
44
+ assert output == expected_output, "Failed escaping < and > characters"
45
+
46
+
47
+ def test_nested_markdown_syntax():
48
+ input_text = "This is **bold and _italic_** text."
49
+ expected_output = "This is <b>bold and <i>italic</i></b> text."
50
+ output = telegram_format(input_text)
51
+ assert output == expected_output, "Failed handling nested markdown syntax"
52
+
53
+
54
+ def test_combination_of_markdown_elements():
55
+ input_text = """
56
+ # Heading
57
+ This is a test of **bold**, __underline__, and `inline code`.
58
+ - Item 1
59
+ * Item 2
60
+
61
+ ```python
62
+ for i in range(3):
63
+ print(i)
64
+ ```
65
+
66
+ [Link](http://example.com)
67
+ """
68
+ expected_output = """
69
+ <b>Heading</b>
70
+ This is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.
71
+ • Item 1
72
+ • Item 2
73
+
74
+ <pre><code class="language-python">for i in range(3):
75
+ print(i)
76
+ </code></pre>
77
+
78
+ <a href="http://example.com">Link</a>
79
+ """
80
+ output = telegram_format(input_text)
81
+ assert (
82
+ output.strip() == expected_output.strip()
83
+ ), "Failed combining multiple markdown elements into HTML"
84
+
85
+
86
+ def test_nested_bold_within_italic():
87
+ input_text = "This is *__bold within italic__* text."
88
+ expected_output = "This is <i><u>bold within italic</u></i> text."
89
+ output = telegram_format(input_text)
90
+ assert (
91
+ output == expected_output
92
+ ), "Failed converting nested bold within italic markdown to HTML"
93
+
94
+
95
+ def test_italic_within_bold():
96
+ input_text = "This is **bold and _italic_ together**."
97
+ expected_output = "This is <b>bold and <i>italic</i> together</b>."
98
+ output = telegram_format(input_text)
99
+ assert (
100
+ output == expected_output
101
+ ), "Failed converting italic within bold markdown to HTML"
102
+
103
+
104
+ def test_inline_code_within_bold_text():
105
+ input_text = "This is **bold and `inline code` together**."
106
+ expected_output = "This is <b>bold and <code>inline code</code> together</b>."
107
+ output = telegram_format(input_text)
108
+ assert output == expected_output, "Failed handling inline code within bold text"
109
+
110
+
111
+ def test_mixed_formatting_tags_with_lists_and_links():
112
+ input_text = """
113
+ - This is a list item with **bold**, __underline__, and [a link](http://example.com)
114
+ - Another item with ***bold and italic*** text
115
+ """
116
+ expected_output = """
117
+ • This is a list item with <b>bold</b>, <u>underline</u>, and <a href="http://example.com">a link</a>
118
+ • Another item with <b><i>bold and italic</i></b> text
119
+ """
120
+ output = telegram_format(input_text)
121
+ assert (
122
+ output.strip() == expected_output.strip()
123
+ ), "Failed handling mixed formatting tags with lists and links"
124
+
125
+
126
+ def test_special_characters_within_code_blocks():
127
+ input_text = "Here is a code block: ```<script>alert('Hello')</script>```"
128
+ expected_output = "Here is a code block: <pre><code>&lt;script&gt;alert('Hello')&lt;/script&gt;</code></pre>"
129
+ output = telegram_format(input_text)
130
+ assert (
131
+ output == expected_output
132
+ ), "Failed escaping special characters within code blocks"
133
+
134
+
135
+ def test_code_block_within_bold_text():
136
+ input_text = "This is **bold with a `code block` inside**."
137
+ expected_output = "This is <b>bold with a <code>code block</code> inside</b>."
138
+ output = telegram_format(input_text)
139
+ assert output == expected_output, "Failed handling code block within bold text"
140
+
141
+
142
+ def test_triple_backticks_with_nested_markdown():
143
+ input_text = "```python\n**bold text** and __underline__ in code block```"
144
+ expected_output = '<pre><code class="language-python">**bold text** and __underline__ in code block</code></pre>'
145
+ output = telegram_format(input_text)
146
+ assert (
147
+ output == expected_output
148
+ ), "Failed handling markdown within triple backtick code blocks"
149
+
150
+
151
+ def test_unmatched_code_delimiters():
152
+ input_text = "This has an `unmatched code delimiter."
153
+ expected_output = "This has an <code>unmatched code delimiter.</code>"
154
+ output = telegram_format(input_text)
155
+ assert output == expected_output, "Failed handling unmatched code delimiters"
156
+
157
+
158
+ def test_preformatted_block_with_unusual_language_specification():
159
+ input_text = "```weirdLang\nSome weirdLang code\n```"
160
+ expected_output = (
161
+ '<pre><code class="language-weirdLang">Some weirdLang code\n</code></pre>'
162
+ )
163
+ output = telegram_format(input_text)
164
+ assert (
165
+ output == expected_output
166
+ ), "Failed handling preformatted block with unusual language specification"
167
+
168
+
169
+ def test_inline_code_within_lists():
170
+ input_text = """
171
+ - List item with `code`
172
+ * Another `code` item
173
+ """
174
+ expected_output = """
175
+ • List item with <code>code</code>
176
+ • Another <code>code</code> item
177
+ """
178
+ output = telegram_format(input_text)
179
+ assert (
180
+ output.strip() == expected_output.strip()
181
+ ), "Failed handling inline code within lists"
182
+
183
+
184
+ def test_vector_storage_links_trim():
185
+ input_text = """
186
+ - List item with `code`
187
+ * Another `code` item【4:0†source】
188
+ """
189
+ expected_output = """
190
+ • List item with <code>code</code>
191
+ • Another <code>code</code> item
192
+ """
193
+ output = telegram_format(input_text)
194
+ assert output.strip() == expected_output.strip(), "Failed trim storage links"
195
+
196
+
197
+ def test_strikethrough_conversion():
198
+ input_text = "This is ~~strikethrough~~ text."
199
+ expected_output = "This is <s>strikethrough</s> text."
200
+ output = telegram_format(input_text)
201
+ assert output == expected_output, "Failed converting ~~ to <s> tags"
202
+
203
+
204
+ def test_blockquote_conversion():
205
+ input_text = "> This is a blockquote."
206
+ expected_output = "<blockquote>This is a blockquote.</blockquote>"
207
+ output = telegram_format(input_text)
208
+ assert output == expected_output, "Failed converting > to <blockquote> tags"
209
+
210
+
211
+ def test_inline_url_conversion():
212
+ input_text = "[example](http://example.com)"
213
+ expected_output = '<a href="http://example.com">example</a>'
214
+ output = telegram_format(input_text)
215
+ assert output == expected_output, "Failed converting [text](URL) to <a> tags"
216
+
217
+
218
+ def test_inline_mention_conversion():
219
+ input_text = "[User](tg://user?id=123456789)"
220
+ expected_output = '<a href="tg://user?id=123456789">User</a>'
221
+ output = telegram_format(input_text)
222
+ assert (
223
+ output == expected_output
224
+ ), "Failed converting [text](tg://user?id=ID) to <a> tags"
225
+
226
+
227
+ def test_escaping_ampersand():
228
+ input_text = "Use & in your HTML."
229
+ expected_output = "Use &amp; in your HTML."
230
+ output = telegram_format(input_text)
231
+ assert output == expected_output, "Failed escaping & character"
232
+
233
+
234
+ def test_pre_and_code_tags_with_html_entities():
235
+ input_text = "```html\n<div>Content</div>\n```"
236
+ expected_output = (
237
+ '<pre><code class="language-html">&lt;div&gt;Content&lt;/div&gt;\n</code></pre>'
238
+ )
239
+ output = telegram_format(input_text)
240
+ assert (
241
+ output == expected_output
242
+ ), "Failed handling pre and code tags with HTML entities"
243
+
244
+
245
+ def test_code_with_multiple_lines():
246
+ input_text = "```\ndef example():\n return 'example'\n```"
247
+ expected_output = "<pre><code>def example():\n return 'example'\n</code></pre>"
248
+ output = telegram_format(input_text)
249
+ assert output == expected_output, "Failed handling code with multiple lines"
250
+
251
+
252
+ def test_combined_formatting_with_lists():
253
+ input_text = """
254
+ - **Bold** list item
255
+ - _Italic_ list item
256
+ - `Code` list item
257
+ """
258
+ expected_output = """
259
+ • <b>Bold</b> list item
260
+ • <i>Italic</i> list item
261
+ • <code>Code</code> list item
262
+ """
263
+ output = telegram_format(input_text)
264
+ assert (
265
+ output.strip() == expected_output.strip()
266
+ ), "Failed handling combined formatting with lists"
267
+
268
+
269
+ def test_md_large_example():
270
+ input_text = """
271
+ 1. **Headings:**
272
+ # H1 Heading
273
+ ## H2 Heading
274
+ ### H3 Heading
275
+ #### H4 Heading
276
+ ##### H5 Heading
277
+ ###### H6 Heading
278
+
279
+ 2. **Emphasis:**
280
+
281
+ *Italic text* or _Italic text_
282
+
283
+ **Bold text** or __Underline text__
284
+
285
+ ***Bold and italic text*** or ___Underline and italic text___
286
+
287
+ 3. **Lists:**
288
+ - **Unordered List:**
289
+
290
+ - Item 1
291
+ - Item 2
292
+ - Subitem 1
293
+ - Subitem 2
294
+
295
+ - **Ordered List:**
296
+
297
+ 1. First item
298
+ 2. Second item
299
+ 1. Subitem 1
300
+ 2. Subitem 2
301
+
302
+ 4. **Links:**
303
+
304
+ [OpenAI](https://www.openai.com)
305
+
306
+ 5. **Images:**
307
+
308
+ ![Alt text for image](URL_to_image)
309
+ ![Alt text for image](URL_to_імедж)
310
+
311
+ 6. **Blockquotes:**
312
+
313
+ > This is a blockquote.
314
+ > It can span multiple lines.
315
+
316
+ 7. **Inline Code:**
317
+
318
+ Here is some `inline code`.
319
+
320
+ 8. **Code Blocks:**
321
+
322
+ ```python
323
+ def example_function():
324
+ print("Hello World")
325
+ ```
326
+
327
+ 9. **Tables:**
328
+
329
+ | Header 1 | Header 2 |
330
+ |----------|----------|
331
+ | Row 1 Col 1 | Row 1 Col 2 |
332
+ | Row 2 Col 1 | Row 2 Col 2 |
333
+
334
+ 10. **Horizontal Rule:**
335
+
336
+ ---
337
+ """
338
+ expected_output = """
339
+ 1. <b>Headings:</b>
340
+ <b>H1 Heading</b>
341
+ <b>H2 Heading</b>
342
+ <b>H3 Heading</b>
343
+ <b>H4 Heading</b>
344
+ <b>H5 Heading</b>
345
+ <b>H6 Heading</b>
346
+
347
+ 2. <b>Emphasis:</b>
348
+
349
+ <i>Italic text</i> or <i>Italic text</i>
350
+
351
+ <b>Bold text</b> or <u>Underline text</u>
352
+
353
+ <b><i>Bold and italic text</i></b> or <u><i>Underline and italic text</i></u>
354
+
355
+ 3. <b>Lists:</b>
356
+ • <b>Unordered List:</b>
357
+
358
+ • Item 1
359
+ • Item 2
360
+ • Subitem 1
361
+ • Subitem 2
362
+
363
+ • <b>Ordered List:</b>
364
+
365
+ 1. First item
366
+ 2. Second item
367
+ 1. Subitem 1
368
+ 2. Subitem 2
369
+
370
+ 4. <b>Links:</b>
371
+
372
+ <a href="https://www.openai.com">OpenAI</a>
373
+
374
+ 5. <b>Images:</b>
375
+
376
+ <a href="URL_to_image">Alt text for image</a>
377
+ <a href="URL_to_імедж">Alt text for image</a>
378
+
379
+ 6. <b>Blockquotes:</b>
380
+
381
+ <blockquote>This is a blockquote.
382
+ It can span multiple lines.</blockquote>
383
+
384
+ 7. <b>Inline Code:</b>
385
+
386
+ Here is some <code>inline code</code>.
387
+
388
+ 8. <b>Code Blocks:</b>
389
+
390
+ <pre><code class="language-python">def example_function():
391
+ print("Hello World")
392
+ </code></pre>
393
+
394
+ 9. <b>Tables:</b>
395
+
396
+ | Header 1 | Header 2 |
397
+ |----------|----------|
398
+ | Row 1 Col 1 | Row 1 Col 2 |
399
+ | Row 2 Col 1 | Row 2 Col 2 |
400
+
401
+ 10. <b>Horizontal Rule:</b>
402
+
403
+ ---
404
+ """
405
+ output = telegram_format(input_text)
406
+ assert (
407
+ output.strip() == expected_output.strip()
408
+ ), "Failed handling large markdown example"
409
+
410
+
411
+ def test_unclosed_single_backtick():
412
+ """Test that a single unclosed backtick is properly handled"""
413
+ text = "Here is some `code without closing"
414
+ result = ensure_closing_delimiters(text)
415
+ assert result == "Here is some `code without closing`"
416
+
417
+
418
+ def test_unclosed_triple_backtick():
419
+ """Test that unclosed triple backticks are properly handled"""
420
+ text = "Here is some ```code without closing"
421
+ result = ensure_closing_delimiters(text)
422
+ assert result == "Here is some ```code without closing```"
423
+
424
+
425
+ def test_bracket_link_with_additional_text():
426
+ """
427
+ Ensures that text like '[OtherText] [Title](Link)' doesn't
428
+ merge 'OtherText' and 'Title' into the <a> tag text.
429
+ """
430
+ input_text = "[OtherText] [Title](https://example.com)"
431
+ output = telegram_format(input_text)
432
+ expected_output = '[OtherText] <a href="https://example.com">Title</a>'
433
+ assert output == expected_output, f"Output was: {output}"
434
+
435
+
436
+ def test_heading_formatting_with_newlines():
437
+ """
438
+ Checks that headings #, ##, etc. are properly wrapped in <b> tags.
439
+ """
440
+ input_text = """# Heading1
441
+ Some text
442
+ ## Heading2
443
+ More text
444
+ """
445
+ output = telegram_format(input_text)
446
+ lines = output.splitlines()
447
+
448
+ assert "<b>Heading1</b>" in output
449
+ assert "<b>Heading2</b>" in output
450
+ assert lines[0] == "<b>Heading1</b>"
451
+ assert lines[1] == "Some text"
452
+ assert lines[2] == "<b>Heading2</b>"
453
+ assert lines[3] == "More text"
454
+
455
+
456
+ def test_list_formatting_with_newlines():
457
+ """
458
+ Checks that list items (starting with '-' or '*') become bullet points,
459
+ each on its own line with proper spacing.
460
+ """
461
+ input_text = """- Item one
462
+ - Item two
463
+ * Item three
464
+ Some text
465
+ - Item four"""
466
+ output = telegram_format(input_text)
467
+ lines = [line.strip() for line in output.splitlines() if line.strip()]
468
+
469
+ assert "• Item one" in lines
470
+ assert "• Item two" in lines
471
+ assert "• Item three" in lines
472
+ assert "• Item four" in lines
473
+ assert "Some text" in lines
474
+
475
+ bullet_lines = [line for line in lines if line.startswith("•")]
476
+ assert len(bullet_lines) == 4
477
+ assert bullet_lines[0] == "• Item one"
478
+ assert bullet_lines[1] == "• Item two"
479
+ assert bullet_lines[2] == "• Item three"
480
+ assert bullet_lines[3] == "• Item four"
481
+
482
+
483
+ def test_preserve_other_brackets():
484
+ """
485
+ Ensures that other bracketed text not forming a valid link is preserved literally.
486
+ """
487
+ input_text = "Look at [this], but [not a link] something else."
488
+ output = telegram_format(input_text)
489
+ assert "[this]" in output
490
+ assert "[not a link]" in output
491
+ assert "<a href=" not in output
492
+
493
+
494
+ def test_link_with_nested_brackets():
495
+ """Test that links with nested brackets in the text are handled correctly"""
496
+ input_text = "[Link [with brackets]](https://example.com)"
497
+ output = telegram_format(input_text)
498
+ expected_output = '<a href="https://example.com">Link [with brackets]</a>'
499
+ assert output == expected_output, f"Output was: {output}"
500
+
501
+
502
+ def test_link_with_spaces():
503
+ """Test that links with spaces are handled correctly"""
504
+ input_text = "[OtherText] [Title](Link)"
505
+ output = telegram_format(input_text)
506
+ expected_output = '[OtherText] <a href="Link">Title</a>'
507
+ assert output == expected_output, f"Output was: {output}"
508
+
509
+
510
+ def test_ukrainian_bullet_points():
511
+ input_text = """Звісно, ось список цікавих речей у форматі Markdown:
512
+
513
+ * **Парадокс кота Шредінгера:** Чи може кіт бути одночасно живим і мертвим? 🤔
514
+ * **Ефект метелика:** Маленька зміна може мати великі наслідки. 🦋
515
+ * **Теорія струн:** Чи є наш всесвіт просто вібрацією струн? 🎶
516
+ * **Темна матерія та темна енергія:** Що складає 95% всесвіту? 🌌
517
+ * **Квантова заплутаність:** Чи можуть два об'єкти бути зв'язані на відстані? 🔗
518
+ * **Соліпсизм:** Чи існує щось, крім моєї свідомості? 🤨
519
+ * **Парадокс Фермі:** Де всі інші інопланетяни? 👽
520
+ * **Симуляційна гіпотеза:** Чи живемо ми в симуляції? 💻
521
+ * **Ефект Даннінга-Крюгера:** Чому некомпетентні люди переоцінюють себе? 🤓
522
+ * **Когнітивні спотворення:** Як наш мозок обманює нас? 🤯
523
+ """
524
+
525
+ expected_output = """Звісно, ось список цікавих речей у форматі Markdown:
526
+
527
+ • <b>Парадокс кота Шредінгера:</b> Чи може кіт бути одночасно живим і мертвим? 🤔
528
+ • <b>Ефект метелика:</b> Маленька зміна може мати великі наслідки. 🦋
529
+ • <b>Теорія струн:</b> Чи є наш всесвіт просто вібрацією струн? 🎶
530
+ • <b>Темна матерія та темна енергія:</b> Що складає 95% всесвіту? 🌌
531
+ • <b>Квантова заплутаність:</b> Чи можуть два об'єкти бути зв'язані на відстані? 🔗
532
+ • <b>Соліпсизм:</b> Чи існує щось, крім моєї свідомості? 🤨
533
+ • <b>Парадокс Фермі:</b> Де всі інші інопланетяни? 👽
534
+ • <b>Симуляційна гіпотеза:</b> Чи живемо ми в симуляції? 💻
535
+ • <b>Ефект Даннінга-Крюгера:</b> Чому некомпетентні люди переоцінюють себе? 🤓
536
+ • <b>Когнітивні спотворення:</b> Як наш мозок обманює нас? 🤯
537
+ """
538
+
539
+ output = telegram_format(input_text)
540
+ print(output)
541
+ assert output.strip() == expected_output.strip()
542
+
543
+
544
+ def test_asterisk_in_equations():
545
+ """Test that asterisks in mathematical equations are not converted to italic"""
546
+ test_cases = [
547
+ ("2 * 2 = 4", "2 * 2 = 4"),
548
+ ("x*y + z = 10", "x*y + z = 10"),
549
+ ("a * b * c", "a * b * c"),
550
+ ("2*x + 3*y = z", "2*x + 3*y = z"),
551
+ ("This is *italic* but 2 * 2 is not", "This is <i>italic</i> but 2 * 2 is not"),
552
+ ("5 * x + *emphasized* text", "5 * x + <i>emphasized</i> text"),
553
+ ]
554
+
555
+ for input_text, expected_output in test_cases:
556
+ output = telegram_format(input_text)
557
+ assert (
558
+ output == expected_output
559
+ ), f"Failed on input: {input_text}, got: {output}"
560
+
561
+
562
+ def test_complex_equations_with_asterisk():
563
+ """Test more complex mathematical expressions with asterisks"""
564
+ input_text = """The formula is:
565
+ f(x) = 2*x + 3*y
566
+ g(x) = x * (y + z)
567
+ This is *italic* text with equation 2 * 2 = 4
568
+ """
569
+ expected_output = """The formula is:
570
+ f(x) = 2*x + 3*y
571
+ g(x) = x * (y + z)
572
+ This is <i>italic</i> text with equation 2 * 2 = 4"""
573
+
574
+ output = telegram_format(input_text)
575
+ assert output.strip() == expected_output.strip(), f"Output was: {output}"
576
+
577
+
578
+ # ----------------------------------------------------------------------------------------
579
+ # New, more comprehensive and edge-case test methods begin here
580
+ # ----------------------------------------------------------------------------------------
581
+
582
+
583
+ def test_empty_string():
584
+ """Check behavior with an empty string."""
585
+ input_text = ""
586
+ output = telegram_format(input_text)
587
+ assert output == ""
588
+
589
+
590
+ def test_spaces_only():
591
+ """Check behavior with a string that has only spaces."""
592
+ input_text = " "
593
+ output = telegram_format(input_text)
594
+ # Should either remain blank or just be those spaces (strip() might remove them)
595
+ assert output.strip() == ""
596
+
597
+
598
+ def test_asterisk_in_parentheses():
599
+ """Edge case with asterisk in parentheses."""
600
+ input_text = "(2*3) is an equation, but *italic* text is separate."
601
+ expected_output = "(2*3) is an equation, but <i>italic</i> text is separate."
602
+ output = telegram_format(input_text)
603
+ assert output == expected_output
604
+
605
+
606
+ def test_underscore_in_non_italic_context():
607
+ """Edge case with underscores that should not convert to italic."""
608
+ input_text = "This_variable should remain, but _italic_ should convert."
609
+ expected_output = "This_variable should remain, but <i>italic</i> should convert."
610
+ output = telegram_format(input_text)
611
+ assert output == expected_output
612
+
613
+
614
+ def test_code_block_mixed_with_unescaped_html():
615
+ """Ensure code block remains escaped but outside text is processed normally."""
616
+ input_text = """
617
+ Some <div>stuff</div> here.
618
+ ```
619
+ <html><body>Unescaped?</body></html>
620
+ ```
621
+ More text with *italic*.
622
+ """
623
+ expected_output = """
624
+ Some &lt;div&gt;stuff&lt;/div&gt; here.
625
+ <pre><code>&lt;html&gt;&lt;body&gt;Unescaped?&lt;/body&gt;&lt;/html&gt;
626
+ </code></pre>
627
+ More text with <i>italic</i>.
628
+ """
629
+ output = telegram_format(input_text)
630
+ assert output.strip() == expected_output.strip()
631
+
632
+
633
+ def test_equation_with_asterisks_and_italics_combined():
634
+ """More advanced check: combine equations and true italics side by side."""
635
+ input_text = "2*x + 3*y = 10, and *italic* is separate."
636
+ expected_output = "2*x + 3*y = 10, and <i>italic</i> is separate."
637
+ output = telegram_format(input_text)
638
+ assert output == expected_output
639
+
640
+
641
+ def test_inline_code_with_asterisk_and_underscore():
642
+ """Ensure that `*` and `_` inside inline code are not interpreted as markdown."""
643
+ input_text = "Here is `code_with_*_asterisk` outside of `code_with__underscore__`"
644
+ expected_output = "Here is <code>code_with_*_asterisk</code> outside of <code>code_with__underscore__</code>"
645
+ output = telegram_format(input_text)
646
+ assert output == expected_output
647
+
648
+
649
+ def test_heading_followed_by_equation():
650
+ """Check heading usage right before an equation line."""
651
+ input_text = """# MyHeading
652
+ 2*x + y = 4
653
+ """
654
+ # Heading should become <b>MyHeading</b>, equation line remains as is
655
+ expected_output = """<b>MyHeading</b>
656
+ 2*x + y = 4"""
657
+ output = telegram_format(input_text)
658
+ assert output.strip() == expected_output.strip(), f"Got: {output}"
@@ -1,56 +0,0 @@
1
- import re
2
- from .converters import convert_html_chars, split_by_tag
3
- from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
4
- from .formatters import combine_blockquotes
5
- from .helpers import remove_blockquote_escaping
6
-
7
-
8
- def telegram_format(text: str) -> str:
9
- """
10
- Converts markdown in the provided text to HTML supported by Telegram.
11
- """
12
- # Step 0: Combine blockquotes
13
- text = combine_blockquotes(text)
14
-
15
- # Step 1: Convert HTML reserved symbols
16
- text = convert_html_chars(text)
17
-
18
- # Step 2: Extract and convert code blocks first
19
- output, code_blocks = extract_and_convert_code_blocks(text)
20
-
21
- # Step 3: Escape HTML special characters in the output text
22
- output = output.replace("<", "&lt;").replace(">", "&gt;")
23
-
24
- # Inline code
25
- output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
26
-
27
- # Nested Bold and Italic
28
- output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
29
-
30
- # Process markdown formatting tags (bold, underline, italic, strikethrough)
31
- # and convert them to their respective HTML tags
32
- output = split_by_tag(output, "**", "b")
33
- output = split_by_tag(output, "__", "u")
34
- output = split_by_tag(output, "_", "i")
35
- output = split_by_tag(output, "*", "i")
36
- output = split_by_tag(output, "~~", "s")
37
-
38
- # Remove storage links
39
- output = re.sub(r"【[^】]+】", "", output)
40
-
41
- # Convert links
42
- output = re.sub(r"\[(.*?)\]\((.*?)\)", r'<a href="\2">\1</a>', output)
43
-
44
- # Convert lists
45
- output = re.sub(r"^\s*[\-\*] (.+)", r"• \1", output, flags=re.MULTILINE)
46
-
47
- # Convert headings
48
- output = re.sub(r"^\s*#+ (.+)", r"<b>\1</b>", output, flags=re.MULTILINE)
49
-
50
- # Step 4: Reinsert the converted HTML code blocks
51
- output = reinsert_code_blocks(output, code_blocks)
52
-
53
- # Step 5: Remove blockquote escaping
54
- output = remove_blockquote_escaping(output)
55
-
56
- return output
@@ -1,256 +0,0 @@
1
- from chatgpt_md_converter.telegram_formatter import telegram_format
2
-
3
-
4
- def test_split_by_tag_bold():
5
- text = "This is **bold** text"
6
- assert telegram_format(text) == "This is <b>bold</b> text"
7
-
8
-
9
- def test_telegram_format_italic():
10
- text = "This is _italic_ text"
11
- output = telegram_format(text)
12
- assert output == "This is <i>italic</i> text"
13
-
14
-
15
- def test_telegram_format_italic_star():
16
- text = "This is *italic* text"
17
- output = telegram_format(text)
18
- assert output == "This is <i>italic</i> text"
19
-
20
-
21
- def test_triple_backticks_with_language():
22
- input_text = "```python\nprint('Hello, world!')\n```"
23
- expected_output = (
24
- "<pre><code class=\"language-python\">print('Hello, world!')\n</code></pre>"
25
- )
26
- output = telegram_format(input_text)
27
- assert (
28
- output == expected_output
29
- ), "Failed converting triple backticks with language to <pre><code> tags"
30
-
31
-
32
- def test_bold_and_underline_conversion():
33
- input_text = "This is **bold** and this is __underline__."
34
- expected_output = "This is <b>bold</b> and this is <u>underline</u>."
35
- output = telegram_format(input_text)
36
- assert output == expected_output, "Failed converting ** and __ to <b> and <u> tags"
37
-
38
-
39
- def test_escaping_special_characters():
40
- input_text = "Avoid using < or > in your HTML."
41
- expected_output = "Avoid using &lt; or &gt; in your HTML."
42
- output = telegram_format(input_text)
43
- assert output == expected_output, "Failed escaping < and > characters"
44
-
45
-
46
- def test_nested_markdown_syntax():
47
- input_text = "This is **bold and _italic_** text."
48
- expected_output = "This is <b>bold and <i>italic</i></b> text."
49
- output = telegram_format(input_text)
50
- assert output == expected_output, "Failed handling nested markdown syntax"
51
-
52
-
53
- def test_combination_of_markdown_elements():
54
- input_text = """
55
- # Heading
56
- This is a test of **bold**, __underline__, and `inline code`.
57
- - Item 1
58
- * Item 2
59
-
60
- ```python
61
- for i in range(3):
62
- print(i)
63
- ```
64
-
65
- [Link](http://example.com)
66
- """
67
- expected_output = """<b>Heading</b>\nThis is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.\n• Item 1\n• Item 2\n\n<pre><code class="language-python">for i in range(3):\n print(i)\n</code></pre>\n\n<a href="http://example.com">Link</a>\n"""
68
- output = telegram_format(input_text)
69
- assert (
70
- output.strip() == expected_output.strip()
71
- ), "Failed combining multiple markdown elements into HTML"
72
-
73
-
74
- def test_nested_bold_within_italic():
75
- input_text = "This is *__bold within italic__* text."
76
- expected_output = "This is <i><u>bold within italic</u></i> text."
77
- output = telegram_format(input_text)
78
- assert (
79
- output == expected_output
80
- ), "Failed converting nested bold within italic markdown to HTML"
81
-
82
-
83
- def test_italic_within_bold():
84
- input_text = "This is **bold and _italic_ together**."
85
- expected_output = "This is <b>bold and <i>italic</i> together</b>."
86
- output = telegram_format(input_text)
87
- assert (
88
- output == expected_output
89
- ), "Failed converting italic within bold markdown to HTML"
90
-
91
-
92
- def test_inline_code_within_bold_text():
93
- input_text = "This is **bold and `inline code` together**."
94
- expected_output = "This is <b>bold and <code>inline code</code> together</b>."
95
- output = telegram_format(input_text)
96
- assert output == expected_output, "Failed handling inline code within bold text"
97
-
98
-
99
- def test_mixed_formatting_tags_with_lists_and_links():
100
- input_text = """
101
- - This is a list item with **bold**, __underline__, and [a link](http://example.com)
102
- - Another item with ***bold and italic*** text
103
- """
104
- expected_output = """
105
- • This is a list item with <b>bold</b>, <u>underline</u>, and <a href="http://example.com">a link</a>
106
- • Another item with <b><i>bold and italic</i></b> text
107
- """
108
- output = telegram_format(input_text)
109
- assert (
110
- output.strip() == expected_output.strip()
111
- ), "Failed handling mixed formatting tags with lists and links"
112
-
113
-
114
- def test_special_characters_within_code_blocks():
115
- input_text = "Here is a code block: ```<script>alert('Hello')</script>```"
116
- expected_output = "Here is a code block: <pre><code>&lt;script&gt;alert('Hello')&lt;/script&gt;</code></pre>"
117
- output = telegram_format(input_text)
118
- assert (
119
- output == expected_output
120
- ), "Failed escaping special characters within code blocks"
121
-
122
-
123
- def test_code_block_within_bold_text():
124
- input_text = "This is **bold with a `code block` inside**."
125
- expected_output = "This is <b>bold with a <code>code block</code> inside</b>."
126
- output = telegram_format(input_text)
127
- assert output == expected_output, "Failed handling code block within bold text"
128
-
129
-
130
- def test_triple_backticks_with_nested_markdown():
131
- input_text = "```python\n**bold text** and __underline__ in code block```"
132
- # Expecting the markdown syntax to be ignored within the code block
133
- expected_output = '<pre><code class="language-python">**bold text** and __underline__ in code block</code></pre>'
134
- output = telegram_format(input_text)
135
- assert (
136
- output == expected_output
137
- ), "Failed handling markdown within triple backtick code blocks"
138
-
139
-
140
- def test_unmatched_code_delimiters():
141
- input_text = "This has an `unmatched code delimiter."
142
- # Expecting original input as output due to the unmatched delimiter
143
- expected_output = "This has an <code>unmatched code delimiter.</code>"
144
- output = telegram_format(input_text)
145
- assert output == expected_output, "Failed handling unmatched code delimiters"
146
-
147
-
148
- def test_preformatted_block_with_unusual_language_specification():
149
- input_text = "```weirdLang\nSome weirdLang code\n```"
150
- expected_output = (
151
- '<pre><code class="language-weirdLang">Some weirdLang code\n</code></pre>'
152
- )
153
- output = telegram_format(input_text)
154
- assert (
155
- output == expected_output
156
- ), "Failed handling preformatted block with unusual language specification"
157
-
158
-
159
- def test_inline_code_within_lists():
160
- input_text = """
161
- - List item with `code`
162
- * Another `code` item
163
- """
164
- expected_output = """
165
- • List item with <code>code</code>
166
- • Another <code>code</code> item
167
- """
168
- output = telegram_format(input_text)
169
- assert (
170
- output.strip() == expected_output.strip()
171
- ), "Failed handling inline code within lists"
172
-
173
-
174
- def test_vector_storage_links_trim():
175
- input_text = """
176
- - List item with `code`
177
- * Another `code` item【4:0†source】
178
- """
179
- expected_output = """
180
- • List item with <code>code</code>
181
- • Another <code>code</code> item
182
- """
183
- output = telegram_format(input_text)
184
- assert output.strip() == expected_output.strip(), "Failed trim storage links"
185
-
186
-
187
- def test_strikethrough_conversion():
188
- input_text = "This is ~~strikethrough~~ text."
189
- expected_output = "This is <s>strikethrough</s> text."
190
- output = telegram_format(input_text)
191
- assert output == expected_output, "Failed converting ~~ to <s> tags"
192
-
193
-
194
- def test_blockquote_conversion():
195
- input_text = "> This is a blockquote."
196
- expected_output = "<blockquote>This is a blockquote.</blockquote>"
197
- output = telegram_format(input_text)
198
- assert output == expected_output, "Failed converting > to <blockquote> tags"
199
-
200
-
201
- def test_inline_url_conversion():
202
- input_text = "[example](http://example.com)"
203
- expected_output = '<a href="http://example.com">example</a>'
204
- output = telegram_format(input_text)
205
- assert output == expected_output, "Failed converting [text](URL) to <a> tags"
206
-
207
-
208
- def test_inline_mention_conversion():
209
- input_text = "[User](tg://user?id=123456789)"
210
- expected_output = '<a href="tg://user?id=123456789">User</a>'
211
- output = telegram_format(input_text)
212
- assert (
213
- output == expected_output
214
- ), "Failed converting [text](tg://user?id=ID) to <a> tags"
215
-
216
-
217
- def test_escaping_ampersand():
218
- input_text = "Use & in your HTML."
219
- expected_output = "Use &amp; in your HTML."
220
- output = telegram_format(input_text)
221
- assert output == expected_output, "Failed escaping & character"
222
-
223
-
224
- def test_pre_and_code_tags_with_html_entities():
225
- input_text = "```html\n<div>Content</div>\n```"
226
- expected_output = (
227
- '<pre><code class="language-html">&lt;div&gt;Content&lt;/div&gt;\n</code></pre>'
228
- )
229
- output = telegram_format(input_text)
230
- assert (
231
- output == expected_output
232
- ), "Failed handling pre and code tags with HTML entities"
233
-
234
-
235
- def test_code_with_multiple_lines():
236
- input_text = "```\ndef example():\n return 'example'\n```"
237
- expected_output = "<pre><code>def example():\n return 'example'\n</code></pre>"
238
- output = telegram_format(input_text)
239
- assert output == expected_output, "Failed handling code with multiple lines"
240
-
241
-
242
- def test_combined_formatting_with_lists():
243
- input_text = """
244
- - **Bold** list item
245
- - _Italic_ list item
246
- - `Code` list item
247
- """
248
- expected_output = """
249
- • <b>Bold</b> list item
250
- • <i>Italic</i> list item
251
- • <code>Code</code> list item
252
- """
253
- output = telegram_format(input_text)
254
- assert (
255
- output.strip() == expected_output.strip()
256
- ), "Failed handling combined formatting with lists"