chatgpt-md-converter 0.4.0b2__tar.gz → 0.4.0b4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/PKG-INFO +1 -1
  2. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/code_blocks.py +3 -5
  3. chatgpt_md_converter-0.4.0b4/chatgpt_md_converter/telegram_markdown/html_escape.py +40 -0
  4. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/renderer.py +8 -5
  5. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
  6. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/SOURCES.txt +2 -0
  7. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/setup.py +1 -1
  8. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_parser.py +16 -0
  9. chatgpt_md_converter-0.4.0b4/tests/test_pre_escaped_entities.py +141 -0
  10. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/LICENSE +0 -0
  11. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/README.md +0 -0
  12. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/__init__.py +0 -0
  13. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/escaping.py +0 -0
  14. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/handlers.py +0 -0
  15. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/renderer.py +0 -0
  16. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/state.py +0 -0
  17. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/tree.py +0 -0
  18. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_splitter.py +0 -0
  19. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_to_markdown.py +0 -0
  20. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/__init__.py +0 -0
  21. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/entity.py +0 -0
  22. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/__init__.py +0 -0
  23. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/blockquotes.py +0 -0
  24. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/headings.py +0 -0
  25. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/inline.py +0 -0
  26. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/links.py +0 -0
  27. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/parser.py +0 -0
  28. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/utf16.py +0 -0
  29. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_formatter.py +0 -0
  30. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/__init__.py +0 -0
  31. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/inline.py +0 -0
  32. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/postprocess.py +0 -0
  33. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/preprocess.py +0 -0
  34. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
  35. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
  36. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/setup.cfg +0 -0
  37. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_entities.py +0 -0
  38. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_html_to_markdown_inline_spacing.py +0 -0
  39. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_roundtrip_markdown.py +0 -0
  40. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_splitter.py +0 -0
  41. {chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_telegram_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.4.0b2
3
+ Version: 0.4.0b4
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -2,6 +2,8 @@
2
2
 
3
3
  import re
4
4
 
5
+ from .html_escape import escape_code_content
6
+
5
7
  _CODE_BLOCK_RE = re.compile(
6
8
  r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
7
9
  flags=re.DOTALL,
@@ -62,11 +64,7 @@ def extract_and_convert_code_blocks(text: str):
62
64
  def _replacement(match: re.Match[str]) -> tuple[str, str]:
63
65
  language = match.group("lang") or ""
64
66
  code_content = match.group("code")
65
- escaped = (
66
- code_content.replace("&", "&amp;")
67
- .replace("<", "&lt;")
68
- .replace(">", "&gt;")
69
- )
67
+ escaped = escape_code_content(code_content)
70
68
  placeholder = f"CODEBLOCKPLACEHOLDER_{len(placeholders)}_"
71
69
  placeholders.append(placeholder)
72
70
  if language:
@@ -0,0 +1,40 @@
1
+ """HTML escaping utilities for code content.
2
+
3
+ LLMs sometimes pre-escape HTML entities (&lt; &gt; &amp; &quot;) in
4
+ markdown code blocks and inline code. We unescape first, then
5
+ re-escape exactly once to avoid double-escaping like &amp;lt;.
6
+ """
7
+
8
+ import re
9
+
10
+ _HTML_ENTITY_RE = re.compile(r"&(?:lt|gt|amp|quot|apos|#\d+|#x[\da-fA-F]+);")
11
+
12
+
13
+ def _is_pre_escaped(text: str) -> bool:
14
+ """Return True if the text contains any HTML character references."""
15
+ return bool(_HTML_ENTITY_RE.search(text))
16
+
17
+
18
+ def _unescape_html(text: str) -> str:
19
+ """Unescape common HTML character references to their literal chars."""
20
+ text = text.replace("&amp;", "&")
21
+ text = text.replace("&lt;", "<")
22
+ text = text.replace("&gt;", ">")
23
+ text = text.replace("&quot;", '"')
24
+ text = text.replace("&apos;", "'")
25
+ return text
26
+
27
+
28
+ def escape_code_content(text: str) -> str:
29
+ """Escape code content for Telegram HTML, handling pre-escaped input.
30
+
31
+ If the input already contains HTML entities (from LLM pre-escaping),
32
+ unescape them first so we produce a single level of escaping.
33
+ """
34
+ if _is_pre_escaped(text):
35
+ text = _unescape_html(text)
36
+ return (
37
+ text.replace("&", "&amp;")
38
+ .replace("<", "&lt;")
39
+ .replace(">", "&gt;")
40
+ )
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import re
6
6
 
7
7
  from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
8
+ from .html_escape import escape_code_content
8
9
  from .inline import (apply_custom_italic, convert_html_chars,
9
10
  extract_inline_code_snippets, split_by_tag)
10
11
  from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
@@ -34,15 +35,17 @@ def telegram_format(text: str) -> str:
34
35
 
35
36
  output = re.sub(r"【[^】]+】", "", output)
36
37
 
38
+ # Handle Telegram custom emoji before generic links
39
+ # ![emoji](tg://emoji?id=123) -> <tg-emoji emoji-id="123">emoji</tg-emoji>
40
+ emoji_pattern = r"!\[([^\]]*)\]\(tg://emoji\?id=(\d+)\)"
41
+ output = re.sub(emoji_pattern, r'<tg-emoji emoji-id="\2">\1</tg-emoji>', output)
42
+
43
+ # Handle all links including images (! prefix is stripped for non-emoji images)
37
44
  link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
38
45
  output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
39
46
 
40
47
  for placeholder, snippet in inline_snippets.items():
41
- escaped = (
42
- snippet.replace("&", "&amp;")
43
- .replace("<", "&lt;")
44
- .replace(">", "&gt;")
45
- )
48
+ escaped = escape_code_content(snippet)
46
49
  output = output.replace(placeholder, f"<code>{escaped}</code>")
47
50
 
48
51
  output = reinsert_code_blocks(output, block_map)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.4.0b2
3
+ Version: 0.4.0b4
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -25,6 +25,7 @@ chatgpt_md_converter/telegram_entities/extractors/inline.py
25
25
  chatgpt_md_converter/telegram_entities/extractors/links.py
26
26
  chatgpt_md_converter/telegram_markdown/__init__.py
27
27
  chatgpt_md_converter/telegram_markdown/code_blocks.py
28
+ chatgpt_md_converter/telegram_markdown/html_escape.py
28
29
  chatgpt_md_converter/telegram_markdown/inline.py
29
30
  chatgpt_md_converter/telegram_markdown/postprocess.py
30
31
  chatgpt_md_converter/telegram_markdown/preprocess.py
@@ -32,6 +33,7 @@ chatgpt_md_converter/telegram_markdown/renderer.py
32
33
  tests/test_entities.py
33
34
  tests/test_html_to_markdown_inline_spacing.py
34
35
  tests/test_parser.py
36
+ tests/test_pre_escaped_entities.py
35
37
  tests/test_roundtrip_markdown.py
36
38
  tests/test_splitter.py
37
39
  tests/test_telegram_api.py
@@ -2,7 +2,7 @@ from setuptools import setup
2
2
 
3
3
  setup(
4
4
  name="chatgpt_md_converter",
5
- version="0.4.0b2",
5
+ version="0.4.0b4",
6
6
  author="Kostiantyn Kriuchkov",
7
7
  author_email="latand666@gmail.com",
8
8
  description="A package for converting markdown to HTML for chat Telegram bots",
@@ -1038,3 +1038,19 @@ def test_inline_code_with_escaped_backtick_trailing_text():
1038
1038
  expected_output = "Escaped \\*asterisks\\* and <code>code with \\</code> backtick`"
1039
1039
  output = telegram_format(input_text)
1040
1040
  assert output == expected_output
1041
+
1042
+
1043
+ def test_custom_emoji_conversion():
1044
+ """Test that custom emoji markdown is converted to tg-emoji HTML tag."""
1045
+ input_text = "Hello ![❤️](tg://emoji?id=5226457415154701085) world"
1046
+ expected_output = 'Hello <tg-emoji emoji-id="5226457415154701085">❤️</tg-emoji> world'
1047
+ output = telegram_format(input_text)
1048
+ assert output == expected_output, "Failed converting custom emoji to <tg-emoji> tag"
1049
+
1050
+
1051
+ def test_custom_emoji_with_regular_link():
1052
+ """Test that custom emoji and regular links are both handled correctly."""
1053
+ input_text = "Emoji ![👍](tg://emoji?id=5368324170671202286) and [link](https://example.com)"
1054
+ expected_output = 'Emoji <tg-emoji emoji-id="5368324170671202286">👍</tg-emoji> and <a href="https://example.com">link</a>'
1055
+ output = telegram_format(input_text)
1056
+ assert output == expected_output, "Failed handling emoji and link together"
@@ -0,0 +1,141 @@
1
+ """Tests for handling pre-escaped HTML entities from LLMs.
2
+
3
+ LLMs sometimes output &lt; &gt; &amp; instead of < > & in markdown code
4
+ blocks and inline code. The formatter should normalize these to avoid
5
+ double-escaping (e.g. &amp;lt; in the final HTML).
6
+ """
7
+
8
+ import pytest
9
+
10
+ from chatgpt_md_converter import html_to_telegram_markdown, telegram_format
11
+
12
+
13
+ class TestInlineCodePreEscaped:
14
+ """Inline code with pre-escaped HTML entities."""
15
+
16
+ def test_pre_escaped_angle_brackets(self):
17
+ # LLM wrote &lt; instead of < in inline code
18
+ md_escaped = "Use `&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;` for custom emoji"
19
+ md_correct = 'Use `<tg-emoji emoji-id="ID">⭐</tg-emoji>` for custom emoji'
20
+
21
+ html_escaped = telegram_format(md_escaped)
22
+ html_correct = telegram_format(md_correct)
23
+
24
+ assert html_escaped == html_correct
25
+ assert "&amp;lt;" not in html_escaped
26
+ assert "&lt;tg-emoji" in html_escaped # single escape only
27
+
28
+ def test_pre_escaped_ampersand(self):
29
+ md_escaped = "Query: `a &amp; b`"
30
+ md_correct = "Query: `a & b`"
31
+
32
+ html_escaped = telegram_format(md_escaped)
33
+ html_correct = telegram_format(md_correct)
34
+
35
+ assert html_escaped == html_correct
36
+ assert "&amp;amp;" not in html_escaped
37
+
38
+ def test_pre_escaped_mixed(self):
39
+ md_escaped = "`&lt;div class=&quot;test&quot;&gt;hello&lt;/div&gt;`"
40
+ md_correct = '`<div class="test">hello</div>`'
41
+
42
+ html_escaped = telegram_format(md_escaped)
43
+ html_correct = telegram_format(md_correct)
44
+
45
+ assert html_escaped == html_correct
46
+
47
+ def test_no_double_escaping_gt_lt(self):
48
+ md = "`x &lt; y &gt; z`"
49
+ html = telegram_format(md)
50
+
51
+ assert "&amp;lt;" not in html
52
+ assert "&amp;gt;" not in html
53
+ assert "<code>x &lt; y &gt; z</code>" in html
54
+
55
+
56
+ class TestCodeBlockPreEscaped:
57
+ """Fenced code blocks with pre-escaped HTML entities."""
58
+
59
+ def test_pre_escaped_html_code_block(self):
60
+ md_escaped = "```html\n&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;\n```"
61
+ md_correct = '```html\n<tg-emoji emoji-id="ID">⭐</tg-emoji>\n```'
62
+
63
+ html_escaped = telegram_format(md_escaped)
64
+ html_correct = telegram_format(md_correct)
65
+
66
+ assert html_escaped == html_correct
67
+ assert "&amp;lt;" not in html_escaped
68
+
69
+ def test_pre_escaped_ampersand_code_block(self):
70
+ md_escaped = "```\na &amp; b\n```"
71
+ md_correct = "```\na & b\n```"
72
+
73
+ html_escaped = telegram_format(md_escaped)
74
+ html_correct = telegram_format(md_correct)
75
+
76
+ assert html_escaped == html_correct
77
+
78
+ def test_pre_escaped_mixed_code_block(self):
79
+ md_escaped = "```xml\n&lt;root attr=&quot;val&quot;&gt;\n &lt;child/&gt;\n&lt;/root&gt;\n```"
80
+ md_correct = '```xml\n<root attr="val">\n <child/>\n</root>\n```'
81
+
82
+ html_escaped = telegram_format(md_escaped)
83
+ html_correct = telegram_format(md_correct)
84
+
85
+ assert html_escaped == html_correct
86
+
87
+
88
+ class TestRoundTripPreEscaped:
89
+ """Round-trip: pre-escaped input should normalize to the same as clean input."""
90
+
91
+ def test_inline_code_round_trip(self):
92
+ md_escaped = "Use `&lt;b&gt;bold&lt;/b&gt;` tag"
93
+ md_correct = "Use `<b>bold</b>` tag"
94
+
95
+ html1 = telegram_format(md_escaped)
96
+ md1 = html_to_telegram_markdown(html1)
97
+ html2 = telegram_format(md1)
98
+
99
+ html_ref = telegram_format(md_correct)
100
+ md_ref = html_to_telegram_markdown(html_ref)
101
+
102
+ # After one round-trip, both should converge
103
+ assert md1 == md_ref
104
+ assert html1 == html_ref
105
+ assert html2 == html_ref
106
+
107
+ def test_code_block_round_trip(self):
108
+ md_escaped = "```\n&lt;div&gt;test&lt;/div&gt;\n```"
109
+ md_correct = "```\n<div>test</div>\n```"
110
+
111
+ html1 = telegram_format(md_escaped)
112
+ md1 = html_to_telegram_markdown(html1)
113
+ html2 = telegram_format(md1)
114
+
115
+ html_ref = telegram_format(md_correct)
116
+ md_ref = html_to_telegram_markdown(html_ref)
117
+
118
+ assert md1 == md_ref
119
+ assert html1 == html_ref
120
+ assert html2 == html_ref
121
+
122
+ def test_real_world_tg_emoji_case(self):
123
+ """The exact scenario from production: LLM pre-escapes tg-emoji tags."""
124
+ md_input = (
125
+ "В `input_message_content` ти віддаєш _готовий текст_:\n"
126
+ "- HTML: `&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;`\n"
127
+ "- MarkdownV2: `![⭐](tg://emoji?id=ID)`"
128
+ )
129
+ html = telegram_format(md_input)
130
+
131
+ # Should NOT have double-escaped entities
132
+ assert "&amp;lt;" not in html
133
+ assert "&amp;gt;" not in html
134
+
135
+ # Should have proper single-escaped entities in <code> tags
136
+ assert '<code>&lt;tg-emoji emoji-id="ID"&gt;⭐&lt;/tg-emoji&gt;</code>' in html
137
+
138
+ # Round-trip should be stable
139
+ md_back = html_to_telegram_markdown(html)
140
+ html2 = telegram_format(md_back)
141
+ assert html == html2