PyPI - chatgpt-md-converter - Versions diffs - 0.4.0b2__tar.gz → 0.4.0b4__tar.gz - Mend

chatgpt-md-converter 0.4.0b2tar.gz → 0.4.0b4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chatgpt_md_converter
-Version: 0.4.0b2
+Version: 0.4.0b4
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/code_blocks.py RENAMED Viewed

@@ -2,6 +2,8 @@
 import re
+from .html_escape import escape_code_content
 _CODE_BLOCK_RE = re.compile(
     r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
     flags=re.DOTALL,
@@ -62,11 +64,7 @@ def extract_and_convert_code_blocks(text: str):
     def _replacement(match: re.Match[str]) -> tuple[str, str]:
         language = match.group("lang") or ""
         code_content = match.group("code")
-        escaped = (
-            code_content.replace("&", "&amp;")
-            .replace("<", "&lt;")
-            .replace(">", "&gt;")
-        )
+        escaped = escape_code_content(code_content)
         placeholder = f"CODEBLOCKPLACEHOLDER_{len(placeholders)}_"
         placeholders.append(placeholder)
         if language:

chatgpt_md_converter-0.4.0b4/chatgpt_md_converter/telegram_markdown/html_escape.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""HTML escaping utilities for code content.
+LLMs sometimes pre-escape HTML entities (&lt; &gt; &amp; &quot;) in
+markdown code blocks and inline code. We unescape first, then
+re-escape exactly once to avoid double-escaping like &amp;lt;.
+"""
+import re
+_HTML_ENTITY_RE = re.compile(r"&(?:lt|gt|amp|quot|apos|#\d+|#x[\da-fA-F]+);")
+def _is_pre_escaped(text: str) -> bool:
+    """Return True if the text contains any HTML character references."""
+    return bool(_HTML_ENTITY_RE.search(text))
+def _unescape_html(text: str) -> str:
+    """Unescape common HTML character references to their literal chars."""
+    text = text.replace("&amp;", "&")
+    text = text.replace("&lt;", "<")
+    text = text.replace("&gt;", ">")
+    text = text.replace("&quot;", '"')
+    text = text.replace("&apos;", "'")
+    return text
+def escape_code_content(text: str) -> str:
+    """Escape code content for Telegram HTML, handling pre-escaped input.
+    If the input already contains HTML entities (from LLM pre-escaping),
+    unescape them first so we produce a single level of escaping.
+    """
+    if _is_pre_escaped(text):
+        text = _unescape_html(text)
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+    )

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/renderer.py RENAMED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import re
 from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
+from .html_escape import escape_code_content
 from .inline import (apply_custom_italic, convert_html_chars,
                      extract_inline_code_snippets, split_by_tag)
 from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
@@ -34,15 +35,17 @@ def telegram_format(text: str) -> str:
     output = re.sub(r"【[^】]+】", "", output)
+    # Handle Telegram custom emoji before generic links
+    # ![emoji](tg://emoji?id=123) -> <tg-emoji emoji-id="123">emoji</tg-emoji>
+    emoji_pattern = r"!\[([^\]]*)\]\(tg://emoji\?id=(\d+)\)"
+    output = re.sub(emoji_pattern, r'<tg-emoji emoji-id="\2">\1</tg-emoji>', output)
+    # Handle all links including images (! prefix is stripped for non-emoji images)
     link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
     output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
     for placeholder, snippet in inline_snippets.items():
-        escaped = (
-            snippet.replace("&", "&amp;")
-            .replace("<", "&lt;")
-            .replace(">", "&gt;")
-        )
+        escaped = escape_code_content(snippet)
         output = output.replace(placeholder, f"<code>{escaped}</code>")
     output = reinsert_code_blocks(output, block_map)

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chatgpt_md_converter
-Version: 0.4.0b2
+Version: 0.4.0b4
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/SOURCES.txt RENAMED Viewed

@@ -25,6 +25,7 @@ chatgpt_md_converter/telegram_entities/extractors/inline.py
 chatgpt_md_converter/telegram_entities/extractors/links.py
 chatgpt_md_converter/telegram_markdown/__init__.py
 chatgpt_md_converter/telegram_markdown/code_blocks.py
+chatgpt_md_converter/telegram_markdown/html_escape.py
 chatgpt_md_converter/telegram_markdown/inline.py
 chatgpt_md_converter/telegram_markdown/postprocess.py
 chatgpt_md_converter/telegram_markdown/preprocess.py
@@ -32,6 +33,7 @@ chatgpt_md_converter/telegram_markdown/renderer.py
 tests/test_entities.py
 tests/test_html_to_markdown_inline_spacing.py
 tests/test_parser.py
+tests/test_pre_escaped_entities.py
 tests/test_roundtrip_markdown.py
 tests/test_splitter.py
 tests/test_telegram_api.py

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup
 setup(
     name="chatgpt_md_converter",
-    version="0.4.0b2",
+    version="0.4.0b4",
     author="Kostiantyn Kriuchkov",
     author_email="latand666@gmail.com",
     description="A package for converting markdown to HTML for chat Telegram bots",

{chatgpt_md_converter-0.4.0b2 → chatgpt_md_converter-0.4.0b4}/tests/test_parser.py RENAMED Viewed

@@ -1038,3 +1038,19 @@ def test_inline_code_with_escaped_backtick_trailing_text():
     expected_output = "Escaped \\*asterisks\\* and <code>code with \\</code> backtick`"
     output = telegram_format(input_text)
     assert output == expected_output
+def test_custom_emoji_conversion():
+    """Test that custom emoji markdown is converted to tg-emoji HTML tag."""
+    input_text = "Hello ![❤️](tg://emoji?id=5226457415154701085) world"
+    expected_output = 'Hello <tg-emoji emoji-id="5226457415154701085">❤️</tg-emoji> world'
+    output = telegram_format(input_text)
+    assert output == expected_output, "Failed converting custom emoji to <tg-emoji> tag"
+def test_custom_emoji_with_regular_link():
+    """Test that custom emoji and regular links are both handled correctly."""
+    input_text = "Emoji ![👍](tg://emoji?id=5368324170671202286) and [link](https://example.com)"
+    expected_output = 'Emoji <tg-emoji emoji-id="5368324170671202286">👍</tg-emoji> and <a href="https://example.com">link</a>'
+    output = telegram_format(input_text)
+    assert output == expected_output, "Failed handling emoji and link together"

chatgpt_md_converter-0.4.0b4/tests/test_pre_escaped_entities.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""Tests for handling pre-escaped HTML entities from LLMs.
+LLMs sometimes output &lt; &gt; &amp; instead of < > & in markdown code
+blocks and inline code. The formatter should normalize these to avoid
+double-escaping (e.g. &amp;lt; in the final HTML).
+"""
+import pytest
+from chatgpt_md_converter import html_to_telegram_markdown, telegram_format
+class TestInlineCodePreEscaped:
+    """Inline code with pre-escaped HTML entities."""
+    def test_pre_escaped_angle_brackets(self):
+        # LLM wrote &lt; instead of < in inline code
+        md_escaped = "Use `&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;` for custom emoji"
+        md_correct = 'Use `<tg-emoji emoji-id="ID">⭐</tg-emoji>` for custom emoji'
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+        assert "&amp;lt;" not in html_escaped
+        assert "&lt;tg-emoji" in html_escaped  # single escape only
+    def test_pre_escaped_ampersand(self):
+        md_escaped = "Query: `a &amp; b`"
+        md_correct = "Query: `a & b`"
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+        assert "&amp;amp;" not in html_escaped
+    def test_pre_escaped_mixed(self):
+        md_escaped = "`&lt;div class=&quot;test&quot;&gt;hello&lt;/div&gt;`"
+        md_correct = '`<div class="test">hello</div>`'
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+    def test_no_double_escaping_gt_lt(self):
+        md = "`x &lt; y &gt; z`"
+        html = telegram_format(md)
+        assert "&amp;lt;" not in html
+        assert "&amp;gt;" not in html
+        assert "<code>x &lt; y &gt; z</code>" in html
+class TestCodeBlockPreEscaped:
+    """Fenced code blocks with pre-escaped HTML entities."""
+    def test_pre_escaped_html_code_block(self):
+        md_escaped = "```html\n&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;\n```"
+        md_correct = '```html\n<tg-emoji emoji-id="ID">⭐</tg-emoji>\n```'
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+        assert "&amp;lt;" not in html_escaped
+    def test_pre_escaped_ampersand_code_block(self):
+        md_escaped = "```\na &amp; b\n```"
+        md_correct = "```\na & b\n```"
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+    def test_pre_escaped_mixed_code_block(self):
+        md_escaped = "```xml\n&lt;root attr=&quot;val&quot;&gt;\n  &lt;child/&gt;\n&lt;/root&gt;\n```"
+        md_correct = '```xml\n<root attr="val">\n  <child/>\n</root>\n```'
+        html_escaped = telegram_format(md_escaped)
+        html_correct = telegram_format(md_correct)
+        assert html_escaped == html_correct
+class TestRoundTripPreEscaped:
+    """Round-trip: pre-escaped input should normalize to the same as clean input."""
+    def test_inline_code_round_trip(self):
+        md_escaped = "Use `&lt;b&gt;bold&lt;/b&gt;` tag"
+        md_correct = "Use `<b>bold</b>` tag"
+        html1 = telegram_format(md_escaped)
+        md1 = html_to_telegram_markdown(html1)
+        html2 = telegram_format(md1)
+        html_ref = telegram_format(md_correct)
+        md_ref = html_to_telegram_markdown(html_ref)
+        # After one round-trip, both should converge
+        assert md1 == md_ref
+        assert html1 == html_ref
+        assert html2 == html_ref
+    def test_code_block_round_trip(self):
+        md_escaped = "```\n&lt;div&gt;test&lt;/div&gt;\n```"
+        md_correct = "```\n<div>test</div>\n```"
+        html1 = telegram_format(md_escaped)
+        md1 = html_to_telegram_markdown(html1)
+        html2 = telegram_format(md1)
+        html_ref = telegram_format(md_correct)
+        md_ref = html_to_telegram_markdown(html_ref)
+        assert md1 == md_ref
+        assert html1 == html_ref
+        assert html2 == html_ref
+    def test_real_world_tg_emoji_case(self):
+        """The exact scenario from production: LLM pre-escapes tg-emoji tags."""
+        md_input = (
+            "В `input_message_content` ти віддаєш _готовий текст_:\n"
+            "- HTML: `&lt;tg-emoji emoji-id=\"ID\"&gt;⭐&lt;/tg-emoji&gt;`\n"
+            "- MarkdownV2: `![⭐](tg://emoji?id=ID)`"
+        )
+        html = telegram_format(md_input)
+        # Should NOT have double-escaped entities
+        assert "&amp;lt;" not in html
+        assert "&amp;gt;" not in html
+        # Should have proper single-escaped entities in <code> tags
+        assert '<code>&lt;tg-emoji emoji-id="ID"&gt;⭐&lt;/tg-emoji&gt;</code>' in html
+        # Round-trip should be stable
+        md_back = html_to_telegram_markdown(html)
+        html2 = telegram_format(md_back)
+        assert html == html2