chatgpt-md-converter 0.4.0b3__tar.gz → 0.4.0b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/PKG-INFO +1 -1
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/code_blocks.py +3 -5
- chatgpt_md_converter-0.4.0b4/chatgpt_md_converter/telegram_markdown/html_escape.py +40 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/renderer.py +2 -5
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/SOURCES.txt +2 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/setup.py +1 -1
- chatgpt_md_converter-0.4.0b4/tests/test_pre_escaped_entities.py +141 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/LICENSE +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/README.md +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/__init__.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/escaping.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/handlers.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/renderer.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/state.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_markdown/tree.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_splitter.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_to_markdown.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/__init__.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/entity.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/__init__.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/blockquotes.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/headings.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/inline.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/extractors/links.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/parser.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_entities/utf16.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_formatter.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/__init__.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/inline.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/postprocess.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/telegram_markdown/preprocess.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/setup.cfg +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_entities.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_html_to_markdown_inline_spacing.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_parser.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_roundtrip_markdown.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_splitter.py +0 -0
- {chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_telegram_api.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0b4
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
5
|
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
|
|
5
|
+
from .html_escape import escape_code_content
|
|
6
|
+
|
|
5
7
|
_CODE_BLOCK_RE = re.compile(
|
|
6
8
|
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
7
9
|
flags=re.DOTALL,
|
|
@@ -62,11 +64,7 @@ def extract_and_convert_code_blocks(text: str):
|
|
|
62
64
|
def _replacement(match: re.Match[str]) -> tuple[str, str]:
|
|
63
65
|
language = match.group("lang") or ""
|
|
64
66
|
code_content = match.group("code")
|
|
65
|
-
escaped = (
|
|
66
|
-
code_content.replace("&", "&")
|
|
67
|
-
.replace("<", "<")
|
|
68
|
-
.replace(">", ">")
|
|
69
|
-
)
|
|
67
|
+
escaped = escape_code_content(code_content)
|
|
70
68
|
placeholder = f"CODEBLOCKPLACEHOLDER_{len(placeholders)}_"
|
|
71
69
|
placeholders.append(placeholder)
|
|
72
70
|
if language:
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""HTML escaping utilities for code content.
|
|
2
|
+
|
|
3
|
+
LLMs sometimes pre-escape HTML entities (< > & ") in
|
|
4
|
+
markdown code blocks and inline code. We unescape first, then
|
|
5
|
+
re-escape exactly once to avoid double-escaping like &lt;.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
_HTML_ENTITY_RE = re.compile(r"&(?:lt|gt|amp|quot|apos|#\d+|#x[\da-fA-F]+);")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _is_pre_escaped(text: str) -> bool:
|
|
14
|
+
"""Return True if the text contains any HTML character references."""
|
|
15
|
+
return bool(_HTML_ENTITY_RE.search(text))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _unescape_html(text: str) -> str:
|
|
19
|
+
"""Unescape common HTML character references to their literal chars."""
|
|
20
|
+
text = text.replace("&", "&")
|
|
21
|
+
text = text.replace("<", "<")
|
|
22
|
+
text = text.replace(">", ">")
|
|
23
|
+
text = text.replace(""", '"')
|
|
24
|
+
text = text.replace("'", "'")
|
|
25
|
+
return text
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def escape_code_content(text: str) -> str:
|
|
29
|
+
"""Escape code content for Telegram HTML, handling pre-escaped input.
|
|
30
|
+
|
|
31
|
+
If the input already contains HTML entities (from LLM pre-escaping),
|
|
32
|
+
unescape them first so we produce a single level of escaping.
|
|
33
|
+
"""
|
|
34
|
+
if _is_pre_escaped(text):
|
|
35
|
+
text = _unescape_html(text)
|
|
36
|
+
return (
|
|
37
|
+
text.replace("&", "&")
|
|
38
|
+
.replace("<", "<")
|
|
39
|
+
.replace(">", ">")
|
|
40
|
+
)
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
7
|
from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
8
|
+
from .html_escape import escape_code_content
|
|
8
9
|
from .inline import (apply_custom_italic, convert_html_chars,
|
|
9
10
|
extract_inline_code_snippets, split_by_tag)
|
|
10
11
|
from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
|
|
@@ -44,11 +45,7 @@ def telegram_format(text: str) -> str:
|
|
|
44
45
|
output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
|
|
45
46
|
|
|
46
47
|
for placeholder, snippet in inline_snippets.items():
|
|
47
|
-
escaped = (
|
|
48
|
-
snippet.replace("&", "&")
|
|
49
|
-
.replace("<", "<")
|
|
50
|
-
.replace(">", ">")
|
|
51
|
-
)
|
|
48
|
+
escaped = escape_code_content(snippet)
|
|
52
49
|
output = output.replace(placeholder, f"<code>{escaped}</code>")
|
|
53
50
|
|
|
54
51
|
output = reinsert_code_blocks(output, block_map)
|
{chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.0b4
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
5
|
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
@@ -25,6 +25,7 @@ chatgpt_md_converter/telegram_entities/extractors/inline.py
|
|
|
25
25
|
chatgpt_md_converter/telegram_entities/extractors/links.py
|
|
26
26
|
chatgpt_md_converter/telegram_markdown/__init__.py
|
|
27
27
|
chatgpt_md_converter/telegram_markdown/code_blocks.py
|
|
28
|
+
chatgpt_md_converter/telegram_markdown/html_escape.py
|
|
28
29
|
chatgpt_md_converter/telegram_markdown/inline.py
|
|
29
30
|
chatgpt_md_converter/telegram_markdown/postprocess.py
|
|
30
31
|
chatgpt_md_converter/telegram_markdown/preprocess.py
|
|
@@ -32,6 +33,7 @@ chatgpt_md_converter/telegram_markdown/renderer.py
|
|
|
32
33
|
tests/test_entities.py
|
|
33
34
|
tests/test_html_to_markdown_inline_spacing.py
|
|
34
35
|
tests/test_parser.py
|
|
36
|
+
tests/test_pre_escaped_entities.py
|
|
35
37
|
tests/test_roundtrip_markdown.py
|
|
36
38
|
tests/test_splitter.py
|
|
37
39
|
tests/test_telegram_api.py
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Tests for handling pre-escaped HTML entities from LLMs.
|
|
2
|
+
|
|
3
|
+
LLMs sometimes output < > & instead of < > & in markdown code
|
|
4
|
+
blocks and inline code. The formatter should normalize these to avoid
|
|
5
|
+
double-escaping (e.g. &lt; in the final HTML).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from chatgpt_md_converter import html_to_telegram_markdown, telegram_format
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestInlineCodePreEscaped:
|
|
14
|
+
"""Inline code with pre-escaped HTML entities."""
|
|
15
|
+
|
|
16
|
+
def test_pre_escaped_angle_brackets(self):
|
|
17
|
+
# LLM wrote < instead of < in inline code
|
|
18
|
+
md_escaped = "Use `<tg-emoji emoji-id=\"ID\">⭐</tg-emoji>` for custom emoji"
|
|
19
|
+
md_correct = 'Use `<tg-emoji emoji-id="ID">⭐</tg-emoji>` for custom emoji'
|
|
20
|
+
|
|
21
|
+
html_escaped = telegram_format(md_escaped)
|
|
22
|
+
html_correct = telegram_format(md_correct)
|
|
23
|
+
|
|
24
|
+
assert html_escaped == html_correct
|
|
25
|
+
assert "&lt;" not in html_escaped
|
|
26
|
+
assert "<tg-emoji" in html_escaped # single escape only
|
|
27
|
+
|
|
28
|
+
def test_pre_escaped_ampersand(self):
|
|
29
|
+
md_escaped = "Query: `a & b`"
|
|
30
|
+
md_correct = "Query: `a & b`"
|
|
31
|
+
|
|
32
|
+
html_escaped = telegram_format(md_escaped)
|
|
33
|
+
html_correct = telegram_format(md_correct)
|
|
34
|
+
|
|
35
|
+
assert html_escaped == html_correct
|
|
36
|
+
assert "&amp;" not in html_escaped
|
|
37
|
+
|
|
38
|
+
def test_pre_escaped_mixed(self):
|
|
39
|
+
md_escaped = "`<div class="test">hello</div>`"
|
|
40
|
+
md_correct = '`<div class="test">hello</div>`'
|
|
41
|
+
|
|
42
|
+
html_escaped = telegram_format(md_escaped)
|
|
43
|
+
html_correct = telegram_format(md_correct)
|
|
44
|
+
|
|
45
|
+
assert html_escaped == html_correct
|
|
46
|
+
|
|
47
|
+
def test_no_double_escaping_gt_lt(self):
|
|
48
|
+
md = "`x < y > z`"
|
|
49
|
+
html = telegram_format(md)
|
|
50
|
+
|
|
51
|
+
assert "&lt;" not in html
|
|
52
|
+
assert "&gt;" not in html
|
|
53
|
+
assert "<code>x < y > z</code>" in html
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TestCodeBlockPreEscaped:
|
|
57
|
+
"""Fenced code blocks with pre-escaped HTML entities."""
|
|
58
|
+
|
|
59
|
+
def test_pre_escaped_html_code_block(self):
|
|
60
|
+
md_escaped = "```html\n<tg-emoji emoji-id=\"ID\">⭐</tg-emoji>\n```"
|
|
61
|
+
md_correct = '```html\n<tg-emoji emoji-id="ID">⭐</tg-emoji>\n```'
|
|
62
|
+
|
|
63
|
+
html_escaped = telegram_format(md_escaped)
|
|
64
|
+
html_correct = telegram_format(md_correct)
|
|
65
|
+
|
|
66
|
+
assert html_escaped == html_correct
|
|
67
|
+
assert "&lt;" not in html_escaped
|
|
68
|
+
|
|
69
|
+
def test_pre_escaped_ampersand_code_block(self):
|
|
70
|
+
md_escaped = "```\na & b\n```"
|
|
71
|
+
md_correct = "```\na & b\n```"
|
|
72
|
+
|
|
73
|
+
html_escaped = telegram_format(md_escaped)
|
|
74
|
+
html_correct = telegram_format(md_correct)
|
|
75
|
+
|
|
76
|
+
assert html_escaped == html_correct
|
|
77
|
+
|
|
78
|
+
def test_pre_escaped_mixed_code_block(self):
|
|
79
|
+
md_escaped = "```xml\n<root attr="val">\n <child/>\n</root>\n```"
|
|
80
|
+
md_correct = '```xml\n<root attr="val">\n <child/>\n</root>\n```'
|
|
81
|
+
|
|
82
|
+
html_escaped = telegram_format(md_escaped)
|
|
83
|
+
html_correct = telegram_format(md_correct)
|
|
84
|
+
|
|
85
|
+
assert html_escaped == html_correct
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestRoundTripPreEscaped:
|
|
89
|
+
"""Round-trip: pre-escaped input should normalize to the same as clean input."""
|
|
90
|
+
|
|
91
|
+
def test_inline_code_round_trip(self):
|
|
92
|
+
md_escaped = "Use `<b>bold</b>` tag"
|
|
93
|
+
md_correct = "Use `<b>bold</b>` tag"
|
|
94
|
+
|
|
95
|
+
html1 = telegram_format(md_escaped)
|
|
96
|
+
md1 = html_to_telegram_markdown(html1)
|
|
97
|
+
html2 = telegram_format(md1)
|
|
98
|
+
|
|
99
|
+
html_ref = telegram_format(md_correct)
|
|
100
|
+
md_ref = html_to_telegram_markdown(html_ref)
|
|
101
|
+
|
|
102
|
+
# After one round-trip, both should converge
|
|
103
|
+
assert md1 == md_ref
|
|
104
|
+
assert html1 == html_ref
|
|
105
|
+
assert html2 == html_ref
|
|
106
|
+
|
|
107
|
+
def test_code_block_round_trip(self):
|
|
108
|
+
md_escaped = "```\n<div>test</div>\n```"
|
|
109
|
+
md_correct = "```\n<div>test</div>\n```"
|
|
110
|
+
|
|
111
|
+
html1 = telegram_format(md_escaped)
|
|
112
|
+
md1 = html_to_telegram_markdown(html1)
|
|
113
|
+
html2 = telegram_format(md1)
|
|
114
|
+
|
|
115
|
+
html_ref = telegram_format(md_correct)
|
|
116
|
+
md_ref = html_to_telegram_markdown(html_ref)
|
|
117
|
+
|
|
118
|
+
assert md1 == md_ref
|
|
119
|
+
assert html1 == html_ref
|
|
120
|
+
assert html2 == html_ref
|
|
121
|
+
|
|
122
|
+
def test_real_world_tg_emoji_case(self):
|
|
123
|
+
"""The exact scenario from production: LLM pre-escapes tg-emoji tags."""
|
|
124
|
+
md_input = (
|
|
125
|
+
"В `input_message_content` ти віддаєш _готовий текст_:\n"
|
|
126
|
+
"- HTML: `<tg-emoji emoji-id=\"ID\">⭐</tg-emoji>`\n"
|
|
127
|
+
"- MarkdownV2: ``"
|
|
128
|
+
)
|
|
129
|
+
html = telegram_format(md_input)
|
|
130
|
+
|
|
131
|
+
# Should NOT have double-escaped entities
|
|
132
|
+
assert "&lt;" not in html
|
|
133
|
+
assert "&gt;" not in html
|
|
134
|
+
|
|
135
|
+
# Should have proper single-escaped entities in <code> tags
|
|
136
|
+
assert '<code><tg-emoji emoji-id="ID">⭐</tg-emoji></code>' in html
|
|
137
|
+
|
|
138
|
+
# Round-trip should be stable
|
|
139
|
+
md_back = html_to_telegram_markdown(html)
|
|
140
|
+
html2 = telegram_format(md_back)
|
|
141
|
+
assert html == html2
|
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/chatgpt_md_converter/html_splitter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.4.0b3 → chatgpt_md_converter-0.4.0b4}/tests/test_roundtrip_markdown.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|