chatgpt-md-converter 0.3.6__tar.gz → 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/PKG-INFO +2 -2
  2. chatgpt_md_converter-0.3.8/README.md +147 -0
  3. chatgpt_md_converter-0.3.8/chatgpt_md_converter/__init__.py +4 -0
  4. chatgpt_md_converter-0.3.8/chatgpt_md_converter/extractors.py +95 -0
  5. chatgpt_md_converter-0.3.8/chatgpt_md_converter/html_splitter.py +239 -0
  6. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/PKG-INFO +2 -2
  7. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/SOURCES.txt +4 -1
  8. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/setup.py +3 -3
  9. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/tests/test_parser.py +205 -0
  10. chatgpt_md_converter-0.3.8/tests/test_splitter.py +103 -0
  11. chatgpt_md_converter-0.3.6/chatgpt_md_converter/__init__.py +0 -3
  12. chatgpt_md_converter-0.3.6/chatgpt_md_converter/extractors.py +0 -61
  13. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/LICENSE +0 -0
  14. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/converters.py +0 -0
  15. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/formatters.py +0 -0
  16. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/helpers.py +0 -0
  17. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/telegram_formatter.py +0 -0
  18. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
  19. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
  20. {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/setup.cfg +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
- Home-page: https://github.com/Latand/formatter-chatgpt-telegram
5
+ Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
7
7
  Author-email: latand666@gmail.com
8
8
  Classifier: Programming Language :: Python :: 3
@@ -0,0 +1,147 @@
1
+ # ChatGPT Markdown to Telegram HTML Parser
2
+
3
+ ## Overview
4
+
5
+ This project provides a solution for converting Telegram-style Markdown formatted text into HTML markup supported by the Telegram Bot API, specifically tailored for use in ChatGPT bots developed with the OpenAI API. It includes features for handling various Markdown elements and ensures proper tag closure, making it suitable for streaming mode applications.
6
+
7
+ ## Features
8
+
9
+ - Converts Telegram-style Markdown syntax to Telegram-compatible HTML
10
+ - Supports text styling:
11
+ - Bold: `**text**` → `<b>text</b>`
12
+ - Italic: `*text*` or `_text_` → `<i>text</i>`
13
+ - Underline: `__text__` → `<u>text</u>`
14
+ - Strikethrough: `~~text~~` → `<s>text</s>`
15
+ - Spoiler: `||text||` → `<span class="tg-spoiler">text</span>`
16
+ - Inline code: `` `code` `` → `<code>code</code>`
17
+ - Handles nested text styling
18
+ - Converts links: `[text](URL)` → `<a href="URL">text</a>`
19
+ - Processes code blocks with language specification
20
+ - Supports blockquotes:
21
+ - Regular blockquotes: `> text` → `<blockquote>text</blockquote>`
22
+ - Expandable blockquotes: `**> text` → `<blockquote expandable>text</blockquote>`
23
+ - Automatically appends missing closing delimiters for code blocks
24
+ - Escapes HTML special characters to prevent unwanted HTML rendering
25
+
26
+ ## Usage
27
+
28
+ To use the Markdown to Telegram HTML Parser in your ChatGPT bot, integrate the provided Python functions into your bot's processing pipeline. Here is a brief overview of how to incorporate the parser:
29
+
30
+ 1. **Ensure Closing Delimiters**: Automatically appends missing closing delimiters for backticks to ensure proper parsing.
31
+
32
+ 2. **Extract and Convert Code Blocks**: Extracts Markdown code blocks, converts them to HTML `<pre><code>` format, and replaces them with placeholders to prevent formatting within code blocks.
33
+
34
+ 3. **Markdown to HTML Conversion**: Applies various regex substitutions and custom logic to convert supported Markdown formatting to Telegram-compatible HTML tags.
35
+
36
+ 4. **Reinsert Code Blocks**: Reinserts the previously extracted and converted code blocks back into the main text, replacing placeholders with the appropriate HTML content.
37
+
38
+ Simply call the `telegram_format(text: str) -> str` function with your Markdown-formatted text as input to receive the converted HTML output ready for use with the Telegram Bot API.
39
+
40
+ ## Installation
41
+
42
+ ```sh
43
+ pip install chatgpt-md-converter
44
+ ```
45
+
46
+ ## Example
47
+
48
+ ```python
49
+ from chatgpt_md_converter import telegram_format
50
+
51
+ # Basic formatting example
52
+ text = """
53
+ Here is some **bold**, __underline__, and `inline code`.
54
+ This is a ||spoiler text|| and *italic*.
55
+
56
+ Code example:
57
+ print('Hello, world!')
58
+ """
59
+
60
+ # Blockquotes example
61
+ blockquote_text = """
62
+ > Regular blockquote
63
+ > Multiple lines
64
+
65
+ **> Expandable blockquote
66
+ > Hidden by default
67
+ > Multiple lines
68
+ """
69
+
70
+ formatted_text = telegram_format(text)
71
+ formatted_blockquote = telegram_format(blockquote_text)
72
+
73
+ print(formatted_text)
74
+ print(formatted_blockquote)
75
+ ```
76
+
77
+ ### Output:
78
+
79
+ ```
80
+ Here is some <b>bold</b>, <u>underline</u>, and <code>inline code</code>.
81
+ This is a <span class="tg-spoiler">spoiler text</span> and <i>italic</i>.
82
+
83
+ Code example:
84
+ print('Hello, world!')
85
+
86
+ <blockquote>Regular blockquote
87
+ Multiple lines</blockquote>
88
+
89
+ <blockquote expandable>Expandable blockquote
90
+ Hidden by default
91
+ Multiple lines</blockquote>
92
+ ```
93
+
94
+ ## Requirements
95
+
96
+ - Python 3.x
97
+ - No external libraries required (uses built-in `re` module for regex operations)
98
+
99
+ ## Contribution
100
+
101
+ Feel free to contribute to this project by submitting pull requests or opening issues for bugs, feature requests, or improvements.
102
+
103
+ ## Prompting LLMs for Telegram-Specific Formatting
104
+
105
+ > **Note**:
106
+ > Since standard Markdown doesn't include Telegram-specific features like spoilers (`||text||`) and expandable blockquotes (`**> text`), you'll need to explicitly instruct LLMs to use these formats. Here's a suggested prompt addition to include in your system message or initial instructions:
107
+
108
+ ````
109
+ When formatting your responses for Telegram, please use these special formatting conventions:
110
+
111
+ 1. For content that should be hidden as a spoiler (revealed only when users click):
112
+ Use: ||spoiler content here||
113
+ Example: This is visible, but ||this is hidden until clicked||.
114
+
115
+ 2. For lengthy explanations or optional content that should be collapsed:
116
+ Use: **> Expandable section title
117
+
118
+ > Content line 1
119
+ > Content line 2
120
+ > (Each line of the expandable blockquote should start with ">")
121
+
122
+ 3. Continue using standard markdown for other formatting:
123
+ - **bold text**
124
+ - *italic text*
125
+ - __underlined text__
126
+ - ~~strikethrough~~
127
+ - `inline code`
128
+ - ```code blocks```
129
+ - [link text](URL)
130
+
131
+ Apply spoilers for:
132
+
133
+ - Solution reveals
134
+ - Potential plot spoilers
135
+ - Sensitive information
136
+ - Surprising facts
137
+
138
+ Use expandable blockquotes for:
139
+
140
+ - Detailed explanations
141
+ - Long examples
142
+ - Optional reading
143
+ - Technical details
144
+ - Additional context not needed by all users
145
+ ````
146
+
147
+ You can add this prompt to your system message when initializing your ChatGPT interactions to ensure the model properly formats content for optimal display in Telegram.
@@ -0,0 +1,4 @@
1
+ from .telegram_formatter import telegram_format
2
+ from .html_splitter import split_html_for_telegram
3
+
4
+ __all__ = ["telegram_format", "split_html_for_telegram"]
@@ -0,0 +1,95 @@
1
+ import re
2
+
3
+
4
+ def ensure_closing_delimiters(text: str) -> str:
5
+ # Append missing closing backtick delimiters.
6
+
7
+ code_block_re = re.compile(
8
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
+ flags=re.DOTALL,
10
+ )
11
+
12
+ # Track an open fence. Once a fence is opened, everything until the same
13
+ # fence is encountered again is treated as plain text. This mimics how
14
+ # Markdown handles fences and allows fence-like strings inside code blocks.
15
+ open_fence = None
16
+ for line in text.splitlines():
17
+ stripped = line.strip()
18
+ if open_fence is None:
19
+ m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
20
+ if m:
21
+ open_fence = m.group("fence")
22
+ else:
23
+ if stripped.endswith(open_fence):
24
+ open_fence = None
25
+
26
+ # If a fence was left open, append a matching closing fence.
27
+ if open_fence is not None:
28
+ if not text.endswith("\n"):
29
+ text += "\n"
30
+ text += open_fence
31
+
32
+ cleaned_inline = code_block_re.sub("", text)
33
+
34
+ # Balance triple backticks that are not part of a complete fence.
35
+ if cleaned_inline.count("```") % 2 != 0:
36
+ text += "```"
37
+
38
+ # Balance single backticks outside fenced blocks.
39
+ cleaned_inline = code_block_re.sub("", text)
40
+ if cleaned_inline.count("`") % 2 != 0:
41
+ text += "`"
42
+
43
+ return text
44
+
45
+
46
+ def extract_and_convert_code_blocks(text: str):
47
+ """
48
+ Extracts code blocks from the text, converting them to HTML <pre><code> format,
49
+ and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
50
+ """
51
+ text = ensure_closing_delimiters(text)
52
+ placeholders = []
53
+ code_blocks = {}
54
+
55
+ def replacer(match):
56
+ language = match.group("lang") if match.group("lang") else ""
57
+ code_content = match.group("code")
58
+
59
+ # Properly escape HTML entities in code content
60
+ escaped_content = (
61
+ code_content.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
62
+ )
63
+
64
+ placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
65
+ placeholders.append(placeholder)
66
+ if not language:
67
+ html_code_block = f"<pre><code>{escaped_content}</code></pre>"
68
+ else:
69
+ html_code_block = (
70
+ f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
71
+ )
72
+ return (placeholder, html_code_block)
73
+
74
+ modified_text = text
75
+ code_block_pattern = re.compile(
76
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
77
+ flags=re.DOTALL,
78
+ )
79
+ for match in code_block_pattern.finditer(text):
80
+ placeholder, html_code_block = replacer(
81
+ match
82
+ )
83
+ code_blocks[placeholder] = html_code_block
84
+ modified_text = modified_text.replace(match.group(0), placeholder, 1)
85
+
86
+ return modified_text, code_blocks
87
+
88
+
89
+ def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
90
+ """
91
+ Reinserts HTML code blocks into the text, replacing their placeholders.
92
+ """
93
+ for placeholder, html_code_block in code_blocks.items():
94
+ text = text.replace(placeholder, html_code_block, 1)
95
+ return text
@@ -0,0 +1,239 @@
1
+ import re
2
+ from html.parser import HTMLParser
3
+
4
+ MAX_LENGTH = 4096
5
+ MIN_LENGTH = 500
6
+
7
+
8
+ class HTMLTagTracker(HTMLParser):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.open_tags = []
12
+
13
+ def handle_starttag(self, tag, attrs):
14
+ # saving tags
15
+ if tag in (
16
+ "b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
17
+ "strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
18
+ ):
19
+ self.open_tags.append((tag, attrs))
20
+
21
+ def handle_endtag(self, tag):
22
+ for i in range(len(self.open_tags) - 1, -1, -1):
23
+ if self.open_tags[i][0] == tag:
24
+ del self.open_tags[i]
25
+ break
26
+
27
+ def get_open_tags_html(self):
28
+ parts = []
29
+ for tag, attrs in self.open_tags:
30
+ attr_str = ""
31
+ if attrs:
32
+ attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
33
+ parts.append(f"<{tag}{attr_str}>")
34
+ return "".join(parts)
35
+
36
+ def get_closing_tags_html(self):
37
+ return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
38
+
39
+
40
+ def split_pre_block(pre_block: str, max_length) -> list[str]:
41
+ """
42
+ Splits long HTML-formatted text into chunks suitable for sending via Telegram,
43
+ preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
44
+
45
+ Args:
46
+ text (str): The input HTML-formatted string.
47
+ trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
48
+ from each resulting chunk before sending. This is useful to avoid
49
+ unnecessary blank space at the beginning of messages in Telegram.
50
+
51
+ Returns:
52
+ list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
53
+ """
54
+
55
+ # language-aware: <pre><code class="language-python">...</code></pre>
56
+ match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
57
+ if match:
58
+ attr, content = match.groups()
59
+ lines = content.splitlines(keepends=True)
60
+ chunks, buf = [], ""
61
+ overhead = len(f"<pre><code{attr}></code></pre>")
62
+ for line in lines:
63
+ if len(buf) + len(line) + overhead > max_length:
64
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
65
+ buf = ""
66
+ buf += line
67
+ if buf:
68
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
69
+ return chunks
70
+ else:
71
+ # regular <pre>...</pre>
72
+ inner = pre_block[5:-6]
73
+ lines = inner.splitlines(keepends=True)
74
+ chunks, buf = [], ""
75
+ overhead = len('<pre></pre>')
76
+ for line in lines:
77
+ if len(buf) + len(line) + overhead > max_length:
78
+ chunks.append(f"<pre>{buf}</pre>")
79
+ buf = ""
80
+ buf += line
81
+ if buf:
82
+ chunks.append(f"<pre>{buf}</pre>")
83
+ return chunks
84
+
85
+
86
+ def _is_only_tags(block: str) -> bool:
87
+ return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
88
+
89
+
90
+ def _effective_length(content: str) -> int:
91
+ tracker = HTMLTagTracker()
92
+ tracker.feed(content)
93
+ return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
94
+
95
+
96
+ def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
97
+ """Split long HTML-formatted text into Telegram-compatible chunks.
98
+
99
+ Parameters
100
+ ----------
101
+ text: str
102
+ Input HTML text.
103
+ trim_empty_leading_lines: bool, optional
104
+ If True, removes `\n` sybmols from start of chunks.
105
+ max_length: int, optional
106
+ Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
107
+ Default = 4096 (symbols)
108
+
109
+ Returns
110
+ -------
111
+ list[str]
112
+ List of HTML chunks.
113
+ """
114
+
115
+ if max_length < MIN_LENGTH:
116
+ raise ValueError("max_length should be at least %d" % MIN_LENGTH)
117
+
118
+ pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
119
+ parts = pattern.split(text)
120
+
121
+ chunks: list[str] = []
122
+ prefix = ""
123
+ current = ""
124
+ whitespace_re = re.compile(r"(\\s+)")
125
+ tag_re = re.compile(r"(<[^>]+>)")
126
+
127
+ def finalize():
128
+ nonlocal current, prefix
129
+ tracker = HTMLTagTracker()
130
+ tracker.feed(prefix + current)
131
+ chunk = prefix + current + tracker.get_closing_tags_html()
132
+ chunks.append(chunk)
133
+ prefix = tracker.get_open_tags_html()
134
+ current = ""
135
+
136
+ def append_piece(piece: str):
137
+ nonlocal current, prefix
138
+
139
+ def split_on_whitespace(chunk: str) -> list[str] | None:
140
+ parts = [part for part in whitespace_re.split(chunk) if part]
141
+ if len(parts) <= 1:
142
+ return None
143
+ return parts
144
+
145
+ def split_on_tags(chunk: str) -> list[str] | None:
146
+ parts = [part for part in tag_re.split(chunk) if part]
147
+ if len(parts) <= 1:
148
+ return None
149
+ return parts
150
+
151
+ def fittable_prefix_length(chunk: str) -> int:
152
+ low, high = 1, len(chunk)
153
+ best = 0
154
+ while low <= high:
155
+ mid = (low + high) // 2
156
+ candidate = chunk[:mid]
157
+ if _effective_length(prefix + current + candidate) <= max_length:
158
+ best = mid
159
+ low = mid + 1
160
+ else:
161
+ high = mid - 1
162
+ return best
163
+
164
+ while piece:
165
+ if _effective_length(prefix + current + piece) <= max_length:
166
+ current += piece
167
+ return
168
+
169
+ if len(piece) > max_length:
170
+ if _is_only_tags(piece):
171
+ raise ValueError("block contains only html tags")
172
+ splitted = split_on_whitespace(piece)
173
+ if splitted:
174
+ for part in splitted:
175
+ append_piece(part)
176
+ return
177
+ tag_split = split_on_tags(piece)
178
+ if tag_split:
179
+ for part in tag_split:
180
+ append_piece(part)
181
+ return
182
+ elif current:
183
+ finalize()
184
+ continue
185
+ else:
186
+ splitted = split_on_whitespace(piece)
187
+ if splitted:
188
+ for part in splitted:
189
+ append_piece(part)
190
+ return
191
+ tag_split = split_on_tags(piece)
192
+ if tag_split:
193
+ for part in tag_split:
194
+ append_piece(part)
195
+ return
196
+
197
+ fitted = fittable_prefix_length(piece)
198
+ if fitted == 0:
199
+ if current:
200
+ finalize()
201
+ continue
202
+ raise ValueError("unable to split content within max_length")
203
+
204
+ current += piece[:fitted]
205
+ piece = piece[fitted:]
206
+
207
+ if piece:
208
+ finalize()
209
+
210
+
211
+ for part in parts:
212
+ if not part:
213
+ continue
214
+ if part.startswith("<pre>") or part.startswith("<pre><code"):
215
+ pre_chunks = split_pre_block(part, max_length=max_length)
216
+ for pc in pre_chunks:
217
+ append_piece(pc)
218
+ continue
219
+ blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
220
+ for block in blocks:
221
+ if block:
222
+ append_piece(block)
223
+
224
+ if current:
225
+ finalize()
226
+
227
+ merged: list[str] = []
228
+ buf = ""
229
+ for chunk in chunks:
230
+ if len(buf) + len(chunk) <= max_length:
231
+ buf += chunk
232
+ else:
233
+ if buf:
234
+ merged.append(buf)
235
+ buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
236
+ if buf:
237
+ merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
238
+
239
+ return merged
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
- Home-page: https://github.com/Latand/formatter-chatgpt-telegram
5
+ Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
7
7
  Author-email: latand666@gmail.com
8
8
  Classifier: Programming Language :: Python :: 3
@@ -1,13 +1,16 @@
1
1
  LICENSE
2
+ README.md
2
3
  setup.py
3
4
  chatgpt_md_converter/__init__.py
4
5
  chatgpt_md_converter/converters.py
5
6
  chatgpt_md_converter/extractors.py
6
7
  chatgpt_md_converter/formatters.py
7
8
  chatgpt_md_converter/helpers.py
9
+ chatgpt_md_converter/html_splitter.py
8
10
  chatgpt_md_converter/telegram_formatter.py
9
11
  chatgpt_md_converter.egg-info/PKG-INFO
10
12
  chatgpt_md_converter.egg-info/SOURCES.txt
11
13
  chatgpt_md_converter.egg-info/dependency_links.txt
12
14
  chatgpt_md_converter.egg-info/top_level.txt
13
- tests/test_parser.py
15
+ tests/test_parser.py
16
+ tests/test_splitter.py
@@ -2,13 +2,13 @@ from setuptools import setup
2
2
 
3
3
  setup(
4
4
  name="chatgpt_md_converter",
5
- version="0.3.6",
5
+ version="0.3.8",
6
6
  author="Kostiantyn Kriuchkov",
7
7
  author_email="latand666@gmail.com",
8
8
  description="A package for converting markdown to HTML for chat Telegram bots",
9
- long_description=open("README.MD").read(),
9
+ long_description=open("README.md").read(),
10
10
  long_description_content_type="text/markdown",
11
- url="https://github.com/Latand/formatter-chatgpt-telegram",
11
+ url="https://github.com/botfather-dev/formatter-chatgpt-telegram",
12
12
  classifiers=[
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: MIT License",
@@ -730,3 +730,208 @@ def test_ukrainian_text_with_inline_code():
730
730
  expected_output = """звісно, майстре тестування. ой та зрозуміло <code>&lt;LAUGH&gt;</code> що ти тут тестуєш."""
731
731
  output = telegram_format(input_text)
732
732
  assert output == expected_output, f"Output was: {output}"
733
+
734
+
735
+ def test_nested_code_fence_quadruple():
736
+ input_text = """````markdown
737
+ ```python
738
+ def hello_world():
739
+ print("Hello, World!")
740
+ ```
741
+ ````"""
742
+ expected_output = (
743
+ "<pre><code class=\"language-markdown\">```python\n"
744
+ "def hello_world():\n print(\"Hello, World!\")\n```\n</code></pre>"
745
+ )
746
+ output = telegram_format(input_text)
747
+ def show_output():
748
+ print(f"Expected was: \n\n{expected_output}\n\n")
749
+ print(f"output was: \n\n{output}")
750
+ assert output == expected_output, show_output()
751
+
752
+
753
+ def test_nested_code_fence_quadruple_no_lang():
754
+ input_text = """````
755
+ ```python
756
+ print('hi')
757
+ ```
758
+ ````"""
759
+ expected_output = (
760
+ "<pre><code>```python\nprint('hi')\n```\n</code></pre>"
761
+ )
762
+ output = telegram_format(input_text)
763
+ def show_output():
764
+ print(f"Expected was: \n\n{expected_output}\n\n")
765
+ print(f"output was: \n\n{output}")
766
+ assert output == expected_output, show_output()
767
+
768
+
769
+ def test_nested_code_fence_five_backticks():
770
+ input_text = """`````markdown
771
+ ````python
772
+ print(1)
773
+ ````
774
+ `````"""
775
+ expected_output = (
776
+ "<pre><code class=\"language-markdown\">````python\nprint(1)\n````\n</code></pre>"
777
+ )
778
+ output = telegram_format(input_text)
779
+ def show_output():
780
+ print(f"Expected was: \n\n{expected_output}\n\n")
781
+ print(f"output was: \n\n{output}")
782
+ assert output == expected_output, show_output()
783
+
784
+
785
+ def test_nested_code_fence_five_backticks_with_inner_triple():
786
+ input_text = """`````markdown
787
+ ````python
788
+ print("hello world ```")
789
+ ````
790
+ `````"""
791
+ expected_output = (
792
+ "<pre><code class=\"language-markdown\">````python\n"
793
+ "print(\"hello world ```\")\n````\n</code></pre>"
794
+ )
795
+ output = telegram_format(input_text)
796
+ def show_output():
797
+ print(f"Expected was: \n\n{expected_output}\n\n")
798
+ print(f"output was: \n\n{output}")
799
+ assert output == expected_output, show_output()
800
+
801
+
802
+ def test_nested_code_fence_six_backticks():
803
+ input_text = """``````markdown
804
+ `````python
805
+ print('hi')
806
+ `````
807
+ ``````"""
808
+ expected_output = """<pre><code class=\"language-markdown\">`````python
809
+ print('hi')
810
+ `````
811
+ </code></pre>"""
812
+ output = telegram_format(input_text)
813
+ def show_output():
814
+ print(f"Expected was: \n\n{expected_output}\n\n")
815
+ print(f"output was: \n\n{output}")
816
+ assert output == expected_output, show_output()
817
+
818
+
819
+ def test_nested_code_fence_plain_text():
820
+ input_text = """
821
+ ````markdown
822
+ ```
823
+ hello
824
+ ```
825
+ ````"""
826
+ expected_output = """<pre><code class=\"language-markdown\">```
827
+ hello
828
+ ```
829
+ </code></pre>"""
830
+ output = telegram_format(input_text)
831
+ def show_output():
832
+ print(f"Expected was: \n\n{expected_output}\n\n")
833
+ print(f"output was: \n\n{output}")
834
+ assert output == expected_output, show_output()
835
+
836
+
837
+
838
+
839
+
840
+ def test_expensive_nested_code_five_fence_plain_text():
841
+ input_text = """
842
+ `````markdown
843
+ ````
844
+ ```python
845
+ print("hello world ```")
846
+ ```
847
+ `````"""
848
+
849
+ expected_output = """<pre><code class=\"language-markdown\">````
850
+ ```python
851
+ print("hello world ```")
852
+ ```
853
+ </code></pre>"""
854
+ output = telegram_format(input_text)
855
+ def show_output():
856
+ print(f"Expected was: \n\n{expected_output}\n\n")
857
+ print(f"output was: \n\n{output}")
858
+ assert output == expected_output, show_output()
859
+
860
+ def test_another_expensive_nested_code_five_fence_plain_text():
861
+ input_text = """`````markdown
862
+ ````python
863
+ print("hello world ```"')
864
+ ```
865
+ `````"""
866
+
867
+ expected_output = """<pre><code class=\"language-markdown\">````python
868
+ print("hello world ```"')
869
+ ```
870
+ </code></pre>"""
871
+ output = telegram_format(input_text)
872
+ def show_output():
873
+ print(f"Expected was: \n\n{expected_output}\n\n")
874
+ print(f"output was: \n\n{output}")
875
+ assert output == expected_output, show_output()
876
+
877
+ def test_hard_level_nested_code_five_fence_plain_text():
878
+ input_text = """`````markdown
879
+ ````python
880
+ print("hello world ```"')
881
+ ````
882
+ `````
883
+ ```python
884
+ print("Some another text")""" # That's where closing the second block of python code is missing.
885
+
886
+ expected_output = """<pre><code class="language-markdown">````python
887
+ print("hello world ```"')
888
+ ````
889
+ </code></pre>
890
+ <pre><code class="language-python">print("Some another text")
891
+ </code></pre>""" # But the code block is still closed correctly.
892
+
893
+ output = telegram_format(input_text)
894
+ def show_output():
895
+ print(f"Expected was: \n\n{expected_output}\n\n")
896
+ print(f"output was: \n\n{output}")
897
+ assert output == expected_output, show_output()
898
+
899
+ def test_hard_level_nested_code_five_fence_plain_text_2():
900
+ input_text = """`````markdown
901
+ ````python
902
+ print("hello world ```"')
903
+ `````
904
+ ```python
905
+ print("Some another text")""" # That's where closing the second block of python code is missing.
906
+
907
+ expected_output = """<pre><code class="language-markdown">````python
908
+ print("hello world ```"')
909
+ </code></pre>
910
+ <pre><code class="language-python">print("Some another text")
911
+ </code></pre>""" # But the code block is still closed correctly.
912
+
913
+ output = telegram_format(input_text)
914
+ def show_output():
915
+ print(f"Expected was: \n\n{expected_output}\n\n")
916
+ print(f"output was: \n\n{output}")
917
+ assert output == expected_output, show_output()
918
+
919
+ def test_some_new():
920
+ input_text = """
921
+ ``````markdown
922
+ `````
923
+ ````python
924
+ print("hello world ```")
925
+ ```
926
+ """ # Markdown code wasn't closed
927
+
928
+ expected_output = """<pre><code class=\"language-markdown\">`````
929
+ ````python
930
+ print("hello world ```")
931
+ ```
932
+ </code></pre>""" # But after closed correctly
933
+ output = telegram_format(input_text)
934
+ def show_output():
935
+ print(f"Expected was: \n\n{expected_output}\n\n")
936
+ print(f"output was: \n\n{output}")
937
+ assert output == expected_output, show_output()
@@ -0,0 +1,103 @@
1
+ import re
2
+
3
+ import pytest
4
+
5
+ from chatgpt_md_converter.html_splitter import (MIN_LENGTH,
6
+ split_html_for_telegram)
7
+
8
+ from . import html_examples
9
+
10
+
11
+ def test_html_splitter():
12
+ chunks = split_html_for_telegram(html_examples.input_text)
13
+ valid_chunks = [
14
+ html_examples.valid_chunk_1,
15
+ html_examples.valid_chunk_2,
16
+ html_examples.valid_chunk_3,
17
+ ]
18
+ for index, chunk in enumerate(chunks):
19
+ assert chunk == valid_chunks[index], (
20
+ f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
21
+ )
22
+
23
+ def test_html_splitter__remove_leading_brakes():
24
+ chunks = split_html_for_telegram(html_examples.input_text, trim_empty_leading_lines=True)
25
+ valid_chunks = [
26
+ html_examples.valid_chunk_1,
27
+ html_examples.valid_chunk_2,
28
+ html_examples.valid_chunk_3_remove_leading_brakes,
29
+ ]
30
+ for index, chunk in enumerate(chunks):
31
+ assert chunk == valid_chunks[index], (
32
+ f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
33
+ )
34
+
35
+ def test_html_splitter_max_length_550():
36
+ chunks = split_html_for_telegram(
37
+ html_examples.long_code_input, max_length=550, trim_empty_leading_lines=True
38
+ )
39
+
40
+ def load_expected_chunks_550():
41
+ raw = re.split(r"END\n?", html_examples.expected_550)
42
+ chunks = []
43
+ for part in raw:
44
+ if not part.strip():
45
+ continue
46
+ lines = part.splitlines()
47
+ chunks.append("\n".join(lines[1:]))
48
+ return chunks
49
+
50
+ valid_chunks = load_expected_chunks_550()
51
+ for index, chunk in enumerate(chunks):
52
+ assert chunk == valid_chunks[index], (
53
+ f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
54
+ )
55
+ assert len(chunk) <= 550
56
+
57
+ def test_split_html_respects_max_length_by_words():
58
+ text = "<b>" + "<i>word</i> " * 100 + "</b>"
59
+ chunks = split_html_for_telegram(text, max_length=550)
60
+ assert len(chunks) > 1
61
+ for chunk in chunks:
62
+ assert len(chunk) <= 550
63
+ assert chunk.startswith("<b>")
64
+ assert chunk.endswith("</b>")
65
+ assert chunk.count("<i>") == chunk.count("</i>")
66
+
67
+
68
+ def test_split_html_only_tags_raises():
69
+ text = "<b></b>" * 200
70
+ with pytest.raises(ValueError):
71
+ split_html_for_telegram(text, max_length=600)
72
+
73
+
74
+ def test_split_html_min_length_enforced():
75
+ with pytest.raises(ValueError):
76
+ split_html_for_telegram("hello", max_length=MIN_LENGTH - 1)
77
+
78
+
79
+ def test_split_html_long_word_exceeds_limit():
80
+ text = "a" * 600
81
+ chunks = split_html_for_telegram(text, max_length=550)
82
+ assert chunks == ["a" * 550, "a" * 50]
83
+
84
+
85
+ LONG_TEXT = "<b><i>" + "word " * 96 + "word!" + "</i></b>"
86
+
87
+ SHORT_TEXT = "<u>" + "another " * 9 + "another" + "</u>"
88
+
89
+
90
+ def test_split_html_keeps_newline_without_trim():
91
+ text = LONG_TEXT + "\n\n" + SHORT_TEXT
92
+ chunks = split_html_for_telegram(text, max_length=500, trim_empty_leading_lines=False)
93
+ assert chunks[0] == LONG_TEXT
94
+ assert chunks[1].startswith("\n")
95
+ assert chunks[1].endswith(SHORT_TEXT)
96
+ assert chunks[1].lstrip("\n").startswith("<u>")
97
+ assert chunks[1].lstrip("\n").endswith("</u>")
98
+
99
+
100
+ def test_split_html_trims_leading_newline_on_new_chunk():
101
+ text = LONG_TEXT + "\n\n" + SHORT_TEXT
102
+ chunks = split_html_for_telegram(text, max_length=500, trim_empty_leading_lines=True)
103
+ assert chunks == [LONG_TEXT, SHORT_TEXT]
@@ -1,3 +0,0 @@
1
- from .telegram_formatter import telegram_format
2
-
3
- __all__ = ["telegram_format"]
@@ -1,61 +0,0 @@
1
- import re
2
-
3
-
4
- def ensure_closing_delimiters(text: str) -> str:
5
- """
6
- Ensures that if an opening ` or ``` is found without a matching closing delimiter,
7
- the missing delimiter is appended to the end of the text.
8
- """
9
- # For triple backticks
10
- if text.count("```") % 2 != 0:
11
- text += "```"
12
- # For single backticks
13
- if text.count("`") % 2 != 0:
14
- text += "`"
15
- return text
16
-
17
-
18
- def extract_and_convert_code_blocks(text: str):
19
- """
20
- Extracts code blocks from the text, converting them to HTML <pre><code> format,
21
- and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
22
- """
23
- text = ensure_closing_delimiters(text)
24
- placeholders = []
25
- code_blocks = {}
26
-
27
- def replacer(match):
28
- language = match.group(1) if match.group(1) else ""
29
- code_content = match.group(3)
30
-
31
- # Properly escape HTML entities in code content
32
- escaped_content = (
33
- code_content.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
34
- )
35
-
36
- placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
37
- placeholders.append(placeholder)
38
- if not language:
39
- html_code_block = f"<pre><code>{escaped_content}</code></pre>"
40
- else:
41
- html_code_block = (
42
- f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
43
- )
44
- return (placeholder, html_code_block)
45
-
46
- modified_text = text
47
- for match in re.finditer(r"```(\w*)?(\n)?(.*?)```", text, flags=re.DOTALL):
48
- placeholder, html_code_block = replacer(match)
49
- code_blocks[placeholder] = html_code_block
50
- modified_text = modified_text.replace(match.group(0), placeholder, 1)
51
-
52
- return modified_text, code_blocks
53
-
54
-
55
- def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
56
- """
57
- Reinserts HTML code blocks into the text, replacing their placeholders.
58
- """
59
- for placeholder, html_code_block in code_blocks.items():
60
- text = text.replace(placeholder, html_code_block, 1)
61
- return text