chatgpt-md-converter 0.3.6__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/PKG-INFO +2 -2
- chatgpt_md_converter-0.3.8/README.md +147 -0
- chatgpt_md_converter-0.3.8/chatgpt_md_converter/__init__.py +4 -0
- chatgpt_md_converter-0.3.8/chatgpt_md_converter/extractors.py +95 -0
- chatgpt_md_converter-0.3.8/chatgpt_md_converter/html_splitter.py +239 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/PKG-INFO +2 -2
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/SOURCES.txt +4 -1
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/setup.py +3 -3
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/tests/test_parser.py +205 -0
- chatgpt_md_converter-0.3.8/tests/test_splitter.py +103 -0
- chatgpt_md_converter-0.3.6/chatgpt_md_converter/__init__.py +0 -3
- chatgpt_md_converter-0.3.6/chatgpt_md_converter/extractors.py +0 -61
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/LICENSE +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/converters.py +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/formatters.py +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/helpers.py +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/telegram_formatter.py +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
- {chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/setup.cfg +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
7
7
|
Author-email: latand666@gmail.com
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# ChatGPT Markdown to Telegram HTML Parser
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This project provides a solution for converting Telegram-style Markdown formatted text into HTML markup supported by the Telegram Bot API, specifically tailored for use in ChatGPT bots developed with the OpenAI API. It includes features for handling various Markdown elements and ensures proper tag closure, making it suitable for streaming mode applications.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Converts Telegram-style Markdown syntax to Telegram-compatible HTML
|
|
10
|
+
- Supports text styling:
|
|
11
|
+
- Bold: `**text**` → `<b>text</b>`
|
|
12
|
+
- Italic: `*text*` or `_text_` → `<i>text</i>`
|
|
13
|
+
- Underline: `__text__` → `<u>text</u>`
|
|
14
|
+
- Strikethrough: `~~text~~` → `<s>text</s>`
|
|
15
|
+
- Spoiler: `||text||` → `<span class="tg-spoiler">text</span>`
|
|
16
|
+
- Inline code: `` `code` `` → `<code>code</code>`
|
|
17
|
+
- Handles nested text styling
|
|
18
|
+
- Converts links: `[text](URL)` → `<a href="URL">text</a>`
|
|
19
|
+
- Processes code blocks with language specification
|
|
20
|
+
- Supports blockquotes:
|
|
21
|
+
- Regular blockquotes: `> text` → `<blockquote>text</blockquote>`
|
|
22
|
+
- Expandable blockquotes: `**> text` → `<blockquote expandable>text</blockquote>`
|
|
23
|
+
- Automatically appends missing closing delimiters for code blocks
|
|
24
|
+
- Escapes HTML special characters to prevent unwanted HTML rendering
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
To use the Markdown to Telegram HTML Parser in your ChatGPT bot, integrate the provided Python functions into your bot's processing pipeline. Here is a brief overview of how to incorporate the parser:
|
|
29
|
+
|
|
30
|
+
1. **Ensure Closing Delimiters**: Automatically appends missing closing delimiters for backticks to ensure proper parsing.
|
|
31
|
+
|
|
32
|
+
2. **Extract and Convert Code Blocks**: Extracts Markdown code blocks, converts them to HTML `<pre><code>` format, and replaces them with placeholders to prevent formatting within code blocks.
|
|
33
|
+
|
|
34
|
+
3. **Markdown to HTML Conversion**: Applies various regex substitutions and custom logic to convert supported Markdown formatting to Telegram-compatible HTML tags.
|
|
35
|
+
|
|
36
|
+
4. **Reinsert Code Blocks**: Reinserts the previously extracted and converted code blocks back into the main text, replacing placeholders with the appropriate HTML content.
|
|
37
|
+
|
|
38
|
+
Simply call the `telegram_format(text: str) -> str` function with your Markdown-formatted text as input to receive the converted HTML output ready for use with the Telegram Bot API.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```sh
|
|
43
|
+
pip install chatgpt-md-converter
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Example
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from chatgpt_md_converter import telegram_format
|
|
50
|
+
|
|
51
|
+
# Basic formatting example
|
|
52
|
+
text = """
|
|
53
|
+
Here is some **bold**, __underline__, and `inline code`.
|
|
54
|
+
This is a ||spoiler text|| and *italic*.
|
|
55
|
+
|
|
56
|
+
Code example:
|
|
57
|
+
print('Hello, world!')
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Blockquotes example
|
|
61
|
+
blockquote_text = """
|
|
62
|
+
> Regular blockquote
|
|
63
|
+
> Multiple lines
|
|
64
|
+
|
|
65
|
+
**> Expandable blockquote
|
|
66
|
+
> Hidden by default
|
|
67
|
+
> Multiple lines
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
formatted_text = telegram_format(text)
|
|
71
|
+
formatted_blockquote = telegram_format(blockquote_text)
|
|
72
|
+
|
|
73
|
+
print(formatted_text)
|
|
74
|
+
print(formatted_blockquote)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Output:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Here is some <b>bold</b>, <u>underline</u>, and <code>inline code</code>.
|
|
81
|
+
This is a <span class="tg-spoiler">spoiler text</span> and <i>italic</i>.
|
|
82
|
+
|
|
83
|
+
Code example:
|
|
84
|
+
print('Hello, world!')
|
|
85
|
+
|
|
86
|
+
<blockquote>Regular blockquote
|
|
87
|
+
Multiple lines</blockquote>
|
|
88
|
+
|
|
89
|
+
<blockquote expandable>Expandable blockquote
|
|
90
|
+
Hidden by default
|
|
91
|
+
Multiple lines</blockquote>
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Requirements
|
|
95
|
+
|
|
96
|
+
- Python 3.x
|
|
97
|
+
- No external libraries required (uses built-in `re` module for regex operations)
|
|
98
|
+
|
|
99
|
+
## Contribution
|
|
100
|
+
|
|
101
|
+
Feel free to contribute to this project by submitting pull requests or opening issues for bugs, feature requests, or improvements.
|
|
102
|
+
|
|
103
|
+
## Prompting LLMs for Telegram-Specific Formatting
|
|
104
|
+
|
|
105
|
+
> **Note**:
|
|
106
|
+
> Since standard Markdown doesn't include Telegram-specific features like spoilers (`||text||`) and expandable blockquotes (`**> text`), you'll need to explicitly instruct LLMs to use these formats. Here's a suggested prompt addition to include in your system message or initial instructions:
|
|
107
|
+
|
|
108
|
+
````
|
|
109
|
+
When formatting your responses for Telegram, please use these special formatting conventions:
|
|
110
|
+
|
|
111
|
+
1. For content that should be hidden as a spoiler (revealed only when users click):
|
|
112
|
+
Use: ||spoiler content here||
|
|
113
|
+
Example: This is visible, but ||this is hidden until clicked||.
|
|
114
|
+
|
|
115
|
+
2. For lengthy explanations or optional content that should be collapsed:
|
|
116
|
+
Use: **> Expandable section title
|
|
117
|
+
|
|
118
|
+
> Content line 1
|
|
119
|
+
> Content line 2
|
|
120
|
+
> (Each line of the expandable blockquote should start with ">")
|
|
121
|
+
|
|
122
|
+
3. Continue using standard markdown for other formatting:
|
|
123
|
+
- **bold text**
|
|
124
|
+
- *italic text*
|
|
125
|
+
- __underlined text__
|
|
126
|
+
- ~~strikethrough~~
|
|
127
|
+
- `inline code`
|
|
128
|
+
- ```code blocks```
|
|
129
|
+
- [link text](URL)
|
|
130
|
+
|
|
131
|
+
Apply spoilers for:
|
|
132
|
+
|
|
133
|
+
- Solution reveals
|
|
134
|
+
- Potential plot spoilers
|
|
135
|
+
- Sensitive information
|
|
136
|
+
- Surprising facts
|
|
137
|
+
|
|
138
|
+
Use expandable blockquotes for:
|
|
139
|
+
|
|
140
|
+
- Detailed explanations
|
|
141
|
+
- Long examples
|
|
142
|
+
- Optional reading
|
|
143
|
+
- Technical details
|
|
144
|
+
- Additional context not needed by all users
|
|
145
|
+
````
|
|
146
|
+
|
|
147
|
+
You can add this prompt to your system message when initializing your ChatGPT interactions to ensure the model properly formats content for optimal display in Telegram.
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def ensure_closing_delimiters(text: str) -> str:
|
|
5
|
+
# Append missing closing backtick delimiters.
|
|
6
|
+
|
|
7
|
+
code_block_re = re.compile(
|
|
8
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
9
|
+
flags=re.DOTALL,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
# Track an open fence. Once a fence is opened, everything until the same
|
|
13
|
+
# fence is encountered again is treated as plain text. This mimics how
|
|
14
|
+
# Markdown handles fences and allows fence-like strings inside code blocks.
|
|
15
|
+
open_fence = None
|
|
16
|
+
for line in text.splitlines():
|
|
17
|
+
stripped = line.strip()
|
|
18
|
+
if open_fence is None:
|
|
19
|
+
m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
|
|
20
|
+
if m:
|
|
21
|
+
open_fence = m.group("fence")
|
|
22
|
+
else:
|
|
23
|
+
if stripped.endswith(open_fence):
|
|
24
|
+
open_fence = None
|
|
25
|
+
|
|
26
|
+
# If a fence was left open, append a matching closing fence.
|
|
27
|
+
if open_fence is not None:
|
|
28
|
+
if not text.endswith("\n"):
|
|
29
|
+
text += "\n"
|
|
30
|
+
text += open_fence
|
|
31
|
+
|
|
32
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
33
|
+
|
|
34
|
+
# Balance triple backticks that are not part of a complete fence.
|
|
35
|
+
if cleaned_inline.count("```") % 2 != 0:
|
|
36
|
+
text += "```"
|
|
37
|
+
|
|
38
|
+
# Balance single backticks outside fenced blocks.
|
|
39
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
40
|
+
if cleaned_inline.count("`") % 2 != 0:
|
|
41
|
+
text += "`"
|
|
42
|
+
|
|
43
|
+
return text
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_and_convert_code_blocks(text: str):
|
|
47
|
+
"""
|
|
48
|
+
Extracts code blocks from the text, converting them to HTML <pre><code> format,
|
|
49
|
+
and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
|
|
50
|
+
"""
|
|
51
|
+
text = ensure_closing_delimiters(text)
|
|
52
|
+
placeholders = []
|
|
53
|
+
code_blocks = {}
|
|
54
|
+
|
|
55
|
+
def replacer(match):
|
|
56
|
+
language = match.group("lang") if match.group("lang") else ""
|
|
57
|
+
code_content = match.group("code")
|
|
58
|
+
|
|
59
|
+
# Properly escape HTML entities in code content
|
|
60
|
+
escaped_content = (
|
|
61
|
+
code_content.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
|
|
65
|
+
placeholders.append(placeholder)
|
|
66
|
+
if not language:
|
|
67
|
+
html_code_block = f"<pre><code>{escaped_content}</code></pre>"
|
|
68
|
+
else:
|
|
69
|
+
html_code_block = (
|
|
70
|
+
f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
|
|
71
|
+
)
|
|
72
|
+
return (placeholder, html_code_block)
|
|
73
|
+
|
|
74
|
+
modified_text = text
|
|
75
|
+
code_block_pattern = re.compile(
|
|
76
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
|
|
77
|
+
flags=re.DOTALL,
|
|
78
|
+
)
|
|
79
|
+
for match in code_block_pattern.finditer(text):
|
|
80
|
+
placeholder, html_code_block = replacer(
|
|
81
|
+
match
|
|
82
|
+
)
|
|
83
|
+
code_blocks[placeholder] = html_code_block
|
|
84
|
+
modified_text = modified_text.replace(match.group(0), placeholder, 1)
|
|
85
|
+
|
|
86
|
+
return modified_text, code_blocks
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Reinserts HTML code blocks into the text, replacing their placeholders.
|
|
92
|
+
"""
|
|
93
|
+
for placeholder, html_code_block in code_blocks.items():
|
|
94
|
+
text = text.replace(placeholder, html_code_block, 1)
|
|
95
|
+
return text
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from html.parser import HTMLParser
|
|
3
|
+
|
|
4
|
+
MAX_LENGTH = 4096
|
|
5
|
+
MIN_LENGTH = 500
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HTMLTagTracker(HTMLParser):
|
|
9
|
+
def __init__(self):
|
|
10
|
+
super().__init__()
|
|
11
|
+
self.open_tags = []
|
|
12
|
+
|
|
13
|
+
def handle_starttag(self, tag, attrs):
|
|
14
|
+
# saving tags
|
|
15
|
+
if tag in (
|
|
16
|
+
"b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
|
|
17
|
+
"strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
|
|
18
|
+
):
|
|
19
|
+
self.open_tags.append((tag, attrs))
|
|
20
|
+
|
|
21
|
+
def handle_endtag(self, tag):
|
|
22
|
+
for i in range(len(self.open_tags) - 1, -1, -1):
|
|
23
|
+
if self.open_tags[i][0] == tag:
|
|
24
|
+
del self.open_tags[i]
|
|
25
|
+
break
|
|
26
|
+
|
|
27
|
+
def get_open_tags_html(self):
|
|
28
|
+
parts = []
|
|
29
|
+
for tag, attrs in self.open_tags:
|
|
30
|
+
attr_str = ""
|
|
31
|
+
if attrs:
|
|
32
|
+
attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
|
|
33
|
+
parts.append(f"<{tag}{attr_str}>")
|
|
34
|
+
return "".join(parts)
|
|
35
|
+
|
|
36
|
+
def get_closing_tags_html(self):
|
|
37
|
+
return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def split_pre_block(pre_block: str, max_length) -> list[str]:
|
|
41
|
+
"""
|
|
42
|
+
Splits long HTML-formatted text into chunks suitable for sending via Telegram,
|
|
43
|
+
preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
text (str): The input HTML-formatted string.
|
|
47
|
+
trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
|
|
48
|
+
from each resulting chunk before sending. This is useful to avoid
|
|
49
|
+
unnecessary blank space at the beginning of messages in Telegram.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# language-aware: <pre><code class="language-python">...</code></pre>
|
|
56
|
+
match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
|
|
57
|
+
if match:
|
|
58
|
+
attr, content = match.groups()
|
|
59
|
+
lines = content.splitlines(keepends=True)
|
|
60
|
+
chunks, buf = [], ""
|
|
61
|
+
overhead = len(f"<pre><code{attr}></code></pre>")
|
|
62
|
+
for line in lines:
|
|
63
|
+
if len(buf) + len(line) + overhead > max_length:
|
|
64
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
65
|
+
buf = ""
|
|
66
|
+
buf += line
|
|
67
|
+
if buf:
|
|
68
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
69
|
+
return chunks
|
|
70
|
+
else:
|
|
71
|
+
# regular <pre>...</pre>
|
|
72
|
+
inner = pre_block[5:-6]
|
|
73
|
+
lines = inner.splitlines(keepends=True)
|
|
74
|
+
chunks, buf = [], ""
|
|
75
|
+
overhead = len('<pre></pre>')
|
|
76
|
+
for line in lines:
|
|
77
|
+
if len(buf) + len(line) + overhead > max_length:
|
|
78
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
79
|
+
buf = ""
|
|
80
|
+
buf += line
|
|
81
|
+
if buf:
|
|
82
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
83
|
+
return chunks
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_only_tags(block: str) -> bool:
|
|
87
|
+
return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _effective_length(content: str) -> int:
|
|
91
|
+
tracker = HTMLTagTracker()
|
|
92
|
+
tracker.feed(content)
|
|
93
|
+
return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
|
|
97
|
+
"""Split long HTML-formatted text into Telegram-compatible chunks.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
text: str
|
|
102
|
+
Input HTML text.
|
|
103
|
+
trim_empty_leading_lines: bool, optional
|
|
104
|
+
If True, removes `\n` sybmols from start of chunks.
|
|
105
|
+
max_length: int, optional
|
|
106
|
+
Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
|
|
107
|
+
Default = 4096 (symbols)
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
list[str]
|
|
112
|
+
List of HTML chunks.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if max_length < MIN_LENGTH:
|
|
116
|
+
raise ValueError("max_length should be at least %d" % MIN_LENGTH)
|
|
117
|
+
|
|
118
|
+
pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
|
|
119
|
+
parts = pattern.split(text)
|
|
120
|
+
|
|
121
|
+
chunks: list[str] = []
|
|
122
|
+
prefix = ""
|
|
123
|
+
current = ""
|
|
124
|
+
whitespace_re = re.compile(r"(\\s+)")
|
|
125
|
+
tag_re = re.compile(r"(<[^>]+>)")
|
|
126
|
+
|
|
127
|
+
def finalize():
|
|
128
|
+
nonlocal current, prefix
|
|
129
|
+
tracker = HTMLTagTracker()
|
|
130
|
+
tracker.feed(prefix + current)
|
|
131
|
+
chunk = prefix + current + tracker.get_closing_tags_html()
|
|
132
|
+
chunks.append(chunk)
|
|
133
|
+
prefix = tracker.get_open_tags_html()
|
|
134
|
+
current = ""
|
|
135
|
+
|
|
136
|
+
def append_piece(piece: str):
|
|
137
|
+
nonlocal current, prefix
|
|
138
|
+
|
|
139
|
+
def split_on_whitespace(chunk: str) -> list[str] | None:
|
|
140
|
+
parts = [part for part in whitespace_re.split(chunk) if part]
|
|
141
|
+
if len(parts) <= 1:
|
|
142
|
+
return None
|
|
143
|
+
return parts
|
|
144
|
+
|
|
145
|
+
def split_on_tags(chunk: str) -> list[str] | None:
|
|
146
|
+
parts = [part for part in tag_re.split(chunk) if part]
|
|
147
|
+
if len(parts) <= 1:
|
|
148
|
+
return None
|
|
149
|
+
return parts
|
|
150
|
+
|
|
151
|
+
def fittable_prefix_length(chunk: str) -> int:
|
|
152
|
+
low, high = 1, len(chunk)
|
|
153
|
+
best = 0
|
|
154
|
+
while low <= high:
|
|
155
|
+
mid = (low + high) // 2
|
|
156
|
+
candidate = chunk[:mid]
|
|
157
|
+
if _effective_length(prefix + current + candidate) <= max_length:
|
|
158
|
+
best = mid
|
|
159
|
+
low = mid + 1
|
|
160
|
+
else:
|
|
161
|
+
high = mid - 1
|
|
162
|
+
return best
|
|
163
|
+
|
|
164
|
+
while piece:
|
|
165
|
+
if _effective_length(prefix + current + piece) <= max_length:
|
|
166
|
+
current += piece
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
if len(piece) > max_length:
|
|
170
|
+
if _is_only_tags(piece):
|
|
171
|
+
raise ValueError("block contains only html tags")
|
|
172
|
+
splitted = split_on_whitespace(piece)
|
|
173
|
+
if splitted:
|
|
174
|
+
for part in splitted:
|
|
175
|
+
append_piece(part)
|
|
176
|
+
return
|
|
177
|
+
tag_split = split_on_tags(piece)
|
|
178
|
+
if tag_split:
|
|
179
|
+
for part in tag_split:
|
|
180
|
+
append_piece(part)
|
|
181
|
+
return
|
|
182
|
+
elif current:
|
|
183
|
+
finalize()
|
|
184
|
+
continue
|
|
185
|
+
else:
|
|
186
|
+
splitted = split_on_whitespace(piece)
|
|
187
|
+
if splitted:
|
|
188
|
+
for part in splitted:
|
|
189
|
+
append_piece(part)
|
|
190
|
+
return
|
|
191
|
+
tag_split = split_on_tags(piece)
|
|
192
|
+
if tag_split:
|
|
193
|
+
for part in tag_split:
|
|
194
|
+
append_piece(part)
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
fitted = fittable_prefix_length(piece)
|
|
198
|
+
if fitted == 0:
|
|
199
|
+
if current:
|
|
200
|
+
finalize()
|
|
201
|
+
continue
|
|
202
|
+
raise ValueError("unable to split content within max_length")
|
|
203
|
+
|
|
204
|
+
current += piece[:fitted]
|
|
205
|
+
piece = piece[fitted:]
|
|
206
|
+
|
|
207
|
+
if piece:
|
|
208
|
+
finalize()
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
for part in parts:
|
|
212
|
+
if not part:
|
|
213
|
+
continue
|
|
214
|
+
if part.startswith("<pre>") or part.startswith("<pre><code"):
|
|
215
|
+
pre_chunks = split_pre_block(part, max_length=max_length)
|
|
216
|
+
for pc in pre_chunks:
|
|
217
|
+
append_piece(pc)
|
|
218
|
+
continue
|
|
219
|
+
blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
|
|
220
|
+
for block in blocks:
|
|
221
|
+
if block:
|
|
222
|
+
append_piece(block)
|
|
223
|
+
|
|
224
|
+
if current:
|
|
225
|
+
finalize()
|
|
226
|
+
|
|
227
|
+
merged: list[str] = []
|
|
228
|
+
buf = ""
|
|
229
|
+
for chunk in chunks:
|
|
230
|
+
if len(buf) + len(chunk) <= max_length:
|
|
231
|
+
buf += chunk
|
|
232
|
+
else:
|
|
233
|
+
if buf:
|
|
234
|
+
merged.append(buf)
|
|
235
|
+
buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
|
|
236
|
+
if buf:
|
|
237
|
+
merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
|
|
238
|
+
|
|
239
|
+
return merged
|
{chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/PKG-INFO
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
7
7
|
Author-email: latand666@gmail.com
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
{chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter.egg-info/SOURCES.txt
RENAMED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
LICENSE
|
|
2
|
+
README.md
|
|
2
3
|
setup.py
|
|
3
4
|
chatgpt_md_converter/__init__.py
|
|
4
5
|
chatgpt_md_converter/converters.py
|
|
5
6
|
chatgpt_md_converter/extractors.py
|
|
6
7
|
chatgpt_md_converter/formatters.py
|
|
7
8
|
chatgpt_md_converter/helpers.py
|
|
9
|
+
chatgpt_md_converter/html_splitter.py
|
|
8
10
|
chatgpt_md_converter/telegram_formatter.py
|
|
9
11
|
chatgpt_md_converter.egg-info/PKG-INFO
|
|
10
12
|
chatgpt_md_converter.egg-info/SOURCES.txt
|
|
11
13
|
chatgpt_md_converter.egg-info/dependency_links.txt
|
|
12
14
|
chatgpt_md_converter.egg-info/top_level.txt
|
|
13
|
-
tests/test_parser.py
|
|
15
|
+
tests/test_parser.py
|
|
16
|
+
tests/test_splitter.py
|
|
@@ -2,13 +2,13 @@ from setuptools import setup
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="chatgpt_md_converter",
|
|
5
|
-
version="0.3.
|
|
5
|
+
version="0.3.8",
|
|
6
6
|
author="Kostiantyn Kriuchkov",
|
|
7
7
|
author_email="latand666@gmail.com",
|
|
8
8
|
description="A package for converting markdown to HTML for chat Telegram bots",
|
|
9
|
-
long_description=open("README.
|
|
9
|
+
long_description=open("README.md").read(),
|
|
10
10
|
long_description_content_type="text/markdown",
|
|
11
|
-
url="https://github.com/
|
|
11
|
+
url="https://github.com/botfather-dev/formatter-chatgpt-telegram",
|
|
12
12
|
classifiers=[
|
|
13
13
|
"Programming Language :: Python :: 3",
|
|
14
14
|
"License :: OSI Approved :: MIT License",
|
|
@@ -730,3 +730,208 @@ def test_ukrainian_text_with_inline_code():
|
|
|
730
730
|
expected_output = """звісно, майстре тестування. ой та зрозуміло <code><LAUGH></code> що ти тут тестуєш."""
|
|
731
731
|
output = telegram_format(input_text)
|
|
732
732
|
assert output == expected_output, f"Output was: {output}"
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def test_nested_code_fence_quadruple():
|
|
736
|
+
input_text = """````markdown
|
|
737
|
+
```python
|
|
738
|
+
def hello_world():
|
|
739
|
+
print("Hello, World!")
|
|
740
|
+
```
|
|
741
|
+
````"""
|
|
742
|
+
expected_output = (
|
|
743
|
+
"<pre><code class=\"language-markdown\">```python\n"
|
|
744
|
+
"def hello_world():\n print(\"Hello, World!\")\n```\n</code></pre>"
|
|
745
|
+
)
|
|
746
|
+
output = telegram_format(input_text)
|
|
747
|
+
def show_output():
|
|
748
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
749
|
+
print(f"output was: \n\n{output}")
|
|
750
|
+
assert output == expected_output, show_output()
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def test_nested_code_fence_quadruple_no_lang():
|
|
754
|
+
input_text = """````
|
|
755
|
+
```python
|
|
756
|
+
print('hi')
|
|
757
|
+
```
|
|
758
|
+
````"""
|
|
759
|
+
expected_output = (
|
|
760
|
+
"<pre><code>```python\nprint('hi')\n```\n</code></pre>"
|
|
761
|
+
)
|
|
762
|
+
output = telegram_format(input_text)
|
|
763
|
+
def show_output():
|
|
764
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
765
|
+
print(f"output was: \n\n{output}")
|
|
766
|
+
assert output == expected_output, show_output()
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def test_nested_code_fence_five_backticks():
|
|
770
|
+
input_text = """`````markdown
|
|
771
|
+
````python
|
|
772
|
+
print(1)
|
|
773
|
+
````
|
|
774
|
+
`````"""
|
|
775
|
+
expected_output = (
|
|
776
|
+
"<pre><code class=\"language-markdown\">````python\nprint(1)\n````\n</code></pre>"
|
|
777
|
+
)
|
|
778
|
+
output = telegram_format(input_text)
|
|
779
|
+
def show_output():
|
|
780
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
781
|
+
print(f"output was: \n\n{output}")
|
|
782
|
+
assert output == expected_output, show_output()
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def test_nested_code_fence_five_backticks_with_inner_triple():
|
|
786
|
+
input_text = """`````markdown
|
|
787
|
+
````python
|
|
788
|
+
print("hello world ```")
|
|
789
|
+
````
|
|
790
|
+
`````"""
|
|
791
|
+
expected_output = (
|
|
792
|
+
"<pre><code class=\"language-markdown\">````python\n"
|
|
793
|
+
"print(\"hello world ```\")\n````\n</code></pre>"
|
|
794
|
+
)
|
|
795
|
+
output = telegram_format(input_text)
|
|
796
|
+
def show_output():
|
|
797
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
798
|
+
print(f"output was: \n\n{output}")
|
|
799
|
+
assert output == expected_output, show_output()
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def test_nested_code_fence_six_backticks():
|
|
803
|
+
input_text = """``````markdown
|
|
804
|
+
`````python
|
|
805
|
+
print('hi')
|
|
806
|
+
`````
|
|
807
|
+
``````"""
|
|
808
|
+
expected_output = """<pre><code class=\"language-markdown\">`````python
|
|
809
|
+
print('hi')
|
|
810
|
+
`````
|
|
811
|
+
</code></pre>"""
|
|
812
|
+
output = telegram_format(input_text)
|
|
813
|
+
def show_output():
|
|
814
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
815
|
+
print(f"output was: \n\n{output}")
|
|
816
|
+
assert output == expected_output, show_output()
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def test_nested_code_fence_plain_text():
|
|
820
|
+
input_text = """
|
|
821
|
+
````markdown
|
|
822
|
+
```
|
|
823
|
+
hello
|
|
824
|
+
```
|
|
825
|
+
````"""
|
|
826
|
+
expected_output = """<pre><code class=\"language-markdown\">```
|
|
827
|
+
hello
|
|
828
|
+
```
|
|
829
|
+
</code></pre>"""
|
|
830
|
+
output = telegram_format(input_text)
|
|
831
|
+
def show_output():
|
|
832
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
833
|
+
print(f"output was: \n\n{output}")
|
|
834
|
+
assert output == expected_output, show_output()
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def test_expensive_nested_code_five_fence_plain_text():
|
|
841
|
+
input_text = """
|
|
842
|
+
`````markdown
|
|
843
|
+
````
|
|
844
|
+
```python
|
|
845
|
+
print("hello world ```")
|
|
846
|
+
```
|
|
847
|
+
`````"""
|
|
848
|
+
|
|
849
|
+
expected_output = """<pre><code class=\"language-markdown\">````
|
|
850
|
+
```python
|
|
851
|
+
print("hello world ```")
|
|
852
|
+
```
|
|
853
|
+
</code></pre>"""
|
|
854
|
+
output = telegram_format(input_text)
|
|
855
|
+
def show_output():
|
|
856
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
857
|
+
print(f"output was: \n\n{output}")
|
|
858
|
+
assert output == expected_output, show_output()
|
|
859
|
+
|
|
860
|
+
def test_another_expensive_nested_code_five_fence_plain_text():
|
|
861
|
+
input_text = """`````markdown
|
|
862
|
+
````python
|
|
863
|
+
print("hello world ```"')
|
|
864
|
+
```
|
|
865
|
+
`````"""
|
|
866
|
+
|
|
867
|
+
expected_output = """<pre><code class=\"language-markdown\">````python
|
|
868
|
+
print("hello world ```"')
|
|
869
|
+
```
|
|
870
|
+
</code></pre>"""
|
|
871
|
+
output = telegram_format(input_text)
|
|
872
|
+
def show_output():
|
|
873
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
874
|
+
print(f"output was: \n\n{output}")
|
|
875
|
+
assert output == expected_output, show_output()
|
|
876
|
+
|
|
877
|
+
def test_hard_level_nested_code_five_fence_plain_text():
|
|
878
|
+
input_text = """`````markdown
|
|
879
|
+
````python
|
|
880
|
+
print("hello world ```"')
|
|
881
|
+
````
|
|
882
|
+
`````
|
|
883
|
+
```python
|
|
884
|
+
print("Some another text")""" # That's where closing the second block of python code is missing.
|
|
885
|
+
|
|
886
|
+
expected_output = """<pre><code class="language-markdown">````python
|
|
887
|
+
print("hello world ```"')
|
|
888
|
+
````
|
|
889
|
+
</code></pre>
|
|
890
|
+
<pre><code class="language-python">print("Some another text")
|
|
891
|
+
</code></pre>""" # But the code block is still closed correctly.
|
|
892
|
+
|
|
893
|
+
output = telegram_format(input_text)
|
|
894
|
+
def show_output():
|
|
895
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
896
|
+
print(f"output was: \n\n{output}")
|
|
897
|
+
assert output == expected_output, show_output()
|
|
898
|
+
|
|
899
|
+
def test_hard_level_nested_code_five_fence_plain_text_2():
|
|
900
|
+
input_text = """`````markdown
|
|
901
|
+
````python
|
|
902
|
+
print("hello world ```"')
|
|
903
|
+
`````
|
|
904
|
+
```python
|
|
905
|
+
print("Some another text")""" # That's where closing the second block of python code is missing.
|
|
906
|
+
|
|
907
|
+
expected_output = """<pre><code class="language-markdown">````python
|
|
908
|
+
print("hello world ```"')
|
|
909
|
+
</code></pre>
|
|
910
|
+
<pre><code class="language-python">print("Some another text")
|
|
911
|
+
</code></pre>""" # But the code block is still closed correctly.
|
|
912
|
+
|
|
913
|
+
output = telegram_format(input_text)
|
|
914
|
+
def show_output():
|
|
915
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
916
|
+
print(f"output was: \n\n{output}")
|
|
917
|
+
assert output == expected_output, show_output()
|
|
918
|
+
|
|
919
|
+
def test_some_new():
|
|
920
|
+
input_text = """
|
|
921
|
+
``````markdown
|
|
922
|
+
`````
|
|
923
|
+
````python
|
|
924
|
+
print("hello world ```")
|
|
925
|
+
```
|
|
926
|
+
""" # Markdown code wasn't closed
|
|
927
|
+
|
|
928
|
+
expected_output = """<pre><code class=\"language-markdown\">`````
|
|
929
|
+
````python
|
|
930
|
+
print("hello world ```")
|
|
931
|
+
```
|
|
932
|
+
</code></pre>""" # But after closed correctly
|
|
933
|
+
output = telegram_format(input_text)
|
|
934
|
+
def show_output():
|
|
935
|
+
print(f"Expected was: \n\n{expected_output}\n\n")
|
|
936
|
+
print(f"output was: \n\n{output}")
|
|
937
|
+
assert output == expected_output, show_output()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from chatgpt_md_converter.html_splitter import (MIN_LENGTH,
|
|
6
|
+
split_html_for_telegram)
|
|
7
|
+
|
|
8
|
+
from . import html_examples
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_html_splitter():
|
|
12
|
+
chunks = split_html_for_telegram(html_examples.input_text)
|
|
13
|
+
valid_chunks = [
|
|
14
|
+
html_examples.valid_chunk_1,
|
|
15
|
+
html_examples.valid_chunk_2,
|
|
16
|
+
html_examples.valid_chunk_3,
|
|
17
|
+
]
|
|
18
|
+
for index, chunk in enumerate(chunks):
|
|
19
|
+
assert chunk == valid_chunks[index], (
|
|
20
|
+
f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def test_html_splitter__remove_leading_brakes():
|
|
24
|
+
chunks = split_html_for_telegram(html_examples.input_text, trim_empty_leading_lines=True)
|
|
25
|
+
valid_chunks = [
|
|
26
|
+
html_examples.valid_chunk_1,
|
|
27
|
+
html_examples.valid_chunk_2,
|
|
28
|
+
html_examples.valid_chunk_3_remove_leading_brakes,
|
|
29
|
+
]
|
|
30
|
+
for index, chunk in enumerate(chunks):
|
|
31
|
+
assert chunk == valid_chunks[index], (
|
|
32
|
+
f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def test_html_splitter_max_length_550():
|
|
36
|
+
chunks = split_html_for_telegram(
|
|
37
|
+
html_examples.long_code_input, max_length=550, trim_empty_leading_lines=True
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def load_expected_chunks_550():
|
|
41
|
+
raw = re.split(r"END\n?", html_examples.expected_550)
|
|
42
|
+
chunks = []
|
|
43
|
+
for part in raw:
|
|
44
|
+
if not part.strip():
|
|
45
|
+
continue
|
|
46
|
+
lines = part.splitlines()
|
|
47
|
+
chunks.append("\n".join(lines[1:]))
|
|
48
|
+
return chunks
|
|
49
|
+
|
|
50
|
+
valid_chunks = load_expected_chunks_550()
|
|
51
|
+
for index, chunk in enumerate(chunks):
|
|
52
|
+
assert chunk == valid_chunks[index], (
|
|
53
|
+
f"expected: \n\n{valid_chunks[index]} \n\n got: \n\n{chunk}"
|
|
54
|
+
)
|
|
55
|
+
assert len(chunk) <= 550
|
|
56
|
+
|
|
57
|
+
def test_split_html_respects_max_length_by_words():
|
|
58
|
+
text = "<b>" + "<i>word</i> " * 100 + "</b>"
|
|
59
|
+
chunks = split_html_for_telegram(text, max_length=550)
|
|
60
|
+
assert len(chunks) > 1
|
|
61
|
+
for chunk in chunks:
|
|
62
|
+
assert len(chunk) <= 550
|
|
63
|
+
assert chunk.startswith("<b>")
|
|
64
|
+
assert chunk.endswith("</b>")
|
|
65
|
+
assert chunk.count("<i>") == chunk.count("</i>")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_split_html_only_tags_raises():
|
|
69
|
+
text = "<b></b>" * 200
|
|
70
|
+
with pytest.raises(ValueError):
|
|
71
|
+
split_html_for_telegram(text, max_length=600)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_split_html_min_length_enforced():
|
|
75
|
+
with pytest.raises(ValueError):
|
|
76
|
+
split_html_for_telegram("hello", max_length=MIN_LENGTH - 1)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_split_html_long_word_exceeds_limit():
|
|
80
|
+
text = "a" * 600
|
|
81
|
+
chunks = split_html_for_telegram(text, max_length=550)
|
|
82
|
+
assert chunks == ["a" * 550, "a" * 50]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
LONG_TEXT = "<b><i>" + "word " * 96 + "word!" + "</i></b>"
|
|
86
|
+
|
|
87
|
+
SHORT_TEXT = "<u>" + "another " * 9 + "another" + "</u>"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_split_html_keeps_newline_without_trim():
|
|
91
|
+
text = LONG_TEXT + "\n\n" + SHORT_TEXT
|
|
92
|
+
chunks = split_html_for_telegram(text, max_length=500, trim_empty_leading_lines=False)
|
|
93
|
+
assert chunks[0] == LONG_TEXT
|
|
94
|
+
assert chunks[1].startswith("\n")
|
|
95
|
+
assert chunks[1].endswith(SHORT_TEXT)
|
|
96
|
+
assert chunks[1].lstrip("\n").startswith("<u>")
|
|
97
|
+
assert chunks[1].lstrip("\n").endswith("</u>")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_split_html_trims_leading_newline_on_new_chunk():
|
|
101
|
+
text = LONG_TEXT + "\n\n" + SHORT_TEXT
|
|
102
|
+
chunks = split_html_for_telegram(text, max_length=500, trim_empty_leading_lines=True)
|
|
103
|
+
assert chunks == [LONG_TEXT, SHORT_TEXT]
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def ensure_closing_delimiters(text: str) -> str:
|
|
5
|
-
"""
|
|
6
|
-
Ensures that if an opening ` or ``` is found without a matching closing delimiter,
|
|
7
|
-
the missing delimiter is appended to the end of the text.
|
|
8
|
-
"""
|
|
9
|
-
# For triple backticks
|
|
10
|
-
if text.count("```") % 2 != 0:
|
|
11
|
-
text += "```"
|
|
12
|
-
# For single backticks
|
|
13
|
-
if text.count("`") % 2 != 0:
|
|
14
|
-
text += "`"
|
|
15
|
-
return text
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def extract_and_convert_code_blocks(text: str):
|
|
19
|
-
"""
|
|
20
|
-
Extracts code blocks from the text, converting them to HTML <pre><code> format,
|
|
21
|
-
and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
|
|
22
|
-
"""
|
|
23
|
-
text = ensure_closing_delimiters(text)
|
|
24
|
-
placeholders = []
|
|
25
|
-
code_blocks = {}
|
|
26
|
-
|
|
27
|
-
def replacer(match):
|
|
28
|
-
language = match.group(1) if match.group(1) else ""
|
|
29
|
-
code_content = match.group(3)
|
|
30
|
-
|
|
31
|
-
# Properly escape HTML entities in code content
|
|
32
|
-
escaped_content = (
|
|
33
|
-
code_content.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
|
|
37
|
-
placeholders.append(placeholder)
|
|
38
|
-
if not language:
|
|
39
|
-
html_code_block = f"<pre><code>{escaped_content}</code></pre>"
|
|
40
|
-
else:
|
|
41
|
-
html_code_block = (
|
|
42
|
-
f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
|
|
43
|
-
)
|
|
44
|
-
return (placeholder, html_code_block)
|
|
45
|
-
|
|
46
|
-
modified_text = text
|
|
47
|
-
for match in re.finditer(r"```(\w*)?(\n)?(.*?)```", text, flags=re.DOTALL):
|
|
48
|
-
placeholder, html_code_block = replacer(match)
|
|
49
|
-
code_blocks[placeholder] = html_code_block
|
|
50
|
-
modified_text = modified_text.replace(match.group(0), placeholder, 1)
|
|
51
|
-
|
|
52
|
-
return modified_text, code_blocks
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
|
|
56
|
-
"""
|
|
57
|
-
Reinserts HTML code blocks into the text, replacing their placeholders.
|
|
58
|
-
"""
|
|
59
|
-
for placeholder, html_code_block in code_blocks.items():
|
|
60
|
-
text = text.replace(placeholder, html_code_block, 1)
|
|
61
|
-
return text
|
|
File without changes
|
{chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/converters.py
RENAMED
|
File without changes
|
{chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/formatters.py
RENAMED
|
File without changes
|
|
File without changes
|
{chatgpt_md_converter-0.3.6 → chatgpt_md_converter-0.3.8}/chatgpt_md_converter/telegram_formatter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|