chatgpt-md-converter 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatgpt_md_converter/__init__.py +2 -1
- chatgpt_md_converter/extractors.py +47 -13
- chatgpt_md_converter/html_splitter.py +239 -0
- {chatgpt_md_converter-0.3.6.dist-info → chatgpt_md_converter-0.3.8.dist-info}/METADATA +2 -2
- chatgpt_md_converter-0.3.8.dist-info/RECORD +12 -0
- {chatgpt_md_converter-0.3.6.dist-info → chatgpt_md_converter-0.3.8.dist-info}/WHEEL +1 -1
- chatgpt_md_converter-0.3.6.dist-info/RECORD +0 -11
- {chatgpt_md_converter-0.3.6.dist-info → chatgpt_md_converter-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {chatgpt_md_converter-0.3.6.dist-info → chatgpt_md_converter-0.3.8.dist-info}/top_level.txt +0 -0
chatgpt_md_converter/__init__.py
CHANGED
|
@@ -2,16 +2,44 @@ import re
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def ensure_closing_delimiters(text: str) -> str:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
5
|
+
# Append missing closing backtick delimiters.
|
|
6
|
+
|
|
7
|
+
code_block_re = re.compile(
|
|
8
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
9
|
+
flags=re.DOTALL,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
# Track an open fence. Once a fence is opened, everything until the same
|
|
13
|
+
# fence is encountered again is treated as plain text. This mimics how
|
|
14
|
+
# Markdown handles fences and allows fence-like strings inside code blocks.
|
|
15
|
+
open_fence = None
|
|
16
|
+
for line in text.splitlines():
|
|
17
|
+
stripped = line.strip()
|
|
18
|
+
if open_fence is None:
|
|
19
|
+
m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
|
|
20
|
+
if m:
|
|
21
|
+
open_fence = m.group("fence")
|
|
22
|
+
else:
|
|
23
|
+
if stripped.endswith(open_fence):
|
|
24
|
+
open_fence = None
|
|
25
|
+
|
|
26
|
+
# If a fence was left open, append a matching closing fence.
|
|
27
|
+
if open_fence is not None:
|
|
28
|
+
if not text.endswith("\n"):
|
|
29
|
+
text += "\n"
|
|
30
|
+
text += open_fence
|
|
31
|
+
|
|
32
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
33
|
+
|
|
34
|
+
# Balance triple backticks that are not part of a complete fence.
|
|
35
|
+
if cleaned_inline.count("```") % 2 != 0:
|
|
11
36
|
text += "```"
|
|
12
|
-
|
|
13
|
-
|
|
37
|
+
|
|
38
|
+
# Balance single backticks outside fenced blocks.
|
|
39
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
40
|
+
if cleaned_inline.count("`") % 2 != 0:
|
|
14
41
|
text += "`"
|
|
42
|
+
|
|
15
43
|
return text
|
|
16
44
|
|
|
17
45
|
|
|
@@ -25,8 +53,8 @@ def extract_and_convert_code_blocks(text: str):
|
|
|
25
53
|
code_blocks = {}
|
|
26
54
|
|
|
27
55
|
def replacer(match):
|
|
28
|
-
language = match.group(
|
|
29
|
-
code_content = match.group(
|
|
56
|
+
language = match.group("lang") if match.group("lang") else ""
|
|
57
|
+
code_content = match.group("code")
|
|
30
58
|
|
|
31
59
|
# Properly escape HTML entities in code content
|
|
32
60
|
escaped_content = (
|
|
@@ -44,8 +72,14 @@ def extract_and_convert_code_blocks(text: str):
|
|
|
44
72
|
return (placeholder, html_code_block)
|
|
45
73
|
|
|
46
74
|
modified_text = text
|
|
47
|
-
|
|
48
|
-
|
|
75
|
+
code_block_pattern = re.compile(
|
|
76
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
|
|
77
|
+
flags=re.DOTALL,
|
|
78
|
+
)
|
|
79
|
+
for match in code_block_pattern.finditer(text):
|
|
80
|
+
placeholder, html_code_block = replacer(
|
|
81
|
+
match
|
|
82
|
+
)
|
|
49
83
|
code_blocks[placeholder] = html_code_block
|
|
50
84
|
modified_text = modified_text.replace(match.group(0), placeholder, 1)
|
|
51
85
|
|
|
@@ -58,4 +92,4 @@ def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
|
|
|
58
92
|
"""
|
|
59
93
|
for placeholder, html_code_block in code_blocks.items():
|
|
60
94
|
text = text.replace(placeholder, html_code_block, 1)
|
|
61
|
-
return text
|
|
95
|
+
return text
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from html.parser import HTMLParser
|
|
3
|
+
|
|
4
|
+
MAX_LENGTH = 4096
|
|
5
|
+
MIN_LENGTH = 500
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HTMLTagTracker(HTMLParser):
|
|
9
|
+
def __init__(self):
|
|
10
|
+
super().__init__()
|
|
11
|
+
self.open_tags = []
|
|
12
|
+
|
|
13
|
+
def handle_starttag(self, tag, attrs):
|
|
14
|
+
# saving tags
|
|
15
|
+
if tag in (
|
|
16
|
+
"b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
|
|
17
|
+
"strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
|
|
18
|
+
):
|
|
19
|
+
self.open_tags.append((tag, attrs))
|
|
20
|
+
|
|
21
|
+
def handle_endtag(self, tag):
|
|
22
|
+
for i in range(len(self.open_tags) - 1, -1, -1):
|
|
23
|
+
if self.open_tags[i][0] == tag:
|
|
24
|
+
del self.open_tags[i]
|
|
25
|
+
break
|
|
26
|
+
|
|
27
|
+
def get_open_tags_html(self):
|
|
28
|
+
parts = []
|
|
29
|
+
for tag, attrs in self.open_tags:
|
|
30
|
+
attr_str = ""
|
|
31
|
+
if attrs:
|
|
32
|
+
attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
|
|
33
|
+
parts.append(f"<{tag}{attr_str}>")
|
|
34
|
+
return "".join(parts)
|
|
35
|
+
|
|
36
|
+
def get_closing_tags_html(self):
|
|
37
|
+
return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def split_pre_block(pre_block: str, max_length) -> list[str]:
|
|
41
|
+
"""
|
|
42
|
+
Splits long HTML-formatted text into chunks suitable for sending via Telegram,
|
|
43
|
+
preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
text (str): The input HTML-formatted string.
|
|
47
|
+
trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
|
|
48
|
+
from each resulting chunk before sending. This is useful to avoid
|
|
49
|
+
unnecessary blank space at the beginning of messages in Telegram.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# language-aware: <pre><code class="language-python">...</code></pre>
|
|
56
|
+
match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
|
|
57
|
+
if match:
|
|
58
|
+
attr, content = match.groups()
|
|
59
|
+
lines = content.splitlines(keepends=True)
|
|
60
|
+
chunks, buf = [], ""
|
|
61
|
+
overhead = len(f"<pre><code{attr}></code></pre>")
|
|
62
|
+
for line in lines:
|
|
63
|
+
if len(buf) + len(line) + overhead > max_length:
|
|
64
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
65
|
+
buf = ""
|
|
66
|
+
buf += line
|
|
67
|
+
if buf:
|
|
68
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
69
|
+
return chunks
|
|
70
|
+
else:
|
|
71
|
+
# regular <pre>...</pre>
|
|
72
|
+
inner = pre_block[5:-6]
|
|
73
|
+
lines = inner.splitlines(keepends=True)
|
|
74
|
+
chunks, buf = [], ""
|
|
75
|
+
overhead = len('<pre></pre>')
|
|
76
|
+
for line in lines:
|
|
77
|
+
if len(buf) + len(line) + overhead > max_length:
|
|
78
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
79
|
+
buf = ""
|
|
80
|
+
buf += line
|
|
81
|
+
if buf:
|
|
82
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
83
|
+
return chunks
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _is_only_tags(block: str) -> bool:
|
|
87
|
+
return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _effective_length(content: str) -> int:
|
|
91
|
+
tracker = HTMLTagTracker()
|
|
92
|
+
tracker.feed(content)
|
|
93
|
+
return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
|
|
97
|
+
"""Split long HTML-formatted text into Telegram-compatible chunks.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
text: str
|
|
102
|
+
Input HTML text.
|
|
103
|
+
trim_empty_leading_lines: bool, optional
|
|
104
|
+
If True, removes `\n` sybmols from start of chunks.
|
|
105
|
+
max_length: int, optional
|
|
106
|
+
Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
|
|
107
|
+
Default = 4096 (symbols)
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
list[str]
|
|
112
|
+
List of HTML chunks.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if max_length < MIN_LENGTH:
|
|
116
|
+
raise ValueError("max_length should be at least %d" % MIN_LENGTH)
|
|
117
|
+
|
|
118
|
+
pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
|
|
119
|
+
parts = pattern.split(text)
|
|
120
|
+
|
|
121
|
+
chunks: list[str] = []
|
|
122
|
+
prefix = ""
|
|
123
|
+
current = ""
|
|
124
|
+
whitespace_re = re.compile(r"(\\s+)")
|
|
125
|
+
tag_re = re.compile(r"(<[^>]+>)")
|
|
126
|
+
|
|
127
|
+
def finalize():
|
|
128
|
+
nonlocal current, prefix
|
|
129
|
+
tracker = HTMLTagTracker()
|
|
130
|
+
tracker.feed(prefix + current)
|
|
131
|
+
chunk = prefix + current + tracker.get_closing_tags_html()
|
|
132
|
+
chunks.append(chunk)
|
|
133
|
+
prefix = tracker.get_open_tags_html()
|
|
134
|
+
current = ""
|
|
135
|
+
|
|
136
|
+
def append_piece(piece: str):
|
|
137
|
+
nonlocal current, prefix
|
|
138
|
+
|
|
139
|
+
def split_on_whitespace(chunk: str) -> list[str] | None:
|
|
140
|
+
parts = [part for part in whitespace_re.split(chunk) if part]
|
|
141
|
+
if len(parts) <= 1:
|
|
142
|
+
return None
|
|
143
|
+
return parts
|
|
144
|
+
|
|
145
|
+
def split_on_tags(chunk: str) -> list[str] | None:
|
|
146
|
+
parts = [part for part in tag_re.split(chunk) if part]
|
|
147
|
+
if len(parts) <= 1:
|
|
148
|
+
return None
|
|
149
|
+
return parts
|
|
150
|
+
|
|
151
|
+
def fittable_prefix_length(chunk: str) -> int:
|
|
152
|
+
low, high = 1, len(chunk)
|
|
153
|
+
best = 0
|
|
154
|
+
while low <= high:
|
|
155
|
+
mid = (low + high) // 2
|
|
156
|
+
candidate = chunk[:mid]
|
|
157
|
+
if _effective_length(prefix + current + candidate) <= max_length:
|
|
158
|
+
best = mid
|
|
159
|
+
low = mid + 1
|
|
160
|
+
else:
|
|
161
|
+
high = mid - 1
|
|
162
|
+
return best
|
|
163
|
+
|
|
164
|
+
while piece:
|
|
165
|
+
if _effective_length(prefix + current + piece) <= max_length:
|
|
166
|
+
current += piece
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
if len(piece) > max_length:
|
|
170
|
+
if _is_only_tags(piece):
|
|
171
|
+
raise ValueError("block contains only html tags")
|
|
172
|
+
splitted = split_on_whitespace(piece)
|
|
173
|
+
if splitted:
|
|
174
|
+
for part in splitted:
|
|
175
|
+
append_piece(part)
|
|
176
|
+
return
|
|
177
|
+
tag_split = split_on_tags(piece)
|
|
178
|
+
if tag_split:
|
|
179
|
+
for part in tag_split:
|
|
180
|
+
append_piece(part)
|
|
181
|
+
return
|
|
182
|
+
elif current:
|
|
183
|
+
finalize()
|
|
184
|
+
continue
|
|
185
|
+
else:
|
|
186
|
+
splitted = split_on_whitespace(piece)
|
|
187
|
+
if splitted:
|
|
188
|
+
for part in splitted:
|
|
189
|
+
append_piece(part)
|
|
190
|
+
return
|
|
191
|
+
tag_split = split_on_tags(piece)
|
|
192
|
+
if tag_split:
|
|
193
|
+
for part in tag_split:
|
|
194
|
+
append_piece(part)
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
fitted = fittable_prefix_length(piece)
|
|
198
|
+
if fitted == 0:
|
|
199
|
+
if current:
|
|
200
|
+
finalize()
|
|
201
|
+
continue
|
|
202
|
+
raise ValueError("unable to split content within max_length")
|
|
203
|
+
|
|
204
|
+
current += piece[:fitted]
|
|
205
|
+
piece = piece[fitted:]
|
|
206
|
+
|
|
207
|
+
if piece:
|
|
208
|
+
finalize()
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
for part in parts:
|
|
212
|
+
if not part:
|
|
213
|
+
continue
|
|
214
|
+
if part.startswith("<pre>") or part.startswith("<pre><code"):
|
|
215
|
+
pre_chunks = split_pre_block(part, max_length=max_length)
|
|
216
|
+
for pc in pre_chunks:
|
|
217
|
+
append_piece(pc)
|
|
218
|
+
continue
|
|
219
|
+
blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
|
|
220
|
+
for block in blocks:
|
|
221
|
+
if block:
|
|
222
|
+
append_piece(block)
|
|
223
|
+
|
|
224
|
+
if current:
|
|
225
|
+
finalize()
|
|
226
|
+
|
|
227
|
+
merged: list[str] = []
|
|
228
|
+
buf = ""
|
|
229
|
+
for chunk in chunks:
|
|
230
|
+
if len(buf) + len(chunk) <= max_length:
|
|
231
|
+
buf += chunk
|
|
232
|
+
else:
|
|
233
|
+
if buf:
|
|
234
|
+
merged.append(buf)
|
|
235
|
+
buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
|
|
236
|
+
if buf:
|
|
237
|
+
merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
|
|
238
|
+
|
|
239
|
+
return merged
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
7
7
|
Author-email: latand666@gmail.com
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
chatgpt_md_converter/__init__.py,sha256=HF8fLq9o1A4HMDjPWCQ43NSby_L29Zgd4S_g3ORyyCA,157
|
|
2
|
+
chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
|
|
3
|
+
chatgpt_md_converter/extractors.py,sha256=k1oRlocn0K4OyU3-k2mrhKanKNdU-664t1CTcf8hYdE,3212
|
|
4
|
+
chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
|
|
5
|
+
chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
|
|
6
|
+
chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
|
|
7
|
+
chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
|
|
8
|
+
chatgpt_md_converter-0.3.8.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
9
|
+
chatgpt_md_converter-0.3.8.dist-info/METADATA,sha256=ngfuia4mAfiHBySgX_hKii8ty1O9hOkCotqX9Fzidm4,5792
|
|
10
|
+
chatgpt_md_converter-0.3.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
chatgpt_md_converter-0.3.8.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
12
|
+
chatgpt_md_converter-0.3.8.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
|
|
2
|
-
chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
|
|
3
|
-
chatgpt_md_converter/extractors.py,sha256=WU38iAG-MANmilqR73gAvxqqXvx4JT8q3xrac_GRXGI,2071
|
|
4
|
-
chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
|
|
5
|
-
chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
|
|
6
|
-
chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
|
|
7
|
-
chatgpt_md_converter-0.3.6.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
8
|
-
chatgpt_md_converter-0.3.6.dist-info/METADATA,sha256=_cmiJutFIaPt17LD9VOf650BzsPjaBIkD-VUSYpVtJM,5785
|
|
9
|
-
chatgpt_md_converter-0.3.6.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
|
10
|
-
chatgpt_md_converter-0.3.6.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
11
|
-
chatgpt_md_converter-0.3.6.dist-info/RECORD,,
|
{chatgpt_md_converter-0.3.6.dist-info → chatgpt_md_converter-0.3.8.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|