chatgpt-md-converter 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatgpt_md_converter/extractors.py +45 -12
- chatgpt_md_converter/html_splitter.py +114 -0
- {chatgpt_md_converter-0.3.5.dist-info → chatgpt_md_converter-0.3.7.dist-info}/METADATA +2 -2
- chatgpt_md_converter-0.3.7.dist-info/RECORD +12 -0
- {chatgpt_md_converter-0.3.5.dist-info → chatgpt_md_converter-0.3.7.dist-info}/WHEEL +1 -1
- chatgpt_md_converter-0.3.5.dist-info/RECORD +0 -11
- {chatgpt_md_converter-0.3.5.dist-info → chatgpt_md_converter-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {chatgpt_md_converter-0.3.5.dist-info → chatgpt_md_converter-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -2,16 +2,43 @@ import re
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def ensure_closing_delimiters(text: str) -> str:
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
5
|
+
"""Append missing closing backtick delimiters."""
|
|
6
|
+
|
|
7
|
+
code_block_re = re.compile(
|
|
8
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
9
|
+
flags=re.DOTALL,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
# Remove complete code blocks from consideration so inner backticks
|
|
13
|
+
# don't affect delimiter balancing.
|
|
14
|
+
cleaned = code_block_re.sub("", text)
|
|
15
|
+
|
|
16
|
+
# Detect unclosed fences by tracking opening fence lengths.
|
|
17
|
+
stack = []
|
|
18
|
+
for line in cleaned.splitlines():
|
|
19
|
+
m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", line.strip())
|
|
20
|
+
if not m:
|
|
21
|
+
continue
|
|
22
|
+
fence = m.group("fence")
|
|
23
|
+
if stack and fence == stack[-1]:
|
|
24
|
+
stack.pop()
|
|
25
|
+
else:
|
|
26
|
+
stack.append(fence)
|
|
27
|
+
|
|
28
|
+
if stack:
|
|
29
|
+
text += "\n" + stack[-1]
|
|
30
|
+
|
|
31
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
32
|
+
|
|
33
|
+
# Balance triple backticks that are not part of a complete fence.
|
|
34
|
+
if cleaned_inline.count("```") % 2 != 0:
|
|
11
35
|
text += "```"
|
|
12
|
-
|
|
13
|
-
|
|
36
|
+
|
|
37
|
+
# Balance single backticks outside fenced blocks.
|
|
38
|
+
cleaned_inline = code_block_re.sub("", text)
|
|
39
|
+
if cleaned_inline.count("`") % 2 != 0:
|
|
14
40
|
text += "`"
|
|
41
|
+
|
|
15
42
|
return text
|
|
16
43
|
|
|
17
44
|
|
|
@@ -25,8 +52,8 @@ def extract_and_convert_code_blocks(text: str):
|
|
|
25
52
|
code_blocks = {}
|
|
26
53
|
|
|
27
54
|
def replacer(match):
|
|
28
|
-
language = match.group(
|
|
29
|
-
code_content = match.group(
|
|
55
|
+
language = match.group("lang") if match.group("lang") else ""
|
|
56
|
+
code_content = match.group("code")
|
|
30
57
|
|
|
31
58
|
# Properly escape HTML entities in code content
|
|
32
59
|
escaped_content = (
|
|
@@ -44,8 +71,14 @@ def extract_and_convert_code_blocks(text: str):
|
|
|
44
71
|
return (placeholder, html_code_block)
|
|
45
72
|
|
|
46
73
|
modified_text = text
|
|
47
|
-
|
|
48
|
-
|
|
74
|
+
code_block_pattern = re.compile(
|
|
75
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
|
|
76
|
+
flags=re.DOTALL,
|
|
77
|
+
)
|
|
78
|
+
for match in code_block_pattern.finditer(text):
|
|
79
|
+
placeholder, html_code_block = replacer(
|
|
80
|
+
match
|
|
81
|
+
)
|
|
49
82
|
code_blocks[placeholder] = html_code_block
|
|
50
83
|
modified_text = modified_text.replace(match.group(0), placeholder, 1)
|
|
51
84
|
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from html.parser import HTMLParser
|
|
3
|
+
|
|
4
|
+
MAX_LENGTH = 4096
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HTMLTagTracker(HTMLParser):
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__()
|
|
10
|
+
self.open_tags = []
|
|
11
|
+
|
|
12
|
+
def handle_starttag(self, tag, attrs):
|
|
13
|
+
# saving tags
|
|
14
|
+
if tag in ("b", "i", "u", "s", "code", "pre", "a", "span", "blockquote"):
|
|
15
|
+
self.open_tags.append((tag, attrs))
|
|
16
|
+
|
|
17
|
+
def handle_endtag(self, tag):
|
|
18
|
+
for i in range(len(self.open_tags) - 1, -1, -1):
|
|
19
|
+
if self.open_tags[i][0] == tag:
|
|
20
|
+
del self.open_tags[i]
|
|
21
|
+
break
|
|
22
|
+
|
|
23
|
+
def get_open_tags_html(self):
|
|
24
|
+
parts = []
|
|
25
|
+
for tag, attrs in self.open_tags:
|
|
26
|
+
attr_str = ""
|
|
27
|
+
if attrs:
|
|
28
|
+
attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
|
|
29
|
+
parts.append(f"<{tag}{attr_str}>")
|
|
30
|
+
return "".join(parts)
|
|
31
|
+
|
|
32
|
+
def get_closing_tags_html(self):
|
|
33
|
+
return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def split_pre_block(pre_block: str) -> list[str]:
|
|
37
|
+
# language-aware: <pre><code class="language-python">...</code></pre>
|
|
38
|
+
match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
|
|
39
|
+
if match:
|
|
40
|
+
attr, content = match.groups()
|
|
41
|
+
lines = content.splitlines(keepends=True)
|
|
42
|
+
chunks, buf = [], ""
|
|
43
|
+
for line in lines:
|
|
44
|
+
if len(buf) + len(line) + len('<pre><code></code></pre>') > MAX_LENGTH:
|
|
45
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
46
|
+
buf = ""
|
|
47
|
+
buf += line
|
|
48
|
+
if buf:
|
|
49
|
+
chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
|
|
50
|
+
return chunks
|
|
51
|
+
else:
|
|
52
|
+
# regular <pre>...</pre>
|
|
53
|
+
inner = pre_block[5:-6]
|
|
54
|
+
lines = inner.splitlines(keepends=True)
|
|
55
|
+
chunks, buf = [], ""
|
|
56
|
+
for line in lines:
|
|
57
|
+
if len(buf) + len(line) + len('<pre></pre>') > MAX_LENGTH:
|
|
58
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
59
|
+
buf = ""
|
|
60
|
+
buf += line
|
|
61
|
+
if buf:
|
|
62
|
+
chunks.append(f"<pre>{buf}</pre>")
|
|
63
|
+
return chunks
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def split_html_for_telegram(text: str) -> list[str]:
|
|
67
|
+
chunks = []
|
|
68
|
+
pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
|
|
69
|
+
parts = pattern.split(text)
|
|
70
|
+
|
|
71
|
+
for part in parts:
|
|
72
|
+
if not part:
|
|
73
|
+
continue
|
|
74
|
+
if part.startswith("<pre>") or part.startswith("<pre><code"):
|
|
75
|
+
pre_chunks = split_pre_block(part)
|
|
76
|
+
chunks.extend(pre_chunks)
|
|
77
|
+
else:
|
|
78
|
+
# breaking down regular HTML
|
|
79
|
+
tracker = HTMLTagTracker()
|
|
80
|
+
current = ""
|
|
81
|
+
blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
|
|
82
|
+
for block in blocks:
|
|
83
|
+
prospective = current + block
|
|
84
|
+
if len(prospective) > MAX_LENGTH:
|
|
85
|
+
tracker.feed(current)
|
|
86
|
+
open_tags = tracker.get_open_tags_html()
|
|
87
|
+
close_tags = tracker.get_closing_tags_html()
|
|
88
|
+
chunks.append(open_tags + current + close_tags)
|
|
89
|
+
current = block
|
|
90
|
+
tracker = HTMLTagTracker()
|
|
91
|
+
else:
|
|
92
|
+
current = prospective
|
|
93
|
+
if current.strip():
|
|
94
|
+
tracker.feed(current)
|
|
95
|
+
open_tags = tracker.get_open_tags_html()
|
|
96
|
+
close_tags = tracker.get_closing_tags_html()
|
|
97
|
+
chunks.append(open_tags + current + close_tags)
|
|
98
|
+
|
|
99
|
+
# post-unification: combine chunks if they don't exceed the limit in total
|
|
100
|
+
merged_chunks = []
|
|
101
|
+
buf = ""
|
|
102
|
+
for chunk in chunks:
|
|
103
|
+
# chunk = chunk.lstrip("\n") # removing leading line breaks
|
|
104
|
+
|
|
105
|
+
if len(buf) + len(chunk) <= MAX_LENGTH:
|
|
106
|
+
buf += chunk
|
|
107
|
+
else:
|
|
108
|
+
if buf:
|
|
109
|
+
merged_chunks.append(buf)
|
|
110
|
+
buf = chunk
|
|
111
|
+
if buf:
|
|
112
|
+
merged_chunks.append(buf)
|
|
113
|
+
|
|
114
|
+
return merged_chunks
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.7
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
7
7
|
Author-email: latand666@gmail.com
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
|
|
2
|
+
chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
|
|
3
|
+
chatgpt_md_converter/extractors.py,sha256=uThH9vnjlEwZowCbxvcZreMZUPqUEiuq0nbWva3K-CE,3023
|
|
4
|
+
chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
|
|
5
|
+
chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
|
|
6
|
+
chatgpt_md_converter/html_splitter.py,sha256=8ao4QU5PFDFCHMg8pj5kBqmxSOUO6RfzqQfk4o1F8ms,3897
|
|
7
|
+
chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
|
|
8
|
+
chatgpt_md_converter-0.3.7.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
9
|
+
chatgpt_md_converter-0.3.7.dist-info/METADATA,sha256=4gweCWqlv3a6pR6FJbf-ycCEToIjCRf2Ohnk5p81bwQ,5792
|
|
10
|
+
chatgpt_md_converter-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
chatgpt_md_converter-0.3.7.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
12
|
+
chatgpt_md_converter-0.3.7.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
|
|
2
|
-
chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
|
|
3
|
-
chatgpt_md_converter/extractors.py,sha256=WU38iAG-MANmilqR73gAvxqqXvx4JT8q3xrac_GRXGI,2071
|
|
4
|
-
chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
|
|
5
|
-
chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
|
|
6
|
-
chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
|
|
7
|
-
chatgpt_md_converter-0.3.5.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
8
|
-
chatgpt_md_converter-0.3.5.dist-info/METADATA,sha256=ly_GTnX933MbdUCZKKQAn3kQrICrRiCWvgaJ1dBgWCc,5785
|
|
9
|
-
chatgpt_md_converter-0.3.5.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
|
|
10
|
-
chatgpt_md_converter-0.3.5.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
11
|
-
chatgpt_md_converter-0.3.5.dist-info/RECORD,,
|
{chatgpt_md_converter-0.3.5.dist-info → chatgpt_md_converter-0.3.7.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|