chatgpt-md-converter 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
1
  from .telegram_formatter import telegram_format
2
+ from .html_splitter import split_html_for_telegram
2
3
 
3
- __all__ = ["telegram_format"]
4
+ __all__ = ["telegram_format", "split_html_for_telegram"]
@@ -2,16 +2,44 @@ import re
2
2
 
3
3
 
4
4
  def ensure_closing_delimiters(text: str) -> str:
5
- """
6
- Ensures that if an opening ` or ``` is found without a matching closing delimiter,
7
- the missing delimiter is appended to the end of the text.
8
- """
9
- # For triple backticks
10
- if text.count("```") % 2 != 0:
5
+ # Append missing closing backtick delimiters.
6
+
7
+ code_block_re = re.compile(
8
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
+ flags=re.DOTALL,
10
+ )
11
+
12
+ # Track an open fence. Once a fence is opened, everything until the same
13
+ # fence is encountered again is treated as plain text. This mimics how
14
+ # Markdown handles fences and allows fence-like strings inside code blocks.
15
+ open_fence = None
16
+ for line in text.splitlines():
17
+ stripped = line.strip()
18
+ if open_fence is None:
19
+ m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
20
+ if m:
21
+ open_fence = m.group("fence")
22
+ else:
23
+ if stripped.endswith(open_fence):
24
+ open_fence = None
25
+
26
+ # If a fence was left open, append a matching closing fence.
27
+ if open_fence is not None:
28
+ if not text.endswith("\n"):
29
+ text += "\n"
30
+ text += open_fence
31
+
32
+ cleaned_inline = code_block_re.sub("", text)
33
+
34
+ # Balance triple backticks that are not part of a complete fence.
35
+ if cleaned_inline.count("```") % 2 != 0:
11
36
  text += "```"
12
- # For single backticks
13
- if text.count("`") % 2 != 0:
37
+
38
+ # Balance single backticks outside fenced blocks.
39
+ cleaned_inline = code_block_re.sub("", text)
40
+ if cleaned_inline.count("`") % 2 != 0:
14
41
  text += "`"
42
+
15
43
  return text
16
44
 
17
45
 
@@ -25,8 +53,8 @@ def extract_and_convert_code_blocks(text: str):
25
53
  code_blocks = {}
26
54
 
27
55
  def replacer(match):
28
- language = match.group(1) if match.group(1) else ""
29
- code_content = match.group(3)
56
+ language = match.group("lang") if match.group("lang") else ""
57
+ code_content = match.group("code")
30
58
 
31
59
  # Properly escape HTML entities in code content
32
60
  escaped_content = (
@@ -44,8 +72,14 @@ def extract_and_convert_code_blocks(text: str):
44
72
  return (placeholder, html_code_block)
45
73
 
46
74
  modified_text = text
47
- for match in re.finditer(r"```(\w*)?(\n)?(.*?)```", text, flags=re.DOTALL):
48
- placeholder, html_code_block = replacer(match)
75
+ code_block_pattern = re.compile(
76
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
77
+ flags=re.DOTALL,
78
+ )
79
+ for match in code_block_pattern.finditer(text):
80
+ placeholder, html_code_block = replacer(
81
+ match
82
+ )
49
83
  code_blocks[placeholder] = html_code_block
50
84
  modified_text = modified_text.replace(match.group(0), placeholder, 1)
51
85
 
@@ -58,4 +92,4 @@ def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
58
92
  """
59
93
  for placeholder, html_code_block in code_blocks.items():
60
94
  text = text.replace(placeholder, html_code_block, 1)
61
- return text
95
+ return text
@@ -0,0 +1,239 @@
1
+ import re
2
+ from html.parser import HTMLParser
3
+
4
+ MAX_LENGTH = 4096
5
+ MIN_LENGTH = 500
6
+
7
+
8
+ class HTMLTagTracker(HTMLParser):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.open_tags = []
12
+
13
+ def handle_starttag(self, tag, attrs):
14
+ # saving tags
15
+ if tag in (
16
+ "b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
17
+ "strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
18
+ ):
19
+ self.open_tags.append((tag, attrs))
20
+
21
+ def handle_endtag(self, tag):
22
+ for i in range(len(self.open_tags) - 1, -1, -1):
23
+ if self.open_tags[i][0] == tag:
24
+ del self.open_tags[i]
25
+ break
26
+
27
+ def get_open_tags_html(self):
28
+ parts = []
29
+ for tag, attrs in self.open_tags:
30
+ attr_str = ""
31
+ if attrs:
32
+ attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
33
+ parts.append(f"<{tag}{attr_str}>")
34
+ return "".join(parts)
35
+
36
+ def get_closing_tags_html(self):
37
+ return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
38
+
39
+
40
+ def split_pre_block(pre_block: str, max_length) -> list[str]:
41
+ """
42
+ Splits long HTML-formatted text into chunks suitable for sending via Telegram,
43
+ preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
44
+
45
+ Args:
46
+ text (str): The input HTML-formatted string.
47
+ trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
48
+ from each resulting chunk before sending. This is useful to avoid
49
+ unnecessary blank space at the beginning of messages in Telegram.
50
+
51
+ Returns:
52
+ list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
53
+ """
54
+
55
+ # language-aware: <pre><code class="language-python">...</code></pre>
56
+ match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
57
+ if match:
58
+ attr, content = match.groups()
59
+ lines = content.splitlines(keepends=True)
60
+ chunks, buf = [], ""
61
+ overhead = len(f"<pre><code{attr}></code></pre>")
62
+ for line in lines:
63
+ if len(buf) + len(line) + overhead > max_length:
64
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
65
+ buf = ""
66
+ buf += line
67
+ if buf:
68
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
69
+ return chunks
70
+ else:
71
+ # regular <pre>...</pre>
72
+ inner = pre_block[5:-6]
73
+ lines = inner.splitlines(keepends=True)
74
+ chunks, buf = [], ""
75
+ overhead = len('<pre></pre>')
76
+ for line in lines:
77
+ if len(buf) + len(line) + overhead > max_length:
78
+ chunks.append(f"<pre>{buf}</pre>")
79
+ buf = ""
80
+ buf += line
81
+ if buf:
82
+ chunks.append(f"<pre>{buf}</pre>")
83
+ return chunks
84
+
85
+
86
+ def _is_only_tags(block: str) -> bool:
87
+ return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
88
+
89
+
90
+ def _effective_length(content: str) -> int:
91
+ tracker = HTMLTagTracker()
92
+ tracker.feed(content)
93
+ return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
94
+
95
+
96
+ def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
97
+ """Split long HTML-formatted text into Telegram-compatible chunks.
98
+
99
+ Parameters
100
+ ----------
101
+ text: str
102
+ Input HTML text.
103
+ trim_empty_leading_lines: bool, optional
104
+ If True, removes `\n` sybmols from start of chunks.
105
+ max_length: int, optional
106
+ Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
107
+ Default = 4096 (symbols)
108
+
109
+ Returns
110
+ -------
111
+ list[str]
112
+ List of HTML chunks.
113
+ """
114
+
115
+ if max_length < MIN_LENGTH:
116
+ raise ValueError("max_length should be at least %d" % MIN_LENGTH)
117
+
118
+ pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
119
+ parts = pattern.split(text)
120
+
121
+ chunks: list[str] = []
122
+ prefix = ""
123
+ current = ""
124
+ whitespace_re = re.compile(r"(\\s+)")
125
+ tag_re = re.compile(r"(<[^>]+>)")
126
+
127
+ def finalize():
128
+ nonlocal current, prefix
129
+ tracker = HTMLTagTracker()
130
+ tracker.feed(prefix + current)
131
+ chunk = prefix + current + tracker.get_closing_tags_html()
132
+ chunks.append(chunk)
133
+ prefix = tracker.get_open_tags_html()
134
+ current = ""
135
+
136
+ def append_piece(piece: str):
137
+ nonlocal current, prefix
138
+
139
+ def split_on_whitespace(chunk: str) -> list[str] | None:
140
+ parts = [part for part in whitespace_re.split(chunk) if part]
141
+ if len(parts) <= 1:
142
+ return None
143
+ return parts
144
+
145
+ def split_on_tags(chunk: str) -> list[str] | None:
146
+ parts = [part for part in tag_re.split(chunk) if part]
147
+ if len(parts) <= 1:
148
+ return None
149
+ return parts
150
+
151
+ def fittable_prefix_length(chunk: str) -> int:
152
+ low, high = 1, len(chunk)
153
+ best = 0
154
+ while low <= high:
155
+ mid = (low + high) // 2
156
+ candidate = chunk[:mid]
157
+ if _effective_length(prefix + current + candidate) <= max_length:
158
+ best = mid
159
+ low = mid + 1
160
+ else:
161
+ high = mid - 1
162
+ return best
163
+
164
+ while piece:
165
+ if _effective_length(prefix + current + piece) <= max_length:
166
+ current += piece
167
+ return
168
+
169
+ if len(piece) > max_length:
170
+ if _is_only_tags(piece):
171
+ raise ValueError("block contains only html tags")
172
+ splitted = split_on_whitespace(piece)
173
+ if splitted:
174
+ for part in splitted:
175
+ append_piece(part)
176
+ return
177
+ tag_split = split_on_tags(piece)
178
+ if tag_split:
179
+ for part in tag_split:
180
+ append_piece(part)
181
+ return
182
+ elif current:
183
+ finalize()
184
+ continue
185
+ else:
186
+ splitted = split_on_whitespace(piece)
187
+ if splitted:
188
+ for part in splitted:
189
+ append_piece(part)
190
+ return
191
+ tag_split = split_on_tags(piece)
192
+ if tag_split:
193
+ for part in tag_split:
194
+ append_piece(part)
195
+ return
196
+
197
+ fitted = fittable_prefix_length(piece)
198
+ if fitted == 0:
199
+ if current:
200
+ finalize()
201
+ continue
202
+ raise ValueError("unable to split content within max_length")
203
+
204
+ current += piece[:fitted]
205
+ piece = piece[fitted:]
206
+
207
+ if piece:
208
+ finalize()
209
+
210
+
211
+ for part in parts:
212
+ if not part:
213
+ continue
214
+ if part.startswith("<pre>") or part.startswith("<pre><code"):
215
+ pre_chunks = split_pre_block(part, max_length=max_length)
216
+ for pc in pre_chunks:
217
+ append_piece(pc)
218
+ continue
219
+ blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
220
+ for block in blocks:
221
+ if block:
222
+ append_piece(block)
223
+
224
+ if current:
225
+ finalize()
226
+
227
+ merged: list[str] = []
228
+ buf = ""
229
+ for chunk in chunks:
230
+ if len(buf) + len(chunk) <= max_length:
231
+ buf += chunk
232
+ else:
233
+ if buf:
234
+ merged.append(buf)
235
+ buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
236
+ if buf:
237
+ merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
238
+
239
+ return merged
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
- Home-page: https://github.com/Latand/formatter-chatgpt-telegram
5
+ Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
7
7
  Author-email: latand666@gmail.com
8
8
  Classifier: Programming Language :: Python :: 3
@@ -0,0 +1,12 @@
1
+ chatgpt_md_converter/__init__.py,sha256=HF8fLq9o1A4HMDjPWCQ43NSby_L29Zgd4S_g3ORyyCA,157
2
+ chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
+ chatgpt_md_converter/extractors.py,sha256=k1oRlocn0K4OyU3-k2mrhKanKNdU-664t1CTcf8hYdE,3212
4
+ chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
+ chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
+ chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
7
+ chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
8
+ chatgpt_md_converter-0.3.8.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
9
+ chatgpt_md_converter-0.3.8.dist-info/METADATA,sha256=ngfuia4mAfiHBySgX_hKii8ty1O9hOkCotqX9Fzidm4,5792
10
+ chatgpt_md_converter-0.3.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ chatgpt_md_converter-0.3.8.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
12
+ chatgpt_md_converter-0.3.8.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
- chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
- chatgpt_md_converter/extractors.py,sha256=WU38iAG-MANmilqR73gAvxqqXvx4JT8q3xrac_GRXGI,2071
4
- chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
- chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
- chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
7
- chatgpt_md_converter-0.3.6.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
8
- chatgpt_md_converter-0.3.6.dist-info/METADATA,sha256=_cmiJutFIaPt17LD9VOf650BzsPjaBIkD-VUSYpVtJM,5785
9
- chatgpt_md_converter-0.3.6.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
10
- chatgpt_md_converter-0.3.6.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
11
- chatgpt_md_converter-0.3.6.dist-info/RECORD,,