chatgpt-md-converter 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
1
  from .telegram_formatter import telegram_format
2
+ from .html_splitter import split_html_for_telegram
2
3
 
3
- __all__ = ["telegram_format"]
4
+ __all__ = ["telegram_format", "split_html_for_telegram"]
@@ -2,31 +2,32 @@ import re
2
2
 
3
3
 
4
4
  def ensure_closing_delimiters(text: str) -> str:
5
- """Append missing closing backtick delimiters."""
5
+ # Append missing closing backtick delimiters.
6
6
 
7
7
  code_block_re = re.compile(
8
8
  r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
9
  flags=re.DOTALL,
10
10
  )
11
11
 
12
- # Remove complete code blocks from consideration so inner backticks
13
- # don't affect delimiter balancing.
14
- cleaned = code_block_re.sub("", text)
15
-
16
- # Detect unclosed fences by tracking opening fence lengths.
17
- stack = []
18
- for line in cleaned.splitlines():
19
- m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", line.strip())
20
- if not m:
21
- continue
22
- fence = m.group("fence")
23
- if stack and fence == stack[-1]:
24
- stack.pop()
12
+ # Track an open fence. Once a fence is opened, everything until the same
13
+ # fence is encountered again is treated as plain text. This mimics how
14
+ # Markdown handles fences and allows fence-like strings inside code blocks.
15
+ open_fence = None
16
+ for line in text.splitlines():
17
+ stripped = line.strip()
18
+ if open_fence is None:
19
+ m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
20
+ if m:
21
+ open_fence = m.group("fence")
25
22
  else:
26
- stack.append(fence)
23
+ if stripped.endswith(open_fence):
24
+ open_fence = None
27
25
 
28
- if stack:
29
- text += "\n" + stack[-1]
26
+ # If a fence was left open, append a matching closing fence.
27
+ if open_fence is not None:
28
+ if not text.endswith("\n"):
29
+ text += "\n"
30
+ text += open_fence
30
31
 
31
32
  cleaned_inline = code_block_re.sub("", text)
32
33
 
@@ -91,4 +92,4 @@ def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
91
92
  """
92
93
  for placeholder, html_code_block in code_blocks.items():
93
94
  text = text.replace(placeholder, html_code_block, 1)
94
- return text
95
+ return text
@@ -2,6 +2,7 @@ import re
2
2
  from html.parser import HTMLParser
3
3
 
4
4
  MAX_LENGTH = 4096
5
+ MIN_LENGTH = 500
5
6
 
6
7
 
7
8
  class HTMLTagTracker(HTMLParser):
@@ -11,7 +12,10 @@ class HTMLTagTracker(HTMLParser):
11
12
 
12
13
  def handle_starttag(self, tag, attrs):
13
14
  # saving tags
14
- if tag in ("b", "i", "u", "s", "code", "pre", "a", "span", "blockquote"):
15
+ if tag in (
16
+ "b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
17
+ "strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
18
+ ):
15
19
  self.open_tags.append((tag, attrs))
16
20
 
17
21
  def handle_endtag(self, tag):
@@ -33,15 +37,30 @@ class HTMLTagTracker(HTMLParser):
33
37
  return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
34
38
 
35
39
 
36
- def split_pre_block(pre_block: str) -> list[str]:
40
+ def split_pre_block(pre_block: str, max_length) -> list[str]:
41
+ """
42
+ Splits long HTML-formatted text into chunks suitable for sending via Telegram,
43
+ preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
44
+
45
+ Args:
46
+ text (str): The input HTML-formatted string.
47
+ trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
48
+ from each resulting chunk before sending. This is useful to avoid
49
+ unnecessary blank space at the beginning of messages in Telegram.
50
+
51
+ Returns:
52
+ list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
53
+ """
54
+
37
55
  # language-aware: <pre><code class="language-python">...</code></pre>
38
56
  match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
39
57
  if match:
40
58
  attr, content = match.groups()
41
59
  lines = content.splitlines(keepends=True)
42
60
  chunks, buf = [], ""
61
+ overhead = len(f"<pre><code{attr}></code></pre>")
43
62
  for line in lines:
44
- if len(buf) + len(line) + len('<pre><code></code></pre>') > MAX_LENGTH:
63
+ if len(buf) + len(line) + overhead > max_length:
45
64
  chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
46
65
  buf = ""
47
66
  buf += line
@@ -53,8 +72,9 @@ def split_pre_block(pre_block: str) -> list[str]:
53
72
  inner = pre_block[5:-6]
54
73
  lines = inner.splitlines(keepends=True)
55
74
  chunks, buf = [], ""
75
+ overhead = len('<pre></pre>')
56
76
  for line in lines:
57
- if len(buf) + len(line) + len('<pre></pre>') > MAX_LENGTH:
77
+ if len(buf) + len(line) + overhead > max_length:
58
78
  chunks.append(f"<pre>{buf}</pre>")
59
79
  buf = ""
60
80
  buf += line
@@ -63,52 +83,157 @@ def split_pre_block(pre_block: str) -> list[str]:
63
83
  return chunks
64
84
 
65
85
 
66
- def split_html_for_telegram(text: str) -> list[str]:
67
- chunks = []
86
+ def _is_only_tags(block: str) -> bool:
87
+ return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
88
+
89
+
90
+ def _effective_length(content: str) -> int:
91
+ tracker = HTMLTagTracker()
92
+ tracker.feed(content)
93
+ return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
94
+
95
+
96
+ def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
97
+ """Split long HTML-formatted text into Telegram-compatible chunks.
98
+
99
+ Parameters
100
+ ----------
101
+ text: str
102
+ Input HTML text.
103
+ trim_empty_leading_lines: bool, optional
104
+ If True, removes `\n` sybmols from start of chunks.
105
+ max_length: int, optional
106
+ Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
107
+ Default = 4096 (symbols)
108
+
109
+ Returns
110
+ -------
111
+ list[str]
112
+ List of HTML chunks.
113
+ """
114
+
115
+ if max_length < MIN_LENGTH:
116
+ raise ValueError("max_length should be at least %d" % MIN_LENGTH)
117
+
68
118
  pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
69
119
  parts = pattern.split(text)
70
120
 
121
+ chunks: list[str] = []
122
+ prefix = ""
123
+ current = ""
124
+ whitespace_re = re.compile(r"(\\s+)")
125
+ tag_re = re.compile(r"(<[^>]+>)")
126
+
127
+ def finalize():
128
+ nonlocal current, prefix
129
+ tracker = HTMLTagTracker()
130
+ tracker.feed(prefix + current)
131
+ chunk = prefix + current + tracker.get_closing_tags_html()
132
+ chunks.append(chunk)
133
+ prefix = tracker.get_open_tags_html()
134
+ current = ""
135
+
136
+ def append_piece(piece: str):
137
+ nonlocal current, prefix
138
+
139
+ def split_on_whitespace(chunk: str) -> list[str] | None:
140
+ parts = [part for part in whitespace_re.split(chunk) if part]
141
+ if len(parts) <= 1:
142
+ return None
143
+ return parts
144
+
145
+ def split_on_tags(chunk: str) -> list[str] | None:
146
+ parts = [part for part in tag_re.split(chunk) if part]
147
+ if len(parts) <= 1:
148
+ return None
149
+ return parts
150
+
151
+ def fittable_prefix_length(chunk: str) -> int:
152
+ low, high = 1, len(chunk)
153
+ best = 0
154
+ while low <= high:
155
+ mid = (low + high) // 2
156
+ candidate = chunk[:mid]
157
+ if _effective_length(prefix + current + candidate) <= max_length:
158
+ best = mid
159
+ low = mid + 1
160
+ else:
161
+ high = mid - 1
162
+ return best
163
+
164
+ while piece:
165
+ if _effective_length(prefix + current + piece) <= max_length:
166
+ current += piece
167
+ return
168
+
169
+ if len(piece) > max_length:
170
+ if _is_only_tags(piece):
171
+ raise ValueError("block contains only html tags")
172
+ splitted = split_on_whitespace(piece)
173
+ if splitted:
174
+ for part in splitted:
175
+ append_piece(part)
176
+ return
177
+ tag_split = split_on_tags(piece)
178
+ if tag_split:
179
+ for part in tag_split:
180
+ append_piece(part)
181
+ return
182
+ elif current:
183
+ finalize()
184
+ continue
185
+ else:
186
+ splitted = split_on_whitespace(piece)
187
+ if splitted:
188
+ for part in splitted:
189
+ append_piece(part)
190
+ return
191
+ tag_split = split_on_tags(piece)
192
+ if tag_split:
193
+ for part in tag_split:
194
+ append_piece(part)
195
+ return
196
+
197
+ fitted = fittable_prefix_length(piece)
198
+ if fitted == 0:
199
+ if current:
200
+ finalize()
201
+ continue
202
+ raise ValueError("unable to split content within max_length")
203
+
204
+ current += piece[:fitted]
205
+ piece = piece[fitted:]
206
+
207
+ if piece:
208
+ finalize()
209
+
210
+
71
211
  for part in parts:
72
212
  if not part:
73
213
  continue
74
214
  if part.startswith("<pre>") or part.startswith("<pre><code"):
75
- pre_chunks = split_pre_block(part)
76
- chunks.extend(pre_chunks)
77
- else:
78
- # breaking down regular HTML
79
- tracker = HTMLTagTracker()
80
- current = ""
81
- blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
82
- for block in blocks:
83
- prospective = current + block
84
- if len(prospective) > MAX_LENGTH:
85
- tracker.feed(current)
86
- open_tags = tracker.get_open_tags_html()
87
- close_tags = tracker.get_closing_tags_html()
88
- chunks.append(open_tags + current + close_tags)
89
- current = block
90
- tracker = HTMLTagTracker()
91
- else:
92
- current = prospective
93
- if current.strip():
94
- tracker.feed(current)
95
- open_tags = tracker.get_open_tags_html()
96
- close_tags = tracker.get_closing_tags_html()
97
- chunks.append(open_tags + current + close_tags)
98
-
99
- # post-unification: combine chunks if they don't exceed the limit in total
100
- merged_chunks = []
215
+ pre_chunks = split_pre_block(part, max_length=max_length)
216
+ for pc in pre_chunks:
217
+ append_piece(pc)
218
+ continue
219
+ blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
220
+ for block in blocks:
221
+ if block:
222
+ append_piece(block)
223
+
224
+ if current:
225
+ finalize()
226
+
227
+ merged: list[str] = []
101
228
  buf = ""
102
229
  for chunk in chunks:
103
- # chunk = chunk.lstrip("\n") # removing leading line breaks
104
-
105
- if len(buf) + len(chunk) <= MAX_LENGTH:
230
+ if len(buf) + len(chunk) <= max_length:
106
231
  buf += chunk
107
232
  else:
108
233
  if buf:
109
- merged_chunks.append(buf)
110
- buf = chunk
234
+ merged.append(buf)
235
+ buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
111
236
  if buf:
112
- merged_chunks.append(buf)
237
+ merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
113
238
 
114
- return merged_chunks
239
+ return merged
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.7
3
+ Version: 0.3.8
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -0,0 +1,12 @@
1
+ chatgpt_md_converter/__init__.py,sha256=HF8fLq9o1A4HMDjPWCQ43NSby_L29Zgd4S_g3ORyyCA,157
2
+ chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
+ chatgpt_md_converter/extractors.py,sha256=k1oRlocn0K4OyU3-k2mrhKanKNdU-664t1CTcf8hYdE,3212
4
+ chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
+ chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
+ chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
7
+ chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
8
+ chatgpt_md_converter-0.3.8.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
9
+ chatgpt_md_converter-0.3.8.dist-info/METADATA,sha256=ngfuia4mAfiHBySgX_hKii8ty1O9hOkCotqX9Fzidm4,5792
10
+ chatgpt_md_converter-0.3.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ chatgpt_md_converter-0.3.8.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
12
+ chatgpt_md_converter-0.3.8.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
- chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
- chatgpt_md_converter/extractors.py,sha256=uThH9vnjlEwZowCbxvcZreMZUPqUEiuq0nbWva3K-CE,3023
4
- chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
- chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
- chatgpt_md_converter/html_splitter.py,sha256=8ao4QU5PFDFCHMg8pj5kBqmxSOUO6RfzqQfk4o1F8ms,3897
7
- chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
8
- chatgpt_md_converter-0.3.7.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
9
- chatgpt_md_converter-0.3.7.dist-info/METADATA,sha256=4gweCWqlv3a6pR6FJbf-ycCEToIjCRf2Ohnk5p81bwQ,5792
10
- chatgpt_md_converter-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- chatgpt_md_converter-0.3.7.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
12
- chatgpt_md_converter-0.3.7.dist-info/RECORD,,