chatgpt-md-converter 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ from .html_splitter import split_html_for_telegram
2
+ from .html_to_markdown import html_to_telegram_markdown
1
3
  from .telegram_formatter import telegram_format
2
4
 
3
- __all__ = ["telegram_format"]
5
+ __all__ = ["telegram_format", "split_html_for_telegram", "html_to_telegram_markdown"]
@@ -0,0 +1,68 @@
1
+ """Shared escaping utilities for Telegram Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import html
6
+ import re
7
+
8
+ from .tree import Node
9
+
10
+ _SIMPLE_STAR_ITALIC = re.compile(
11
+ r"(?<!\\)(?<!\*)\*(?=[^\s])([^\*\n]+?)(?<!\s)\*(?![A-Za-z0-9\*])",
12
+ )
13
+
14
+
15
+ def _canonicalize_star_italics(text: str) -> str:
16
+ def _replace(match: re.Match[str]) -> str:
17
+ inner = match.group(1)
18
+ if "*" in inner or "_" in inner or "`" in inner:
19
+ return match.group(0)
20
+ return f"_{inner}_"
21
+
22
+ return _SIMPLE_STAR_ITALIC.sub(_replace, text)
23
+
24
+
25
+ def normalise_text(text: str) -> str:
26
+ if not text:
27
+ return ""
28
+ unescaped = html.unescape(text)
29
+ return unescaped.replace("\u00a0", " ")
30
+
31
+
32
+ def collect_text(node: Node) -> str:
33
+ if node.kind == "text":
34
+ return html.unescape(node.text)
35
+ parts: list[str] = []
36
+ for child in node.children:
37
+ if child.kind == "text":
38
+ parts.append(html.unescape(child.text))
39
+ elif child.kind == "element":
40
+ if child.tag.lower() == "br":
41
+ parts.append("\n")
42
+ else:
43
+ parts.append(collect_text(child))
44
+ return "".join(parts)
45
+
46
+
47
+ def escape_inline_code(text: str) -> str:
48
+ return text.replace("`", "\\`")
49
+
50
+
51
+ def escape_link_label(label: str) -> str:
52
+ escaped = label
53
+ for ch in "[]()":
54
+ escaped = escaped.replace(ch, f"\\{ch}")
55
+ return escaped
56
+
57
+
58
+ def escape_link_url(url: str) -> str:
59
+ return url.replace("\\", "\\\\").replace(")", "\\)")
60
+
61
+
62
+ def post_process(markdown: str) -> str:
63
+ text = re.sub(r"(^|\n)•\s", r"\1- ", markdown)
64
+ text = re.sub(r"\n{3,}", "\n\n", text)
65
+ text = text.replace("\r", "")
66
+ text = "\n".join(line.rstrip() for line in text.split("\n"))
67
+ text = _canonicalize_star_italics(text)
68
+ return text.strip()
@@ -0,0 +1,155 @@
1
+ """Tag-specific renderers for Telegram Markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, Dict
6
+
7
+ from .escaping import (collect_text, escape_inline_code, escape_link_label,
8
+ escape_link_url, normalise_text)
9
+ from .state import RenderState
10
+ from .tree import Node
11
+
12
+ InlineHandler = Callable[[Node, RenderState], str]
13
+
14
+
15
+ _INLINE_MARKERS: Dict[str, tuple[str, str]] = {
16
+ "u": ("__", "__"),
17
+ "ins": ("__", "__"),
18
+ "s": ("~~", "~~"),
19
+ "strike": ("~~", "~~"),
20
+ "del": ("~~", "~~"),
21
+ }
22
+
23
+
24
+ def render_nodes(nodes: list[Node], state: RenderState) -> str:
25
+ return "".join(render_node(node, state) for node in nodes)
26
+
27
+
28
+ def render_node(node: Node, state: RenderState) -> str:
29
+ if node.kind == "text":
30
+ return normalise_text(node.text)
31
+
32
+ handler = TAG_DISPATCH.get(node.tag.lower())
33
+ if handler:
34
+ return handler(node, state)
35
+ return render_nodes(node.children, state)
36
+
37
+
38
+ def _handle_bold(node: Node, state: RenderState) -> str:
39
+ inner_state = state.child(bold_depth=state.bold_depth + 1)
40
+ inner = render_nodes(node.children, inner_state)
41
+ return f"**{inner}**"
42
+
43
+
44
+ def _handle_italic(node: Node, state: RenderState) -> str:
45
+ depth = state.italic_depth
46
+ in_bold = state.bold_depth > 0 and depth == 0
47
+ marker = "_" if in_bold else ("*" if depth % 2 == 0 else "_")
48
+ inner_state = state.child(italic_depth=depth + 1)
49
+ inner = render_nodes(node.children, inner_state)
50
+ return f"{marker}{inner}{marker}"
51
+
52
+
53
+ def _handle_inline_marker(node: Node, state: RenderState) -> str:
54
+ marker_open, marker_close = _INLINE_MARKERS[node.tag.lower()]
55
+ inner = render_nodes(node.children, state)
56
+ return f"{marker_open}{inner}{marker_close}"
57
+
58
+
59
+ def _handle_spoiler(node: Node, state: RenderState) -> str:
60
+ inner = render_nodes(node.children, state)
61
+ return f"||{inner}||"
62
+
63
+
64
+ def _handle_code(node: Node, state: RenderState) -> str:
65
+ inner = collect_text(node)
66
+ return f"`{escape_inline_code(inner)}`"
67
+
68
+
69
+ def _handle_pre(node: Node, state: RenderState) -> str:
70
+ children = node.children
71
+ language: str | None = None
72
+ content_node: Node
73
+
74
+ if len(children) == 1 and children[0].kind == "element" and children[0].tag.lower() == "code":
75
+ content_node = children[0]
76
+ class_attr = content_node.attrs.get("class") or ""
77
+ for part in class_attr.split():
78
+ if part.startswith("language-"):
79
+ language = part.split("-", 1)[1]
80
+ break
81
+ else:
82
+ content_node = Node(kind="element", tag="__virtual__", children=children)
83
+
84
+ inner_text = collect_text(content_node)
85
+ fence = f"```{language}" if language else "```"
86
+ if language or "\n" in inner_text:
87
+ return f"{fence}\n{inner_text}```"
88
+ return f"{fence}{inner_text}```"
89
+
90
+
91
+ def _handle_link(node: Node, state: RenderState) -> str:
92
+ href = node.attrs.get("href", "") or ""
93
+ label = render_nodes(node.children, state)
94
+ if not label:
95
+ label = href
96
+
97
+ escaped_label = escape_link_label(label)
98
+ escaped_url = escape_link_url(href)
99
+
100
+ if href.startswith("tg://emoji?"):
101
+ return f"![{escaped_label}]({escaped_url})"
102
+ return f"[{escaped_label}]({escaped_url})"
103
+
104
+
105
+ def _handle_blockquote(node: Node, state: RenderState) -> str:
106
+ inner = render_nodes(node.children, state)
107
+ lines = inner.split("\n")
108
+ expandable = "expandable" in node.attrs
109
+ rendered: list[str] = []
110
+ for index, line in enumerate(lines):
111
+ prefix = "**>" if expandable and index == 0 else ">"
112
+ stripped = line.rstrip("\r")
113
+ if expandable:
114
+ rendered.append(prefix + stripped)
115
+ else:
116
+ rendered.append(f"{prefix} {stripped}" if stripped else prefix)
117
+ return "\n".join(rendered)
118
+
119
+
120
+ def _handle_tg_emoji(node: Node, state: RenderState) -> str:
121
+ emoji_id = node.attrs.get("emoji-id")
122
+ label = render_nodes(node.children, state)
123
+ if emoji_id:
124
+ href = f"tg://emoji?id={emoji_id}"
125
+ return f"![{escape_link_label(label)}]({href})"
126
+ return label
127
+
128
+
129
+ def _handle_span(node: Node, state: RenderState) -> str:
130
+ classes = (node.attrs.get("class") or "").split()
131
+ if any(cls == "tg-spoiler" for cls in classes):
132
+ return _handle_spoiler(node, state)
133
+ if any(cls == "tg-emoji" for cls in classes):
134
+ return render_nodes(node.children, state)
135
+ return render_nodes(node.children, state)
136
+
137
+
138
+ TAG_DISPATCH: Dict[str, Callable[[Node, RenderState], str]] = {
139
+ "b": _handle_bold,
140
+ "strong": _handle_bold,
141
+ "i": _handle_italic,
142
+ "em": _handle_italic,
143
+ "u": _handle_inline_marker,
144
+ "ins": _handle_inline_marker,
145
+ "s": _handle_inline_marker,
146
+ "strike": _handle_inline_marker,
147
+ "del": _handle_inline_marker,
148
+ "span": _handle_span,
149
+ "tg-spoiler": _handle_spoiler,
150
+ "code": _handle_code,
151
+ "pre": _handle_pre,
152
+ "a": _handle_link,
153
+ "blockquote": _handle_blockquote,
154
+ "tg-emoji": _handle_tg_emoji,
155
+ }
@@ -0,0 +1,16 @@
1
+ """High-level HTML → Telegram Markdown renderer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List
6
+
7
+ from .escaping import post_process
8
+ from .handlers import render_nodes
9
+ from .state import RenderState
10
+ from .tree import Node, build_tree
11
+
12
+
13
+ def html_to_telegram_markdown(html_text: str) -> str:
14
+ nodes: List[Node] = build_tree(html_text)
15
+ markdown = render_nodes(nodes, RenderState())
16
+ return post_process(markdown)
@@ -0,0 +1,16 @@
1
+ """Rendering state for HTML → Telegram Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class RenderState:
10
+ bold_depth: int = 0
11
+ italic_depth: int = 0
12
+
13
+ def child(self, **updates: int) -> "RenderState":
14
+ data = {"bold_depth": self.bold_depth, "italic_depth": self.italic_depth}
15
+ data.update(updates)
16
+ return RenderState(**data)
@@ -0,0 +1,65 @@
1
+ """DOM-like tree construction for Telegram HTML fragments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from html.parser import HTMLParser
7
+ from typing import Dict, List, Optional
8
+
9
+
10
+ @dataclass
11
+ class Node:
12
+ kind: str # "text" or "element"
13
+ text: str = ""
14
+ tag: str = ""
15
+ attrs: Dict[str, Optional[str]] = field(default_factory=dict)
16
+ children: List["Node"] = field(default_factory=list)
17
+
18
+
19
+ class _HTMLTreeBuilder(HTMLParser):
20
+ SELF_CLOSING_TAGS = {"br"}
21
+
22
+ def __init__(self) -> None:
23
+ super().__init__(convert_charrefs=False)
24
+ self.root = Node(kind="element", tag="__root__")
25
+ self._stack: List[Node] = [self.root]
26
+
27
+ def handle_starttag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
28
+ if tag in self.SELF_CLOSING_TAGS:
29
+ if tag == "br":
30
+ self._stack[-1].children.append(Node(kind="text", text="\n"))
31
+ return
32
+ node = Node(kind="element", tag=tag, attrs=dict(attrs))
33
+ self._stack[-1].children.append(node)
34
+ self._stack.append(node)
35
+
36
+ def handle_endtag(self, tag: str) -> None:
37
+ for index in range(len(self._stack) - 1, 0, -1):
38
+ if self._stack[index].tag == tag:
39
+ del self._stack[index:]
40
+ return
41
+
42
+ def handle_startendtag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
43
+ if tag in self.SELF_CLOSING_TAGS:
44
+ self.handle_starttag(tag, attrs)
45
+ return
46
+ node = Node(kind="element", tag=tag, attrs=dict(attrs))
47
+ self._stack[-1].children.append(node)
48
+
49
+ def handle_data(self, data: str) -> None:
50
+ if data:
51
+ self._stack[-1].children.append(Node(kind="text", text=data))
52
+
53
+ def handle_entityref(self, name: str) -> None:
54
+ self.handle_data(f"&{name};")
55
+
56
+ def handle_charref(self, name: str) -> None:
57
+ self.handle_data(f"&#{name};")
58
+
59
+
60
+ def build_tree(html_text: str) -> List[Node]:
61
+ """Parse HTML and return the list of top-level nodes."""
62
+ builder = _HTMLTreeBuilder()
63
+ builder.feed(html_text)
64
+ builder.close()
65
+ return builder.root.children
@@ -2,6 +2,7 @@ import re
2
2
  from html.parser import HTMLParser
3
3
 
4
4
  MAX_LENGTH = 4096
5
+ MIN_LENGTH = 500
5
6
 
6
7
 
7
8
  class HTMLTagTracker(HTMLParser):
@@ -11,7 +12,10 @@ class HTMLTagTracker(HTMLParser):
11
12
 
12
13
  def handle_starttag(self, tag, attrs):
13
14
  # saving tags
14
- if tag in ("b", "i", "u", "s", "code", "pre", "a", "span", "blockquote"):
15
+ if tag in (
16
+ "b", "i", "u", "s", "code", "pre", "a", "span", "blockquote",
17
+ "strong", "em", "ins", "strike", "del", "tg-spoiler", "tg-emoji"
18
+ ):
15
19
  self.open_tags.append((tag, attrs))
16
20
 
17
21
  def handle_endtag(self, tag):
@@ -33,15 +37,30 @@ class HTMLTagTracker(HTMLParser):
33
37
  return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
34
38
 
35
39
 
36
- def split_pre_block(pre_block: str) -> list[str]:
40
+ def split_pre_block(pre_block: str, max_length) -> list[str]:
41
+ """
42
+ Splits long HTML-formatted text into chunks suitable for sending via Telegram,
43
+ preserving valid HTML tag nesting and handling <pre>/<code> blocks separately.
44
+
45
+ Args:
46
+ text (str): The input HTML-formatted string.
47
+ trim_leading_newlines (bool): If True, removes leading newline characters (`\\n`)
48
+ from each resulting chunk before sending. This is useful to avoid
49
+ unnecessary blank space at the beginning of messages in Telegram.
50
+
51
+ Returns:
52
+ list[str]: A list of HTML-formatted message chunks, each within Telegram's length limit.
53
+ """
54
+
37
55
  # language-aware: <pre><code class="language-python">...</code></pre>
38
56
  match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
39
57
  if match:
40
58
  attr, content = match.groups()
41
59
  lines = content.splitlines(keepends=True)
42
60
  chunks, buf = [], ""
61
+ overhead = len(f"<pre><code{attr}></code></pre>")
43
62
  for line in lines:
44
- if len(buf) + len(line) + len('<pre><code></code></pre>') > MAX_LENGTH:
63
+ if len(buf) + len(line) + overhead > max_length:
45
64
  chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
46
65
  buf = ""
47
66
  buf += line
@@ -53,8 +72,9 @@ def split_pre_block(pre_block: str) -> list[str]:
53
72
  inner = pre_block[5:-6]
54
73
  lines = inner.splitlines(keepends=True)
55
74
  chunks, buf = [], ""
75
+ overhead = len('<pre></pre>')
56
76
  for line in lines:
57
- if len(buf) + len(line) + len('<pre></pre>') > MAX_LENGTH:
77
+ if len(buf) + len(line) + overhead > max_length:
58
78
  chunks.append(f"<pre>{buf}</pre>")
59
79
  buf = ""
60
80
  buf += line
@@ -63,52 +83,157 @@ def split_pre_block(pre_block: str) -> list[str]:
63
83
  return chunks
64
84
 
65
85
 
66
- def split_html_for_telegram(text: str) -> list[str]:
67
- chunks = []
86
+ def _is_only_tags(block: str) -> bool:
87
+ return bool(re.fullmatch(r'(?:\s*<[^>]+>\s*)+', block))
88
+
89
+
90
+ def _effective_length(content: str) -> int:
91
+ tracker = HTMLTagTracker()
92
+ tracker.feed(content)
93
+ return len(tracker.get_open_tags_html()) + len(content) + len(tracker.get_closing_tags_html())
94
+
95
+
96
+ def split_html_for_telegram(text: str, trim_empty_leading_lines: bool = False, max_length: int = MAX_LENGTH) -> list[str]:
97
+ """Split long HTML-formatted text into Telegram-compatible chunks.
98
+
99
+ Parameters
100
+ ----------
101
+ text: str
102
+ Input HTML text.
103
+ trim_empty_leading_lines: bool, optional
104
+ If True, removes `\n` sybmols from start of chunks.
105
+ max_length: int, optional
106
+ Maximum allowed length for a single chunk (must be >= ``MIN_LENGTH = 500``).
107
+ Default = 4096 (symbols)
108
+
109
+ Returns
110
+ -------
111
+ list[str]
112
+ List of HTML chunks.
113
+ """
114
+
115
+ if max_length < MIN_LENGTH:
116
+ raise ValueError("max_length should be at least %d" % MIN_LENGTH)
117
+
68
118
  pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
69
119
  parts = pattern.split(text)
70
120
 
121
+ chunks: list[str] = []
122
+ prefix = ""
123
+ current = ""
124
+ whitespace_re = re.compile(r"(\\s+)")
125
+ tag_re = re.compile(r"(<[^>]+>)")
126
+
127
+ def finalize():
128
+ nonlocal current, prefix
129
+ tracker = HTMLTagTracker()
130
+ tracker.feed(prefix + current)
131
+ chunk = prefix + current + tracker.get_closing_tags_html()
132
+ chunks.append(chunk)
133
+ prefix = tracker.get_open_tags_html()
134
+ current = ""
135
+
136
+ def append_piece(piece: str):
137
+ nonlocal current, prefix
138
+
139
+ def split_on_whitespace(chunk: str) -> list[str] | None:
140
+ parts = [part for part in whitespace_re.split(chunk) if part]
141
+ if len(parts) <= 1:
142
+ return None
143
+ return parts
144
+
145
+ def split_on_tags(chunk: str) -> list[str] | None:
146
+ parts = [part for part in tag_re.split(chunk) if part]
147
+ if len(parts) <= 1:
148
+ return None
149
+ return parts
150
+
151
+ def fittable_prefix_length(chunk: str) -> int:
152
+ low, high = 1, len(chunk)
153
+ best = 0
154
+ while low <= high:
155
+ mid = (low + high) // 2
156
+ candidate = chunk[:mid]
157
+ if _effective_length(prefix + current + candidate) <= max_length:
158
+ best = mid
159
+ low = mid + 1
160
+ else:
161
+ high = mid - 1
162
+ return best
163
+
164
+ while piece:
165
+ if _effective_length(prefix + current + piece) <= max_length:
166
+ current += piece
167
+ return
168
+
169
+ if len(piece) > max_length:
170
+ if _is_only_tags(piece):
171
+ raise ValueError("block contains only html tags")
172
+ splitted = split_on_whitespace(piece)
173
+ if splitted:
174
+ for part in splitted:
175
+ append_piece(part)
176
+ return
177
+ tag_split = split_on_tags(piece)
178
+ if tag_split:
179
+ for part in tag_split:
180
+ append_piece(part)
181
+ return
182
+ elif current:
183
+ finalize()
184
+ continue
185
+ else:
186
+ splitted = split_on_whitespace(piece)
187
+ if splitted:
188
+ for part in splitted:
189
+ append_piece(part)
190
+ return
191
+ tag_split = split_on_tags(piece)
192
+ if tag_split:
193
+ for part in tag_split:
194
+ append_piece(part)
195
+ return
196
+
197
+ fitted = fittable_prefix_length(piece)
198
+ if fitted == 0:
199
+ if current:
200
+ finalize()
201
+ continue
202
+ raise ValueError("unable to split content within max_length")
203
+
204
+ current += piece[:fitted]
205
+ piece = piece[fitted:]
206
+
207
+ if piece:
208
+ finalize()
209
+
210
+
71
211
  for part in parts:
72
212
  if not part:
73
213
  continue
74
214
  if part.startswith("<pre>") or part.startswith("<pre><code"):
75
- pre_chunks = split_pre_block(part)
76
- chunks.extend(pre_chunks)
77
- else:
78
- # breaking down regular HTML
79
- tracker = HTMLTagTracker()
80
- current = ""
81
- blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
82
- for block in blocks:
83
- prospective = current + block
84
- if len(prospective) > MAX_LENGTH:
85
- tracker.feed(current)
86
- open_tags = tracker.get_open_tags_html()
87
- close_tags = tracker.get_closing_tags_html()
88
- chunks.append(open_tags + current + close_tags)
89
- current = block
90
- tracker = HTMLTagTracker()
91
- else:
92
- current = prospective
93
- if current.strip():
94
- tracker.feed(current)
95
- open_tags = tracker.get_open_tags_html()
96
- close_tags = tracker.get_closing_tags_html()
97
- chunks.append(open_tags + current + close_tags)
98
-
99
- # post-unification: combine chunks if they don't exceed the limit in total
100
- merged_chunks = []
215
+ pre_chunks = split_pre_block(part, max_length=max_length)
216
+ for pc in pre_chunks:
217
+ append_piece(pc)
218
+ continue
219
+ blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
220
+ for block in blocks:
221
+ if block:
222
+ append_piece(block)
223
+
224
+ if current:
225
+ finalize()
226
+
227
+ merged: list[str] = []
101
228
  buf = ""
102
229
  for chunk in chunks:
103
- # chunk = chunk.lstrip("\n") # removing leading line breaks
104
-
105
- if len(buf) + len(chunk) <= MAX_LENGTH:
230
+ if len(buf) + len(chunk) <= max_length:
106
231
  buf += chunk
107
232
  else:
108
233
  if buf:
109
- merged_chunks.append(buf)
110
- buf = chunk
234
+ merged.append(buf)
235
+ buf = chunk.lstrip("\n") if trim_empty_leading_lines and merged else chunk
111
236
  if buf:
112
- merged_chunks.append(buf)
237
+ merged.append(buf.lstrip("\n") if trim_empty_leading_lines and merged else buf)
113
238
 
114
- return merged_chunks
239
+ return merged
@@ -0,0 +1,5 @@
1
+ """Backward-compatible entry point for HTML → Telegram Markdown."""
2
+
3
+ from .html_markdown.renderer import html_to_telegram_markdown
4
+
5
+ __all__ = ["html_to_telegram_markdown"]
@@ -1,99 +1,3 @@
1
- import re
1
+ from .telegram_markdown.renderer import telegram_format
2
2
 
3
- from .converters import convert_html_chars, split_by_tag
4
- from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
5
- from .formatters import combine_blockquotes
6
- from .helpers import remove_blockquote_escaping, remove_spoiler_escaping
7
-
8
-
9
- def extract_inline_code_snippets(text: str):
10
- """
11
- Extracts inline code (single-backtick content) from the text,
12
- replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
13
- This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
14
- """
15
- placeholders = []
16
- code_snippets = {}
17
- inline_code_pattern = re.compile(r"`([^`]+)`")
18
-
19
- def replacer(match):
20
- snippet = match.group(1)
21
- placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
22
- placeholders.append(placeholder)
23
- code_snippets[placeholder] = snippet
24
- return placeholder
25
-
26
- new_text = inline_code_pattern.sub(replacer, text)
27
- return new_text, code_snippets
28
-
29
-
30
- def telegram_format(text: str) -> str:
31
- """
32
- Converts markdown in the provided text to HTML supported by Telegram.
33
- """
34
-
35
- # Step 0: Combine blockquotes
36
- text = combine_blockquotes(text)
37
-
38
- # Step 1: Extract and convert triple-backtick code blocks first
39
- output, triple_code_blocks = extract_and_convert_code_blocks(text)
40
-
41
- # Step 2: Extract inline code snippets
42
- output, inline_code_snippets = extract_inline_code_snippets(output)
43
-
44
- # Step 3: Convert HTML reserved symbols in the text (not in code blocks)
45
- output = convert_html_chars(output)
46
-
47
- # Convert headings (H1-H6)
48
- output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
49
-
50
- # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
51
- output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
52
-
53
- # Nested Bold and Italic
54
- output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
55
- output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
56
-
57
- # Process markdown for bold (**), underline (__), strikethrough (~~), and spoiler (||)
58
- output = split_by_tag(output, "**", "b")
59
- output = split_by_tag(output, "__", "u")
60
- output = split_by_tag(output, "~~", "s")
61
- output = split_by_tag(output, "||", 'span class="tg-spoiler"')
62
-
63
- # Custom approach for single-asterisk italic
64
- italic_pattern = re.compile(
65
- r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])", re.DOTALL
66
- )
67
- output = italic_pattern.sub(r"<i>\1</i>", output)
68
-
69
- # Process single underscore-based italic
70
- output = split_by_tag(output, "_", "i")
71
-
72
- # Remove storage links (Vector storage placeholders like 【4:0†source】)
73
- output = re.sub(r"【[^】]+】", "", output)
74
-
75
- # Convert Markdown links/images to <a href="">…</a>
76
- link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
77
- output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
78
-
79
- # Step 4: Reinsert inline code snippets, applying HTML escaping to the content
80
- for placeholder, snippet in inline_code_snippets.items():
81
- # Apply HTML escaping to the content of inline code
82
- escaped_snippet = (
83
- snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
84
- )
85
- output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
86
-
87
- # Step 5: Reinsert the converted triple-backtick code blocks
88
- output = reinsert_code_blocks(output, triple_code_blocks)
89
-
90
- # Step 6: Remove blockquote escaping
91
- output = remove_blockquote_escaping(output)
92
-
93
- # Step 7: Remove spoiler tag escaping
94
- output = remove_spoiler_escaping(output)
95
-
96
- # Clean up multiple consecutive newlines, but preserve intentional spacing
97
- output = re.sub(r"\n{3,}", "\n\n", output)
98
-
99
- return output.strip()
3
+ __all__ = ['telegram_format']
@@ -0,0 +1,5 @@
1
+ """Modular Telegram Markdown → HTML conversion helpers."""
2
+
3
+ from .renderer import telegram_format
4
+
5
+ __all__ = ["telegram_format"]
@@ -0,0 +1,95 @@
1
+ """Code block extraction utilities for Telegram Markdown conversion."""
2
+
3
+ import re
4
+
5
+ _CODE_BLOCK_RE = re.compile(
6
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
7
+ flags=re.DOTALL,
8
+ )
9
+
10
+
11
+
12
+ def _count_unescaped_backticks(text: str) -> int:
13
+ """Return the number of backticks not escaped by a backslash."""
14
+ count = 0
15
+ for index, char in enumerate(text):
16
+ if char != "`":
17
+ continue
18
+ backslashes = 0
19
+ j = index - 1
20
+ while j >= 0 and text[j] == '\\':
21
+ backslashes += 1
22
+ j -= 1
23
+ if backslashes % 2 == 0:
24
+ count += 1
25
+ return count
26
+
27
+ def ensure_closing_delimiters(text: str) -> str:
28
+ """Append any missing closing backtick fences for Markdown code blocks."""
29
+ open_fence = None
30
+ for line in text.splitlines():
31
+ stripped = line.strip()
32
+ if open_fence is None:
33
+ match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
34
+ if match:
35
+ open_fence = match.group("fence")
36
+ else:
37
+ if stripped.endswith(open_fence):
38
+ open_fence = None
39
+
40
+ if open_fence is not None:
41
+ if not text.endswith("\n"):
42
+ text += "\n"
43
+ text += open_fence
44
+
45
+ cleaned_inline = _CODE_BLOCK_RE.sub("", text)
46
+ if cleaned_inline.count("```") % 2 != 0:
47
+ text += "```"
48
+
49
+ cleaned_inline = _CODE_BLOCK_RE.sub("", text)
50
+ if _count_unescaped_backticks(cleaned_inline) % 2 != 0:
51
+ text += "`"
52
+
53
+ return text
54
+
55
+
56
+ def extract_and_convert_code_blocks(text: str):
57
+ """Replace fenced code blocks with placeholders and return HTML renderings."""
58
+ text = ensure_closing_delimiters(text)
59
+ placeholders: list[str] = []
60
+ code_blocks: dict[str, str] = {}
61
+
62
+ def _replacement(match: re.Match[str]) -> tuple[str, str]:
63
+ language = match.group("lang") or ""
64
+ code_content = match.group("code")
65
+ escaped = (
66
+ code_content.replace("&", "&amp;")
67
+ .replace("<", "&lt;")
68
+ .replace(">", "&gt;")
69
+ )
70
+ placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
71
+ placeholders.append(placeholder)
72
+ if language:
73
+ html_block = f'<pre><code class="language-{language}">{escaped}</code></pre>'
74
+ else:
75
+ html_block = f"<pre><code>{escaped}</code></pre>"
76
+ return placeholder, html_block
77
+
78
+ modified = text
79
+ pattern = re.compile(
80
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
81
+ flags=re.DOTALL,
82
+ )
83
+ for match in pattern.finditer(text):
84
+ placeholder, html_block = _replacement(match)
85
+ code_blocks[placeholder] = html_block
86
+ modified = modified.replace(match.group(0), placeholder, 1)
87
+
88
+ return modified, code_blocks
89
+
90
+
91
+ def reinsert_code_blocks(text: str, code_blocks: dict[str, str]) -> str:
92
+ """Insert rendered HTML code blocks back into their placeholders."""
93
+ for placeholder, html_block in code_blocks.items():
94
+ text = text.replace(placeholder, html_block, 1)
95
+ return text
@@ -0,0 +1,73 @@
1
+ """Inline text helpers for Telegram Markdown conversion."""
2
+
3
+ import re
4
+
5
+ _inline_code_pattern = re.compile(r"`([^`]+)`")
6
+
7
+ _BOLD_PATTERN = re.compile(r"(?<!\\)\*\*(?=\S)(.*?)(?<=\S)\*\*", re.DOTALL)
8
+ _UNDERLINE_PATTERN = re.compile(
9
+ r"(?<!\\)(?<![A-Za-z0-9_])__(?=\S)(.*?)(?<=\S)__(?![A-Za-z0-9_])",
10
+ re.DOTALL,
11
+ )
12
+ _ITALIC_UNDERSCORE_PATTERN = re.compile(
13
+ r"(?<!\\)(?<![A-Za-z0-9_])_(?=\S)(.*?)(?<=\S)_(?![A-Za-z0-9_])",
14
+ re.DOTALL,
15
+ )
16
+ _STRIKETHROUGH_PATTERN = re.compile(r"(?<!\\)~~(?=\S)(.*?)(?<=\S)~~", re.DOTALL)
17
+ _SPOILER_PATTERN = re.compile(r"(?<!\\)\|\|(?=\S)([^\n]*?)(?<=\S)\|\|")
18
+ _ITALIC_STAR_PATTERN = re.compile(
19
+ r"(?<![A-Za-z0-9\\])\*(?!\*)(?=[^\s])(.*?)(?<![\s\\])\*(?![A-Za-z0-9\\])",
20
+ re.DOTALL,
21
+ )
22
+
23
+ _PATTERN_MAP = {
24
+ "**": _BOLD_PATTERN,
25
+ "__": _UNDERLINE_PATTERN,
26
+ "_": _ITALIC_UNDERSCORE_PATTERN,
27
+ "~~": _STRIKETHROUGH_PATTERN,
28
+ "||": _SPOILER_PATTERN,
29
+ }
30
+
31
+
32
+ def convert_html_chars(text: str) -> str:
33
+ text = text.replace("&", "&amp;")
34
+ text = text.replace("<", "&lt;")
35
+ text = text.replace(">", "&gt;")
36
+ return text
37
+
38
+
39
+ def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
40
+ pattern = _PATTERN_MAP.get(md_tag)
41
+ if pattern is None:
42
+ escaped = re.escape(md_tag)
43
+ pattern = re.compile(
44
+ rf"(?<!\\){escaped}(?=\S)(.*?)(?<=\S){escaped}",
45
+ re.DOTALL,
46
+ )
47
+
48
+ def _wrap(match: re.Match[str]) -> str:
49
+ inner = match.group(1)
50
+ if html_tag == 'span class="tg-spoiler"':
51
+ return f'<span class="tg-spoiler">{inner}</span>'
52
+ return f"<{html_tag}>{inner}</{html_tag}>"
53
+
54
+ return pattern.sub(_wrap, out_text)
55
+
56
+
57
+ def extract_inline_code_snippets(text: str):
58
+ placeholders: list[str] = []
59
+ snippets: dict[str, str] = {}
60
+
61
+ def replacer(match: re.Match[str]) -> str:
62
+ snippet = match.group(1)
63
+ placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
64
+ placeholders.append(placeholder)
65
+ snippets[placeholder] = snippet
66
+ return placeholder
67
+
68
+ modified = _inline_code_pattern.sub(replacer, text)
69
+ return modified, snippets
70
+
71
+
72
+ def apply_custom_italic(text: str) -> str:
73
+ return _ITALIC_STAR_PATTERN.sub(r"<i>\1</i>", text)
@@ -1,25 +1,19 @@
1
+ """Post-processing helpers for Telegram Markdown conversion."""
2
+
3
+
1
4
  def remove_blockquote_escaping(output: str) -> str:
2
- """
3
- Removes the escaping from blockquote tags, including expandable blockquotes.
4
- """
5
- # Regular blockquotes
5
+ """Unescape blockquote tags produced during formatting."""
6
6
  output = output.replace("&lt;blockquote&gt;", "<blockquote>").replace(
7
7
  "&lt;/blockquote&gt;", "</blockquote>"
8
8
  )
9
-
10
- # Expandable blockquotes
11
9
  output = output.replace(
12
10
  "&lt;blockquote expandable&gt;", "<blockquote expandable>"
13
11
  ).replace("&lt;/blockquote&gt;", "</blockquote>")
14
-
15
12
  return output
16
13
 
17
14
 
18
15
  def remove_spoiler_escaping(output: str) -> str:
19
- """
20
- Ensures spoiler tags are correctly formatted (rather than being escaped).
21
- """
22
- # Fix any incorrectly escaped spoiler tags
16
+ """Ensure spoiler spans remain HTML tags, not escaped text."""
23
17
  output = output.replace(
24
18
  '&lt;span class="tg-spoiler"&gt;', '<span class="tg-spoiler">'
25
19
  )
@@ -0,0 +1,39 @@
1
+ """Pre-processing helpers for Telegram Markdown conversion."""
2
+
3
+
4
+ def combine_blockquotes(text: str) -> str:
5
+ """Collapse consecutive Markdown blockquote lines into Telegram HTML blocks."""
6
+ lines = text.split("\n")
7
+ combined_lines = []
8
+ blockquote_lines = []
9
+ in_blockquote = False
10
+ is_expandable = False
11
+
12
+ for line in lines:
13
+ if line.startswith("**>"):
14
+ in_blockquote = True
15
+ is_expandable = True
16
+ blockquote_lines.append(line[3:].strip())
17
+ elif line.startswith(">"):
18
+ if not in_blockquote:
19
+ in_blockquote = True
20
+ is_expandable = False
21
+ blockquote_lines.append(line[1:].strip())
22
+ else:
23
+ if in_blockquote:
24
+ combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
25
+ blockquote_lines = []
26
+ in_blockquote = False
27
+ is_expandable = False
28
+ combined_lines.append(line)
29
+
30
+ if in_blockquote:
31
+ combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
32
+
33
+ return "\n".join(combined_lines)
34
+
35
+
36
+ def _render_blockquote(lines: list[str], expandable: bool) -> str:
37
+ if expandable:
38
+ return "<blockquote expandable>" + "\n".join(lines) + "</blockquote>"
39
+ return "<blockquote>" + "\n".join(lines) + "</blockquote>"
@@ -0,0 +1,55 @@
1
+ """High-level Telegram Markdown → HTML renderer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
8
+ from .inline import (apply_custom_italic, convert_html_chars,
9
+ extract_inline_code_snippets, split_by_tag)
10
+ from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
11
+ from .preprocess import combine_blockquotes
12
+
13
+
14
+ def telegram_format(text: str) -> str:
15
+ text = combine_blockquotes(text)
16
+
17
+ output, block_map = extract_and_convert_code_blocks(text)
18
+ output, inline_snippets = extract_inline_code_snippets(output)
19
+
20
+ output = convert_html_chars(output)
21
+
22
+ output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
23
+ output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
24
+
25
+ output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
26
+ output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
27
+
28
+ output = split_by_tag(output, "**", "b")
29
+ output = split_by_tag(output, "__", "u")
30
+ output = split_by_tag(output, "~~", "s")
31
+ output = split_by_tag(output, "||", 'span class="tg-spoiler"')
32
+
33
+ output = apply_custom_italic(output)
34
+ output = split_by_tag(output, "_", "i")
35
+
36
+ output = re.sub(r"【[^】]+】", "", output)
37
+
38
+ link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
39
+ output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
40
+
41
+ for placeholder, snippet in inline_snippets.items():
42
+ escaped = (
43
+ snippet.replace("&", "&amp;")
44
+ .replace("<", "&lt;")
45
+ .replace(">", "&gt;")
46
+ )
47
+ output = output.replace(placeholder, f"<code>{escaped}</code>")
48
+
49
+ output = reinsert_code_blocks(output, block_map)
50
+ output = remove_blockquote_escaping(output)
51
+ output = remove_spoiler_escaping(output)
52
+
53
+ output = re.sub(r"\n{3,}", "\n\n", output)
54
+
55
+ return output.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -114,6 +114,24 @@ Hidden by default
114
114
  Multiple lines</blockquote>
115
115
  ```
116
116
 
117
+
118
+ ## Performance
119
+
120
+ Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
121
+
122
+ | Sample | Direction | Avg ms/call | Ops/sec |
123
+ |--------------|---------------|-------------|---------|
124
+ | short_inline | Markdown→HTML | 0.043 | 23,476 |
125
+ | short_inline | HTML→Markdown | 0.078 | 12,824 |
126
+ | medium_block | Markdown→HTML | 0.108 | 9,270 |
127
+ | medium_block | HTML→Markdown | 0.155 | 6,437 |
128
+ | long_mixed | Markdown→HTML | 0.446 | 2,242 |
129
+ | long_mixed | HTML→Markdown | 0.730 | 1,370 |
130
+
131
+ These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
132
+
133
+ Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
134
+
117
135
  ## Requirements
118
136
 
119
137
  - Python 3.x
@@ -0,0 +1,20 @@
1
+ chatgpt_md_converter/__init__.py,sha256=6ts2hnimdBn_qCA15LKuipUjSU9ZCqRk1GbDPc_JjO4,242
2
+ chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
3
+ chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
4
+ chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
5
+ chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
6
+ chatgpt_md_converter/html_markdown/handlers.py,sha256=dJw-IuvFG7eeTVclx9QOS2NEvqlF2K7i3MJ_llt1YYc,4939
7
+ chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
8
+ chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
9
+ chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
10
+ chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
11
+ chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=gQCGqZTtUusK_I6KOGqMGTd-z3TkUZSo4kMrA5g_l04,3065
12
+ chatgpt_md_converter/telegram_markdown/inline.py,sha256=Phe4T5tu7Y7drH17YW-iOVEqGMRNGe1zVxAbd192HDY,2205
13
+ chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
14
+ chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=c9Wzs7DUumXgrgndCeHbCfV1qLzXVJlLHOtXC3Ne2Nk,1362
15
+ chatgpt_md_converter/telegram_markdown/renderer.py,sha256=ZX0reJLVC_2Fvw26dnSSpK_xr_Kpfp9oTyQw57FCqu0,1957
16
+ chatgpt_md_converter-0.3.9.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
17
+ chatgpt_md_converter-0.3.9.dist-info/METADATA,sha256=P5508ZIm4iTdBtFpA03j5CDy3E7iXEVnx2tkWSZMGbc,6604
18
+ chatgpt_md_converter-0.3.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ chatgpt_md_converter-0.3.9.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
20
+ chatgpt_md_converter-0.3.9.dist-info/RECORD,,
@@ -1,27 +0,0 @@
1
- import re
2
-
3
-
4
- def convert_html_chars(text: str) -> str:
5
- """
6
- Converts HTML reserved symbols to their respective character references.
7
- """
8
- text = text.replace("&", "&amp;")
9
- text = text.replace("<", "&lt;")
10
- text = text.replace(">", "&gt;")
11
- return text
12
-
13
-
14
- def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
15
- """
16
- Splits the text by markdown tag and replaces it with the specified HTML tag.
17
- """
18
- tag_pattern = re.compile(
19
- r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
20
- re.DOTALL,
21
- )
22
-
23
- # Special handling for the tg-spoiler tag
24
- if html_tag == 'span class="tg-spoiler"':
25
- return tag_pattern.sub(r'<span class="tg-spoiler">\1</span>', out_text)
26
-
27
- return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
@@ -1,94 +0,0 @@
1
- import re
2
-
3
-
4
- def ensure_closing_delimiters(text: str) -> str:
5
- """Append missing closing backtick delimiters."""
6
-
7
- code_block_re = re.compile(
8
- r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
- flags=re.DOTALL,
10
- )
11
-
12
- # Remove complete code blocks from consideration so inner backticks
13
- # don't affect delimiter balancing.
14
- cleaned = code_block_re.sub("", text)
15
-
16
- # Detect unclosed fences by tracking opening fence lengths.
17
- stack = []
18
- for line in cleaned.splitlines():
19
- m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", line.strip())
20
- if not m:
21
- continue
22
- fence = m.group("fence")
23
- if stack and fence == stack[-1]:
24
- stack.pop()
25
- else:
26
- stack.append(fence)
27
-
28
- if stack:
29
- text += "\n" + stack[-1]
30
-
31
- cleaned_inline = code_block_re.sub("", text)
32
-
33
- # Balance triple backticks that are not part of a complete fence.
34
- if cleaned_inline.count("```") % 2 != 0:
35
- text += "```"
36
-
37
- # Balance single backticks outside fenced blocks.
38
- cleaned_inline = code_block_re.sub("", text)
39
- if cleaned_inline.count("`") % 2 != 0:
40
- text += "`"
41
-
42
- return text
43
-
44
-
45
- def extract_and_convert_code_blocks(text: str):
46
- """
47
- Extracts code blocks from the text, converting them to HTML <pre><code> format,
48
- and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
49
- """
50
- text = ensure_closing_delimiters(text)
51
- placeholders = []
52
- code_blocks = {}
53
-
54
- def replacer(match):
55
- language = match.group("lang") if match.group("lang") else ""
56
- code_content = match.group("code")
57
-
58
- # Properly escape HTML entities in code content
59
- escaped_content = (
60
- code_content.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
61
- )
62
-
63
- placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
64
- placeholders.append(placeholder)
65
- if not language:
66
- html_code_block = f"<pre><code>{escaped_content}</code></pre>"
67
- else:
68
- html_code_block = (
69
- f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
70
- )
71
- return (placeholder, html_code_block)
72
-
73
- modified_text = text
74
- code_block_pattern = re.compile(
75
- r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
76
- flags=re.DOTALL,
77
- )
78
- for match in code_block_pattern.finditer(text):
79
- placeholder, html_code_block = replacer(
80
- match
81
- )
82
- code_blocks[placeholder] = html_code_block
83
- modified_text = modified_text.replace(match.group(0), placeholder, 1)
84
-
85
- return modified_text, code_blocks
86
-
87
-
88
- def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
89
- """
90
- Reinserts HTML code blocks into the text, replacing their placeholders.
91
- """
92
- for placeholder, html_code_block in code_blocks.items():
93
- text = text.replace(placeholder, html_code_block, 1)
94
- return text
@@ -1,68 +0,0 @@
1
- def combine_blockquotes(text: str) -> str:
2
- """
3
- Combines multiline blockquotes into a single blockquote while keeping the \n characters.
4
- Supports both regular blockquotes (>) and expandable blockquotes (**>).
5
- """
6
- lines = text.split("\n")
7
- combined_lines = []
8
- blockquote_lines = []
9
- in_blockquote = False
10
- is_expandable = False
11
-
12
- for line in lines:
13
- if line.startswith("**>"):
14
- # Expandable blockquote
15
- in_blockquote = True
16
- is_expandable = True
17
- blockquote_lines.append(line[3:].strip())
18
- elif line.startswith(">"):
19
- # Regular blockquote
20
- if not in_blockquote:
21
- # This is a new blockquote
22
- in_blockquote = True
23
- is_expandable = False
24
- blockquote_lines.append(line[1:].strip())
25
- else:
26
- if in_blockquote:
27
- # End of blockquote, combine the lines
28
- if is_expandable:
29
- combined_lines.append(
30
- "<blockquote expandable>"
31
- + "\n".join(blockquote_lines)
32
- + "</blockquote>"
33
- )
34
- else:
35
- combined_lines.append(
36
- "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
37
- )
38
- blockquote_lines = []
39
- in_blockquote = False
40
- is_expandable = False
41
- combined_lines.append(line)
42
-
43
- if in_blockquote:
44
- # Handle the case where the file ends with a blockquote
45
- if is_expandable:
46
- combined_lines.append(
47
- "<blockquote expandable>"
48
- + "\n".join(blockquote_lines)
49
- + "</blockquote>"
50
- )
51
- else:
52
- combined_lines.append(
53
- "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
54
- )
55
-
56
- return "\n".join(combined_lines)
57
-
58
-
59
- def fix_asterisk_equations(text: str) -> str:
60
- """
61
- Replaces numeric expressions with '*' in them with '×'
62
- to avoid accidental italic formatting.
63
- e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
64
- """
65
- import re
66
-
67
- eq_pattern = re.compile(r"(\d+)\s*\*\s*(\d+)")
68
- return eq_pattern.sub(r"\1×\2", text)
@@ -1,12 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
- chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
- chatgpt_md_converter/extractors.py,sha256=uThH9vnjlEwZowCbxvcZreMZUPqUEiuq0nbWva3K-CE,3023
4
- chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
- chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
- chatgpt_md_converter/html_splitter.py,sha256=8ao4QU5PFDFCHMg8pj5kBqmxSOUO6RfzqQfk4o1F8ms,3897
7
- chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
8
- chatgpt_md_converter-0.3.7.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
9
- chatgpt_md_converter-0.3.7.dist-info/METADATA,sha256=4gweCWqlv3a6pR6FJbf-ycCEToIjCRf2Ohnk5p81bwQ,5792
10
- chatgpt_md_converter-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- chatgpt_md_converter-0.3.7.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
12
- chatgpt_md_converter-0.3.7.dist-info/RECORD,,