chatgpt-md-converter 0.3.8__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/PKG-INFO +19 -1
  2. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/README.md +18 -0
  3. chatgpt_md_converter-0.3.10/chatgpt_md_converter/__init__.py +5 -0
  4. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/escaping.py +68 -0
  5. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/handlers.py +216 -0
  6. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/renderer.py +16 -0
  7. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/state.py +16 -0
  8. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/tree.py +65 -0
  9. chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_to_markdown.py +5 -0
  10. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_formatter.py +3 -0
  11. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/__init__.py +5 -0
  12. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/code_blocks.py +95 -0
  13. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/inline.py +73 -0
  14. chatgpt_md_converter-0.3.8/chatgpt_md_converter/helpers.py → chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/postprocess.py +5 -11
  15. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/preprocess.py +39 -0
  16. chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/renderer.py +55 -0
  17. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/PKG-INFO +19 -1
  18. chatgpt_md_converter-0.3.10/chatgpt_md_converter.egg-info/SOURCES.txt +26 -0
  19. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/setup.py +1 -1
  20. chatgpt_md_converter-0.3.10/tests/test_html_to_markdown_inline_spacing.py +25 -0
  21. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/tests/test_parser.py +12 -4
  22. chatgpt_md_converter-0.3.10/tests/test_roundtrip_markdown.py +32 -0
  23. chatgpt_md_converter-0.3.8/chatgpt_md_converter/__init__.py +0 -4
  24. chatgpt_md_converter-0.3.8/chatgpt_md_converter/converters.py +0 -27
  25. chatgpt_md_converter-0.3.8/chatgpt_md_converter/extractors.py +0 -95
  26. chatgpt_md_converter-0.3.8/chatgpt_md_converter/formatters.py +0 -68
  27. chatgpt_md_converter-0.3.8/chatgpt_md_converter/telegram_formatter.py +0 -99
  28. chatgpt_md_converter-0.3.8/chatgpt_md_converter.egg-info/SOURCES.txt +0 -16
  29. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/LICENSE +0 -0
  30. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter/html_splitter.py +0 -0
  31. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
  32. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
  33. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/setup.cfg +0 -0
  34. {chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/tests/test_splitter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -114,6 +114,24 @@ Hidden by default
114
114
  Multiple lines</blockquote>
115
115
  ```
116
116
 
117
+
118
+ ## Performance
119
+
120
+ Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
121
+
122
+ | Sample | Direction | Avg ms/call | Ops/sec |
123
+ |--------------|---------------|-------------|---------|
124
+ | short_inline | Markdown→HTML | 0.043 | 23,476 |
125
+ | short_inline | HTML→Markdown | 0.078 | 12,824 |
126
+ | medium_block | Markdown→HTML | 0.108 | 9,270 |
127
+ | medium_block | HTML→Markdown | 0.155 | 6,437 |
128
+ | long_mixed | Markdown→HTML | 0.446 | 2,242 |
129
+ | long_mixed | HTML→Markdown | 0.730 | 1,370 |
130
+
131
+ These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
132
+
133
+ Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
134
+
117
135
  ## Requirements
118
136
 
119
137
  - Python 3.x
@@ -91,6 +91,24 @@ Hidden by default
91
91
  Multiple lines</blockquote>
92
92
  ```
93
93
 
94
+
95
+ ## Performance
96
+
97
+ Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
98
+
99
+ | Sample | Direction | Avg ms/call | Ops/sec |
100
+ |--------------|---------------|-------------|---------|
101
+ | short_inline | Markdown→HTML | 0.043 | 23,476 |
102
+ | short_inline | HTML→Markdown | 0.078 | 12,824 |
103
+ | medium_block | Markdown→HTML | 0.108 | 9,270 |
104
+ | medium_block | HTML→Markdown | 0.155 | 6,437 |
105
+ | long_mixed | Markdown→HTML | 0.446 | 2,242 |
106
+ | long_mixed | HTML→Markdown | 0.730 | 1,370 |
107
+
108
+ These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
109
+
110
+ Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
111
+
94
112
  ## Requirements
95
113
 
96
114
  - Python 3.x
@@ -0,0 +1,5 @@
1
+ from .html_splitter import split_html_for_telegram
2
+ from .html_to_markdown import html_to_telegram_markdown
3
+ from .telegram_formatter import telegram_format
4
+
5
+ __all__ = ["telegram_format", "split_html_for_telegram", "html_to_telegram_markdown"]
@@ -0,0 +1,68 @@
1
+ """Shared escaping utilities for Telegram Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import html
6
+ import re
7
+
8
+ from .tree import Node
9
+
10
+ _SIMPLE_STAR_ITALIC = re.compile(
11
+ r"(?<!\\)(?<!\*)\*(?=[^\s])([^\*\n]+?)(?<!\s)\*(?![A-Za-z0-9\*])",
12
+ )
13
+
14
+
15
+ def _canonicalize_star_italics(text: str) -> str:
16
+ def _replace(match: re.Match[str]) -> str:
17
+ inner = match.group(1)
18
+ if "*" in inner or "_" in inner or "`" in inner:
19
+ return match.group(0)
20
+ return f"_{inner}_"
21
+
22
+ return _SIMPLE_STAR_ITALIC.sub(_replace, text)
23
+
24
+
25
+ def normalise_text(text: str) -> str:
26
+ if not text:
27
+ return ""
28
+ unescaped = html.unescape(text)
29
+ return unescaped.replace("\u00a0", " ")
30
+
31
+
32
+ def collect_text(node: Node) -> str:
33
+ if node.kind == "text":
34
+ return html.unescape(node.text)
35
+ parts: list[str] = []
36
+ for child in node.children:
37
+ if child.kind == "text":
38
+ parts.append(html.unescape(child.text))
39
+ elif child.kind == "element":
40
+ if child.tag.lower() == "br":
41
+ parts.append("\n")
42
+ else:
43
+ parts.append(collect_text(child))
44
+ return "".join(parts)
45
+
46
+
47
+ def escape_inline_code(text: str) -> str:
48
+ return text.replace("`", "\\`")
49
+
50
+
51
+ def escape_link_label(label: str) -> str:
52
+ escaped = label
53
+ for ch in "[]()":
54
+ escaped = escaped.replace(ch, f"\\{ch}")
55
+ return escaped
56
+
57
+
58
+ def escape_link_url(url: str) -> str:
59
+ return url.replace("\\", "\\\\").replace(")", "\\)")
60
+
61
+
62
+ def post_process(markdown: str) -> str:
63
+ text = re.sub(r"(^|\n)•\s", r"\1- ", markdown)
64
+ text = re.sub(r"\n{3,}", "\n\n", text)
65
+ text = text.replace("\r", "")
66
+ text = "\n".join(line.rstrip() for line in text.split("\n"))
67
+ text = _canonicalize_star_italics(text)
68
+ return text.strip()
@@ -0,0 +1,216 @@
1
+ """Tag-specific renderers for Telegram Markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Callable, Dict
6
+
7
+ from .escaping import (collect_text, escape_inline_code, escape_link_label,
8
+ escape_link_url, normalise_text)
9
+ from .state import RenderState
10
+ from .tree import Node
11
+
12
+ InlineHandler = Callable[[Node, RenderState], str]
13
+
14
+
15
+ _INLINE_MARKERS: Dict[str, tuple[str, str]] = {
16
+ "u": ("__", "__"),
17
+ "ins": ("__", "__"),
18
+ "s": ("~~", "~~"),
19
+ "strike": ("~~", "~~"),
20
+ "del": ("~~", "~~"),
21
+ }
22
+
23
+
24
+ def render_nodes(nodes: list[Node], state: RenderState) -> str:
25
+ return "".join(render_node(node, state) for node in nodes)
26
+
27
+
28
+ def render_node(node: Node, state: RenderState) -> str:
29
+ if node.kind == "text":
30
+ return normalise_text(node.text)
31
+
32
+ handler = TAG_DISPATCH.get(node.tag.lower())
33
+ if handler:
34
+ return handler(node, state)
35
+ return render_nodes(node.children, state)
36
+
37
+
38
+ def _split_surrounding_whitespace(text: str) -> tuple[str, str, str]:
39
+ """Return leading whitespace, core text, and trailing whitespace."""
40
+
41
+ start = 0
42
+ end = len(text)
43
+
44
+ while start < end and text[start].isspace():
45
+ start += 1
46
+
47
+ while end > start and text[end - 1].isspace():
48
+ end -= 1
49
+
50
+ return text[:start], text[start:end], text[end:]
51
+
52
+
53
+ def _italic_boundary_conflict(marker: str, core: str) -> bool:
54
+ if marker == "*":
55
+ return core.startswith("*") or core.endswith("*")
56
+
57
+ if marker == "_":
58
+ starts = core.startswith("_")
59
+ if starts and len(core) > 1 and core[1] == "_":
60
+ starts = False
61
+
62
+ ends = core.endswith("_")
63
+ if ends and len(core) > 1 and core[-2] == "_":
64
+ ends = False
65
+
66
+ return starts or ends
67
+
68
+ return False
69
+
70
+
71
+ def _choose_italic_marker(state: RenderState, core: str) -> str:
72
+ depth = state.italic_depth
73
+
74
+ if state.bold_depth > 0 and depth == 0:
75
+ candidates = ["_", "*"]
76
+ elif depth % 2 == 0:
77
+ candidates = ["*", "_"]
78
+ else:
79
+ candidates = ["_", "*"]
80
+
81
+ for marker in candidates:
82
+ if not _italic_boundary_conflict(marker, core):
83
+ return marker
84
+
85
+ return candidates[0]
86
+
87
+
88
+ def _handle_bold(node: Node, state: RenderState) -> str:
89
+ inner_state = state.child(bold_depth=state.bold_depth + 1)
90
+ inner = render_nodes(node.children, inner_state)
91
+ leading, core, trailing = _split_surrounding_whitespace(inner)
92
+ if not core:
93
+ return leading + trailing
94
+ return f"{leading}**{core}**{trailing}"
95
+
96
+
97
+ def _handle_italic(node: Node, state: RenderState) -> str:
98
+ depth = state.italic_depth
99
+ inner_state = state.child(italic_depth=depth + 1)
100
+ inner = render_nodes(node.children, inner_state)
101
+ leading, core, trailing = _split_surrounding_whitespace(inner)
102
+ if not core:
103
+ return leading + trailing
104
+ marker = _choose_italic_marker(state, core)
105
+ return f"{leading}{marker}{core}{marker}{trailing}"
106
+
107
+
108
+ def _handle_inline_marker(node: Node, state: RenderState) -> str:
109
+ marker_open, marker_close = _INLINE_MARKERS[node.tag.lower()]
110
+ inner = render_nodes(node.children, state)
111
+ leading, core, trailing = _split_surrounding_whitespace(inner)
112
+ if not core:
113
+ return leading + trailing
114
+ return f"{leading}{marker_open}{core}{marker_close}{trailing}"
115
+
116
+
117
+ def _handle_spoiler(node: Node, state: RenderState) -> str:
118
+ inner = render_nodes(node.children, state)
119
+ leading, core, trailing = _split_surrounding_whitespace(inner)
120
+ if not core:
121
+ return leading + trailing
122
+ return f"{leading}||{core}||{trailing}"
123
+
124
+
125
+ def _handle_code(node: Node, state: RenderState) -> str:
126
+ inner = collect_text(node)
127
+ return f"`{escape_inline_code(inner)}`"
128
+
129
+
130
+ def _handle_pre(node: Node, state: RenderState) -> str:
131
+ children = node.children
132
+ language: str | None = None
133
+ content_node: Node
134
+
135
+ if len(children) == 1 and children[0].kind == "element" and children[0].tag.lower() == "code":
136
+ content_node = children[0]
137
+ class_attr = content_node.attrs.get("class") or ""
138
+ for part in class_attr.split():
139
+ if part.startswith("language-"):
140
+ language = part.split("-", 1)[1]
141
+ break
142
+ else:
143
+ content_node = Node(kind="element", tag="__virtual__", children=children)
144
+
145
+ inner_text = collect_text(content_node)
146
+ fence = f"```{language}" if language else "```"
147
+ if language or "\n" in inner_text:
148
+ return f"{fence}\n{inner_text}```"
149
+ return f"{fence}{inner_text}```"
150
+
151
+
152
+ def _handle_link(node: Node, state: RenderState) -> str:
153
+ href = node.attrs.get("href", "") or ""
154
+ label = render_nodes(node.children, state)
155
+ if not label:
156
+ label = href
157
+
158
+ escaped_label = escape_link_label(label)
159
+ escaped_url = escape_link_url(href)
160
+
161
+ if href.startswith("tg://emoji?"):
162
+ return f"![{escaped_label}]({escaped_url})"
163
+ return f"[{escaped_label}]({escaped_url})"
164
+
165
+
166
+ def _handle_blockquote(node: Node, state: RenderState) -> str:
167
+ inner = render_nodes(node.children, state)
168
+ lines = inner.split("\n")
169
+ expandable = "expandable" in node.attrs
170
+ rendered: list[str] = []
171
+ for index, line in enumerate(lines):
172
+ prefix = "**>" if expandable and index == 0 else ">"
173
+ stripped = line.rstrip("\r")
174
+ if expandable:
175
+ rendered.append(prefix + stripped)
176
+ else:
177
+ rendered.append(f"{prefix} {stripped}" if stripped else prefix)
178
+ return "\n".join(rendered)
179
+
180
+
181
+ def _handle_tg_emoji(node: Node, state: RenderState) -> str:
182
+ emoji_id = node.attrs.get("emoji-id")
183
+ label = render_nodes(node.children, state)
184
+ if emoji_id:
185
+ href = f"tg://emoji?id={emoji_id}"
186
+ return f"![{escape_link_label(label)}]({href})"
187
+ return label
188
+
189
+
190
+ def _handle_span(node: Node, state: RenderState) -> str:
191
+ classes = (node.attrs.get("class") or "").split()
192
+ if any(cls == "tg-spoiler" for cls in classes):
193
+ return _handle_spoiler(node, state)
194
+ if any(cls == "tg-emoji" for cls in classes):
195
+ return render_nodes(node.children, state)
196
+ return render_nodes(node.children, state)
197
+
198
+
199
+ TAG_DISPATCH: Dict[str, Callable[[Node, RenderState], str]] = {
200
+ "b": _handle_bold,
201
+ "strong": _handle_bold,
202
+ "i": _handle_italic,
203
+ "em": _handle_italic,
204
+ "u": _handle_inline_marker,
205
+ "ins": _handle_inline_marker,
206
+ "s": _handle_inline_marker,
207
+ "strike": _handle_inline_marker,
208
+ "del": _handle_inline_marker,
209
+ "span": _handle_span,
210
+ "tg-spoiler": _handle_spoiler,
211
+ "code": _handle_code,
212
+ "pre": _handle_pre,
213
+ "a": _handle_link,
214
+ "blockquote": _handle_blockquote,
215
+ "tg-emoji": _handle_tg_emoji,
216
+ }
@@ -0,0 +1,16 @@
1
+ """High-level HTML → Telegram Markdown renderer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List
6
+
7
+ from .escaping import post_process
8
+ from .handlers import render_nodes
9
+ from .state import RenderState
10
+ from .tree import Node, build_tree
11
+
12
+
13
+ def html_to_telegram_markdown(html_text: str) -> str:
14
+ nodes: List[Node] = build_tree(html_text)
15
+ markdown = render_nodes(nodes, RenderState())
16
+ return post_process(markdown)
@@ -0,0 +1,16 @@
1
+ """Rendering state for HTML → Telegram Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class RenderState:
10
+ bold_depth: int = 0
11
+ italic_depth: int = 0
12
+
13
+ def child(self, **updates: int) -> "RenderState":
14
+ data = {"bold_depth": self.bold_depth, "italic_depth": self.italic_depth}
15
+ data.update(updates)
16
+ return RenderState(**data)
@@ -0,0 +1,65 @@
1
+ """DOM-like tree construction for Telegram HTML fragments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from html.parser import HTMLParser
7
+ from typing import Dict, List, Optional
8
+
9
+
10
+ @dataclass
11
+ class Node:
12
+ kind: str # "text" or "element"
13
+ text: str = ""
14
+ tag: str = ""
15
+ attrs: Dict[str, Optional[str]] = field(default_factory=dict)
16
+ children: List["Node"] = field(default_factory=list)
17
+
18
+
19
+ class _HTMLTreeBuilder(HTMLParser):
20
+ SELF_CLOSING_TAGS = {"br"}
21
+
22
+ def __init__(self) -> None:
23
+ super().__init__(convert_charrefs=False)
24
+ self.root = Node(kind="element", tag="__root__")
25
+ self._stack: List[Node] = [self.root]
26
+
27
+ def handle_starttag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
28
+ if tag in self.SELF_CLOSING_TAGS:
29
+ if tag == "br":
30
+ self._stack[-1].children.append(Node(kind="text", text="\n"))
31
+ return
32
+ node = Node(kind="element", tag=tag, attrs=dict(attrs))
33
+ self._stack[-1].children.append(node)
34
+ self._stack.append(node)
35
+
36
+ def handle_endtag(self, tag: str) -> None:
37
+ for index in range(len(self._stack) - 1, 0, -1):
38
+ if self._stack[index].tag == tag:
39
+ del self._stack[index:]
40
+ return
41
+
42
+ def handle_startendtag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
43
+ if tag in self.SELF_CLOSING_TAGS:
44
+ self.handle_starttag(tag, attrs)
45
+ return
46
+ node = Node(kind="element", tag=tag, attrs=dict(attrs))
47
+ self._stack[-1].children.append(node)
48
+
49
+ def handle_data(self, data: str) -> None:
50
+ if data:
51
+ self._stack[-1].children.append(Node(kind="text", text=data))
52
+
53
+ def handle_entityref(self, name: str) -> None:
54
+ self.handle_data(f"&{name};")
55
+
56
+ def handle_charref(self, name: str) -> None:
57
+ self.handle_data(f"&#{name};")
58
+
59
+
60
+ def build_tree(html_text: str) -> List[Node]:
61
+ """Parse HTML and return the list of top-level nodes."""
62
+ builder = _HTMLTreeBuilder()
63
+ builder.feed(html_text)
64
+ builder.close()
65
+ return builder.root.children
@@ -0,0 +1,5 @@
1
+ """Backward-compatible entry point for HTML → Telegram Markdown."""
2
+
3
+ from .html_markdown.renderer import html_to_telegram_markdown
4
+
5
+ __all__ = ["html_to_telegram_markdown"]
@@ -0,0 +1,3 @@
1
+ from .telegram_markdown.renderer import telegram_format
2
+
3
+ __all__ = ['telegram_format']
@@ -0,0 +1,5 @@
1
+ """Modular Telegram Markdown → HTML conversion helpers."""
2
+
3
+ from .renderer import telegram_format
4
+
5
+ __all__ = ["telegram_format"]
@@ -0,0 +1,95 @@
1
+ """Code block extraction utilities for Telegram Markdown conversion."""
2
+
3
+ import re
4
+
5
+ _CODE_BLOCK_RE = re.compile(
6
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
7
+ flags=re.DOTALL,
8
+ )
9
+
10
+
11
+
12
+ def _count_unescaped_backticks(text: str) -> int:
13
+ """Return the number of backticks not escaped by a backslash."""
14
+ count = 0
15
+ for index, char in enumerate(text):
16
+ if char != "`":
17
+ continue
18
+ backslashes = 0
19
+ j = index - 1
20
+ while j >= 0 and text[j] == '\\':
21
+ backslashes += 1
22
+ j -= 1
23
+ if backslashes % 2 == 0:
24
+ count += 1
25
+ return count
26
+
27
+ def ensure_closing_delimiters(text: str) -> str:
28
+ """Append any missing closing backtick fences for Markdown code blocks."""
29
+ open_fence = None
30
+ for line in text.splitlines():
31
+ stripped = line.strip()
32
+ if open_fence is None:
33
+ match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
34
+ if match:
35
+ open_fence = match.group("fence")
36
+ else:
37
+ if stripped.endswith(open_fence):
38
+ open_fence = None
39
+
40
+ if open_fence is not None:
41
+ if not text.endswith("\n"):
42
+ text += "\n"
43
+ text += open_fence
44
+
45
+ cleaned_inline = _CODE_BLOCK_RE.sub("", text)
46
+ if cleaned_inline.count("```") % 2 != 0:
47
+ text += "```"
48
+
49
+ cleaned_inline = _CODE_BLOCK_RE.sub("", text)
50
+ if _count_unescaped_backticks(cleaned_inline) % 2 != 0:
51
+ text += "`"
52
+
53
+ return text
54
+
55
+
56
+ def extract_and_convert_code_blocks(text: str):
57
+ """Replace fenced code blocks with placeholders and return HTML renderings."""
58
+ text = ensure_closing_delimiters(text)
59
+ placeholders: list[str] = []
60
+ code_blocks: dict[str, str] = {}
61
+
62
+ def _replacement(match: re.Match[str]) -> tuple[str, str]:
63
+ language = match.group("lang") or ""
64
+ code_content = match.group("code")
65
+ escaped = (
66
+ code_content.replace("&", "&amp;")
67
+ .replace("<", "&lt;")
68
+ .replace(">", "&gt;")
69
+ )
70
+ placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
71
+ placeholders.append(placeholder)
72
+ if language:
73
+ html_block = f'<pre><code class="language-{language}">{escaped}</code></pre>'
74
+ else:
75
+ html_block = f"<pre><code>{escaped}</code></pre>"
76
+ return placeholder, html_block
77
+
78
+ modified = text
79
+ pattern = re.compile(
80
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
81
+ flags=re.DOTALL,
82
+ )
83
+ for match in pattern.finditer(text):
84
+ placeholder, html_block = _replacement(match)
85
+ code_blocks[placeholder] = html_block
86
+ modified = modified.replace(match.group(0), placeholder, 1)
87
+
88
+ return modified, code_blocks
89
+
90
+
91
+ def reinsert_code_blocks(text: str, code_blocks: dict[str, str]) -> str:
92
+ """Insert rendered HTML code blocks back into their placeholders."""
93
+ for placeholder, html_block in code_blocks.items():
94
+ text = text.replace(placeholder, html_block, 1)
95
+ return text
@@ -0,0 +1,73 @@
1
+ """Inline text helpers for Telegram Markdown conversion."""
2
+
3
+ import re
4
+
5
+ _inline_code_pattern = re.compile(r"`([^`]+)`")
6
+
7
+ _BOLD_PATTERN = re.compile(r"(?<!\\)\*\*(?=\S)(.*?)(?<=\S)\*\*", re.DOTALL)
8
+ _UNDERLINE_PATTERN = re.compile(
9
+ r"(?<!\\)(?<![A-Za-z0-9_])__(?=\S)(.*?)(?<=\S)__(?![A-Za-z0-9_])",
10
+ re.DOTALL,
11
+ )
12
+ _ITALIC_UNDERSCORE_PATTERN = re.compile(
13
+ r"(?<!\\)(?<![A-Za-z0-9_])_(?=\S)(.*?)(?<=\S)_(?![A-Za-z0-9_])",
14
+ re.DOTALL,
15
+ )
16
+ _STRIKETHROUGH_PATTERN = re.compile(r"(?<!\\)~~(?=\S)(.*?)(?<=\S)~~", re.DOTALL)
17
+ _SPOILER_PATTERN = re.compile(r"(?<!\\)\|\|(?=\S)([^\n]*?)(?<=\S)\|\|")
18
+ _ITALIC_STAR_PATTERN = re.compile(
19
+ r"(?<![A-Za-z0-9\\])\*(?!\*)(?=[^\s])(.*?)(?<![\s\\])\*(?![A-Za-z0-9\\])",
20
+ re.DOTALL,
21
+ )
22
+
23
+ _PATTERN_MAP = {
24
+ "**": _BOLD_PATTERN,
25
+ "__": _UNDERLINE_PATTERN,
26
+ "_": _ITALIC_UNDERSCORE_PATTERN,
27
+ "~~": _STRIKETHROUGH_PATTERN,
28
+ "||": _SPOILER_PATTERN,
29
+ }
30
+
31
+
32
+ def convert_html_chars(text: str) -> str:
33
+ text = text.replace("&", "&amp;")
34
+ text = text.replace("<", "&lt;")
35
+ text = text.replace(">", "&gt;")
36
+ return text
37
+
38
+
39
+ def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
40
+ pattern = _PATTERN_MAP.get(md_tag)
41
+ if pattern is None:
42
+ escaped = re.escape(md_tag)
43
+ pattern = re.compile(
44
+ rf"(?<!\\){escaped}(?=\S)(.*?)(?<=\S){escaped}",
45
+ re.DOTALL,
46
+ )
47
+
48
+ def _wrap(match: re.Match[str]) -> str:
49
+ inner = match.group(1)
50
+ if html_tag == 'span class="tg-spoiler"':
51
+ return f'<span class="tg-spoiler">{inner}</span>'
52
+ return f"<{html_tag}>{inner}</{html_tag}>"
53
+
54
+ return pattern.sub(_wrap, out_text)
55
+
56
+
57
+ def extract_inline_code_snippets(text: str):
58
+ placeholders: list[str] = []
59
+ snippets: dict[str, str] = {}
60
+
61
+ def replacer(match: re.Match[str]) -> str:
62
+ snippet = match.group(1)
63
+ placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
64
+ placeholders.append(placeholder)
65
+ snippets[placeholder] = snippet
66
+ return placeholder
67
+
68
+ modified = _inline_code_pattern.sub(replacer, text)
69
+ return modified, snippets
70
+
71
+
72
+ def apply_custom_italic(text: str) -> str:
73
+ return _ITALIC_STAR_PATTERN.sub(r"<i>\1</i>", text)
@@ -1,25 +1,19 @@
1
+ """Post-processing helpers for Telegram Markdown conversion."""
2
+
3
+
1
4
  def remove_blockquote_escaping(output: str) -> str:
2
- """
3
- Removes the escaping from blockquote tags, including expandable blockquotes.
4
- """
5
- # Regular blockquotes
5
+ """Unescape blockquote tags produced during formatting."""
6
6
  output = output.replace("&lt;blockquote&gt;", "<blockquote>").replace(
7
7
  "&lt;/blockquote&gt;", "</blockquote>"
8
8
  )
9
-
10
- # Expandable blockquotes
11
9
  output = output.replace(
12
10
  "&lt;blockquote expandable&gt;", "<blockquote expandable>"
13
11
  ).replace("&lt;/blockquote&gt;", "</blockquote>")
14
-
15
12
  return output
16
13
 
17
14
 
18
15
  def remove_spoiler_escaping(output: str) -> str:
19
- """
20
- Ensures spoiler tags are correctly formatted (rather than being escaped).
21
- """
22
- # Fix any incorrectly escaped spoiler tags
16
+ """Ensure spoiler spans remain HTML tags, not escaped text."""
23
17
  output = output.replace(
24
18
  '&lt;span class="tg-spoiler"&gt;', '<span class="tg-spoiler">'
25
19
  )
@@ -0,0 +1,39 @@
1
+ """Pre-processing helpers for Telegram Markdown conversion."""
2
+
3
+
4
+ def combine_blockquotes(text: str) -> str:
5
+ """Collapse consecutive Markdown blockquote lines into Telegram HTML blocks."""
6
+ lines = text.split("\n")
7
+ combined_lines = []
8
+ blockquote_lines = []
9
+ in_blockquote = False
10
+ is_expandable = False
11
+
12
+ for line in lines:
13
+ if line.startswith("**>"):
14
+ in_blockquote = True
15
+ is_expandable = True
16
+ blockquote_lines.append(line[3:].strip())
17
+ elif line.startswith(">"):
18
+ if not in_blockquote:
19
+ in_blockquote = True
20
+ is_expandable = False
21
+ blockquote_lines.append(line[1:].strip())
22
+ else:
23
+ if in_blockquote:
24
+ combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
25
+ blockquote_lines = []
26
+ in_blockquote = False
27
+ is_expandable = False
28
+ combined_lines.append(line)
29
+
30
+ if in_blockquote:
31
+ combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
32
+
33
+ return "\n".join(combined_lines)
34
+
35
+
36
+ def _render_blockquote(lines: list[str], expandable: bool) -> str:
37
+ if expandable:
38
+ return "<blockquote expandable>" + "\n".join(lines) + "</blockquote>"
39
+ return "<blockquote>" + "\n".join(lines) + "</blockquote>"
@@ -0,0 +1,55 @@
1
+ """High-level Telegram Markdown → HTML renderer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
8
+ from .inline import (apply_custom_italic, convert_html_chars,
9
+ extract_inline_code_snippets, split_by_tag)
10
+ from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
11
+ from .preprocess import combine_blockquotes
12
+
13
+
14
+ def telegram_format(text: str) -> str:
15
+ text = combine_blockquotes(text)
16
+
17
+ output, block_map = extract_and_convert_code_blocks(text)
18
+ output, inline_snippets = extract_inline_code_snippets(output)
19
+
20
+ output = convert_html_chars(output)
21
+
22
+ output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
23
+ output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
24
+
25
+ output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
26
+ output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
27
+
28
+ output = split_by_tag(output, "**", "b")
29
+ output = split_by_tag(output, "__", "u")
30
+ output = split_by_tag(output, "~~", "s")
31
+ output = split_by_tag(output, "||", 'span class="tg-spoiler"')
32
+
33
+ output = apply_custom_italic(output)
34
+ output = split_by_tag(output, "_", "i")
35
+
36
+ output = re.sub(r"【[^】]+】", "", output)
37
+
38
+ link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
39
+ output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
40
+
41
+ for placeholder, snippet in inline_snippets.items():
42
+ escaped = (
43
+ snippet.replace("&", "&amp;")
44
+ .replace("<", "&lt;")
45
+ .replace(">", "&gt;")
46
+ )
47
+ output = output.replace(placeholder, f"<code>{escaped}</code>")
48
+
49
+ output = reinsert_code_blocks(output, block_map)
50
+ output = remove_blockquote_escaping(output)
51
+ output = remove_spoiler_escaping(output)
52
+
53
+ output = re.sub(r"\n{3,}", "\n\n", output)
54
+
55
+ return output.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -114,6 +114,24 @@ Hidden by default
114
114
  Multiple lines</blockquote>
115
115
  ```
116
116
 
117
+
118
+ ## Performance
119
+
120
+ Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
121
+
122
+ | Sample | Direction | Avg ms/call | Ops/sec |
123
+ |--------------|---------------|-------------|---------|
124
+ | short_inline | Markdown→HTML | 0.043 | 23,476 |
125
+ | short_inline | HTML→Markdown | 0.078 | 12,824 |
126
+ | medium_block | Markdown→HTML | 0.108 | 9,270 |
127
+ | medium_block | HTML→Markdown | 0.155 | 6,437 |
128
+ | long_mixed | Markdown→HTML | 0.446 | 2,242 |
129
+ | long_mixed | HTML→Markdown | 0.730 | 1,370 |
130
+
131
+ These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
132
+
133
+ Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
134
+
117
135
  ## Requirements
118
136
 
119
137
  - Python 3.x
@@ -0,0 +1,26 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ chatgpt_md_converter/__init__.py
5
+ chatgpt_md_converter/html_splitter.py
6
+ chatgpt_md_converter/html_to_markdown.py
7
+ chatgpt_md_converter/telegram_formatter.py
8
+ chatgpt_md_converter.egg-info/PKG-INFO
9
+ chatgpt_md_converter.egg-info/SOURCES.txt
10
+ chatgpt_md_converter.egg-info/dependency_links.txt
11
+ chatgpt_md_converter.egg-info/top_level.txt
12
+ chatgpt_md_converter/html_markdown/escaping.py
13
+ chatgpt_md_converter/html_markdown/handlers.py
14
+ chatgpt_md_converter/html_markdown/renderer.py
15
+ chatgpt_md_converter/html_markdown/state.py
16
+ chatgpt_md_converter/html_markdown/tree.py
17
+ chatgpt_md_converter/telegram_markdown/__init__.py
18
+ chatgpt_md_converter/telegram_markdown/code_blocks.py
19
+ chatgpt_md_converter/telegram_markdown/inline.py
20
+ chatgpt_md_converter/telegram_markdown/postprocess.py
21
+ chatgpt_md_converter/telegram_markdown/preprocess.py
22
+ chatgpt_md_converter/telegram_markdown/renderer.py
23
+ tests/test_html_to_markdown_inline_spacing.py
24
+ tests/test_parser.py
25
+ tests/test_roundtrip_markdown.py
26
+ tests/test_splitter.py
@@ -2,7 +2,7 @@ from setuptools import setup
2
2
 
3
3
  setup(
4
4
  name="chatgpt_md_converter",
5
- version="0.3.8",
5
+ version="0.3.10",
6
6
  author="Kostiantyn Kriuchkov",
7
7
  author_email="latand666@gmail.com",
8
8
  description="A package for converting markdown to HTML for chat Telegram bots",
@@ -0,0 +1,25 @@
1
+ import pytest
2
+
3
+ from chatgpt_md_converter import html_to_telegram_markdown
4
+
5
+
6
+ @pytest.mark.parametrize(
7
+ ("html", "expected"),
8
+ [
9
+ ("Start <b>bold </b>finish", "Start **bold** finish"),
10
+ ("Start <b> bold</b> finish", "Start **bold** finish"),
11
+ ("Start <i> italics </i>finish", "Start _italics_ finish"),
12
+ ("Start <i>value_</i>end", "Start *value_*end"),
13
+ ("Start <u> underline </u>finish", "Start __underline__ finish"),
14
+ (
15
+ "Start <span class=\"tg-spoiler\"> secret </span>end",
16
+ "Start ||secret|| end",
17
+ ),
18
+ (
19
+ "Intro <b>bold <i> inner </i> block</b> outro",
20
+ "Intro **bold _inner_ block** outro",
21
+ ),
22
+ ],
23
+ )
24
+ def test_html_to_markdown_strips_inline_whitespace(html: str, expected: str) -> None:
25
+ assert html_to_telegram_markdown(html) == expected
@@ -1,5 +1,6 @@
1
- from chatgpt_md_converter.extractors import ensure_closing_delimiters
2
1
  from chatgpt_md_converter.telegram_formatter import telegram_format
2
+ from chatgpt_md_converter.telegram_markdown.code_blocks import \
3
+ ensure_closing_delimiters
3
4
 
4
5
 
5
6
  def test_split_by_tag_bold():
@@ -889,7 +890,7 @@ print("hello world ```"')
889
890
  </code></pre>
890
891
  <pre><code class="language-python">print("Some another text")
891
892
  </code></pre>""" # But the code block is still closed correctly.
892
-
893
+
893
894
  output = telegram_format(input_text)
894
895
  def show_output():
895
896
  print(f"Expected was: \n\n{expected_output}\n\n")
@@ -909,7 +910,7 @@ print("hello world ```"')
909
910
  </code></pre>
910
911
  <pre><code class="language-python">print("Some another text")
911
912
  </code></pre>""" # But the code block is still closed correctly.
912
-
913
+
913
914
  output = telegram_format(input_text)
914
915
  def show_output():
915
916
  print(f"Expected was: \n\n{expected_output}\n\n")
@@ -934,4 +935,11 @@ print("hello world ```")
934
935
  def show_output():
935
936
  print(f"Expected was: \n\n{expected_output}\n\n")
936
937
  print(f"output was: \n\n{output}")
937
- assert output == expected_output, show_output()
938
+ assert output == expected_output, show_output()
939
+
940
+ def test_inline_code_with_escaped_backtick_trailing_text():
941
+ """Ensure inline code with escaped backtick does not gain an extra closing tick."""
942
+ input_text = "Escaped \\*asterisks\\* and `code with \\` backtick`"
943
+ expected_output = "Escaped \\*asterisks\\* and <code>code with \\</code> backtick`"
944
+ output = telegram_format(input_text)
945
+ assert output == expected_output
@@ -0,0 +1,32 @@
1
+
2
+ import pytest
3
+
4
+ from chatgpt_md_converter import html_to_telegram_markdown, telegram_format
5
+ from tests.fixtures.markdown_roundtrips import ROUND_TRIP_CASES
6
+
7
+
8
+ @pytest.mark.parametrize("_case, markdown_input, expected_markdown", ROUND_TRIP_CASES)
9
+ def test_html_round_trip_normalizes_markdown(_case, markdown_input, expected_markdown):
10
+ html1 = telegram_format(markdown_input)
11
+ markdown2 = html_to_telegram_markdown(html1)
12
+ html2 = telegram_format(markdown2)
13
+ markdown3 = html_to_telegram_markdown(html2)
14
+ html3 = telegram_format(markdown3)
15
+
16
+ assert markdown2 == expected_markdown
17
+ assert markdown3 == expected_markdown
18
+ assert html1 == html2 == html3
19
+ assert '<br' not in html1
20
+ assert '<br' not in html2
21
+ assert '<br' not in html3
22
+
23
+
24
+ @pytest.mark.parametrize("_case, markdown_input, _", ROUND_TRIP_CASES)
25
+ def test_markdown_html_markdown_cycle_is_idempotent(_case, markdown_input, _):
26
+ html_first = telegram_format(markdown_input)
27
+ markdown_second = html_to_telegram_markdown(html_first)
28
+ html_third = telegram_format(markdown_second)
29
+
30
+ assert '<br' not in html_first
31
+ assert '<br' not in html_third
32
+ assert html_first == html_third
@@ -1,4 +0,0 @@
1
- from .telegram_formatter import telegram_format
2
- from .html_splitter import split_html_for_telegram
3
-
4
- __all__ = ["telegram_format", "split_html_for_telegram"]
@@ -1,27 +0,0 @@
1
- import re
2
-
3
-
4
- def convert_html_chars(text: str) -> str:
5
- """
6
- Converts HTML reserved symbols to their respective character references.
7
- """
8
- text = text.replace("&", "&amp;")
9
- text = text.replace("<", "&lt;")
10
- text = text.replace(">", "&gt;")
11
- return text
12
-
13
-
14
- def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
15
- """
16
- Splits the text by markdown tag and replaces it with the specified HTML tag.
17
- """
18
- tag_pattern = re.compile(
19
- r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
20
- re.DOTALL,
21
- )
22
-
23
- # Special handling for the tg-spoiler tag
24
- if html_tag == 'span class="tg-spoiler"':
25
- return tag_pattern.sub(r'<span class="tg-spoiler">\1</span>', out_text)
26
-
27
- return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
@@ -1,95 +0,0 @@
1
- import re
2
-
3
-
4
- def ensure_closing_delimiters(text: str) -> str:
5
- # Append missing closing backtick delimiters.
6
-
7
- code_block_re = re.compile(
8
- r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
- flags=re.DOTALL,
10
- )
11
-
12
- # Track an open fence. Once a fence is opened, everything until the same
13
- # fence is encountered again is treated as plain text. This mimics how
14
- # Markdown handles fences and allows fence-like strings inside code blocks.
15
- open_fence = None
16
- for line in text.splitlines():
17
- stripped = line.strip()
18
- if open_fence is None:
19
- m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
20
- if m:
21
- open_fence = m.group("fence")
22
- else:
23
- if stripped.endswith(open_fence):
24
- open_fence = None
25
-
26
- # If a fence was left open, append a matching closing fence.
27
- if open_fence is not None:
28
- if not text.endswith("\n"):
29
- text += "\n"
30
- text += open_fence
31
-
32
- cleaned_inline = code_block_re.sub("", text)
33
-
34
- # Balance triple backticks that are not part of a complete fence.
35
- if cleaned_inline.count("```") % 2 != 0:
36
- text += "```"
37
-
38
- # Balance single backticks outside fenced blocks.
39
- cleaned_inline = code_block_re.sub("", text)
40
- if cleaned_inline.count("`") % 2 != 0:
41
- text += "`"
42
-
43
- return text
44
-
45
-
46
- def extract_and_convert_code_blocks(text: str):
47
- """
48
- Extracts code blocks from the text, converting them to HTML <pre><code> format,
49
- and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
50
- """
51
- text = ensure_closing_delimiters(text)
52
- placeholders = []
53
- code_blocks = {}
54
-
55
- def replacer(match):
56
- language = match.group("lang") if match.group("lang") else ""
57
- code_content = match.group("code")
58
-
59
- # Properly escape HTML entities in code content
60
- escaped_content = (
61
- code_content.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
62
- )
63
-
64
- placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
65
- placeholders.append(placeholder)
66
- if not language:
67
- html_code_block = f"<pre><code>{escaped_content}</code></pre>"
68
- else:
69
- html_code_block = (
70
- f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
71
- )
72
- return (placeholder, html_code_block)
73
-
74
- modified_text = text
75
- code_block_pattern = re.compile(
76
- r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
77
- flags=re.DOTALL,
78
- )
79
- for match in code_block_pattern.finditer(text):
80
- placeholder, html_code_block = replacer(
81
- match
82
- )
83
- code_blocks[placeholder] = html_code_block
84
- modified_text = modified_text.replace(match.group(0), placeholder, 1)
85
-
86
- return modified_text, code_blocks
87
-
88
-
89
- def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
90
- """
91
- Reinserts HTML code blocks into the text, replacing their placeholders.
92
- """
93
- for placeholder, html_code_block in code_blocks.items():
94
- text = text.replace(placeholder, html_code_block, 1)
95
- return text
@@ -1,68 +0,0 @@
1
- def combine_blockquotes(text: str) -> str:
2
- """
3
- Combines multiline blockquotes into a single blockquote while keeping the \n characters.
4
- Supports both regular blockquotes (>) and expandable blockquotes (**>).
5
- """
6
- lines = text.split("\n")
7
- combined_lines = []
8
- blockquote_lines = []
9
- in_blockquote = False
10
- is_expandable = False
11
-
12
- for line in lines:
13
- if line.startswith("**>"):
14
- # Expandable blockquote
15
- in_blockquote = True
16
- is_expandable = True
17
- blockquote_lines.append(line[3:].strip())
18
- elif line.startswith(">"):
19
- # Regular blockquote
20
- if not in_blockquote:
21
- # This is a new blockquote
22
- in_blockquote = True
23
- is_expandable = False
24
- blockquote_lines.append(line[1:].strip())
25
- else:
26
- if in_blockquote:
27
- # End of blockquote, combine the lines
28
- if is_expandable:
29
- combined_lines.append(
30
- "<blockquote expandable>"
31
- + "\n".join(blockquote_lines)
32
- + "</blockquote>"
33
- )
34
- else:
35
- combined_lines.append(
36
- "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
37
- )
38
- blockquote_lines = []
39
- in_blockquote = False
40
- is_expandable = False
41
- combined_lines.append(line)
42
-
43
- if in_blockquote:
44
- # Handle the case where the file ends with a blockquote
45
- if is_expandable:
46
- combined_lines.append(
47
- "<blockquote expandable>"
48
- + "\n".join(blockquote_lines)
49
- + "</blockquote>"
50
- )
51
- else:
52
- combined_lines.append(
53
- "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
54
- )
55
-
56
- return "\n".join(combined_lines)
57
-
58
-
59
- def fix_asterisk_equations(text: str) -> str:
60
- """
61
- Replaces numeric expressions with '*' in them with '×'
62
- to avoid accidental italic formatting.
63
- e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
64
- """
65
- import re
66
-
67
- eq_pattern = re.compile(r"(\d+)\s*\*\s*(\d+)")
68
- return eq_pattern.sub(r"\1×\2", text)
@@ -1,99 +0,0 @@
1
- import re
2
-
3
- from .converters import convert_html_chars, split_by_tag
4
- from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
5
- from .formatters import combine_blockquotes
6
- from .helpers import remove_blockquote_escaping, remove_spoiler_escaping
7
-
8
-
9
- def extract_inline_code_snippets(text: str):
10
- """
11
- Extracts inline code (single-backtick content) from the text,
12
- replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
13
- This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
14
- """
15
- placeholders = []
16
- code_snippets = {}
17
- inline_code_pattern = re.compile(r"`([^`]+)`")
18
-
19
- def replacer(match):
20
- snippet = match.group(1)
21
- placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
22
- placeholders.append(placeholder)
23
- code_snippets[placeholder] = snippet
24
- return placeholder
25
-
26
- new_text = inline_code_pattern.sub(replacer, text)
27
- return new_text, code_snippets
28
-
29
-
30
- def telegram_format(text: str) -> str:
31
- """
32
- Converts markdown in the provided text to HTML supported by Telegram.
33
- """
34
-
35
- # Step 0: Combine blockquotes
36
- text = combine_blockquotes(text)
37
-
38
- # Step 1: Extract and convert triple-backtick code blocks first
39
- output, triple_code_blocks = extract_and_convert_code_blocks(text)
40
-
41
- # Step 2: Extract inline code snippets
42
- output, inline_code_snippets = extract_inline_code_snippets(output)
43
-
44
- # Step 3: Convert HTML reserved symbols in the text (not in code blocks)
45
- output = convert_html_chars(output)
46
-
47
- # Convert headings (H1-H6)
48
- output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
49
-
50
- # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
51
- output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
52
-
53
- # Nested Bold and Italic
54
- output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
55
- output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
56
-
57
- # Process markdown for bold (**), underline (__), strikethrough (~~), and spoiler (||)
58
- output = split_by_tag(output, "**", "b")
59
- output = split_by_tag(output, "__", "u")
60
- output = split_by_tag(output, "~~", "s")
61
- output = split_by_tag(output, "||", 'span class="tg-spoiler"')
62
-
63
- # Custom approach for single-asterisk italic
64
- italic_pattern = re.compile(
65
- r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])", re.DOTALL
66
- )
67
- output = italic_pattern.sub(r"<i>\1</i>", output)
68
-
69
- # Process single underscore-based italic
70
- output = split_by_tag(output, "_", "i")
71
-
72
- # Remove storage links (Vector storage placeholders like 【4:0†source】)
73
- output = re.sub(r"【[^】]+】", "", output)
74
-
75
- # Convert Markdown links/images to <a href="">…</a>
76
- link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
77
- output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
78
-
79
- # Step 4: Reinsert inline code snippets, applying HTML escaping to the content
80
- for placeholder, snippet in inline_code_snippets.items():
81
- # Apply HTML escaping to the content of inline code
82
- escaped_snippet = (
83
- snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
84
- )
85
- output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
86
-
87
- # Step 5: Reinsert the converted triple-backtick code blocks
88
- output = reinsert_code_blocks(output, triple_code_blocks)
89
-
90
- # Step 6: Remove blockquote escaping
91
- output = remove_blockquote_escaping(output)
92
-
93
- # Step 7: Remove spoiler tag escaping
94
- output = remove_spoiler_escaping(output)
95
-
96
- # Clean up multiple consecutive newlines, but preserve intentional spacing
97
- output = re.sub(r"\n{3,}", "\n\n", output)
98
-
99
- return output.strip()
@@ -1,16 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- chatgpt_md_converter/__init__.py
5
- chatgpt_md_converter/converters.py
6
- chatgpt_md_converter/extractors.py
7
- chatgpt_md_converter/formatters.py
8
- chatgpt_md_converter/helpers.py
9
- chatgpt_md_converter/html_splitter.py
10
- chatgpt_md_converter/telegram_formatter.py
11
- chatgpt_md_converter.egg-info/PKG-INFO
12
- chatgpt_md_converter.egg-info/SOURCES.txt
13
- chatgpt_md_converter.egg-info/dependency_links.txt
14
- chatgpt_md_converter.egg-info/top_level.txt
15
- tests/test_parser.py
16
- tests/test_splitter.py