chatgpt-md-converter 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatgpt_md_converter/__init__.py +3 -2
- chatgpt_md_converter/html_markdown/escaping.py +68 -0
- chatgpt_md_converter/html_markdown/handlers.py +155 -0
- chatgpt_md_converter/html_markdown/renderer.py +16 -0
- chatgpt_md_converter/html_markdown/state.py +16 -0
- chatgpt_md_converter/html_markdown/tree.py +65 -0
- chatgpt_md_converter/html_to_markdown.py +5 -0
- chatgpt_md_converter/telegram_formatter.py +2 -98
- chatgpt_md_converter/telegram_markdown/__init__.py +5 -0
- chatgpt_md_converter/telegram_markdown/code_blocks.py +95 -0
- chatgpt_md_converter/telegram_markdown/inline.py +73 -0
- chatgpt_md_converter/{helpers.py → telegram_markdown/postprocess.py} +5 -11
- chatgpt_md_converter/telegram_markdown/preprocess.py +39 -0
- chatgpt_md_converter/telegram_markdown/renderer.py +55 -0
- {chatgpt_md_converter-0.3.8.dist-info → chatgpt_md_converter-0.3.9.dist-info}/METADATA +19 -1
- chatgpt_md_converter-0.3.9.dist-info/RECORD +20 -0
- chatgpt_md_converter/converters.py +0 -27
- chatgpt_md_converter/extractors.py +0 -95
- chatgpt_md_converter/formatters.py +0 -68
- chatgpt_md_converter-0.3.8.dist-info/RECORD +0 -12
- {chatgpt_md_converter-0.3.8.dist-info → chatgpt_md_converter-0.3.9.dist-info}/WHEEL +0 -0
- {chatgpt_md_converter-0.3.8.dist-info → chatgpt_md_converter-0.3.9.dist-info}/licenses/LICENSE +0 -0
- {chatgpt_md_converter-0.3.8.dist-info → chatgpt_md_converter-0.3.9.dist-info}/top_level.txt +0 -0
chatgpt_md_converter/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from .telegram_formatter import telegram_format
|
|
2
1
|
from .html_splitter import split_html_for_telegram
|
|
2
|
+
from .html_to_markdown import html_to_telegram_markdown
|
|
3
|
+
from .telegram_formatter import telegram_format
|
|
3
4
|
|
|
4
|
-
__all__ = ["telegram_format", "split_html_for_telegram"]
|
|
5
|
+
__all__ = ["telegram_format", "split_html_for_telegram", "html_to_telegram_markdown"]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Shared escaping utilities for Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import html
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from .tree import Node
|
|
9
|
+
|
|
10
|
+
_SIMPLE_STAR_ITALIC = re.compile(
|
|
11
|
+
r"(?<!\\)(?<!\*)\*(?=[^\s])([^\*\n]+?)(?<!\s)\*(?![A-Za-z0-9\*])",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _canonicalize_star_italics(text: str) -> str:
|
|
16
|
+
def _replace(match: re.Match[str]) -> str:
|
|
17
|
+
inner = match.group(1)
|
|
18
|
+
if "*" in inner or "_" in inner or "`" in inner:
|
|
19
|
+
return match.group(0)
|
|
20
|
+
return f"_{inner}_"
|
|
21
|
+
|
|
22
|
+
return _SIMPLE_STAR_ITALIC.sub(_replace, text)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def normalise_text(text: str) -> str:
|
|
26
|
+
if not text:
|
|
27
|
+
return ""
|
|
28
|
+
unescaped = html.unescape(text)
|
|
29
|
+
return unescaped.replace("\u00a0", " ")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def collect_text(node: Node) -> str:
|
|
33
|
+
if node.kind == "text":
|
|
34
|
+
return html.unescape(node.text)
|
|
35
|
+
parts: list[str] = []
|
|
36
|
+
for child in node.children:
|
|
37
|
+
if child.kind == "text":
|
|
38
|
+
parts.append(html.unescape(child.text))
|
|
39
|
+
elif child.kind == "element":
|
|
40
|
+
if child.tag.lower() == "br":
|
|
41
|
+
parts.append("\n")
|
|
42
|
+
else:
|
|
43
|
+
parts.append(collect_text(child))
|
|
44
|
+
return "".join(parts)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def escape_inline_code(text: str) -> str:
|
|
48
|
+
return text.replace("`", "\\`")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def escape_link_label(label: str) -> str:
|
|
52
|
+
escaped = label
|
|
53
|
+
for ch in "[]()":
|
|
54
|
+
escaped = escaped.replace(ch, f"\\{ch}")
|
|
55
|
+
return escaped
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def escape_link_url(url: str) -> str:
|
|
59
|
+
return url.replace("\\", "\\\\").replace(")", "\\)")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def post_process(markdown: str) -> str:
|
|
63
|
+
text = re.sub(r"(^|\n)•\s", r"\1- ", markdown)
|
|
64
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
65
|
+
text = text.replace("\r", "")
|
|
66
|
+
text = "\n".join(line.rstrip() for line in text.split("\n"))
|
|
67
|
+
text = _canonicalize_star_italics(text)
|
|
68
|
+
return text.strip()
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Tag-specific renderers for Telegram Markdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Callable, Dict
|
|
6
|
+
|
|
7
|
+
from .escaping import (collect_text, escape_inline_code, escape_link_label,
|
|
8
|
+
escape_link_url, normalise_text)
|
|
9
|
+
from .state import RenderState
|
|
10
|
+
from .tree import Node
|
|
11
|
+
|
|
12
|
+
InlineHandler = Callable[[Node, RenderState], str]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_INLINE_MARKERS: Dict[str, tuple[str, str]] = {
|
|
16
|
+
"u": ("__", "__"),
|
|
17
|
+
"ins": ("__", "__"),
|
|
18
|
+
"s": ("~~", "~~"),
|
|
19
|
+
"strike": ("~~", "~~"),
|
|
20
|
+
"del": ("~~", "~~"),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def render_nodes(nodes: list[Node], state: RenderState) -> str:
|
|
25
|
+
return "".join(render_node(node, state) for node in nodes)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def render_node(node: Node, state: RenderState) -> str:
|
|
29
|
+
if node.kind == "text":
|
|
30
|
+
return normalise_text(node.text)
|
|
31
|
+
|
|
32
|
+
handler = TAG_DISPATCH.get(node.tag.lower())
|
|
33
|
+
if handler:
|
|
34
|
+
return handler(node, state)
|
|
35
|
+
return render_nodes(node.children, state)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _handle_bold(node: Node, state: RenderState) -> str:
|
|
39
|
+
inner_state = state.child(bold_depth=state.bold_depth + 1)
|
|
40
|
+
inner = render_nodes(node.children, inner_state)
|
|
41
|
+
return f"**{inner}**"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _handle_italic(node: Node, state: RenderState) -> str:
|
|
45
|
+
depth = state.italic_depth
|
|
46
|
+
in_bold = state.bold_depth > 0 and depth == 0
|
|
47
|
+
marker = "_" if in_bold else ("*" if depth % 2 == 0 else "_")
|
|
48
|
+
inner_state = state.child(italic_depth=depth + 1)
|
|
49
|
+
inner = render_nodes(node.children, inner_state)
|
|
50
|
+
return f"{marker}{inner}{marker}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _handle_inline_marker(node: Node, state: RenderState) -> str:
|
|
54
|
+
marker_open, marker_close = _INLINE_MARKERS[node.tag.lower()]
|
|
55
|
+
inner = render_nodes(node.children, state)
|
|
56
|
+
return f"{marker_open}{inner}{marker_close}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _handle_spoiler(node: Node, state: RenderState) -> str:
|
|
60
|
+
inner = render_nodes(node.children, state)
|
|
61
|
+
return f"||{inner}||"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _handle_code(node: Node, state: RenderState) -> str:
|
|
65
|
+
inner = collect_text(node)
|
|
66
|
+
return f"`{escape_inline_code(inner)}`"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _handle_pre(node: Node, state: RenderState) -> str:
|
|
70
|
+
children = node.children
|
|
71
|
+
language: str | None = None
|
|
72
|
+
content_node: Node
|
|
73
|
+
|
|
74
|
+
if len(children) == 1 and children[0].kind == "element" and children[0].tag.lower() == "code":
|
|
75
|
+
content_node = children[0]
|
|
76
|
+
class_attr = content_node.attrs.get("class") or ""
|
|
77
|
+
for part in class_attr.split():
|
|
78
|
+
if part.startswith("language-"):
|
|
79
|
+
language = part.split("-", 1)[1]
|
|
80
|
+
break
|
|
81
|
+
else:
|
|
82
|
+
content_node = Node(kind="element", tag="__virtual__", children=children)
|
|
83
|
+
|
|
84
|
+
inner_text = collect_text(content_node)
|
|
85
|
+
fence = f"```{language}" if language else "```"
|
|
86
|
+
if language or "\n" in inner_text:
|
|
87
|
+
return f"{fence}\n{inner_text}```"
|
|
88
|
+
return f"{fence}{inner_text}```"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _handle_link(node: Node, state: RenderState) -> str:
|
|
92
|
+
href = node.attrs.get("href", "") or ""
|
|
93
|
+
label = render_nodes(node.children, state)
|
|
94
|
+
if not label:
|
|
95
|
+
label = href
|
|
96
|
+
|
|
97
|
+
escaped_label = escape_link_label(label)
|
|
98
|
+
escaped_url = escape_link_url(href)
|
|
99
|
+
|
|
100
|
+
if href.startswith("tg://emoji?"):
|
|
101
|
+
return f""
|
|
102
|
+
return f"[{escaped_label}]({escaped_url})"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _handle_blockquote(node: Node, state: RenderState) -> str:
|
|
106
|
+
inner = render_nodes(node.children, state)
|
|
107
|
+
lines = inner.split("\n")
|
|
108
|
+
expandable = "expandable" in node.attrs
|
|
109
|
+
rendered: list[str] = []
|
|
110
|
+
for index, line in enumerate(lines):
|
|
111
|
+
prefix = "**>" if expandable and index == 0 else ">"
|
|
112
|
+
stripped = line.rstrip("\r")
|
|
113
|
+
if expandable:
|
|
114
|
+
rendered.append(prefix + stripped)
|
|
115
|
+
else:
|
|
116
|
+
rendered.append(f"{prefix} {stripped}" if stripped else prefix)
|
|
117
|
+
return "\n".join(rendered)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _handle_tg_emoji(node: Node, state: RenderState) -> str:
|
|
121
|
+
emoji_id = node.attrs.get("emoji-id")
|
|
122
|
+
label = render_nodes(node.children, state)
|
|
123
|
+
if emoji_id:
|
|
124
|
+
href = f"tg://emoji?id={emoji_id}"
|
|
125
|
+
return f""
|
|
126
|
+
return label
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _handle_span(node: Node, state: RenderState) -> str:
|
|
130
|
+
classes = (node.attrs.get("class") or "").split()
|
|
131
|
+
if any(cls == "tg-spoiler" for cls in classes):
|
|
132
|
+
return _handle_spoiler(node, state)
|
|
133
|
+
if any(cls == "tg-emoji" for cls in classes):
|
|
134
|
+
return render_nodes(node.children, state)
|
|
135
|
+
return render_nodes(node.children, state)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
TAG_DISPATCH: Dict[str, Callable[[Node, RenderState], str]] = {
|
|
139
|
+
"b": _handle_bold,
|
|
140
|
+
"strong": _handle_bold,
|
|
141
|
+
"i": _handle_italic,
|
|
142
|
+
"em": _handle_italic,
|
|
143
|
+
"u": _handle_inline_marker,
|
|
144
|
+
"ins": _handle_inline_marker,
|
|
145
|
+
"s": _handle_inline_marker,
|
|
146
|
+
"strike": _handle_inline_marker,
|
|
147
|
+
"del": _handle_inline_marker,
|
|
148
|
+
"span": _handle_span,
|
|
149
|
+
"tg-spoiler": _handle_spoiler,
|
|
150
|
+
"code": _handle_code,
|
|
151
|
+
"pre": _handle_pre,
|
|
152
|
+
"a": _handle_link,
|
|
153
|
+
"blockquote": _handle_blockquote,
|
|
154
|
+
"tg-emoji": _handle_tg_emoji,
|
|
155
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""High-level HTML → Telegram Markdown renderer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from .escaping import post_process
|
|
8
|
+
from .handlers import render_nodes
|
|
9
|
+
from .state import RenderState
|
|
10
|
+
from .tree import Node, build_tree
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def html_to_telegram_markdown(html_text: str) -> str:
|
|
14
|
+
nodes: List[Node] = build_tree(html_text)
|
|
15
|
+
markdown = render_nodes(nodes, RenderState())
|
|
16
|
+
return post_process(markdown)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Rendering state for HTML → Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class RenderState:
|
|
10
|
+
bold_depth: int = 0
|
|
11
|
+
italic_depth: int = 0
|
|
12
|
+
|
|
13
|
+
def child(self, **updates: int) -> "RenderState":
|
|
14
|
+
data = {"bold_depth": self.bold_depth, "italic_depth": self.italic_depth}
|
|
15
|
+
data.update(updates)
|
|
16
|
+
return RenderState(**data)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""DOM-like tree construction for Telegram HTML fragments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from html.parser import HTMLParser
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Node:
|
|
12
|
+
kind: str # "text" or "element"
|
|
13
|
+
text: str = ""
|
|
14
|
+
tag: str = ""
|
|
15
|
+
attrs: Dict[str, Optional[str]] = field(default_factory=dict)
|
|
16
|
+
children: List["Node"] = field(default_factory=list)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _HTMLTreeBuilder(HTMLParser):
|
|
20
|
+
SELF_CLOSING_TAGS = {"br"}
|
|
21
|
+
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
super().__init__(convert_charrefs=False)
|
|
24
|
+
self.root = Node(kind="element", tag="__root__")
|
|
25
|
+
self._stack: List[Node] = [self.root]
|
|
26
|
+
|
|
27
|
+
def handle_starttag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
|
|
28
|
+
if tag in self.SELF_CLOSING_TAGS:
|
|
29
|
+
if tag == "br":
|
|
30
|
+
self._stack[-1].children.append(Node(kind="text", text="\n"))
|
|
31
|
+
return
|
|
32
|
+
node = Node(kind="element", tag=tag, attrs=dict(attrs))
|
|
33
|
+
self._stack[-1].children.append(node)
|
|
34
|
+
self._stack.append(node)
|
|
35
|
+
|
|
36
|
+
def handle_endtag(self, tag: str) -> None:
|
|
37
|
+
for index in range(len(self._stack) - 1, 0, -1):
|
|
38
|
+
if self._stack[index].tag == tag:
|
|
39
|
+
del self._stack[index:]
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
def handle_startendtag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
|
|
43
|
+
if tag in self.SELF_CLOSING_TAGS:
|
|
44
|
+
self.handle_starttag(tag, attrs)
|
|
45
|
+
return
|
|
46
|
+
node = Node(kind="element", tag=tag, attrs=dict(attrs))
|
|
47
|
+
self._stack[-1].children.append(node)
|
|
48
|
+
|
|
49
|
+
def handle_data(self, data: str) -> None:
|
|
50
|
+
if data:
|
|
51
|
+
self._stack[-1].children.append(Node(kind="text", text=data))
|
|
52
|
+
|
|
53
|
+
def handle_entityref(self, name: str) -> None:
|
|
54
|
+
self.handle_data(f"&{name};")
|
|
55
|
+
|
|
56
|
+
def handle_charref(self, name: str) -> None:
|
|
57
|
+
self.handle_data(f"&#{name};")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_tree(html_text: str) -> List[Node]:
|
|
61
|
+
"""Parse HTML and return the list of top-level nodes."""
|
|
62
|
+
builder = _HTMLTreeBuilder()
|
|
63
|
+
builder.feed(html_text)
|
|
64
|
+
builder.close()
|
|
65
|
+
return builder.root.children
|
|
@@ -1,99 +1,3 @@
|
|
|
1
|
-
import
|
|
1
|
+
from .telegram_markdown.renderer import telegram_format
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
5
|
-
from .formatters import combine_blockquotes
|
|
6
|
-
from .helpers import remove_blockquote_escaping, remove_spoiler_escaping
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def extract_inline_code_snippets(text: str):
|
|
10
|
-
"""
|
|
11
|
-
Extracts inline code (single-backtick content) from the text,
|
|
12
|
-
replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
|
|
13
|
-
This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
|
|
14
|
-
"""
|
|
15
|
-
placeholders = []
|
|
16
|
-
code_snippets = {}
|
|
17
|
-
inline_code_pattern = re.compile(r"`([^`]+)`")
|
|
18
|
-
|
|
19
|
-
def replacer(match):
|
|
20
|
-
snippet = match.group(1)
|
|
21
|
-
placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
|
|
22
|
-
placeholders.append(placeholder)
|
|
23
|
-
code_snippets[placeholder] = snippet
|
|
24
|
-
return placeholder
|
|
25
|
-
|
|
26
|
-
new_text = inline_code_pattern.sub(replacer, text)
|
|
27
|
-
return new_text, code_snippets
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def telegram_format(text: str) -> str:
|
|
31
|
-
"""
|
|
32
|
-
Converts markdown in the provided text to HTML supported by Telegram.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
# Step 0: Combine blockquotes
|
|
36
|
-
text = combine_blockquotes(text)
|
|
37
|
-
|
|
38
|
-
# Step 1: Extract and convert triple-backtick code blocks first
|
|
39
|
-
output, triple_code_blocks = extract_and_convert_code_blocks(text)
|
|
40
|
-
|
|
41
|
-
# Step 2: Extract inline code snippets
|
|
42
|
-
output, inline_code_snippets = extract_inline_code_snippets(output)
|
|
43
|
-
|
|
44
|
-
# Step 3: Convert HTML reserved symbols in the text (not in code blocks)
|
|
45
|
-
output = convert_html_chars(output)
|
|
46
|
-
|
|
47
|
-
# Convert headings (H1-H6)
|
|
48
|
-
output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
|
|
49
|
-
|
|
50
|
-
# Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
|
|
51
|
-
output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
|
|
52
|
-
|
|
53
|
-
# Nested Bold and Italic
|
|
54
|
-
output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
|
|
55
|
-
output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
|
|
56
|
-
|
|
57
|
-
# Process markdown for bold (**), underline (__), strikethrough (~~), and spoiler (||)
|
|
58
|
-
output = split_by_tag(output, "**", "b")
|
|
59
|
-
output = split_by_tag(output, "__", "u")
|
|
60
|
-
output = split_by_tag(output, "~~", "s")
|
|
61
|
-
output = split_by_tag(output, "||", 'span class="tg-spoiler"')
|
|
62
|
-
|
|
63
|
-
# Custom approach for single-asterisk italic
|
|
64
|
-
italic_pattern = re.compile(
|
|
65
|
-
r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])", re.DOTALL
|
|
66
|
-
)
|
|
67
|
-
output = italic_pattern.sub(r"<i>\1</i>", output)
|
|
68
|
-
|
|
69
|
-
# Process single underscore-based italic
|
|
70
|
-
output = split_by_tag(output, "_", "i")
|
|
71
|
-
|
|
72
|
-
# Remove storage links (Vector storage placeholders like 【4:0†source】)
|
|
73
|
-
output = re.sub(r"【[^】]+】", "", output)
|
|
74
|
-
|
|
75
|
-
# Convert Markdown links/images to <a href="">…</a>
|
|
76
|
-
link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
|
|
77
|
-
output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
|
|
78
|
-
|
|
79
|
-
# Step 4: Reinsert inline code snippets, applying HTML escaping to the content
|
|
80
|
-
for placeholder, snippet in inline_code_snippets.items():
|
|
81
|
-
# Apply HTML escaping to the content of inline code
|
|
82
|
-
escaped_snippet = (
|
|
83
|
-
snippet.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
84
|
-
)
|
|
85
|
-
output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
|
|
86
|
-
|
|
87
|
-
# Step 5: Reinsert the converted triple-backtick code blocks
|
|
88
|
-
output = reinsert_code_blocks(output, triple_code_blocks)
|
|
89
|
-
|
|
90
|
-
# Step 6: Remove blockquote escaping
|
|
91
|
-
output = remove_blockquote_escaping(output)
|
|
92
|
-
|
|
93
|
-
# Step 7: Remove spoiler tag escaping
|
|
94
|
-
output = remove_spoiler_escaping(output)
|
|
95
|
-
|
|
96
|
-
# Clean up multiple consecutive newlines, but preserve intentional spacing
|
|
97
|
-
output = re.sub(r"\n{3,}", "\n\n", output)
|
|
98
|
-
|
|
99
|
-
return output.strip()
|
|
3
|
+
__all__ = ['telegram_format']
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Code block extraction utilities for Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_CODE_BLOCK_RE = re.compile(
|
|
6
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
7
|
+
flags=re.DOTALL,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _count_unescaped_backticks(text: str) -> int:
|
|
13
|
+
"""Return the number of backticks not escaped by a backslash."""
|
|
14
|
+
count = 0
|
|
15
|
+
for index, char in enumerate(text):
|
|
16
|
+
if char != "`":
|
|
17
|
+
continue
|
|
18
|
+
backslashes = 0
|
|
19
|
+
j = index - 1
|
|
20
|
+
while j >= 0 and text[j] == '\\':
|
|
21
|
+
backslashes += 1
|
|
22
|
+
j -= 1
|
|
23
|
+
if backslashes % 2 == 0:
|
|
24
|
+
count += 1
|
|
25
|
+
return count
|
|
26
|
+
|
|
27
|
+
def ensure_closing_delimiters(text: str) -> str:
|
|
28
|
+
"""Append any missing closing backtick fences for Markdown code blocks."""
|
|
29
|
+
open_fence = None
|
|
30
|
+
for line in text.splitlines():
|
|
31
|
+
stripped = line.strip()
|
|
32
|
+
if open_fence is None:
|
|
33
|
+
match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
|
|
34
|
+
if match:
|
|
35
|
+
open_fence = match.group("fence")
|
|
36
|
+
else:
|
|
37
|
+
if stripped.endswith(open_fence):
|
|
38
|
+
open_fence = None
|
|
39
|
+
|
|
40
|
+
if open_fence is not None:
|
|
41
|
+
if not text.endswith("\n"):
|
|
42
|
+
text += "\n"
|
|
43
|
+
text += open_fence
|
|
44
|
+
|
|
45
|
+
cleaned_inline = _CODE_BLOCK_RE.sub("", text)
|
|
46
|
+
if cleaned_inline.count("```") % 2 != 0:
|
|
47
|
+
text += "```"
|
|
48
|
+
|
|
49
|
+
cleaned_inline = _CODE_BLOCK_RE.sub("", text)
|
|
50
|
+
if _count_unescaped_backticks(cleaned_inline) % 2 != 0:
|
|
51
|
+
text += "`"
|
|
52
|
+
|
|
53
|
+
return text
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_and_convert_code_blocks(text: str):
|
|
57
|
+
"""Replace fenced code blocks with placeholders and return HTML renderings."""
|
|
58
|
+
text = ensure_closing_delimiters(text)
|
|
59
|
+
placeholders: list[str] = []
|
|
60
|
+
code_blocks: dict[str, str] = {}
|
|
61
|
+
|
|
62
|
+
def _replacement(match: re.Match[str]) -> tuple[str, str]:
|
|
63
|
+
language = match.group("lang") or ""
|
|
64
|
+
code_content = match.group("code")
|
|
65
|
+
escaped = (
|
|
66
|
+
code_content.replace("&", "&")
|
|
67
|
+
.replace("<", "<")
|
|
68
|
+
.replace(">", ">")
|
|
69
|
+
)
|
|
70
|
+
placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
|
|
71
|
+
placeholders.append(placeholder)
|
|
72
|
+
if language:
|
|
73
|
+
html_block = f'<pre><code class="language-{language}">{escaped}</code></pre>'
|
|
74
|
+
else:
|
|
75
|
+
html_block = f"<pre><code>{escaped}</code></pre>"
|
|
76
|
+
return placeholder, html_block
|
|
77
|
+
|
|
78
|
+
modified = text
|
|
79
|
+
pattern = re.compile(
|
|
80
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
|
|
81
|
+
flags=re.DOTALL,
|
|
82
|
+
)
|
|
83
|
+
for match in pattern.finditer(text):
|
|
84
|
+
placeholder, html_block = _replacement(match)
|
|
85
|
+
code_blocks[placeholder] = html_block
|
|
86
|
+
modified = modified.replace(match.group(0), placeholder, 1)
|
|
87
|
+
|
|
88
|
+
return modified, code_blocks
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def reinsert_code_blocks(text: str, code_blocks: dict[str, str]) -> str:
|
|
92
|
+
"""Insert rendered HTML code blocks back into their placeholders."""
|
|
93
|
+
for placeholder, html_block in code_blocks.items():
|
|
94
|
+
text = text.replace(placeholder, html_block, 1)
|
|
95
|
+
return text
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Inline text helpers for Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_inline_code_pattern = re.compile(r"`([^`]+)`")
|
|
6
|
+
|
|
7
|
+
_BOLD_PATTERN = re.compile(r"(?<!\\)\*\*(?=\S)(.*?)(?<=\S)\*\*", re.DOTALL)
|
|
8
|
+
_UNDERLINE_PATTERN = re.compile(
|
|
9
|
+
r"(?<!\\)(?<![A-Za-z0-9_])__(?=\S)(.*?)(?<=\S)__(?![A-Za-z0-9_])",
|
|
10
|
+
re.DOTALL,
|
|
11
|
+
)
|
|
12
|
+
_ITALIC_UNDERSCORE_PATTERN = re.compile(
|
|
13
|
+
r"(?<!\\)(?<![A-Za-z0-9_])_(?=\S)(.*?)(?<=\S)_(?![A-Za-z0-9_])",
|
|
14
|
+
re.DOTALL,
|
|
15
|
+
)
|
|
16
|
+
_STRIKETHROUGH_PATTERN = re.compile(r"(?<!\\)~~(?=\S)(.*?)(?<=\S)~~", re.DOTALL)
|
|
17
|
+
_SPOILER_PATTERN = re.compile(r"(?<!\\)\|\|(?=\S)([^\n]*?)(?<=\S)\|\|")
|
|
18
|
+
_ITALIC_STAR_PATTERN = re.compile(
|
|
19
|
+
r"(?<![A-Za-z0-9\\])\*(?!\*)(?=[^\s])(.*?)(?<![\s\\])\*(?![A-Za-z0-9\\])",
|
|
20
|
+
re.DOTALL,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_PATTERN_MAP = {
|
|
24
|
+
"**": _BOLD_PATTERN,
|
|
25
|
+
"__": _UNDERLINE_PATTERN,
|
|
26
|
+
"_": _ITALIC_UNDERSCORE_PATTERN,
|
|
27
|
+
"~~": _STRIKETHROUGH_PATTERN,
|
|
28
|
+
"||": _SPOILER_PATTERN,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def convert_html_chars(text: str) -> str:
|
|
33
|
+
text = text.replace("&", "&")
|
|
34
|
+
text = text.replace("<", "<")
|
|
35
|
+
text = text.replace(">", ">")
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
|
|
40
|
+
pattern = _PATTERN_MAP.get(md_tag)
|
|
41
|
+
if pattern is None:
|
|
42
|
+
escaped = re.escape(md_tag)
|
|
43
|
+
pattern = re.compile(
|
|
44
|
+
rf"(?<!\\){escaped}(?=\S)(.*?)(?<=\S){escaped}",
|
|
45
|
+
re.DOTALL,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _wrap(match: re.Match[str]) -> str:
|
|
49
|
+
inner = match.group(1)
|
|
50
|
+
if html_tag == 'span class="tg-spoiler"':
|
|
51
|
+
return f'<span class="tg-spoiler">{inner}</span>'
|
|
52
|
+
return f"<{html_tag}>{inner}</{html_tag}>"
|
|
53
|
+
|
|
54
|
+
return pattern.sub(_wrap, out_text)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def extract_inline_code_snippets(text: str):
|
|
58
|
+
placeholders: list[str] = []
|
|
59
|
+
snippets: dict[str, str] = {}
|
|
60
|
+
|
|
61
|
+
def replacer(match: re.Match[str]) -> str:
|
|
62
|
+
snippet = match.group(1)
|
|
63
|
+
placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
|
|
64
|
+
placeholders.append(placeholder)
|
|
65
|
+
snippets[placeholder] = snippet
|
|
66
|
+
return placeholder
|
|
67
|
+
|
|
68
|
+
modified = _inline_code_pattern.sub(replacer, text)
|
|
69
|
+
return modified, snippets
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def apply_custom_italic(text: str) -> str:
|
|
73
|
+
return _ITALIC_STAR_PATTERN.sub(r"<i>\1</i>", text)
|
|
@@ -1,25 +1,19 @@
|
|
|
1
|
+
"""Post-processing helpers for Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
def remove_blockquote_escaping(output: str) -> str:
|
|
2
|
-
"""
|
|
3
|
-
Removes the escaping from blockquote tags, including expandable blockquotes.
|
|
4
|
-
"""
|
|
5
|
-
# Regular blockquotes
|
|
5
|
+
"""Unescape blockquote tags produced during formatting."""
|
|
6
6
|
output = output.replace("<blockquote>", "<blockquote>").replace(
|
|
7
7
|
"</blockquote>", "</blockquote>"
|
|
8
8
|
)
|
|
9
|
-
|
|
10
|
-
# Expandable blockquotes
|
|
11
9
|
output = output.replace(
|
|
12
10
|
"<blockquote expandable>", "<blockquote expandable>"
|
|
13
11
|
).replace("</blockquote>", "</blockquote>")
|
|
14
|
-
|
|
15
12
|
return output
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def remove_spoiler_escaping(output: str) -> str:
|
|
19
|
-
"""
|
|
20
|
-
Ensures spoiler tags are correctly formatted (rather than being escaped).
|
|
21
|
-
"""
|
|
22
|
-
# Fix any incorrectly escaped spoiler tags
|
|
16
|
+
"""Ensure spoiler spans remain HTML tags, not escaped text."""
|
|
23
17
|
output = output.replace(
|
|
24
18
|
'<span class="tg-spoiler">', '<span class="tg-spoiler">'
|
|
25
19
|
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Pre-processing helpers for Telegram Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def combine_blockquotes(text: str) -> str:
|
|
5
|
+
"""Collapse consecutive Markdown blockquote lines into Telegram HTML blocks."""
|
|
6
|
+
lines = text.split("\n")
|
|
7
|
+
combined_lines = []
|
|
8
|
+
blockquote_lines = []
|
|
9
|
+
in_blockquote = False
|
|
10
|
+
is_expandable = False
|
|
11
|
+
|
|
12
|
+
for line in lines:
|
|
13
|
+
if line.startswith("**>"):
|
|
14
|
+
in_blockquote = True
|
|
15
|
+
is_expandable = True
|
|
16
|
+
blockquote_lines.append(line[3:].strip())
|
|
17
|
+
elif line.startswith(">"):
|
|
18
|
+
if not in_blockquote:
|
|
19
|
+
in_blockquote = True
|
|
20
|
+
is_expandable = False
|
|
21
|
+
blockquote_lines.append(line[1:].strip())
|
|
22
|
+
else:
|
|
23
|
+
if in_blockquote:
|
|
24
|
+
combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
|
|
25
|
+
blockquote_lines = []
|
|
26
|
+
in_blockquote = False
|
|
27
|
+
is_expandable = False
|
|
28
|
+
combined_lines.append(line)
|
|
29
|
+
|
|
30
|
+
if in_blockquote:
|
|
31
|
+
combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
|
|
32
|
+
|
|
33
|
+
return "\n".join(combined_lines)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _render_blockquote(lines: list[str], expandable: bool) -> str:
|
|
37
|
+
if expandable:
|
|
38
|
+
return "<blockquote expandable>" + "\n".join(lines) + "</blockquote>"
|
|
39
|
+
return "<blockquote>" + "\n".join(lines) + "</blockquote>"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""High-level Telegram Markdown → HTML renderer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
8
|
+
from .inline import (apply_custom_italic, convert_html_chars,
|
|
9
|
+
extract_inline_code_snippets, split_by_tag)
|
|
10
|
+
from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
|
|
11
|
+
from .preprocess import combine_blockquotes
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def telegram_format(text: str) -> str:
|
|
15
|
+
text = combine_blockquotes(text)
|
|
16
|
+
|
|
17
|
+
output, block_map = extract_and_convert_code_blocks(text)
|
|
18
|
+
output, inline_snippets = extract_inline_code_snippets(output)
|
|
19
|
+
|
|
20
|
+
output = convert_html_chars(output)
|
|
21
|
+
|
|
22
|
+
output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
|
|
23
|
+
output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
|
|
24
|
+
|
|
25
|
+
output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
|
|
26
|
+
output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
|
|
27
|
+
|
|
28
|
+
output = split_by_tag(output, "**", "b")
|
|
29
|
+
output = split_by_tag(output, "__", "u")
|
|
30
|
+
output = split_by_tag(output, "~~", "s")
|
|
31
|
+
output = split_by_tag(output, "||", 'span class="tg-spoiler"')
|
|
32
|
+
|
|
33
|
+
output = apply_custom_italic(output)
|
|
34
|
+
output = split_by_tag(output, "_", "i")
|
|
35
|
+
|
|
36
|
+
output = re.sub(r"【[^】]+】", "", output)
|
|
37
|
+
|
|
38
|
+
link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
|
|
39
|
+
output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
|
|
40
|
+
|
|
41
|
+
for placeholder, snippet in inline_snippets.items():
|
|
42
|
+
escaped = (
|
|
43
|
+
snippet.replace("&", "&")
|
|
44
|
+
.replace("<", "<")
|
|
45
|
+
.replace(">", ">")
|
|
46
|
+
)
|
|
47
|
+
output = output.replace(placeholder, f"<code>{escaped}</code>")
|
|
48
|
+
|
|
49
|
+
output = reinsert_code_blocks(output, block_map)
|
|
50
|
+
output = remove_blockquote_escaping(output)
|
|
51
|
+
output = remove_spoiler_escaping(output)
|
|
52
|
+
|
|
53
|
+
output = re.sub(r"\n{3,}", "\n\n", output)
|
|
54
|
+
|
|
55
|
+
return output.strip()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chatgpt_md_converter
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: A package for converting markdown to HTML for chat Telegram bots
|
|
5
5
|
Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
|
|
6
6
|
Author: Kostiantyn Kriuchkov
|
|
@@ -114,6 +114,24 @@ Hidden by default
|
|
|
114
114
|
Multiple lines</blockquote>
|
|
115
115
|
```
|
|
116
116
|
|
|
117
|
+
|
|
118
|
+
## Performance
|
|
119
|
+
|
|
120
|
+
Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
|
|
121
|
+
|
|
122
|
+
| Sample | Direction | Avg ms/call | Ops/sec |
|
|
123
|
+
|--------------|---------------|-------------|---------|
|
|
124
|
+
| short_inline | Markdown→HTML | 0.043 | 23,476 |
|
|
125
|
+
| short_inline | HTML→Markdown | 0.078 | 12,824 |
|
|
126
|
+
| medium_block | Markdown→HTML | 0.108 | 9,270 |
|
|
127
|
+
| medium_block | HTML→Markdown | 0.155 | 6,437 |
|
|
128
|
+
| long_mixed | Markdown→HTML | 0.446 | 2,242 |
|
|
129
|
+
| long_mixed | HTML→Markdown | 0.730 | 1,370 |
|
|
130
|
+
|
|
131
|
+
These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
|
|
132
|
+
|
|
133
|
+
Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
|
|
134
|
+
|
|
117
135
|
## Requirements
|
|
118
136
|
|
|
119
137
|
- Python 3.x
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
chatgpt_md_converter/__init__.py,sha256=6ts2hnimdBn_qCA15LKuipUjSU9ZCqRk1GbDPc_JjO4,242
|
|
2
|
+
chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
|
|
3
|
+
chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
|
|
4
|
+
chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
|
|
5
|
+
chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
|
|
6
|
+
chatgpt_md_converter/html_markdown/handlers.py,sha256=dJw-IuvFG7eeTVclx9QOS2NEvqlF2K7i3MJ_llt1YYc,4939
|
|
7
|
+
chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
|
|
8
|
+
chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
|
|
9
|
+
chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
|
|
10
|
+
chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
|
|
11
|
+
chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=gQCGqZTtUusK_I6KOGqMGTd-z3TkUZSo4kMrA5g_l04,3065
|
|
12
|
+
chatgpt_md_converter/telegram_markdown/inline.py,sha256=Phe4T5tu7Y7drH17YW-iOVEqGMRNGe1zVxAbd192HDY,2205
|
|
13
|
+
chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
|
|
14
|
+
chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=c9Wzs7DUumXgrgndCeHbCfV1qLzXVJlLHOtXC3Ne2Nk,1362
|
|
15
|
+
chatgpt_md_converter/telegram_markdown/renderer.py,sha256=ZX0reJLVC_2Fvw26dnSSpK_xr_Kpfp9oTyQw57FCqu0,1957
|
|
16
|
+
chatgpt_md_converter-0.3.9.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
17
|
+
chatgpt_md_converter-0.3.9.dist-info/METADATA,sha256=P5508ZIm4iTdBtFpA03j5CDy3E7iXEVnx2tkWSZMGbc,6604
|
|
18
|
+
chatgpt_md_converter-0.3.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
chatgpt_md_converter-0.3.9.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
20
|
+
chatgpt_md_converter-0.3.9.dist-info/RECORD,,
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def convert_html_chars(text: str) -> str:
|
|
5
|
-
"""
|
|
6
|
-
Converts HTML reserved symbols to their respective character references.
|
|
7
|
-
"""
|
|
8
|
-
text = text.replace("&", "&")
|
|
9
|
-
text = text.replace("<", "<")
|
|
10
|
-
text = text.replace(">", ">")
|
|
11
|
-
return text
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
|
|
15
|
-
"""
|
|
16
|
-
Splits the text by markdown tag and replaces it with the specified HTML tag.
|
|
17
|
-
"""
|
|
18
|
-
tag_pattern = re.compile(
|
|
19
|
-
r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
|
|
20
|
-
re.DOTALL,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
# Special handling for the tg-spoiler tag
|
|
24
|
-
if html_tag == 'span class="tg-spoiler"':
|
|
25
|
-
return tag_pattern.sub(r'<span class="tg-spoiler">\1</span>', out_text)
|
|
26
|
-
|
|
27
|
-
return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def ensure_closing_delimiters(text: str) -> str:
|
|
5
|
-
# Append missing closing backtick delimiters.
|
|
6
|
-
|
|
7
|
-
code_block_re = re.compile(
|
|
8
|
-
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
9
|
-
flags=re.DOTALL,
|
|
10
|
-
)
|
|
11
|
-
|
|
12
|
-
# Track an open fence. Once a fence is opened, everything until the same
|
|
13
|
-
# fence is encountered again is treated as plain text. This mimics how
|
|
14
|
-
# Markdown handles fences and allows fence-like strings inside code blocks.
|
|
15
|
-
open_fence = None
|
|
16
|
-
for line in text.splitlines():
|
|
17
|
-
stripped = line.strip()
|
|
18
|
-
if open_fence is None:
|
|
19
|
-
m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
|
|
20
|
-
if m:
|
|
21
|
-
open_fence = m.group("fence")
|
|
22
|
-
else:
|
|
23
|
-
if stripped.endswith(open_fence):
|
|
24
|
-
open_fence = None
|
|
25
|
-
|
|
26
|
-
# If a fence was left open, append a matching closing fence.
|
|
27
|
-
if open_fence is not None:
|
|
28
|
-
if not text.endswith("\n"):
|
|
29
|
-
text += "\n"
|
|
30
|
-
text += open_fence
|
|
31
|
-
|
|
32
|
-
cleaned_inline = code_block_re.sub("", text)
|
|
33
|
-
|
|
34
|
-
# Balance triple backticks that are not part of a complete fence.
|
|
35
|
-
if cleaned_inline.count("```") % 2 != 0:
|
|
36
|
-
text += "```"
|
|
37
|
-
|
|
38
|
-
# Balance single backticks outside fenced blocks.
|
|
39
|
-
cleaned_inline = code_block_re.sub("", text)
|
|
40
|
-
if cleaned_inline.count("`") % 2 != 0:
|
|
41
|
-
text += "`"
|
|
42
|
-
|
|
43
|
-
return text
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def extract_and_convert_code_blocks(text: str):
|
|
47
|
-
"""
|
|
48
|
-
Extracts code blocks from the text, converting them to HTML <pre><code> format,
|
|
49
|
-
and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
|
|
50
|
-
"""
|
|
51
|
-
text = ensure_closing_delimiters(text)
|
|
52
|
-
placeholders = []
|
|
53
|
-
code_blocks = {}
|
|
54
|
-
|
|
55
|
-
def replacer(match):
|
|
56
|
-
language = match.group("lang") if match.group("lang") else ""
|
|
57
|
-
code_content = match.group("code")
|
|
58
|
-
|
|
59
|
-
# Properly escape HTML entities in code content
|
|
60
|
-
escaped_content = (
|
|
61
|
-
code_content.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
|
|
65
|
-
placeholders.append(placeholder)
|
|
66
|
-
if not language:
|
|
67
|
-
html_code_block = f"<pre><code>{escaped_content}</code></pre>"
|
|
68
|
-
else:
|
|
69
|
-
html_code_block = (
|
|
70
|
-
f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
|
|
71
|
-
)
|
|
72
|
-
return (placeholder, html_code_block)
|
|
73
|
-
|
|
74
|
-
modified_text = text
|
|
75
|
-
code_block_pattern = re.compile(
|
|
76
|
-
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
|
|
77
|
-
flags=re.DOTALL,
|
|
78
|
-
)
|
|
79
|
-
for match in code_block_pattern.finditer(text):
|
|
80
|
-
placeholder, html_code_block = replacer(
|
|
81
|
-
match
|
|
82
|
-
)
|
|
83
|
-
code_blocks[placeholder] = html_code_block
|
|
84
|
-
modified_text = modified_text.replace(match.group(0), placeholder, 1)
|
|
85
|
-
|
|
86
|
-
return modified_text, code_blocks
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
|
|
90
|
-
"""
|
|
91
|
-
Reinserts HTML code blocks into the text, replacing their placeholders.
|
|
92
|
-
"""
|
|
93
|
-
for placeholder, html_code_block in code_blocks.items():
|
|
94
|
-
text = text.replace(placeholder, html_code_block, 1)
|
|
95
|
-
return text
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
def combine_blockquotes(text: str) -> str:
|
|
2
|
-
"""
|
|
3
|
-
Combines multiline blockquotes into a single blockquote while keeping the \n characters.
|
|
4
|
-
Supports both regular blockquotes (>) and expandable blockquotes (**>).
|
|
5
|
-
"""
|
|
6
|
-
lines = text.split("\n")
|
|
7
|
-
combined_lines = []
|
|
8
|
-
blockquote_lines = []
|
|
9
|
-
in_blockquote = False
|
|
10
|
-
is_expandable = False
|
|
11
|
-
|
|
12
|
-
for line in lines:
|
|
13
|
-
if line.startswith("**>"):
|
|
14
|
-
# Expandable blockquote
|
|
15
|
-
in_blockquote = True
|
|
16
|
-
is_expandable = True
|
|
17
|
-
blockquote_lines.append(line[3:].strip())
|
|
18
|
-
elif line.startswith(">"):
|
|
19
|
-
# Regular blockquote
|
|
20
|
-
if not in_blockquote:
|
|
21
|
-
# This is a new blockquote
|
|
22
|
-
in_blockquote = True
|
|
23
|
-
is_expandable = False
|
|
24
|
-
blockquote_lines.append(line[1:].strip())
|
|
25
|
-
else:
|
|
26
|
-
if in_blockquote:
|
|
27
|
-
# End of blockquote, combine the lines
|
|
28
|
-
if is_expandable:
|
|
29
|
-
combined_lines.append(
|
|
30
|
-
"<blockquote expandable>"
|
|
31
|
-
+ "\n".join(blockquote_lines)
|
|
32
|
-
+ "</blockquote>"
|
|
33
|
-
)
|
|
34
|
-
else:
|
|
35
|
-
combined_lines.append(
|
|
36
|
-
"<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
|
|
37
|
-
)
|
|
38
|
-
blockquote_lines = []
|
|
39
|
-
in_blockquote = False
|
|
40
|
-
is_expandable = False
|
|
41
|
-
combined_lines.append(line)
|
|
42
|
-
|
|
43
|
-
if in_blockquote:
|
|
44
|
-
# Handle the case where the file ends with a blockquote
|
|
45
|
-
if is_expandable:
|
|
46
|
-
combined_lines.append(
|
|
47
|
-
"<blockquote expandable>"
|
|
48
|
-
+ "\n".join(blockquote_lines)
|
|
49
|
-
+ "</blockquote>"
|
|
50
|
-
)
|
|
51
|
-
else:
|
|
52
|
-
combined_lines.append(
|
|
53
|
-
"<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
return "\n".join(combined_lines)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def fix_asterisk_equations(text: str) -> str:
|
|
60
|
-
"""
|
|
61
|
-
Replaces numeric expressions with '*' in them with '×'
|
|
62
|
-
to avoid accidental italic formatting.
|
|
63
|
-
e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
|
|
64
|
-
"""
|
|
65
|
-
import re
|
|
66
|
-
|
|
67
|
-
eq_pattern = re.compile(r"(\d+)\s*\*\s*(\d+)")
|
|
68
|
-
return eq_pattern.sub(r"\1×\2", text)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
chatgpt_md_converter/__init__.py,sha256=HF8fLq9o1A4HMDjPWCQ43NSby_L29Zgd4S_g3ORyyCA,157
|
|
2
|
-
chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
|
|
3
|
-
chatgpt_md_converter/extractors.py,sha256=k1oRlocn0K4OyU3-k2mrhKanKNdU-664t1CTcf8hYdE,3212
|
|
4
|
-
chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
|
|
5
|
-
chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
|
|
6
|
-
chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
|
|
7
|
-
chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
|
|
8
|
-
chatgpt_md_converter-0.3.8.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
9
|
-
chatgpt_md_converter-0.3.8.dist-info/METADATA,sha256=ngfuia4mAfiHBySgX_hKii8ty1O9hOkCotqX9Fzidm4,5792
|
|
10
|
-
chatgpt_md_converter-0.3.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
-
chatgpt_md_converter-0.3.8.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
12
|
-
chatgpt_md_converter-0.3.8.dist-info/RECORD,,
|
|
File without changes
|
{chatgpt_md_converter-0.3.8.dist-info → chatgpt_md_converter-0.3.9.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|