chatgpt-md-converter 0.3.11__tar.gz → 0.4.0b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/PKG-INFO +1 -1
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/__init__.py +14 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/handlers.py +3 -2
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/__init__.py +68 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/entity.py +64 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/__init__.py +13 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/blockquotes.py +117 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/headings.py +56 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/inline.py +295 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/links.py +59 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/parser.py +300 -0
- chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/utf16.py +50 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/preprocess.py +4 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/renderer.py +1 -2
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/SOURCES.txt +12 -1
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/setup.py +1 -1
- chatgpt_md_converter-0.4.0b1/tests/test_entities.py +298 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/tests/test_parser.py +35 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/tests/test_roundtrip_markdown.py +7 -0
- chatgpt_md_converter-0.4.0b1/tests/test_telegram_api.py +501 -0
- chatgpt_md_converter-0.3.11/chatgpt_md_converter/__init__.py +0 -5
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/LICENSE +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/README.md +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/escaping.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/renderer.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/state.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/tree.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_splitter.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_to_markdown.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_formatter.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/__init__.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/code_blocks.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/inline.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/postprocess.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/setup.cfg +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/tests/test_html_to_markdown_inline_spacing.py +0 -0
- {chatgpt_md_converter-0.3.11 → chatgpt_md_converter-0.4.0b1}/tests/test_splitter.py +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .html_splitter import split_html_for_telegram
|
|
2
|
+
from .html_to_markdown import html_to_telegram_markdown
|
|
3
|
+
from .telegram_entities import (EntityType, TelegramEntity,
|
|
4
|
+
telegram_format_entities)
|
|
5
|
+
from .telegram_formatter import telegram_format
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"telegram_format",
|
|
9
|
+
"telegram_format_entities",
|
|
10
|
+
"TelegramEntity",
|
|
11
|
+
"EntityType",
|
|
12
|
+
"split_html_for_telegram",
|
|
13
|
+
"html_to_telegram_markdown",
|
|
14
|
+
]
|
|
@@ -169,11 +169,12 @@ def _handle_blockquote(node: Node, state: RenderState) -> str:
|
|
|
169
169
|
expandable = "expandable" in node.attrs
|
|
170
170
|
rendered: list[str] = []
|
|
171
171
|
for index, line in enumerate(lines):
|
|
172
|
-
prefix = "**>" if expandable and index == 0 else ">"
|
|
173
172
|
stripped = line.rstrip("\r")
|
|
174
173
|
if expandable:
|
|
175
|
-
|
|
174
|
+
marker = ">**" if index == 0 else ">"
|
|
175
|
+
rendered.append(f"{marker} {stripped}" if stripped else marker)
|
|
176
176
|
else:
|
|
177
|
+
prefix = ">"
|
|
177
178
|
rendered.append(f"{prefix} {stripped}" if stripped else prefix)
|
|
178
179
|
return "\n".join(rendered)
|
|
179
180
|
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Telegram entity conversion module.
|
|
3
|
+
|
|
4
|
+
This module provides functions to convert Markdown text to Telegram's
|
|
5
|
+
native entity format (plain text + MessageEntity objects).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
from .entity import EntityType, TelegramEntity
|
|
11
|
+
from .parser import parse_entities
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def telegram_format_entities(text: str) -> Tuple[str, List[dict]]:
|
|
15
|
+
"""
|
|
16
|
+
Convert Markdown text to Telegram format with entities.
|
|
17
|
+
|
|
18
|
+
This function parses Markdown syntax and returns plain text along with
|
|
19
|
+
a list of entity dictionaries suitable for the Telegram Bot API.
|
|
20
|
+
|
|
21
|
+
Supported Markdown elements:
|
|
22
|
+
- **bold**
|
|
23
|
+
- *italic* or _italic_
|
|
24
|
+
- __underline__
|
|
25
|
+
- ~~strikethrough~~
|
|
26
|
+
- ||spoiler||
|
|
27
|
+
- `inline code`
|
|
28
|
+
- ```language
|
|
29
|
+
code blocks
|
|
30
|
+
```
|
|
31
|
+
- [link text](url)
|
|
32
|
+
- > blockquotes
|
|
33
|
+
- >** expandable blockquotes
|
|
34
|
+
- # Headings (converted to bold)
|
|
35
|
+
- Lists with - or *
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Markdown-formatted text
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (plain_text, entities) where:
|
|
42
|
+
- plain_text: Text with all Markdown markers removed
|
|
43
|
+
- entities: List of dicts with 'type', 'offset', 'length' keys
|
|
44
|
+
(plus 'url' for links, 'language' for code blocks)
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> text, entities = telegram_format_entities("**Hello** world!")
|
|
48
|
+
>>> print(text)
|
|
49
|
+
Hello world!
|
|
50
|
+
>>> print(entities)
|
|
51
|
+
[{'type': 'bold', 'offset': 0, 'length': 5}]
|
|
52
|
+
|
|
53
|
+
# Use with python-telegram-bot:
|
|
54
|
+
await bot.send_message(chat_id, text=text, entities=entities)
|
|
55
|
+
|
|
56
|
+
# Use with aiogram:
|
|
57
|
+
await message.answer(text, entities=entities)
|
|
58
|
+
"""
|
|
59
|
+
plain_text, entity_objects = parse_entities(text)
|
|
60
|
+
return plain_text, [e.to_dict() for e in entity_objects]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"telegram_format_entities",
|
|
65
|
+
"TelegramEntity",
|
|
66
|
+
"EntityType",
|
|
67
|
+
"parse_entities",
|
|
68
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Telegram entity data structures."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EntityType(Enum):
|
|
9
|
+
"""Telegram MessageEntity types."""
|
|
10
|
+
|
|
11
|
+
BOLD = "bold"
|
|
12
|
+
ITALIC = "italic"
|
|
13
|
+
UNDERLINE = "underline"
|
|
14
|
+
STRIKETHROUGH = "strikethrough"
|
|
15
|
+
SPOILER = "spoiler"
|
|
16
|
+
CODE = "code"
|
|
17
|
+
PRE = "pre"
|
|
18
|
+
TEXT_LINK = "text_link"
|
|
19
|
+
BLOCKQUOTE = "blockquote"
|
|
20
|
+
EXPANDABLE_BLOCKQUOTE = "expandable_blockquote"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TelegramEntity:
|
|
25
|
+
"""
|
|
26
|
+
Represents a Telegram MessageEntity.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
type: The entity type (bold, italic, code, etc.)
|
|
30
|
+
offset: Start position in UTF-16 code units
|
|
31
|
+
length: Length in UTF-16 code units
|
|
32
|
+
url: URL for TEXT_LINK entities
|
|
33
|
+
language: Programming language for PRE (code block) entities
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
type: EntityType
|
|
37
|
+
offset: int
|
|
38
|
+
length: int
|
|
39
|
+
url: Optional[str] = None
|
|
40
|
+
language: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict:
|
|
43
|
+
"""Convert to dict for JSON serialization / Telegram API."""
|
|
44
|
+
result = {
|
|
45
|
+
"type": self.type.value,
|
|
46
|
+
"offset": self.offset,
|
|
47
|
+
"length": self.length,
|
|
48
|
+
}
|
|
49
|
+
if self.url is not None:
|
|
50
|
+
result["url"] = self.url
|
|
51
|
+
if self.language is not None:
|
|
52
|
+
result["language"] = self.language
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_dict(cls, data: dict) -> "TelegramEntity":
|
|
57
|
+
"""Create entity from dictionary."""
|
|
58
|
+
return cls(
|
|
59
|
+
type=EntityType(data["type"]),
|
|
60
|
+
offset=data["offset"],
|
|
61
|
+
length=data["length"],
|
|
62
|
+
url=data.get("url"),
|
|
63
|
+
language=data.get("language"),
|
|
64
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Entity extractors for different Markdown elements."""
|
|
2
|
+
|
|
3
|
+
from .blockquotes import extract_blockquote_entities
|
|
4
|
+
from .headings import extract_heading_entities
|
|
5
|
+
from .inline import extract_inline_formatting_entities
|
|
6
|
+
from .links import extract_link_entities
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"extract_inline_formatting_entities",
|
|
10
|
+
"extract_link_entities",
|
|
11
|
+
"extract_blockquote_entities",
|
|
12
|
+
"extract_heading_entities",
|
|
13
|
+
]
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Blockquote entity extraction."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Pattern for regular blockquotes: > text
|
|
9
|
+
_BLOCKQUOTE_LINE_PATTERN = re.compile(r"^>(?!\*\*)\s?(.*)$", re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
# Pattern for expandable blockquotes: >** text or **> text
|
|
12
|
+
_EXPANDABLE_BLOCKQUOTE_PATTERN = re.compile(
|
|
13
|
+
r"^(?:>\*\*|\*\*>)\s?(.*)$", re.MULTILINE
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_blockquote_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
|
|
18
|
+
"""
|
|
19
|
+
Extract blockquotes and return plain text with BLOCKQUOTE entities.
|
|
20
|
+
|
|
21
|
+
Handles both regular (>) and expandable (>** or **>) blockquotes.
|
|
22
|
+
Consecutive blockquote lines are combined into a single entity.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: Input text with blockquote markers
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (text_without_markers, list_of_entities)
|
|
29
|
+
"""
|
|
30
|
+
entities: List[TelegramEntity] = []
|
|
31
|
+
|
|
32
|
+
# First, handle expandable blockquotes
|
|
33
|
+
result_parts: List[str] = []
|
|
34
|
+
|
|
35
|
+
# Find all expandable blockquote lines and group consecutive ones
|
|
36
|
+
lines = text.split("\n")
|
|
37
|
+
i = 0
|
|
38
|
+
current_offset = 0
|
|
39
|
+
|
|
40
|
+
while i < len(lines):
|
|
41
|
+
line = lines[i]
|
|
42
|
+
|
|
43
|
+
# Check for expandable blockquote
|
|
44
|
+
exp_match = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(line)
|
|
45
|
+
if exp_match:
|
|
46
|
+
# Collect consecutive expandable blockquote lines
|
|
47
|
+
quote_lines = []
|
|
48
|
+
|
|
49
|
+
while i < len(lines):
|
|
50
|
+
m = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i])
|
|
51
|
+
if m:
|
|
52
|
+
quote_lines.append(m.group(1))
|
|
53
|
+
i += 1
|
|
54
|
+
else:
|
|
55
|
+
break
|
|
56
|
+
|
|
57
|
+
quote_content = "\n".join(quote_lines)
|
|
58
|
+
quote_offset = current_offset
|
|
59
|
+
current_offset += len(quote_content) + (1 if i < len(lines) else 0)
|
|
60
|
+
|
|
61
|
+
result_parts.append(quote_content)
|
|
62
|
+
if i < len(lines):
|
|
63
|
+
result_parts.append("\n")
|
|
64
|
+
|
|
65
|
+
entities.append(
|
|
66
|
+
TelegramEntity(
|
|
67
|
+
type=EntityType.EXPANDABLE_BLOCKQUOTE,
|
|
68
|
+
offset=quote_offset,
|
|
69
|
+
length=len(quote_content),
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Check for regular blockquote
|
|
75
|
+
reg_match = _BLOCKQUOTE_LINE_PATTERN.match(line)
|
|
76
|
+
if reg_match:
|
|
77
|
+
# Collect consecutive regular blockquote lines
|
|
78
|
+
quote_lines = []
|
|
79
|
+
start_offset = current_offset
|
|
80
|
+
|
|
81
|
+
while i < len(lines):
|
|
82
|
+
# Don't match expandable as regular
|
|
83
|
+
if _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i]):
|
|
84
|
+
break
|
|
85
|
+
m = _BLOCKQUOTE_LINE_PATTERN.match(lines[i])
|
|
86
|
+
if m:
|
|
87
|
+
quote_lines.append(m.group(1))
|
|
88
|
+
i += 1
|
|
89
|
+
else:
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
quote_content = "\n".join(quote_lines)
|
|
93
|
+
current_offset += len(quote_content) + (1 if i < len(lines) else 0)
|
|
94
|
+
|
|
95
|
+
result_parts.append(quote_content)
|
|
96
|
+
if i < len(lines):
|
|
97
|
+
result_parts.append("\n")
|
|
98
|
+
|
|
99
|
+
entities.append(
|
|
100
|
+
TelegramEntity(
|
|
101
|
+
type=EntityType.BLOCKQUOTE,
|
|
102
|
+
offset=start_offset,
|
|
103
|
+
length=len(quote_content),
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Regular line
|
|
109
|
+
current_offset += len(line) + (1 if i < len(lines) - 1 else 0)
|
|
110
|
+
result_parts.append(line)
|
|
111
|
+
if i < len(lines) - 1:
|
|
112
|
+
result_parts.append("\n")
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
result_text = "".join(result_parts)
|
|
116
|
+
|
|
117
|
+
return result_text, entities
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Heading entity extraction (converted to bold)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Pattern for Markdown headings: # Heading, ## Heading, etc.
|
|
9
|
+
_HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_heading_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
|
|
13
|
+
"""
|
|
14
|
+
Extract Markdown headings and convert them to bold entities.
|
|
15
|
+
|
|
16
|
+
Telegram doesn't have native heading support, so headings are converted
|
|
17
|
+
to bold text (matching the HTML converter behavior).
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
text: Input text with Markdown headings
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Tuple of (text_with_headings_converted, list_of_bold_entities)
|
|
24
|
+
"""
|
|
25
|
+
entities: List[TelegramEntity] = []
|
|
26
|
+
result_parts: List[str] = []
|
|
27
|
+
last_end = 0
|
|
28
|
+
|
|
29
|
+
for match in _HEADING_PATTERN.finditer(text):
|
|
30
|
+
# Add text before this heading
|
|
31
|
+
result_parts.append(text[last_end : match.start()])
|
|
32
|
+
|
|
33
|
+
# Calculate position in output
|
|
34
|
+
current_offset = sum(len(p) for p in result_parts)
|
|
35
|
+
|
|
36
|
+
# Extract heading text (without the # markers)
|
|
37
|
+
heading_text = match.group(2)
|
|
38
|
+
|
|
39
|
+
# Add the heading text
|
|
40
|
+
result_parts.append(heading_text)
|
|
41
|
+
|
|
42
|
+
# Create bold entity for the heading
|
|
43
|
+
entities.append(
|
|
44
|
+
TelegramEntity(
|
|
45
|
+
type=EntityType.BOLD,
|
|
46
|
+
offset=current_offset,
|
|
47
|
+
length=len(heading_text),
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
last_end = match.end()
|
|
52
|
+
|
|
53
|
+
# Add remaining text
|
|
54
|
+
result_parts.append(text[last_end:])
|
|
55
|
+
|
|
56
|
+
return "".join(result_parts), entities
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Inline formatting entity extraction (bold, italic, underline, etc.)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Patterns for different formatting types
|
|
9
|
+
# Order matters - longer markers first to avoid partial matches
|
|
10
|
+
_PATTERNS = [
|
|
11
|
+
# Bold+Italic: ***text***
|
|
12
|
+
(
|
|
13
|
+
re.compile(r"(?<![\\\*])\*\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)\*\*\*(?!\*)", re.DOTALL),
|
|
14
|
+
[EntityType.BOLD, EntityType.ITALIC],
|
|
15
|
+
3,
|
|
16
|
+
),
|
|
17
|
+
# Underline+Italic: ___text___
|
|
18
|
+
(
|
|
19
|
+
re.compile(
|
|
20
|
+
r"(?<![\\_])___(?!_)(?=\S)([\s\S]*?)(?<=\S)___(?!_)",
|
|
21
|
+
re.DOTALL,
|
|
22
|
+
),
|
|
23
|
+
[EntityType.UNDERLINE, EntityType.ITALIC],
|
|
24
|
+
3,
|
|
25
|
+
),
|
|
26
|
+
# Bold: **text**
|
|
27
|
+
(
|
|
28
|
+
re.compile(r"(?<![\\\*])\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*\*(?!\*)", re.DOTALL),
|
|
29
|
+
[EntityType.BOLD],
|
|
30
|
+
2,
|
|
31
|
+
),
|
|
32
|
+
# Underline: __text__
|
|
33
|
+
(
|
|
34
|
+
re.compile(
|
|
35
|
+
r"(?<![\\_])__(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)__(?!_)",
|
|
36
|
+
re.DOTALL,
|
|
37
|
+
),
|
|
38
|
+
[EntityType.UNDERLINE],
|
|
39
|
+
2,
|
|
40
|
+
),
|
|
41
|
+
# Strikethrough: ~~text~~
|
|
42
|
+
(
|
|
43
|
+
re.compile(r"(?<![\\~])~~(?!~)(?=\S)([\s\S]*?)(?<=\S)(?<!~)~~(?!~)", re.DOTALL),
|
|
44
|
+
[EntityType.STRIKETHROUGH],
|
|
45
|
+
2,
|
|
46
|
+
),
|
|
47
|
+
# Spoiler: ||text||
|
|
48
|
+
(
|
|
49
|
+
re.compile(r"(?<![\\|])\|\|(?!\|)(?=\S)([^\n]*?)(?<=\S)(?<!\|)\|\|(?!\|)"),
|
|
50
|
+
[EntityType.SPOILER],
|
|
51
|
+
2,
|
|
52
|
+
),
|
|
53
|
+
# Italic with asterisk: *text* (must not be adjacent to other asterisks)
|
|
54
|
+
(
|
|
55
|
+
re.compile(
|
|
56
|
+
r"(?<![A-Za-z0-9\\\*])\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*(?![A-Za-z0-9\*])",
|
|
57
|
+
re.DOTALL,
|
|
58
|
+
),
|
|
59
|
+
[EntityType.ITALIC],
|
|
60
|
+
1,
|
|
61
|
+
),
|
|
62
|
+
# Italic with underscore: _text_
|
|
63
|
+
(
|
|
64
|
+
re.compile(
|
|
65
|
+
r"(?<![A-Za-z0-9\\_])_(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)_(?![A-Za-z0-9_])",
|
|
66
|
+
re.DOTALL,
|
|
67
|
+
),
|
|
68
|
+
[EntityType.ITALIC],
|
|
69
|
+
1,
|
|
70
|
+
),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class _Match:
|
|
75
|
+
"""Represents a formatting match with its properties."""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
start: int,
|
|
80
|
+
end: int,
|
|
81
|
+
inner_start: int,
|
|
82
|
+
inner_end: int,
|
|
83
|
+
entity_types: List[EntityType],
|
|
84
|
+
marker_len: int,
|
|
85
|
+
):
|
|
86
|
+
self.start = start
|
|
87
|
+
self.end = end
|
|
88
|
+
self.inner_start = inner_start
|
|
89
|
+
self.inner_end = inner_end
|
|
90
|
+
self.entity_types = entity_types
|
|
91
|
+
self.marker_len = marker_len
|
|
92
|
+
self.children: List["_Match"] = []
|
|
93
|
+
|
|
94
|
+
def contains(self, other: "_Match") -> bool:
|
|
95
|
+
"""Check if this match's inner content fully contains another match."""
|
|
96
|
+
return self.inner_start <= other.start and other.end <= self.inner_end
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _find_all_matches(text: str) -> List[_Match]:
|
|
100
|
+
"""Find all formatting matches in text."""
|
|
101
|
+
matches = []
|
|
102
|
+
|
|
103
|
+
for pattern, entity_types, marker_len in _PATTERNS:
|
|
104
|
+
for match in pattern.finditer(text):
|
|
105
|
+
matches.append(
|
|
106
|
+
_Match(
|
|
107
|
+
start=match.start(),
|
|
108
|
+
end=match.end(),
|
|
109
|
+
inner_start=match.start() + marker_len,
|
|
110
|
+
inner_end=match.end() - marker_len,
|
|
111
|
+
entity_types=list(entity_types),
|
|
112
|
+
marker_len=marker_len,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Sort by start position, then by length descending (longer first)
|
|
117
|
+
matches.sort(key=lambda m: (m.start, -(m.end - m.start)))
|
|
118
|
+
|
|
119
|
+
return matches
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _build_match_tree(matches: List[_Match]) -> List[_Match]:
|
|
123
|
+
"""
|
|
124
|
+
Build a tree of matches where nested matches are children.
|
|
125
|
+
Returns only top-level matches (others are nested as children).
|
|
126
|
+
"""
|
|
127
|
+
if not matches:
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
result: List[_Match] = []
|
|
131
|
+
|
|
132
|
+
for match in matches:
|
|
133
|
+
# Find if this match should be nested inside an existing result
|
|
134
|
+
placed = False
|
|
135
|
+
for existing in result:
|
|
136
|
+
if existing.contains(match):
|
|
137
|
+
# Recursively try to place in existing's children
|
|
138
|
+
placed = _try_place_in_children(existing, match)
|
|
139
|
+
if placed:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if not placed:
|
|
143
|
+
# Check if this match overlaps with any existing (invalid)
|
|
144
|
+
overlaps = False
|
|
145
|
+
for existing in result:
|
|
146
|
+
if _matches_overlap(match, existing):
|
|
147
|
+
overlaps = True
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
if not overlaps:
|
|
151
|
+
result.append(match)
|
|
152
|
+
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _try_place_in_children(parent: _Match, child: _Match) -> bool:
|
|
157
|
+
"""Try to place a child match in the parent's children list."""
|
|
158
|
+
# First check if it fits in any existing child
|
|
159
|
+
for existing_child in parent.children:
|
|
160
|
+
if existing_child.contains(child):
|
|
161
|
+
return _try_place_in_children(existing_child, child)
|
|
162
|
+
|
|
163
|
+
# Check for overlaps with existing children
|
|
164
|
+
for existing_child in parent.children:
|
|
165
|
+
if _matches_overlap(child, existing_child):
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
# Can add as a direct child
|
|
169
|
+
parent.children.append(child)
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _matches_overlap(m1: _Match, m2: _Match) -> bool:
|
|
174
|
+
"""Check if two matches have invalid overlap (partial, not nested)."""
|
|
175
|
+
# No overlap
|
|
176
|
+
if m1.end <= m2.start or m2.end <= m1.start:
|
|
177
|
+
return False
|
|
178
|
+
# m1 contains m2 in inner content
|
|
179
|
+
if m1.inner_start <= m2.start and m2.end <= m1.inner_end:
|
|
180
|
+
return False
|
|
181
|
+
# m2 contains m1 in inner content
|
|
182
|
+
if m2.inner_start <= m1.start and m1.end <= m2.inner_end:
|
|
183
|
+
return False
|
|
184
|
+
# Invalid overlap
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _process_match(
|
|
189
|
+
text: str,
|
|
190
|
+
match: _Match,
|
|
191
|
+
base_offset: int,
|
|
192
|
+
) -> Tuple[str, List[TelegramEntity]]:
|
|
193
|
+
"""
|
|
194
|
+
Process a single match and its children, returning plain text and entities.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
text: The text containing the match
|
|
198
|
+
match: The match to process
|
|
199
|
+
base_offset: Offset in the final output where this match starts
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (processed_text, entities)
|
|
203
|
+
"""
|
|
204
|
+
inner_text = text[match.inner_start : match.inner_end]
|
|
205
|
+
entities: List[TelegramEntity] = []
|
|
206
|
+
|
|
207
|
+
# If there are children, process them
|
|
208
|
+
if match.children:
|
|
209
|
+
# Sort children by position
|
|
210
|
+
match.children.sort(key=lambda m: m.start)
|
|
211
|
+
|
|
212
|
+
# Process children recursively
|
|
213
|
+
processed_parts: List[str] = []
|
|
214
|
+
child_entities: List[TelegramEntity] = []
|
|
215
|
+
last_end = match.inner_start
|
|
216
|
+
|
|
217
|
+
for child in match.children:
|
|
218
|
+
# Add text before this child
|
|
219
|
+
processed_parts.append(text[last_end : child.start])
|
|
220
|
+
|
|
221
|
+
# Calculate child's offset in the final output
|
|
222
|
+
child_offset = base_offset + sum(len(p) for p in processed_parts)
|
|
223
|
+
|
|
224
|
+
# Process child recursively
|
|
225
|
+
child_text, child_ents = _process_match(text, child, child_offset)
|
|
226
|
+
processed_parts.append(child_text)
|
|
227
|
+
child_entities.extend(child_ents)
|
|
228
|
+
|
|
229
|
+
last_end = child.end
|
|
230
|
+
|
|
231
|
+
# Add remaining text after last child
|
|
232
|
+
processed_parts.append(text[last_end : match.inner_end])
|
|
233
|
+
|
|
234
|
+
inner_text = "".join(processed_parts)
|
|
235
|
+
entities.extend(child_entities)
|
|
236
|
+
|
|
237
|
+
# Create entities for this match
|
|
238
|
+
for entity_type in match.entity_types:
|
|
239
|
+
entities.append(
|
|
240
|
+
TelegramEntity(
|
|
241
|
+
type=entity_type,
|
|
242
|
+
offset=base_offset,
|
|
243
|
+
length=len(inner_text),
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return inner_text, entities
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def extract_inline_formatting_entities(
|
|
251
|
+
text: str,
|
|
252
|
+
) -> Tuple[str, List[TelegramEntity]]:
|
|
253
|
+
"""
|
|
254
|
+
Extract inline formatting (bold, italic, etc.) and return plain text with entities.
|
|
255
|
+
|
|
256
|
+
Handles nested formatting where one style is fully contained within another.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
text: Input text with Markdown formatting markers
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Tuple of (text_without_markers, list_of_entities)
|
|
263
|
+
"""
|
|
264
|
+
matches = _find_all_matches(text)
|
|
265
|
+
top_level_matches = _build_match_tree(matches)
|
|
266
|
+
|
|
267
|
+
if not top_level_matches:
|
|
268
|
+
return text, []
|
|
269
|
+
|
|
270
|
+
# Sort by position
|
|
271
|
+
top_level_matches.sort(key=lambda m: m.start)
|
|
272
|
+
|
|
273
|
+
# Process all matches
|
|
274
|
+
result_parts: List[str] = []
|
|
275
|
+
all_entities: List[TelegramEntity] = []
|
|
276
|
+
last_end = 0
|
|
277
|
+
|
|
278
|
+
for match in top_level_matches:
|
|
279
|
+
# Add text before this match
|
|
280
|
+
result_parts.append(text[last_end : match.start])
|
|
281
|
+
|
|
282
|
+
# Calculate offset for this match
|
|
283
|
+
current_offset = sum(len(p) for p in result_parts)
|
|
284
|
+
|
|
285
|
+
# Process match and its children
|
|
286
|
+
processed_text, entities = _process_match(text, match, current_offset)
|
|
287
|
+
result_parts.append(processed_text)
|
|
288
|
+
all_entities.extend(entities)
|
|
289
|
+
|
|
290
|
+
last_end = match.end
|
|
291
|
+
|
|
292
|
+
# Add remaining text
|
|
293
|
+
result_parts.append(text[last_end:])
|
|
294
|
+
|
|
295
|
+
return "".join(result_parts), all_entities
|