chatgpt-md-converter 0.3.12__tar.gz → 0.4.0b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/PKG-INFO +1 -1
  2. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/__init__.py +14 -0
  3. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/__init__.py +68 -0
  4. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/entity.py +64 -0
  5. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/__init__.py +13 -0
  6. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/blockquotes.py +117 -0
  7. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/headings.py +56 -0
  8. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/inline.py +295 -0
  9. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/extractors/links.py +59 -0
  10. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/parser.py +300 -0
  11. chatgpt_md_converter-0.4.0b1/chatgpt_md_converter/telegram_entities/utf16.py +50 -0
  12. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/PKG-INFO +1 -1
  13. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/SOURCES.txt +12 -1
  14. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/setup.py +1 -1
  15. chatgpt_md_converter-0.4.0b1/tests/test_entities.py +298 -0
  16. chatgpt_md_converter-0.4.0b1/tests/test_telegram_api.py +501 -0
  17. chatgpt_md_converter-0.3.12/chatgpt_md_converter/__init__.py +0 -5
  18. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/LICENSE +0 -0
  19. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/README.md +0 -0
  20. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/escaping.py +0 -0
  21. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/handlers.py +0 -0
  22. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/renderer.py +0 -0
  23. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/state.py +0 -0
  24. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_markdown/tree.py +0 -0
  25. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_splitter.py +0 -0
  26. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/html_to_markdown.py +0 -0
  27. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_formatter.py +0 -0
  28. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/__init__.py +0 -0
  29. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/code_blocks.py +0 -0
  30. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/inline.py +0 -0
  31. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/postprocess.py +0 -0
  32. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/preprocess.py +0 -0
  33. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter/telegram_markdown/renderer.py +0 -0
  34. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/dependency_links.txt +0 -0
  35. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/chatgpt_md_converter.egg-info/top_level.txt +0 -0
  36. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/setup.cfg +0 -0
  37. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/tests/test_html_to_markdown_inline_spacing.py +0 -0
  38. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/tests/test_parser.py +0 -0
  39. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/tests/test_roundtrip_markdown.py +0 -0
  40. {chatgpt_md_converter-0.3.12 → chatgpt_md_converter-0.4.0b1}/tests/test_splitter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.12
3
+ Version: 0.4.0b1
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -0,0 +1,14 @@
1
+ from .html_splitter import split_html_for_telegram
2
+ from .html_to_markdown import html_to_telegram_markdown
3
+ from .telegram_entities import (EntityType, TelegramEntity,
4
+ telegram_format_entities)
5
+ from .telegram_formatter import telegram_format
6
+
7
+ __all__ = [
8
+ "telegram_format",
9
+ "telegram_format_entities",
10
+ "TelegramEntity",
11
+ "EntityType",
12
+ "split_html_for_telegram",
13
+ "html_to_telegram_markdown",
14
+ ]
@@ -0,0 +1,68 @@
1
+ """
2
+ Telegram entity conversion module.
3
+
4
+ This module provides functions to convert Markdown text to Telegram's
5
+ native entity format (plain text + MessageEntity objects).
6
+ """
7
+
8
+ from typing import List, Tuple
9
+
10
+ from .entity import EntityType, TelegramEntity
11
+ from .parser import parse_entities
12
+
13
+
14
+ def telegram_format_entities(text: str) -> Tuple[str, List[dict]]:
15
+ """
16
+ Convert Markdown text to Telegram format with entities.
17
+
18
+ This function parses Markdown syntax and returns plain text along with
19
+ a list of entity dictionaries suitable for the Telegram Bot API.
20
+
21
+ Supported Markdown elements:
22
+ - **bold**
23
+ - *italic* or _italic_
24
+ - __underline__
25
+ - ~~strikethrough~~
26
+ - ||spoiler||
27
+ - `inline code`
28
+ - ```language
29
+ code blocks
30
+ ```
31
+ - [link text](url)
32
+ - > blockquotes
33
+ - >** expandable blockquotes
34
+ - # Headings (converted to bold)
35
+ - Lists with - or *
36
+
37
+ Args:
38
+ text: Markdown-formatted text
39
+
40
+ Returns:
41
+ Tuple of (plain_text, entities) where:
42
+ - plain_text: Text with all Markdown markers removed
43
+ - entities: List of dicts with 'type', 'offset', 'length' keys
44
+ (plus 'url' for links, 'language' for code blocks)
45
+
46
+ Example:
47
+ >>> text, entities = telegram_format_entities("**Hello** world!")
48
+ >>> print(text)
49
+ Hello world!
50
+ >>> print(entities)
51
+ [{'type': 'bold', 'offset': 0, 'length': 5}]
52
+
53
+ # Use with python-telegram-bot:
54
+ await bot.send_message(chat_id, text=text, entities=entities)
55
+
56
+ # Use with aiogram:
57
+ await message.answer(text, entities=entities)
58
+ """
59
+ plain_text, entity_objects = parse_entities(text)
60
+ return plain_text, [e.to_dict() for e in entity_objects]
61
+
62
+
63
+ __all__ = [
64
+ "telegram_format_entities",
65
+ "TelegramEntity",
66
+ "EntityType",
67
+ "parse_entities",
68
+ ]
@@ -0,0 +1,64 @@
1
+ """Telegram entity data structures."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Optional
6
+
7
+
8
+ class EntityType(Enum):
9
+ """Telegram MessageEntity types."""
10
+
11
+ BOLD = "bold"
12
+ ITALIC = "italic"
13
+ UNDERLINE = "underline"
14
+ STRIKETHROUGH = "strikethrough"
15
+ SPOILER = "spoiler"
16
+ CODE = "code"
17
+ PRE = "pre"
18
+ TEXT_LINK = "text_link"
19
+ BLOCKQUOTE = "blockquote"
20
+ EXPANDABLE_BLOCKQUOTE = "expandable_blockquote"
21
+
22
+
23
+ @dataclass
24
+ class TelegramEntity:
25
+ """
26
+ Represents a Telegram MessageEntity.
27
+
28
+ Attributes:
29
+ type: The entity type (bold, italic, code, etc.)
30
+ offset: Start position in UTF-16 code units
31
+ length: Length in UTF-16 code units
32
+ url: URL for TEXT_LINK entities
33
+ language: Programming language for PRE (code block) entities
34
+ """
35
+
36
+ type: EntityType
37
+ offset: int
38
+ length: int
39
+ url: Optional[str] = None
40
+ language: Optional[str] = None
41
+
42
+ def to_dict(self) -> dict:
43
+ """Convert to dict for JSON serialization / Telegram API."""
44
+ result = {
45
+ "type": self.type.value,
46
+ "offset": self.offset,
47
+ "length": self.length,
48
+ }
49
+ if self.url is not None:
50
+ result["url"] = self.url
51
+ if self.language is not None:
52
+ result["language"] = self.language
53
+ return result
54
+
55
+ @classmethod
56
+ def from_dict(cls, data: dict) -> "TelegramEntity":
57
+ """Create entity from dictionary."""
58
+ return cls(
59
+ type=EntityType(data["type"]),
60
+ offset=data["offset"],
61
+ length=data["length"],
62
+ url=data.get("url"),
63
+ language=data.get("language"),
64
+ )
@@ -0,0 +1,13 @@
1
+ """Entity extractors for different Markdown elements."""
2
+
3
+ from .blockquotes import extract_blockquote_entities
4
+ from .headings import extract_heading_entities
5
+ from .inline import extract_inline_formatting_entities
6
+ from .links import extract_link_entities
7
+
8
+ __all__ = [
9
+ "extract_inline_formatting_entities",
10
+ "extract_link_entities",
11
+ "extract_blockquote_entities",
12
+ "extract_heading_entities",
13
+ ]
@@ -0,0 +1,117 @@
1
+ """Blockquote entity extraction."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for regular blockquotes: > text
9
+ _BLOCKQUOTE_LINE_PATTERN = re.compile(r"^>(?!\*\*)\s?(.*)$", re.MULTILINE)
10
+
11
+ # Pattern for expandable blockquotes: >** text or **> text
12
+ _EXPANDABLE_BLOCKQUOTE_PATTERN = re.compile(
13
+ r"^(?:>\*\*|\*\*>)\s?(.*)$", re.MULTILINE
14
+ )
15
+
16
+
17
+ def extract_blockquote_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
18
+ """
19
+ Extract blockquotes and return plain text with BLOCKQUOTE entities.
20
+
21
+ Handles both regular (>) and expandable (>** or **>) blockquotes.
22
+ Consecutive blockquote lines are combined into a single entity.
23
+
24
+ Args:
25
+ text: Input text with blockquote markers
26
+
27
+ Returns:
28
+ Tuple of (text_without_markers, list_of_entities)
29
+ """
30
+ entities: List[TelegramEntity] = []
31
+
32
+ # First, handle expandable blockquotes
33
+ result_parts: List[str] = []
34
+
35
+ # Find all expandable blockquote lines and group consecutive ones
36
+ lines = text.split("\n")
37
+ i = 0
38
+ current_offset = 0
39
+
40
+ while i < len(lines):
41
+ line = lines[i]
42
+
43
+ # Check for expandable blockquote
44
+ exp_match = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(line)
45
+ if exp_match:
46
+ # Collect consecutive expandable blockquote lines
47
+ quote_lines = []
48
+
49
+ while i < len(lines):
50
+ m = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i])
51
+ if m:
52
+ quote_lines.append(m.group(1))
53
+ i += 1
54
+ else:
55
+ break
56
+
57
+ quote_content = "\n".join(quote_lines)
58
+ quote_offset = current_offset
59
+ current_offset += len(quote_content) + (1 if i < len(lines) else 0)
60
+
61
+ result_parts.append(quote_content)
62
+ if i < len(lines):
63
+ result_parts.append("\n")
64
+
65
+ entities.append(
66
+ TelegramEntity(
67
+ type=EntityType.EXPANDABLE_BLOCKQUOTE,
68
+ offset=quote_offset,
69
+ length=len(quote_content),
70
+ )
71
+ )
72
+ continue
73
+
74
+ # Check for regular blockquote
75
+ reg_match = _BLOCKQUOTE_LINE_PATTERN.match(line)
76
+ if reg_match:
77
+ # Collect consecutive regular blockquote lines
78
+ quote_lines = []
79
+ start_offset = current_offset
80
+
81
+ while i < len(lines):
82
+ # Don't match expandable as regular
83
+ if _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i]):
84
+ break
85
+ m = _BLOCKQUOTE_LINE_PATTERN.match(lines[i])
86
+ if m:
87
+ quote_lines.append(m.group(1))
88
+ i += 1
89
+ else:
90
+ break
91
+
92
+ quote_content = "\n".join(quote_lines)
93
+ current_offset += len(quote_content) + (1 if i < len(lines) else 0)
94
+
95
+ result_parts.append(quote_content)
96
+ if i < len(lines):
97
+ result_parts.append("\n")
98
+
99
+ entities.append(
100
+ TelegramEntity(
101
+ type=EntityType.BLOCKQUOTE,
102
+ offset=start_offset,
103
+ length=len(quote_content),
104
+ )
105
+ )
106
+ continue
107
+
108
+ # Regular line
109
+ current_offset += len(line) + (1 if i < len(lines) - 1 else 0)
110
+ result_parts.append(line)
111
+ if i < len(lines) - 1:
112
+ result_parts.append("\n")
113
+ i += 1
114
+
115
+ result_text = "".join(result_parts)
116
+
117
+ return result_text, entities
@@ -0,0 +1,56 @@
1
+ """Heading entity extraction (converted to bold)."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for Markdown headings: # Heading, ## Heading, etc.
9
+ _HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
10
+
11
+
12
+ def extract_heading_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
13
+ """
14
+ Extract Markdown headings and convert them to bold entities.
15
+
16
+ Telegram doesn't have native heading support, so headings are converted
17
+ to bold text (matching the HTML converter behavior).
18
+
19
+ Args:
20
+ text: Input text with Markdown headings
21
+
22
+ Returns:
23
+ Tuple of (text_with_headings_converted, list_of_bold_entities)
24
+ """
25
+ entities: List[TelegramEntity] = []
26
+ result_parts: List[str] = []
27
+ last_end = 0
28
+
29
+ for match in _HEADING_PATTERN.finditer(text):
30
+ # Add text before this heading
31
+ result_parts.append(text[last_end : match.start()])
32
+
33
+ # Calculate position in output
34
+ current_offset = sum(len(p) for p in result_parts)
35
+
36
+ # Extract heading text (without the # markers)
37
+ heading_text = match.group(2)
38
+
39
+ # Add the heading text
40
+ result_parts.append(heading_text)
41
+
42
+ # Create bold entity for the heading
43
+ entities.append(
44
+ TelegramEntity(
45
+ type=EntityType.BOLD,
46
+ offset=current_offset,
47
+ length=len(heading_text),
48
+ )
49
+ )
50
+
51
+ last_end = match.end()
52
+
53
+ # Add remaining text
54
+ result_parts.append(text[last_end:])
55
+
56
+ return "".join(result_parts), entities
@@ -0,0 +1,295 @@
1
+ """Inline formatting entity extraction (bold, italic, underline, etc.)."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Patterns for different formatting types
9
+ # Order matters - longer markers first to avoid partial matches
10
+ _PATTERNS = [
11
+ # Bold+Italic: ***text***
12
+ (
13
+ re.compile(r"(?<![\\\*])\*\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)\*\*\*(?!\*)", re.DOTALL),
14
+ [EntityType.BOLD, EntityType.ITALIC],
15
+ 3,
16
+ ),
17
+ # Underline+Italic: ___text___
18
+ (
19
+ re.compile(
20
+ r"(?<![\\_])___(?!_)(?=\S)([\s\S]*?)(?<=\S)___(?!_)",
21
+ re.DOTALL,
22
+ ),
23
+ [EntityType.UNDERLINE, EntityType.ITALIC],
24
+ 3,
25
+ ),
26
+ # Bold: **text**
27
+ (
28
+ re.compile(r"(?<![\\\*])\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*\*(?!\*)", re.DOTALL),
29
+ [EntityType.BOLD],
30
+ 2,
31
+ ),
32
+ # Underline: __text__
33
+ (
34
+ re.compile(
35
+ r"(?<![\\_])__(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)__(?!_)",
36
+ re.DOTALL,
37
+ ),
38
+ [EntityType.UNDERLINE],
39
+ 2,
40
+ ),
41
+ # Strikethrough: ~~text~~
42
+ (
43
+ re.compile(r"(?<![\\~])~~(?!~)(?=\S)([\s\S]*?)(?<=\S)(?<!~)~~(?!~)", re.DOTALL),
44
+ [EntityType.STRIKETHROUGH],
45
+ 2,
46
+ ),
47
+ # Spoiler: ||text||
48
+ (
49
+ re.compile(r"(?<![\\|])\|\|(?!\|)(?=\S)([^\n]*?)(?<=\S)(?<!\|)\|\|(?!\|)"),
50
+ [EntityType.SPOILER],
51
+ 2,
52
+ ),
53
+ # Italic with asterisk: *text* (must not be adjacent to other asterisks)
54
+ (
55
+ re.compile(
56
+ r"(?<![A-Za-z0-9\\\*])\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*(?![A-Za-z0-9\*])",
57
+ re.DOTALL,
58
+ ),
59
+ [EntityType.ITALIC],
60
+ 1,
61
+ ),
62
+ # Italic with underscore: _text_
63
+ (
64
+ re.compile(
65
+ r"(?<![A-Za-z0-9\\_])_(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)_(?![A-Za-z0-9_])",
66
+ re.DOTALL,
67
+ ),
68
+ [EntityType.ITALIC],
69
+ 1,
70
+ ),
71
+ ]
72
+
73
+
74
+ class _Match:
75
+ """Represents a formatting match with its properties."""
76
+
77
+ def __init__(
78
+ self,
79
+ start: int,
80
+ end: int,
81
+ inner_start: int,
82
+ inner_end: int,
83
+ entity_types: List[EntityType],
84
+ marker_len: int,
85
+ ):
86
+ self.start = start
87
+ self.end = end
88
+ self.inner_start = inner_start
89
+ self.inner_end = inner_end
90
+ self.entity_types = entity_types
91
+ self.marker_len = marker_len
92
+ self.children: List["_Match"] = []
93
+
94
+ def contains(self, other: "_Match") -> bool:
95
+ """Check if this match's inner content fully contains another match."""
96
+ return self.inner_start <= other.start and other.end <= self.inner_end
97
+
98
+
99
+ def _find_all_matches(text: str) -> List[_Match]:
100
+ """Find all formatting matches in text."""
101
+ matches = []
102
+
103
+ for pattern, entity_types, marker_len in _PATTERNS:
104
+ for match in pattern.finditer(text):
105
+ matches.append(
106
+ _Match(
107
+ start=match.start(),
108
+ end=match.end(),
109
+ inner_start=match.start() + marker_len,
110
+ inner_end=match.end() - marker_len,
111
+ entity_types=list(entity_types),
112
+ marker_len=marker_len,
113
+ )
114
+ )
115
+
116
+ # Sort by start position, then by length descending (longer first)
117
+ matches.sort(key=lambda m: (m.start, -(m.end - m.start)))
118
+
119
+ return matches
120
+
121
+
122
+ def _build_match_tree(matches: List[_Match]) -> List[_Match]:
123
+ """
124
+ Build a tree of matches where nested matches are children.
125
+ Returns only top-level matches (others are nested as children).
126
+ """
127
+ if not matches:
128
+ return []
129
+
130
+ result: List[_Match] = []
131
+
132
+ for match in matches:
133
+ # Find if this match should be nested inside an existing result
134
+ placed = False
135
+ for existing in result:
136
+ if existing.contains(match):
137
+ # Recursively try to place in existing's children
138
+ placed = _try_place_in_children(existing, match)
139
+ if placed:
140
+ break
141
+
142
+ if not placed:
143
+ # Check if this match overlaps with any existing (invalid)
144
+ overlaps = False
145
+ for existing in result:
146
+ if _matches_overlap(match, existing):
147
+ overlaps = True
148
+ break
149
+
150
+ if not overlaps:
151
+ result.append(match)
152
+
153
+ return result
154
+
155
+
156
+ def _try_place_in_children(parent: _Match, child: _Match) -> bool:
157
+ """Try to place a child match in the parent's children list."""
158
+ # First check if it fits in any existing child
159
+ for existing_child in parent.children:
160
+ if existing_child.contains(child):
161
+ return _try_place_in_children(existing_child, child)
162
+
163
+ # Check for overlaps with existing children
164
+ for existing_child in parent.children:
165
+ if _matches_overlap(child, existing_child):
166
+ return False
167
+
168
+ # Can add as a direct child
169
+ parent.children.append(child)
170
+ return True
171
+
172
+
173
+ def _matches_overlap(m1: _Match, m2: _Match) -> bool:
174
+ """Check if two matches have invalid overlap (partial, not nested)."""
175
+ # No overlap
176
+ if m1.end <= m2.start or m2.end <= m1.start:
177
+ return False
178
+ # m1 contains m2 in inner content
179
+ if m1.inner_start <= m2.start and m2.end <= m1.inner_end:
180
+ return False
181
+ # m2 contains m1 in inner content
182
+ if m2.inner_start <= m1.start and m1.end <= m2.inner_end:
183
+ return False
184
+ # Invalid overlap
185
+ return True
186
+
187
+
188
+ def _process_match(
189
+ text: str,
190
+ match: _Match,
191
+ base_offset: int,
192
+ ) -> Tuple[str, List[TelegramEntity]]:
193
+ """
194
+ Process a single match and its children, returning plain text and entities.
195
+
196
+ Args:
197
+ text: The text containing the match
198
+ match: The match to process
199
+ base_offset: Offset in the final output where this match starts
200
+
201
+ Returns:
202
+ Tuple of (processed_text, entities)
203
+ """
204
+ inner_text = text[match.inner_start : match.inner_end]
205
+ entities: List[TelegramEntity] = []
206
+
207
+ # If there are children, process them
208
+ if match.children:
209
+ # Sort children by position
210
+ match.children.sort(key=lambda m: m.start)
211
+
212
+ # Process children recursively
213
+ processed_parts: List[str] = []
214
+ child_entities: List[TelegramEntity] = []
215
+ last_end = match.inner_start
216
+
217
+ for child in match.children:
218
+ # Add text before this child
219
+ processed_parts.append(text[last_end : child.start])
220
+
221
+ # Calculate child's offset in the final output
222
+ child_offset = base_offset + sum(len(p) for p in processed_parts)
223
+
224
+ # Process child recursively
225
+ child_text, child_ents = _process_match(text, child, child_offset)
226
+ processed_parts.append(child_text)
227
+ child_entities.extend(child_ents)
228
+
229
+ last_end = child.end
230
+
231
+ # Add remaining text after last child
232
+ processed_parts.append(text[last_end : match.inner_end])
233
+
234
+ inner_text = "".join(processed_parts)
235
+ entities.extend(child_entities)
236
+
237
+ # Create entities for this match
238
+ for entity_type in match.entity_types:
239
+ entities.append(
240
+ TelegramEntity(
241
+ type=entity_type,
242
+ offset=base_offset,
243
+ length=len(inner_text),
244
+ )
245
+ )
246
+
247
+ return inner_text, entities
248
+
249
+
250
+ def extract_inline_formatting_entities(
251
+ text: str,
252
+ ) -> Tuple[str, List[TelegramEntity]]:
253
+ """
254
+ Extract inline formatting (bold, italic, etc.) and return plain text with entities.
255
+
256
+ Handles nested formatting where one style is fully contained within another.
257
+
258
+ Args:
259
+ text: Input text with Markdown formatting markers
260
+
261
+ Returns:
262
+ Tuple of (text_without_markers, list_of_entities)
263
+ """
264
+ matches = _find_all_matches(text)
265
+ top_level_matches = _build_match_tree(matches)
266
+
267
+ if not top_level_matches:
268
+ return text, []
269
+
270
+ # Sort by position
271
+ top_level_matches.sort(key=lambda m: m.start)
272
+
273
+ # Process all matches
274
+ result_parts: List[str] = []
275
+ all_entities: List[TelegramEntity] = []
276
+ last_end = 0
277
+
278
+ for match in top_level_matches:
279
+ # Add text before this match
280
+ result_parts.append(text[last_end : match.start])
281
+
282
+ # Calculate offset for this match
283
+ current_offset = sum(len(p) for p in result_parts)
284
+
285
+ # Process match and its children
286
+ processed_text, entities = _process_match(text, match, current_offset)
287
+ result_parts.append(processed_text)
288
+ all_entities.extend(entities)
289
+
290
+ last_end = match.end
291
+
292
+ # Add remaining text
293
+ result_parts.append(text[last_end:])
294
+
295
+ return "".join(result_parts), all_entities
@@ -0,0 +1,59 @@
1
+ """Link entity extraction."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for Markdown links: [text](url)
9
+ # Also handles image links: ![alt](url) - treated the same as regular links
10
+ _LINK_PATTERN = re.compile(r"!?\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)")
11
+
12
+
13
+ def extract_link_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
14
+ """
15
+ Extract Markdown links and return plain text with TEXT_LINK entities.
16
+
17
+ Handles both regular links [text](url) and image links ![alt](url).
18
+ Image links are converted to text links showing the alt text.
19
+
20
+ Args:
21
+ text: Input text with Markdown links
22
+
23
+ Returns:
24
+ Tuple of (text_with_links_replaced, list_of_entities)
25
+ """
26
+ entities: List[TelegramEntity] = []
27
+ result_parts: List[str] = []
28
+ last_end = 0
29
+
30
+ for match in _LINK_PATTERN.finditer(text):
31
+ # Add text before this link
32
+ result_parts.append(text[last_end : match.start()])
33
+
34
+ # Calculate position in output
35
+ current_offset = sum(len(p) for p in result_parts)
36
+
37
+ # Extract link text and URL
38
+ link_text = match.group(1)
39
+ url = match.group(2)
40
+
41
+ # Add the link text (without the markdown syntax)
42
+ result_parts.append(link_text)
43
+
44
+ # Create entity
45
+ entities.append(
46
+ TelegramEntity(
47
+ type=EntityType.TEXT_LINK,
48
+ offset=current_offset,
49
+ length=len(link_text),
50
+ url=url,
51
+ )
52
+ )
53
+
54
+ last_end = match.end()
55
+
56
+ # Add remaining text
57
+ result_parts.append(text[last_end:])
58
+
59
+ return "".join(result_parts), entities