chatgpt-md-converter 0.3.12__py3-none-any.whl → 0.4.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,14 @@
1
1
  from .html_splitter import split_html_for_telegram
2
2
  from .html_to_markdown import html_to_telegram_markdown
3
+ from .telegram_entities import (EntityType, TelegramEntity,
4
+ telegram_format_entities)
3
5
  from .telegram_formatter import telegram_format
4
6
 
5
- __all__ = ["telegram_format", "split_html_for_telegram", "html_to_telegram_markdown"]
7
+ __all__ = [
8
+ "telegram_format",
9
+ "telegram_format_entities",
10
+ "TelegramEntity",
11
+ "EntityType",
12
+ "split_html_for_telegram",
13
+ "html_to_telegram_markdown",
14
+ ]
@@ -0,0 +1,68 @@
1
+ """
2
+ Telegram entity conversion module.
3
+
4
+ This module provides functions to convert Markdown text to Telegram's
5
+ native entity format (plain text + MessageEntity objects).
6
+ """
7
+
8
+ from typing import List, Tuple
9
+
10
+ from .entity import EntityType, TelegramEntity
11
+ from .parser import parse_entities
12
+
13
+
14
+ def telegram_format_entities(text: str) -> Tuple[str, List[dict]]:
15
+ """
16
+ Convert Markdown text to Telegram format with entities.
17
+
18
+ This function parses Markdown syntax and returns plain text along with
19
+ a list of entity dictionaries suitable for the Telegram Bot API.
20
+
21
+ Supported Markdown elements:
22
+ - **bold**
23
+ - *italic* or _italic_
24
+ - __underline__
25
+ - ~~strikethrough~~
26
+ - ||spoiler||
27
+ - `inline code`
28
+ - ```language
29
+ code blocks
30
+ ```
31
+ - [link text](url)
32
+ - > blockquotes
33
+ - >** expandable blockquotes
34
+ - # Headings (converted to bold)
35
+ - Lists with - or *
36
+
37
+ Args:
38
+ text: Markdown-formatted text
39
+
40
+ Returns:
41
+ Tuple of (plain_text, entities) where:
42
+ - plain_text: Text with all Markdown markers removed
43
+ - entities: List of dicts with 'type', 'offset', 'length' keys
44
+ (plus 'url' for links, 'language' for code blocks)
45
+
46
+ Example:
47
+ >>> text, entities = telegram_format_entities("**Hello** world!")
48
+ >>> print(text)
49
+ Hello world!
50
+ >>> print(entities)
51
+ [{'type': 'bold', 'offset': 0, 'length': 5}]
52
+
53
+ # Use with python-telegram-bot:
54
+ await bot.send_message(chat_id, text=text, entities=entities)
55
+
56
+ # Use with aiogram:
57
+ await message.answer(text, entities=entities)
58
+ """
59
+ plain_text, entity_objects = parse_entities(text)
60
+ return plain_text, [e.to_dict() for e in entity_objects]
61
+
62
+
63
+ __all__ = [
64
+ "telegram_format_entities",
65
+ "TelegramEntity",
66
+ "EntityType",
67
+ "parse_entities",
68
+ ]
@@ -0,0 +1,64 @@
1
+ """Telegram entity data structures."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Optional
6
+
7
+
8
+ class EntityType(Enum):
9
+ """Telegram MessageEntity types."""
10
+
11
+ BOLD = "bold"
12
+ ITALIC = "italic"
13
+ UNDERLINE = "underline"
14
+ STRIKETHROUGH = "strikethrough"
15
+ SPOILER = "spoiler"
16
+ CODE = "code"
17
+ PRE = "pre"
18
+ TEXT_LINK = "text_link"
19
+ BLOCKQUOTE = "blockquote"
20
+ EXPANDABLE_BLOCKQUOTE = "expandable_blockquote"
21
+
22
+
23
+ @dataclass
24
+ class TelegramEntity:
25
+ """
26
+ Represents a Telegram MessageEntity.
27
+
28
+ Attributes:
29
+ type: The entity type (bold, italic, code, etc.)
30
+ offset: Start position in UTF-16 code units
31
+ length: Length in UTF-16 code units
32
+ url: URL for TEXT_LINK entities
33
+ language: Programming language for PRE (code block) entities
34
+ """
35
+
36
+ type: EntityType
37
+ offset: int
38
+ length: int
39
+ url: Optional[str] = None
40
+ language: Optional[str] = None
41
+
42
+ def to_dict(self) -> dict:
43
+ """Convert to dict for JSON serialization / Telegram API."""
44
+ result = {
45
+ "type": self.type.value,
46
+ "offset": self.offset,
47
+ "length": self.length,
48
+ }
49
+ if self.url is not None:
50
+ result["url"] = self.url
51
+ if self.language is not None:
52
+ result["language"] = self.language
53
+ return result
54
+
55
+ @classmethod
56
+ def from_dict(cls, data: dict) -> "TelegramEntity":
57
+ """Create entity from dictionary."""
58
+ return cls(
59
+ type=EntityType(data["type"]),
60
+ offset=data["offset"],
61
+ length=data["length"],
62
+ url=data.get("url"),
63
+ language=data.get("language"),
64
+ )
@@ -0,0 +1,13 @@
1
+ """Entity extractors for different Markdown elements."""
2
+
3
+ from .blockquotes import extract_blockquote_entities
4
+ from .headings import extract_heading_entities
5
+ from .inline import extract_inline_formatting_entities
6
+ from .links import extract_link_entities
7
+
8
+ __all__ = [
9
+ "extract_inline_formatting_entities",
10
+ "extract_link_entities",
11
+ "extract_blockquote_entities",
12
+ "extract_heading_entities",
13
+ ]
@@ -0,0 +1,117 @@
1
+ """Blockquote entity extraction."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for regular blockquotes: > text
9
+ _BLOCKQUOTE_LINE_PATTERN = re.compile(r"^>(?!\*\*)\s?(.*)$", re.MULTILINE)
10
+
11
+ # Pattern for expandable blockquotes: >** text or **> text
12
+ _EXPANDABLE_BLOCKQUOTE_PATTERN = re.compile(
13
+ r"^(?:>\*\*|\*\*>)\s?(.*)$", re.MULTILINE
14
+ )
15
+
16
+
17
+ def extract_blockquote_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
18
+ """
19
+ Extract blockquotes and return plain text with BLOCKQUOTE entities.
20
+
21
+ Handles both regular (>) and expandable (>** or **>) blockquotes.
22
+ Consecutive blockquote lines are combined into a single entity.
23
+
24
+ Args:
25
+ text: Input text with blockquote markers
26
+
27
+ Returns:
28
+ Tuple of (text_without_markers, list_of_entities)
29
+ """
30
+ entities: List[TelegramEntity] = []
31
+
32
+ # First, handle expandable blockquotes
33
+ result_parts: List[str] = []
34
+
35
+ # Find all expandable blockquote lines and group consecutive ones
36
+ lines = text.split("\n")
37
+ i = 0
38
+ current_offset = 0
39
+
40
+ while i < len(lines):
41
+ line = lines[i]
42
+
43
+ # Check for expandable blockquote
44
+ exp_match = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(line)
45
+ if exp_match:
46
+ # Collect consecutive expandable blockquote lines
47
+ quote_lines = []
48
+
49
+ while i < len(lines):
50
+ m = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i])
51
+ if m:
52
+ quote_lines.append(m.group(1))
53
+ i += 1
54
+ else:
55
+ break
56
+
57
+ quote_content = "\n".join(quote_lines)
58
+ quote_offset = current_offset
59
+ current_offset += len(quote_content) + (1 if i < len(lines) else 0)
60
+
61
+ result_parts.append(quote_content)
62
+ if i < len(lines):
63
+ result_parts.append("\n")
64
+
65
+ entities.append(
66
+ TelegramEntity(
67
+ type=EntityType.EXPANDABLE_BLOCKQUOTE,
68
+ offset=quote_offset,
69
+ length=len(quote_content),
70
+ )
71
+ )
72
+ continue
73
+
74
+ # Check for regular blockquote
75
+ reg_match = _BLOCKQUOTE_LINE_PATTERN.match(line)
76
+ if reg_match:
77
+ # Collect consecutive regular blockquote lines
78
+ quote_lines = []
79
+ start_offset = current_offset
80
+
81
+ while i < len(lines):
82
+ # Don't match expandable as regular
83
+ if _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i]):
84
+ break
85
+ m = _BLOCKQUOTE_LINE_PATTERN.match(lines[i])
86
+ if m:
87
+ quote_lines.append(m.group(1))
88
+ i += 1
89
+ else:
90
+ break
91
+
92
+ quote_content = "\n".join(quote_lines)
93
+ current_offset += len(quote_content) + (1 if i < len(lines) else 0)
94
+
95
+ result_parts.append(quote_content)
96
+ if i < len(lines):
97
+ result_parts.append("\n")
98
+
99
+ entities.append(
100
+ TelegramEntity(
101
+ type=EntityType.BLOCKQUOTE,
102
+ offset=start_offset,
103
+ length=len(quote_content),
104
+ )
105
+ )
106
+ continue
107
+
108
+ # Regular line
109
+ current_offset += len(line) + (1 if i < len(lines) - 1 else 0)
110
+ result_parts.append(line)
111
+ if i < len(lines) - 1:
112
+ result_parts.append("\n")
113
+ i += 1
114
+
115
+ result_text = "".join(result_parts)
116
+
117
+ return result_text, entities
@@ -0,0 +1,56 @@
1
+ """Heading entity extraction (converted to bold)."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for Markdown headings: # Heading, ## Heading, etc.
9
+ _HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
10
+
11
+
12
+ def extract_heading_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
13
+ """
14
+ Extract Markdown headings and convert them to bold entities.
15
+
16
+ Telegram doesn't have native heading support, so headings are converted
17
+ to bold text (matching the HTML converter behavior).
18
+
19
+ Args:
20
+ text: Input text with Markdown headings
21
+
22
+ Returns:
23
+ Tuple of (text_with_headings_converted, list_of_bold_entities)
24
+ """
25
+ entities: List[TelegramEntity] = []
26
+ result_parts: List[str] = []
27
+ last_end = 0
28
+
29
+ for match in _HEADING_PATTERN.finditer(text):
30
+ # Add text before this heading
31
+ result_parts.append(text[last_end : match.start()])
32
+
33
+ # Calculate position in output
34
+ current_offset = sum(len(p) for p in result_parts)
35
+
36
+ # Extract heading text (without the # markers)
37
+ heading_text = match.group(2)
38
+
39
+ # Add the heading text
40
+ result_parts.append(heading_text)
41
+
42
+ # Create bold entity for the heading
43
+ entities.append(
44
+ TelegramEntity(
45
+ type=EntityType.BOLD,
46
+ offset=current_offset,
47
+ length=len(heading_text),
48
+ )
49
+ )
50
+
51
+ last_end = match.end()
52
+
53
+ # Add remaining text
54
+ result_parts.append(text[last_end:])
55
+
56
+ return "".join(result_parts), entities
@@ -0,0 +1,295 @@
1
+ """Inline formatting entity extraction (bold, italic, underline, etc.)."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Patterns for different formatting types
9
+ # Order matters - longer markers first to avoid partial matches
10
+ _PATTERNS = [
11
+ # Bold+Italic: ***text***
12
+ (
13
+ re.compile(r"(?<![\\\*])\*\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)\*\*\*(?!\*)", re.DOTALL),
14
+ [EntityType.BOLD, EntityType.ITALIC],
15
+ 3,
16
+ ),
17
+ # Underline+Italic: ___text___
18
+ (
19
+ re.compile(
20
+ r"(?<![\\_])___(?!_)(?=\S)([\s\S]*?)(?<=\S)___(?!_)",
21
+ re.DOTALL,
22
+ ),
23
+ [EntityType.UNDERLINE, EntityType.ITALIC],
24
+ 3,
25
+ ),
26
+ # Bold: **text**
27
+ (
28
+ re.compile(r"(?<![\\\*])\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*\*(?!\*)", re.DOTALL),
29
+ [EntityType.BOLD],
30
+ 2,
31
+ ),
32
+ # Underline: __text__
33
+ (
34
+ re.compile(
35
+ r"(?<![\\_])__(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)__(?!_)",
36
+ re.DOTALL,
37
+ ),
38
+ [EntityType.UNDERLINE],
39
+ 2,
40
+ ),
41
+ # Strikethrough: ~~text~~
42
+ (
43
+ re.compile(r"(?<![\\~])~~(?!~)(?=\S)([\s\S]*?)(?<=\S)(?<!~)~~(?!~)", re.DOTALL),
44
+ [EntityType.STRIKETHROUGH],
45
+ 2,
46
+ ),
47
+ # Spoiler: ||text||
48
+ (
49
+ re.compile(r"(?<![\\|])\|\|(?!\|)(?=\S)([^\n]*?)(?<=\S)(?<!\|)\|\|(?!\|)"),
50
+ [EntityType.SPOILER],
51
+ 2,
52
+ ),
53
+ # Italic with asterisk: *text* (must not be adjacent to other asterisks)
54
+ (
55
+ re.compile(
56
+ r"(?<![A-Za-z0-9\\\*])\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*(?![A-Za-z0-9\*])",
57
+ re.DOTALL,
58
+ ),
59
+ [EntityType.ITALIC],
60
+ 1,
61
+ ),
62
+ # Italic with underscore: _text_
63
+ (
64
+ re.compile(
65
+ r"(?<![A-Za-z0-9\\_])_(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)_(?![A-Za-z0-9_])",
66
+ re.DOTALL,
67
+ ),
68
+ [EntityType.ITALIC],
69
+ 1,
70
+ ),
71
+ ]
72
+
73
+
74
+ class _Match:
75
+ """Represents a formatting match with its properties."""
76
+
77
+ def __init__(
78
+ self,
79
+ start: int,
80
+ end: int,
81
+ inner_start: int,
82
+ inner_end: int,
83
+ entity_types: List[EntityType],
84
+ marker_len: int,
85
+ ):
86
+ self.start = start
87
+ self.end = end
88
+ self.inner_start = inner_start
89
+ self.inner_end = inner_end
90
+ self.entity_types = entity_types
91
+ self.marker_len = marker_len
92
+ self.children: List["_Match"] = []
93
+
94
+ def contains(self, other: "_Match") -> bool:
95
+ """Check if this match's inner content fully contains another match."""
96
+ return self.inner_start <= other.start and other.end <= self.inner_end
97
+
98
+
99
+ def _find_all_matches(text: str) -> List[_Match]:
100
+ """Find all formatting matches in text."""
101
+ matches = []
102
+
103
+ for pattern, entity_types, marker_len in _PATTERNS:
104
+ for match in pattern.finditer(text):
105
+ matches.append(
106
+ _Match(
107
+ start=match.start(),
108
+ end=match.end(),
109
+ inner_start=match.start() + marker_len,
110
+ inner_end=match.end() - marker_len,
111
+ entity_types=list(entity_types),
112
+ marker_len=marker_len,
113
+ )
114
+ )
115
+
116
+ # Sort by start position, then by length descending (longer first)
117
+ matches.sort(key=lambda m: (m.start, -(m.end - m.start)))
118
+
119
+ return matches
120
+
121
+
122
+ def _build_match_tree(matches: List[_Match]) -> List[_Match]:
123
+ """
124
+ Build a tree of matches where nested matches are children.
125
+ Returns only top-level matches (others are nested as children).
126
+ """
127
+ if not matches:
128
+ return []
129
+
130
+ result: List[_Match] = []
131
+
132
+ for match in matches:
133
+ # Find if this match should be nested inside an existing result
134
+ placed = False
135
+ for existing in result:
136
+ if existing.contains(match):
137
+ # Recursively try to place in existing's children
138
+ placed = _try_place_in_children(existing, match)
139
+ if placed:
140
+ break
141
+
142
+ if not placed:
143
+ # Check if this match overlaps with any existing (invalid)
144
+ overlaps = False
145
+ for existing in result:
146
+ if _matches_overlap(match, existing):
147
+ overlaps = True
148
+ break
149
+
150
+ if not overlaps:
151
+ result.append(match)
152
+
153
+ return result
154
+
155
+
156
+ def _try_place_in_children(parent: _Match, child: _Match) -> bool:
157
+ """Try to place a child match in the parent's children list."""
158
+ # First check if it fits in any existing child
159
+ for existing_child in parent.children:
160
+ if existing_child.contains(child):
161
+ return _try_place_in_children(existing_child, child)
162
+
163
+ # Check for overlaps with existing children
164
+ for existing_child in parent.children:
165
+ if _matches_overlap(child, existing_child):
166
+ return False
167
+
168
+ # Can add as a direct child
169
+ parent.children.append(child)
170
+ return True
171
+
172
+
173
+ def _matches_overlap(m1: _Match, m2: _Match) -> bool:
174
+ """Check if two matches have invalid overlap (partial, not nested)."""
175
+ # No overlap
176
+ if m1.end <= m2.start or m2.end <= m1.start:
177
+ return False
178
+ # m1 contains m2 in inner content
179
+ if m1.inner_start <= m2.start and m2.end <= m1.inner_end:
180
+ return False
181
+ # m2 contains m1 in inner content
182
+ if m2.inner_start <= m1.start and m1.end <= m2.inner_end:
183
+ return False
184
+ # Invalid overlap
185
+ return True
186
+
187
+
188
+ def _process_match(
189
+ text: str,
190
+ match: _Match,
191
+ base_offset: int,
192
+ ) -> Tuple[str, List[TelegramEntity]]:
193
+ """
194
+ Process a single match and its children, returning plain text and entities.
195
+
196
+ Args:
197
+ text: The text containing the match
198
+ match: The match to process
199
+ base_offset: Offset in the final output where this match starts
200
+
201
+ Returns:
202
+ Tuple of (processed_text, entities)
203
+ """
204
+ inner_text = text[match.inner_start : match.inner_end]
205
+ entities: List[TelegramEntity] = []
206
+
207
+ # If there are children, process them
208
+ if match.children:
209
+ # Sort children by position
210
+ match.children.sort(key=lambda m: m.start)
211
+
212
+ # Process children recursively
213
+ processed_parts: List[str] = []
214
+ child_entities: List[TelegramEntity] = []
215
+ last_end = match.inner_start
216
+
217
+ for child in match.children:
218
+ # Add text before this child
219
+ processed_parts.append(text[last_end : child.start])
220
+
221
+ # Calculate child's offset in the final output
222
+ child_offset = base_offset + sum(len(p) for p in processed_parts)
223
+
224
+ # Process child recursively
225
+ child_text, child_ents = _process_match(text, child, child_offset)
226
+ processed_parts.append(child_text)
227
+ child_entities.extend(child_ents)
228
+
229
+ last_end = child.end
230
+
231
+ # Add remaining text after last child
232
+ processed_parts.append(text[last_end : match.inner_end])
233
+
234
+ inner_text = "".join(processed_parts)
235
+ entities.extend(child_entities)
236
+
237
+ # Create entities for this match
238
+ for entity_type in match.entity_types:
239
+ entities.append(
240
+ TelegramEntity(
241
+ type=entity_type,
242
+ offset=base_offset,
243
+ length=len(inner_text),
244
+ )
245
+ )
246
+
247
+ return inner_text, entities
248
+
249
+
250
+ def extract_inline_formatting_entities(
251
+ text: str,
252
+ ) -> Tuple[str, List[TelegramEntity]]:
253
+ """
254
+ Extract inline formatting (bold, italic, etc.) and return plain text with entities.
255
+
256
+ Handles nested formatting where one style is fully contained within another.
257
+
258
+ Args:
259
+ text: Input text with Markdown formatting markers
260
+
261
+ Returns:
262
+ Tuple of (text_without_markers, list_of_entities)
263
+ """
264
+ matches = _find_all_matches(text)
265
+ top_level_matches = _build_match_tree(matches)
266
+
267
+ if not top_level_matches:
268
+ return text, []
269
+
270
+ # Sort by position
271
+ top_level_matches.sort(key=lambda m: m.start)
272
+
273
+ # Process all matches
274
+ result_parts: List[str] = []
275
+ all_entities: List[TelegramEntity] = []
276
+ last_end = 0
277
+
278
+ for match in top_level_matches:
279
+ # Add text before this match
280
+ result_parts.append(text[last_end : match.start])
281
+
282
+ # Calculate offset for this match
283
+ current_offset = sum(len(p) for p in result_parts)
284
+
285
+ # Process match and its children
286
+ processed_text, entities = _process_match(text, match, current_offset)
287
+ result_parts.append(processed_text)
288
+ all_entities.extend(entities)
289
+
290
+ last_end = match.end
291
+
292
+ # Add remaining text
293
+ result_parts.append(text[last_end:])
294
+
295
+ return "".join(result_parts), all_entities
@@ -0,0 +1,59 @@
1
+ """Link entity extraction."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from ..entity import EntityType, TelegramEntity
7
+
8
+ # Pattern for Markdown links: [text](url)
9
+ # Also handles image links: ![alt](url) - treated the same as regular links
10
+ _LINK_PATTERN = re.compile(r"!?\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)")
11
+
12
+
13
+ def extract_link_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
14
+ """
15
+ Extract Markdown links and return plain text with TEXT_LINK entities.
16
+
17
+ Handles both regular links [text](url) and image links ![alt](url).
18
+ Image links are converted to text links showing the alt text.
19
+
20
+ Args:
21
+ text: Input text with Markdown links
22
+
23
+ Returns:
24
+ Tuple of (text_with_links_replaced, list_of_entities)
25
+ """
26
+ entities: List[TelegramEntity] = []
27
+ result_parts: List[str] = []
28
+ last_end = 0
29
+
30
+ for match in _LINK_PATTERN.finditer(text):
31
+ # Add text before this link
32
+ result_parts.append(text[last_end : match.start()])
33
+
34
+ # Calculate position in output
35
+ current_offset = sum(len(p) for p in result_parts)
36
+
37
+ # Extract link text and URL
38
+ link_text = match.group(1)
39
+ url = match.group(2)
40
+
41
+ # Add the link text (without the markdown syntax)
42
+ result_parts.append(link_text)
43
+
44
+ # Create entity
45
+ entities.append(
46
+ TelegramEntity(
47
+ type=EntityType.TEXT_LINK,
48
+ offset=current_offset,
49
+ length=len(link_text),
50
+ url=url,
51
+ )
52
+ )
53
+
54
+ last_end = match.end()
55
+
56
+ # Add remaining text
57
+ result_parts.append(text[last_end:])
58
+
59
+ return "".join(result_parts), entities
@@ -0,0 +1,300 @@
1
+ """Main parser that combines all entity extractors."""
2
+
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ from .entity import EntityType, TelegramEntity
7
+ from .extractors import (extract_blockquote_entities, extract_heading_entities,
8
+ extract_inline_formatting_entities,
9
+ extract_link_entities)
10
+ from .utf16 import utf16_len
11
+
12
+ # Placeholder prefix for protected content
13
+ _CODE_BLOCK_PLACEHOLDER = "\x00CODEBLOCK"
14
+ _INLINE_CODE_PLACEHOLDER = "\x00INLINECODE"
15
+
16
+
17
+ def _convert_list_markers(text: str) -> str:
18
+ """Convert Markdown list markers (* or -) to bullet points."""
19
+ return re.sub(r"^(\s*)[\-\*]\s+", r"\1• ", text, flags=re.MULTILINE)
20
+
21
+
22
+ def _remove_citation_markers(text: str) -> str:
23
+ """Remove ChatGPT-style citation markers like 【1】."""
24
+ return re.sub(r"【[^】]+】", "", text)
25
+
26
+
27
+ def _adjust_entities_to_utf16(
28
+ text: str, entities: List[TelegramEntity]
29
+ ) -> List[TelegramEntity]:
30
+ """
31
+ Convert entity offsets and lengths from Python char indices to UTF-16 code units.
32
+
33
+ Telegram requires UTF-16 code units for entity positions.
34
+ """
35
+ adjusted = []
36
+ for entity in entities:
37
+ # Clamp offset and length to text bounds
38
+ offset = min(entity.offset, len(text))
39
+ length = min(entity.length, len(text) - offset)
40
+
41
+ if length <= 0:
42
+ continue
43
+
44
+ # Get the text portions
45
+ before_text = text[:offset]
46
+ entity_text = text[offset : offset + length]
47
+
48
+ # Convert to UTF-16 units
49
+ utf16_offset = utf16_len(before_text)
50
+ utf16_length = utf16_len(entity_text)
51
+
52
+ if utf16_length > 0:
53
+ adjusted.append(
54
+ TelegramEntity(
55
+ type=entity.type,
56
+ offset=utf16_offset,
57
+ length=utf16_length,
58
+ url=entity.url,
59
+ language=entity.language,
60
+ )
61
+ )
62
+
63
+ return adjusted
64
+
65
+
66
+ def _validate_and_sort_entities(
67
+ entities: List[TelegramEntity],
68
+ ) -> List[TelegramEntity]:
69
+ """
70
+ Sort entities by offset and filter invalid ones.
71
+ """
72
+ # Filter out zero-length and negative entities
73
+ entities = [e for e in entities if e.length > 0 and e.offset >= 0]
74
+
75
+ # Sort by offset, then by length descending (longer first for nesting)
76
+ entities = sorted(entities, key=lambda e: (e.offset, -e.length))
77
+
78
+ return entities
79
+
80
+
81
+ def _clean_multiple_newlines(text: str) -> str:
82
+ """Reduce 3+ consecutive newlines to just 2."""
83
+ return re.sub(r"\n{3,}", "\n\n", text)
84
+
85
+
86
+ def _extract_with_placeholders(
87
+ text: str, pattern: re.Pattern, placeholder_prefix: str
88
+ ) -> Tuple[str, dict]:
89
+ """
90
+ Extract matches and replace with placeholders.
91
+ Returns (modified_text, {placeholder: (content, entity_info)})
92
+ """
93
+ extractions = {}
94
+ counter = [0]
95
+
96
+ def replacer(match):
97
+ placeholder = f"{placeholder_prefix}{counter[0]}\x00"
98
+ counter[0] += 1
99
+ extractions[placeholder] = match
100
+ return placeholder
101
+
102
+ modified = pattern.sub(replacer, text)
103
+ return modified, extractions
104
+
105
+
106
+ def parse_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
107
+ """
108
+ Parse Markdown text and return plain text with Telegram entities.
109
+
110
+ Uses a placeholder-based approach to handle the order of extraction correctly:
111
+ 1. Replace code blocks and inline code with placeholders
112
+ 2. Extract all other formatting (blockquotes, headings, links, inline styles)
113
+ 3. Restore placeholders and calculate final offsets
114
+
115
+ Args:
116
+ text: Markdown-formatted text
117
+
118
+ Returns:
119
+ Tuple of (plain_text, list_of_entities)
120
+ Entities have offsets/lengths in UTF-16 code units.
121
+ """
122
+ all_entities: List[TelegramEntity] = []
123
+
124
+ # Phase 1: Extract code blocks to placeholders
125
+ code_block_pattern = re.compile(
126
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n(?P<code>[\s\S]*?)(?P=fence)",
127
+ flags=re.MULTILINE,
128
+ )
129
+ code_block_map = {}
130
+ code_block_counter = [0]
131
+
132
+ def replace_code_block(match):
133
+ placeholder = f"{_CODE_BLOCK_PLACEHOLDER}{code_block_counter[0]}\x00"
134
+ code_block_counter[0] += 1
135
+ # Strip trailing newline from code content (appears before closing fence)
136
+ code_content = match.group("code").rstrip("\n")
137
+ language = match.group("lang") or None
138
+ code_block_map[placeholder] = (code_content, language)
139
+ return placeholder
140
+
141
+ # Ensure closing delimiters
142
+ text = _ensure_closing_delimiters(text)
143
+ text = code_block_pattern.sub(replace_code_block, text)
144
+
145
+ # Phase 2: Extract inline code to placeholders
146
+ inline_code_pattern = re.compile(r"`([^`\n]+)`")
147
+ inline_code_map = {}
148
+ inline_code_counter = [0]
149
+
150
+ def replace_inline_code(match):
151
+ placeholder = f"{_INLINE_CODE_PLACEHOLDER}{inline_code_counter[0]}\x00"
152
+ inline_code_counter[0] += 1
153
+ code_content = match.group(1)
154
+ inline_code_map[placeholder] = code_content
155
+ return placeholder
156
+
157
+ text = inline_code_pattern.sub(replace_inline_code, text)
158
+
159
+ # Phase 3: Extract other formatting (on text with placeholders)
160
+ # Order matters: inline formatting first (removes markers), then links
161
+ text, blockquote_entities = extract_blockquote_entities(text)
162
+ all_entities.extend(blockquote_entities)
163
+
164
+ text, heading_entities = extract_heading_entities(text)
165
+ all_entities.extend(heading_entities)
166
+
167
+ text, inline_entities = extract_inline_formatting_entities(text)
168
+ all_entities.extend(inline_entities)
169
+
170
+ # Extract links AFTER inline formatting so offsets are correct
171
+ text, link_entities = extract_link_entities(text)
172
+ all_entities.extend(link_entities)
173
+
174
+ # Phase 4: Restore code placeholders and create entities
175
+ # Collect all placeholders with their info
176
+ all_placeholders = []
177
+
178
+ for placeholder, (code_content, language) in code_block_map.items():
179
+ if placeholder in text:
180
+ pos = text.find(placeholder)
181
+ all_placeholders.append({
182
+ 'placeholder': placeholder,
183
+ 'content': code_content,
184
+ 'position': pos,
185
+ 'type': EntityType.PRE,
186
+ 'language': language,
187
+ })
188
+
189
+ for placeholder, code_content in inline_code_map.items():
190
+ if placeholder in text:
191
+ pos = text.find(placeholder)
192
+ all_placeholders.append({
193
+ 'placeholder': placeholder,
194
+ 'content': code_content,
195
+ 'position': pos,
196
+ 'type': EntityType.CODE,
197
+ 'language': None,
198
+ })
199
+
200
+ # Sort by position ascending (restore from start to end)
201
+ # This way, when we shift entities, the later entities get adjusted correctly
202
+ all_placeholders.sort(key=lambda x: x['position'])
203
+
204
+ code_entities: List[TelegramEntity] = []
205
+
206
+ for ph_info in all_placeholders:
207
+ placeholder = ph_info['placeholder']
208
+ code_content = ph_info['content']
209
+ offset = text.find(placeholder)
210
+ text = text.replace(placeholder, code_content, 1)
211
+
212
+ code_entities.append(
213
+ TelegramEntity(
214
+ type=ph_info['type'],
215
+ offset=offset,
216
+ length=len(code_content),
217
+ language=ph_info['language'],
218
+ )
219
+ )
220
+
221
+ # Adjust existing entities (both all_entities and code_entities) after this position
222
+ placeholder_len = len(placeholder)
223
+ content_len = len(code_content)
224
+ shift = content_len - placeholder_len
225
+ all_entities = _shift_entities_after(all_entities, offset, shift)
226
+ # Also shift already-created code entities (except the one we just added)
227
+ code_entities = _shift_entities_after(code_entities[:-1], offset, shift) + [code_entities[-1]]
228
+
229
+ all_entities.extend(code_entities)
230
+
231
+ # Phase 5: Clean up
232
+ text = _convert_list_markers(text)
233
+ text = _remove_citation_markers(text)
234
+ text = _clean_multiple_newlines(text)
235
+
236
+ # Validate and sort entities
237
+ all_entities = _validate_and_sort_entities(all_entities)
238
+
239
+ # Convert to UTF-16 offsets
240
+ all_entities = _adjust_entities_to_utf16(text, all_entities)
241
+
242
+ return text.strip(), all_entities
243
+
244
+
245
+ def _shift_entities_after(
246
+ entities: List[TelegramEntity], position: int, shift: int
247
+ ) -> List[TelegramEntity]:
248
+ """Shift entity offsets that come after a given position."""
249
+ result = []
250
+ for e in entities:
251
+ if e.offset >= position:
252
+ result.append(
253
+ TelegramEntity(
254
+ type=e.type,
255
+ offset=e.offset + shift,
256
+ length=e.length,
257
+ url=e.url,
258
+ language=e.language,
259
+ )
260
+ )
261
+ else:
262
+ result.append(e)
263
+ return result
264
+
265
+
266
+ def _ensure_closing_delimiters(text: str) -> str:
267
+ """Append any missing closing backtick fences for Markdown code blocks."""
268
+ code_block_re = re.compile(
269
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
270
+ flags=re.DOTALL,
271
+ )
272
+
273
+ open_fence = None
274
+ for line in text.splitlines():
275
+ stripped = line.strip()
276
+ if open_fence is None:
277
+ match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
278
+ if match:
279
+ open_fence = match.group("fence")
280
+ else:
281
+ if stripped == open_fence:
282
+ open_fence = None
283
+
284
+ if open_fence is not None:
285
+ if not text.endswith("\n"):
286
+ text += "\n"
287
+ text += open_fence
288
+
289
+ # Check for unclosed triple backticks
290
+ temp = code_block_re.sub("", text)
291
+ if temp.count("```") % 2 != 0:
292
+ text += "\n```"
293
+
294
+ # Check for unclosed single backticks (inline code)
295
+ temp = code_block_re.sub("", text)
296
+ temp = re.sub(r"``+", "", temp)
297
+ if temp.count("`") % 2 != 0:
298
+ text += "`"
299
+
300
+ return text
@@ -0,0 +1,50 @@
1
+ """UTF-16 encoding utilities for Telegram entity offset calculation."""
2
+
3
+
4
+ def utf16_len(text: str) -> int:
5
+ """
6
+ Calculate the length of a string in UTF-16 code units.
7
+
8
+ Telegram uses UTF-16 code units for entity offsets and lengths.
9
+ Characters outside the Basic Multilingual Plane (like emoji) take 2 units.
10
+
11
+ Args:
12
+ text: The string to measure
13
+
14
+ Returns:
15
+ Length in UTF-16 code units
16
+ """
17
+ return len(text.encode("utf-16-le")) // 2
18
+
19
+
20
+ def char_to_utf16_offset(text: str, char_index: int) -> int:
21
+ """
22
+ Convert a Python string index to a UTF-16 offset.
23
+
24
+ Args:
25
+ text: The full text string
26
+ char_index: Python string index (0-based)
27
+
28
+ Returns:
29
+ UTF-16 offset for the same position
30
+ """
31
+ return utf16_len(text[:char_index])
32
+
33
+
34
+ def utf16_to_char_offset(text: str, utf16_offset: int) -> int:
35
+ """
36
+ Convert a UTF-16 offset to a Python string index.
37
+
38
+ Args:
39
+ text: The full text string
40
+ utf16_offset: UTF-16 offset
41
+
42
+ Returns:
43
+ Python string index for the same position
44
+ """
45
+ current_utf16 = 0
46
+ for i, char in enumerate(text):
47
+ if current_utf16 >= utf16_offset:
48
+ return i
49
+ current_utf16 += len(char.encode("utf-16-le")) // 2
50
+ return len(text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.12
3
+ Version: 0.4.0b1
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -0,0 +1,29 @@
1
+ chatgpt_md_converter/__init__.py,sha256=pq0o14l7pBFPm-YsLj0A7nO2FPgF48MSRCEc7b9ktGQ,447
2
+ chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
3
+ chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
4
+ chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
5
+ chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
6
+ chatgpt_md_converter/html_markdown/handlers.py,sha256=zKGRg__41SP7bKs8jodOWAEZJb2FNaC5_raoUiWdBUE,6696
7
+ chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
8
+ chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
9
+ chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
10
+ chatgpt_md_converter/telegram_entities/__init__.py,sha256=dopG-8_gWX8xPeD-9dyHdurs5VPrz-wAFFRvHNKiUNg,1855
11
+ chatgpt_md_converter/telegram_entities/entity.py,sha256=oygQxwBsE7AGm2etq6HFZIeo7tBCwsUGniLP17-_Oz0,1705
12
+ chatgpt_md_converter/telegram_entities/parser.py,sha256=P7uQeGaNLLuFa5QLEkkEhdSqaB9xIlUwuZjbXZ8hkGQ,9885
13
+ chatgpt_md_converter/telegram_entities/utf16.py,sha256=eH-yX7d1wZwb3nRdk3kq1LFd-NQMqYHutPbkvX5_DC0,1283
14
+ chatgpt_md_converter/telegram_entities/extractors/__init__.py,sha256=FinTAoRNjuHza0LcEBtpNnBvSR8PFo6cVVDkLg0cV6w,407
15
+ chatgpt_md_converter/telegram_entities/extractors/blockquotes.py,sha256=Di8nG5Oej0hLbBB-WJ3GtlZCvCaa_BNmoUdpFGo9mnY,3596
16
+ chatgpt_md_converter/telegram_entities/extractors/headings.py,sha256=AzjF9jElWfw3d4Qx-81fku7gyTkvb0pKlmow0zUXSk4,1602
17
+ chatgpt_md_converter/telegram_entities/extractors/inline.py,sha256=DYSs7cJEFY3-fGtdMdOA7DO5ERtEF8r2GQns5WcPyto,8745
18
+ chatgpt_md_converter/telegram_entities/extractors/links.py,sha256=fe35PDGKzbF0cRac3HgQ9mVFvYvrtt9LDmS_pL9GPlk,1671
19
+ chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
20
+ chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=VPkSisvb6DiS5KAcq0OaX4sqR1YX4VgZvJEXZeAjIWk,3067
21
+ chatgpt_md_converter/telegram_markdown/inline.py,sha256=MPzj5VpDqrlvPy69CCwUIOsWgtgIFfbB4CliV5Wz-TY,2207
22
+ chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
23
+ chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=k9XBtwgXkh07SlsqbdcZHwOMHhUGOjiIbOehO5wBnu0,1561
24
+ chatgpt_md_converter/telegram_markdown/renderer.py,sha256=39ZehJq6PVWm-sigeBz7vCycwzEmV4Mwiw36jkGIgXI,1960
25
+ chatgpt_md_converter-0.4.0b1.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
26
+ chatgpt_md_converter-0.4.0b1.dist-info/METADATA,sha256=9wvQrKaXzPu-_VKWRW_cK4vmbaZkwgtzMrbENVHLZb4,6606
27
+ chatgpt_md_converter-0.4.0b1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
28
+ chatgpt_md_converter-0.4.0b1.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
29
+ chatgpt_md_converter-0.4.0b1.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=6ts2hnimdBn_qCA15LKuipUjSU9ZCqRk1GbDPc_JjO4,242
2
- chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
3
- chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
4
- chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
5
- chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
6
- chatgpt_md_converter/html_markdown/handlers.py,sha256=zKGRg__41SP7bKs8jodOWAEZJb2FNaC5_raoUiWdBUE,6696
7
- chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
8
- chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
9
- chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
10
- chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
11
- chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=VPkSisvb6DiS5KAcq0OaX4sqR1YX4VgZvJEXZeAjIWk,3067
12
- chatgpt_md_converter/telegram_markdown/inline.py,sha256=MPzj5VpDqrlvPy69CCwUIOsWgtgIFfbB4CliV5Wz-TY,2207
13
- chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
14
- chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=k9XBtwgXkh07SlsqbdcZHwOMHhUGOjiIbOehO5wBnu0,1561
15
- chatgpt_md_converter/telegram_markdown/renderer.py,sha256=39ZehJq6PVWm-sigeBz7vCycwzEmV4Mwiw36jkGIgXI,1960
16
- chatgpt_md_converter-0.3.12.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
17
- chatgpt_md_converter-0.3.12.dist-info/METADATA,sha256=CWvPYndrqJad_RD-zJABwdDSPhVhkcuXzVwf8Z7BTjw,6605
18
- chatgpt_md_converter-0.3.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- chatgpt_md_converter-0.3.12.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
20
- chatgpt_md_converter-0.3.12.dist-info/RECORD,,