chatgpt-md-converter 0.3.12__py3-none-any.whl → 0.4.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatgpt_md_converter/__init__.py +10 -1
- chatgpt_md_converter/telegram_entities/__init__.py +68 -0
- chatgpt_md_converter/telegram_entities/entity.py +64 -0
- chatgpt_md_converter/telegram_entities/extractors/__init__.py +13 -0
- chatgpt_md_converter/telegram_entities/extractors/blockquotes.py +117 -0
- chatgpt_md_converter/telegram_entities/extractors/headings.py +56 -0
- chatgpt_md_converter/telegram_entities/extractors/inline.py +295 -0
- chatgpt_md_converter/telegram_entities/extractors/links.py +91 -0
- chatgpt_md_converter/telegram_entities/parser.py +300 -0
- chatgpt_md_converter/telegram_entities/utf16.py +50 -0
- {chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/METADATA +1 -1
- chatgpt_md_converter-0.4.0b2.dist-info/RECORD +29 -0
- chatgpt_md_converter-0.3.12.dist-info/RECORD +0 -20
- {chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/WHEEL +0 -0
- {chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/licenses/LICENSE +0 -0
- {chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/top_level.txt +0 -0
chatgpt_md_converter/__init__.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
from .html_splitter import split_html_for_telegram
|
|
2
2
|
from .html_to_markdown import html_to_telegram_markdown
|
|
3
|
+
from .telegram_entities import (EntityType, TelegramEntity,
|
|
4
|
+
telegram_format_entities)
|
|
3
5
|
from .telegram_formatter import telegram_format
|
|
4
6
|
|
|
5
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"telegram_format",
|
|
9
|
+
"telegram_format_entities",
|
|
10
|
+
"TelegramEntity",
|
|
11
|
+
"EntityType",
|
|
12
|
+
"split_html_for_telegram",
|
|
13
|
+
"html_to_telegram_markdown",
|
|
14
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Telegram entity conversion module.
|
|
3
|
+
|
|
4
|
+
This module provides functions to convert Markdown text to Telegram's
|
|
5
|
+
native entity format (plain text + MessageEntity objects).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
from .entity import EntityType, TelegramEntity
|
|
11
|
+
from .parser import parse_entities
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def telegram_format_entities(text: str) -> Tuple[str, List[dict]]:
|
|
15
|
+
"""
|
|
16
|
+
Convert Markdown text to Telegram format with entities.
|
|
17
|
+
|
|
18
|
+
This function parses Markdown syntax and returns plain text along with
|
|
19
|
+
a list of entity dictionaries suitable for the Telegram Bot API.
|
|
20
|
+
|
|
21
|
+
Supported Markdown elements:
|
|
22
|
+
- **bold**
|
|
23
|
+
- *italic* or _italic_
|
|
24
|
+
- __underline__
|
|
25
|
+
- ~~strikethrough~~
|
|
26
|
+
- ||spoiler||
|
|
27
|
+
- `inline code`
|
|
28
|
+
- ```language
|
|
29
|
+
code blocks
|
|
30
|
+
```
|
|
31
|
+
- [link text](url)
|
|
32
|
+
- > blockquotes
|
|
33
|
+
- >** expandable blockquotes
|
|
34
|
+
- # Headings (converted to bold)
|
|
35
|
+
- Lists with - or *
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Markdown-formatted text
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (plain_text, entities) where:
|
|
42
|
+
- plain_text: Text with all Markdown markers removed
|
|
43
|
+
- entities: List of dicts with 'type', 'offset', 'length' keys
|
|
44
|
+
(plus 'url' for links, 'language' for code blocks)
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> text, entities = telegram_format_entities("**Hello** world!")
|
|
48
|
+
>>> print(text)
|
|
49
|
+
Hello world!
|
|
50
|
+
>>> print(entities)
|
|
51
|
+
[{'type': 'bold', 'offset': 0, 'length': 5}]
|
|
52
|
+
|
|
53
|
+
# Use with python-telegram-bot:
|
|
54
|
+
await bot.send_message(chat_id, text=text, entities=entities)
|
|
55
|
+
|
|
56
|
+
# Use with aiogram:
|
|
57
|
+
await message.answer(text, entities=entities)
|
|
58
|
+
"""
|
|
59
|
+
plain_text, entity_objects = parse_entities(text)
|
|
60
|
+
return plain_text, [e.to_dict() for e in entity_objects]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"telegram_format_entities",
|
|
65
|
+
"TelegramEntity",
|
|
66
|
+
"EntityType",
|
|
67
|
+
"parse_entities",
|
|
68
|
+
]
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Telegram entity data structures."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EntityType(Enum):
|
|
9
|
+
"""Telegram MessageEntity types."""
|
|
10
|
+
|
|
11
|
+
BOLD = "bold"
|
|
12
|
+
ITALIC = "italic"
|
|
13
|
+
UNDERLINE = "underline"
|
|
14
|
+
STRIKETHROUGH = "strikethrough"
|
|
15
|
+
SPOILER = "spoiler"
|
|
16
|
+
CODE = "code"
|
|
17
|
+
PRE = "pre"
|
|
18
|
+
TEXT_LINK = "text_link"
|
|
19
|
+
BLOCKQUOTE = "blockquote"
|
|
20
|
+
EXPANDABLE_BLOCKQUOTE = "expandable_blockquote"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TelegramEntity:
|
|
25
|
+
"""
|
|
26
|
+
Represents a Telegram MessageEntity.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
type: The entity type (bold, italic, code, etc.)
|
|
30
|
+
offset: Start position in UTF-16 code units
|
|
31
|
+
length: Length in UTF-16 code units
|
|
32
|
+
url: URL for TEXT_LINK entities
|
|
33
|
+
language: Programming language for PRE (code block) entities
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
type: EntityType
|
|
37
|
+
offset: int
|
|
38
|
+
length: int
|
|
39
|
+
url: Optional[str] = None
|
|
40
|
+
language: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict:
|
|
43
|
+
"""Convert to dict for JSON serialization / Telegram API."""
|
|
44
|
+
result = {
|
|
45
|
+
"type": self.type.value,
|
|
46
|
+
"offset": self.offset,
|
|
47
|
+
"length": self.length,
|
|
48
|
+
}
|
|
49
|
+
if self.url is not None:
|
|
50
|
+
result["url"] = self.url
|
|
51
|
+
if self.language is not None:
|
|
52
|
+
result["language"] = self.language
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_dict(cls, data: dict) -> "TelegramEntity":
|
|
57
|
+
"""Create entity from dictionary."""
|
|
58
|
+
return cls(
|
|
59
|
+
type=EntityType(data["type"]),
|
|
60
|
+
offset=data["offset"],
|
|
61
|
+
length=data["length"],
|
|
62
|
+
url=data.get("url"),
|
|
63
|
+
language=data.get("language"),
|
|
64
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Entity extractors for different Markdown elements."""
|
|
2
|
+
|
|
3
|
+
from .blockquotes import extract_blockquote_entities
|
|
4
|
+
from .headings import extract_heading_entities
|
|
5
|
+
from .inline import extract_inline_formatting_entities
|
|
6
|
+
from .links import extract_link_entities
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"extract_inline_formatting_entities",
|
|
10
|
+
"extract_link_entities",
|
|
11
|
+
"extract_blockquote_entities",
|
|
12
|
+
"extract_heading_entities",
|
|
13
|
+
]
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Blockquote entity extraction."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Pattern for regular blockquotes: > text
|
|
9
|
+
_BLOCKQUOTE_LINE_PATTERN = re.compile(r"^>(?!\*\*)\s?(.*)$", re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
# Pattern for expandable blockquotes: >** text or **> text
|
|
12
|
+
_EXPANDABLE_BLOCKQUOTE_PATTERN = re.compile(
|
|
13
|
+
r"^(?:>\*\*|\*\*>)\s?(.*)$", re.MULTILINE
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_blockquote_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
|
|
18
|
+
"""
|
|
19
|
+
Extract blockquotes and return plain text with BLOCKQUOTE entities.
|
|
20
|
+
|
|
21
|
+
Handles both regular (>) and expandable (>** or **>) blockquotes.
|
|
22
|
+
Consecutive blockquote lines are combined into a single entity.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: Input text with blockquote markers
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (text_without_markers, list_of_entities)
|
|
29
|
+
"""
|
|
30
|
+
entities: List[TelegramEntity] = []
|
|
31
|
+
|
|
32
|
+
# First, handle expandable blockquotes
|
|
33
|
+
result_parts: List[str] = []
|
|
34
|
+
|
|
35
|
+
# Find all expandable blockquote lines and group consecutive ones
|
|
36
|
+
lines = text.split("\n")
|
|
37
|
+
i = 0
|
|
38
|
+
current_offset = 0
|
|
39
|
+
|
|
40
|
+
while i < len(lines):
|
|
41
|
+
line = lines[i]
|
|
42
|
+
|
|
43
|
+
# Check for expandable blockquote
|
|
44
|
+
exp_match = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(line)
|
|
45
|
+
if exp_match:
|
|
46
|
+
# Collect consecutive expandable blockquote lines
|
|
47
|
+
quote_lines = []
|
|
48
|
+
|
|
49
|
+
while i < len(lines):
|
|
50
|
+
m = _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i])
|
|
51
|
+
if m:
|
|
52
|
+
quote_lines.append(m.group(1))
|
|
53
|
+
i += 1
|
|
54
|
+
else:
|
|
55
|
+
break
|
|
56
|
+
|
|
57
|
+
quote_content = "\n".join(quote_lines)
|
|
58
|
+
quote_offset = current_offset
|
|
59
|
+
current_offset += len(quote_content) + (1 if i < len(lines) else 0)
|
|
60
|
+
|
|
61
|
+
result_parts.append(quote_content)
|
|
62
|
+
if i < len(lines):
|
|
63
|
+
result_parts.append("\n")
|
|
64
|
+
|
|
65
|
+
entities.append(
|
|
66
|
+
TelegramEntity(
|
|
67
|
+
type=EntityType.EXPANDABLE_BLOCKQUOTE,
|
|
68
|
+
offset=quote_offset,
|
|
69
|
+
length=len(quote_content),
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Check for regular blockquote
|
|
75
|
+
reg_match = _BLOCKQUOTE_LINE_PATTERN.match(line)
|
|
76
|
+
if reg_match:
|
|
77
|
+
# Collect consecutive regular blockquote lines
|
|
78
|
+
quote_lines = []
|
|
79
|
+
start_offset = current_offset
|
|
80
|
+
|
|
81
|
+
while i < len(lines):
|
|
82
|
+
# Don't match expandable as regular
|
|
83
|
+
if _EXPANDABLE_BLOCKQUOTE_PATTERN.match(lines[i]):
|
|
84
|
+
break
|
|
85
|
+
m = _BLOCKQUOTE_LINE_PATTERN.match(lines[i])
|
|
86
|
+
if m:
|
|
87
|
+
quote_lines.append(m.group(1))
|
|
88
|
+
i += 1
|
|
89
|
+
else:
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
quote_content = "\n".join(quote_lines)
|
|
93
|
+
current_offset += len(quote_content) + (1 if i < len(lines) else 0)
|
|
94
|
+
|
|
95
|
+
result_parts.append(quote_content)
|
|
96
|
+
if i < len(lines):
|
|
97
|
+
result_parts.append("\n")
|
|
98
|
+
|
|
99
|
+
entities.append(
|
|
100
|
+
TelegramEntity(
|
|
101
|
+
type=EntityType.BLOCKQUOTE,
|
|
102
|
+
offset=start_offset,
|
|
103
|
+
length=len(quote_content),
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Regular line
|
|
109
|
+
current_offset += len(line) + (1 if i < len(lines) - 1 else 0)
|
|
110
|
+
result_parts.append(line)
|
|
111
|
+
if i < len(lines) - 1:
|
|
112
|
+
result_parts.append("\n")
|
|
113
|
+
i += 1
|
|
114
|
+
|
|
115
|
+
result_text = "".join(result_parts)
|
|
116
|
+
|
|
117
|
+
return result_text, entities
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Heading entity extraction (converted to bold)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Pattern for Markdown headings: # Heading, ## Heading, etc.
|
|
9
|
+
_HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_heading_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
|
|
13
|
+
"""
|
|
14
|
+
Extract Markdown headings and convert them to bold entities.
|
|
15
|
+
|
|
16
|
+
Telegram doesn't have native heading support, so headings are converted
|
|
17
|
+
to bold text (matching the HTML converter behavior).
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
text: Input text with Markdown headings
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Tuple of (text_with_headings_converted, list_of_bold_entities)
|
|
24
|
+
"""
|
|
25
|
+
entities: List[TelegramEntity] = []
|
|
26
|
+
result_parts: List[str] = []
|
|
27
|
+
last_end = 0
|
|
28
|
+
|
|
29
|
+
for match in _HEADING_PATTERN.finditer(text):
|
|
30
|
+
# Add text before this heading
|
|
31
|
+
result_parts.append(text[last_end : match.start()])
|
|
32
|
+
|
|
33
|
+
# Calculate position in output
|
|
34
|
+
current_offset = sum(len(p) for p in result_parts)
|
|
35
|
+
|
|
36
|
+
# Extract heading text (without the # markers)
|
|
37
|
+
heading_text = match.group(2)
|
|
38
|
+
|
|
39
|
+
# Add the heading text
|
|
40
|
+
result_parts.append(heading_text)
|
|
41
|
+
|
|
42
|
+
# Create bold entity for the heading
|
|
43
|
+
entities.append(
|
|
44
|
+
TelegramEntity(
|
|
45
|
+
type=EntityType.BOLD,
|
|
46
|
+
offset=current_offset,
|
|
47
|
+
length=len(heading_text),
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
last_end = match.end()
|
|
52
|
+
|
|
53
|
+
# Add remaining text
|
|
54
|
+
result_parts.append(text[last_end:])
|
|
55
|
+
|
|
56
|
+
return "".join(result_parts), entities
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""Inline formatting entity extraction (bold, italic, underline, etc.)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Patterns for different formatting types
|
|
9
|
+
# Order matters - longer markers first to avoid partial matches
|
|
10
|
+
_PATTERNS = [
|
|
11
|
+
# Bold+Italic: ***text***
|
|
12
|
+
(
|
|
13
|
+
re.compile(r"(?<![\\\*])\*\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)\*\*\*(?!\*)", re.DOTALL),
|
|
14
|
+
[EntityType.BOLD, EntityType.ITALIC],
|
|
15
|
+
3,
|
|
16
|
+
),
|
|
17
|
+
# Underline+Italic: ___text___
|
|
18
|
+
(
|
|
19
|
+
re.compile(
|
|
20
|
+
r"(?<![\\_])___(?!_)(?=\S)([\s\S]*?)(?<=\S)___(?!_)",
|
|
21
|
+
re.DOTALL,
|
|
22
|
+
),
|
|
23
|
+
[EntityType.UNDERLINE, EntityType.ITALIC],
|
|
24
|
+
3,
|
|
25
|
+
),
|
|
26
|
+
# Bold: **text**
|
|
27
|
+
(
|
|
28
|
+
re.compile(r"(?<![\\\*])\*\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*\*(?!\*)", re.DOTALL),
|
|
29
|
+
[EntityType.BOLD],
|
|
30
|
+
2,
|
|
31
|
+
),
|
|
32
|
+
# Underline: __text__
|
|
33
|
+
(
|
|
34
|
+
re.compile(
|
|
35
|
+
r"(?<![\\_])__(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)__(?!_)",
|
|
36
|
+
re.DOTALL,
|
|
37
|
+
),
|
|
38
|
+
[EntityType.UNDERLINE],
|
|
39
|
+
2,
|
|
40
|
+
),
|
|
41
|
+
# Strikethrough: ~~text~~
|
|
42
|
+
(
|
|
43
|
+
re.compile(r"(?<![\\~])~~(?!~)(?=\S)([\s\S]*?)(?<=\S)(?<!~)~~(?!~)", re.DOTALL),
|
|
44
|
+
[EntityType.STRIKETHROUGH],
|
|
45
|
+
2,
|
|
46
|
+
),
|
|
47
|
+
# Spoiler: ||text||
|
|
48
|
+
(
|
|
49
|
+
re.compile(r"(?<![\\|])\|\|(?!\|)(?=\S)([^\n]*?)(?<=\S)(?<!\|)\|\|(?!\|)"),
|
|
50
|
+
[EntityType.SPOILER],
|
|
51
|
+
2,
|
|
52
|
+
),
|
|
53
|
+
# Italic with asterisk: *text* (must not be adjacent to other asterisks)
|
|
54
|
+
(
|
|
55
|
+
re.compile(
|
|
56
|
+
r"(?<![A-Za-z0-9\\\*])\*(?!\*)(?=\S)([\s\S]*?)(?<=\S)(?<!\*)\*(?![A-Za-z0-9\*])",
|
|
57
|
+
re.DOTALL,
|
|
58
|
+
),
|
|
59
|
+
[EntityType.ITALIC],
|
|
60
|
+
1,
|
|
61
|
+
),
|
|
62
|
+
# Italic with underscore: _text_
|
|
63
|
+
(
|
|
64
|
+
re.compile(
|
|
65
|
+
r"(?<![A-Za-z0-9\\_])_(?!_)(?=\S)([\s\S]*?)(?<=\S)(?<!_)_(?![A-Za-z0-9_])",
|
|
66
|
+
re.DOTALL,
|
|
67
|
+
),
|
|
68
|
+
[EntityType.ITALIC],
|
|
69
|
+
1,
|
|
70
|
+
),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class _Match:
|
|
75
|
+
"""Represents a formatting match with its properties."""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
start: int,
|
|
80
|
+
end: int,
|
|
81
|
+
inner_start: int,
|
|
82
|
+
inner_end: int,
|
|
83
|
+
entity_types: List[EntityType],
|
|
84
|
+
marker_len: int,
|
|
85
|
+
):
|
|
86
|
+
self.start = start
|
|
87
|
+
self.end = end
|
|
88
|
+
self.inner_start = inner_start
|
|
89
|
+
self.inner_end = inner_end
|
|
90
|
+
self.entity_types = entity_types
|
|
91
|
+
self.marker_len = marker_len
|
|
92
|
+
self.children: List["_Match"] = []
|
|
93
|
+
|
|
94
|
+
def contains(self, other: "_Match") -> bool:
|
|
95
|
+
"""Check if this match's inner content fully contains another match."""
|
|
96
|
+
return self.inner_start <= other.start and other.end <= self.inner_end
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _find_all_matches(text: str) -> List[_Match]:
|
|
100
|
+
"""Find all formatting matches in text."""
|
|
101
|
+
matches = []
|
|
102
|
+
|
|
103
|
+
for pattern, entity_types, marker_len in _PATTERNS:
|
|
104
|
+
for match in pattern.finditer(text):
|
|
105
|
+
matches.append(
|
|
106
|
+
_Match(
|
|
107
|
+
start=match.start(),
|
|
108
|
+
end=match.end(),
|
|
109
|
+
inner_start=match.start() + marker_len,
|
|
110
|
+
inner_end=match.end() - marker_len,
|
|
111
|
+
entity_types=list(entity_types),
|
|
112
|
+
marker_len=marker_len,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Sort by start position, then by length descending (longer first)
|
|
117
|
+
matches.sort(key=lambda m: (m.start, -(m.end - m.start)))
|
|
118
|
+
|
|
119
|
+
return matches
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _build_match_tree(matches: List[_Match]) -> List[_Match]:
|
|
123
|
+
"""
|
|
124
|
+
Build a tree of matches where nested matches are children.
|
|
125
|
+
Returns only top-level matches (others are nested as children).
|
|
126
|
+
"""
|
|
127
|
+
if not matches:
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
result: List[_Match] = []
|
|
131
|
+
|
|
132
|
+
for match in matches:
|
|
133
|
+
# Find if this match should be nested inside an existing result
|
|
134
|
+
placed = False
|
|
135
|
+
for existing in result:
|
|
136
|
+
if existing.contains(match):
|
|
137
|
+
# Recursively try to place in existing's children
|
|
138
|
+
placed = _try_place_in_children(existing, match)
|
|
139
|
+
if placed:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if not placed:
|
|
143
|
+
# Check if this match overlaps with any existing (invalid)
|
|
144
|
+
overlaps = False
|
|
145
|
+
for existing in result:
|
|
146
|
+
if _matches_overlap(match, existing):
|
|
147
|
+
overlaps = True
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
if not overlaps:
|
|
151
|
+
result.append(match)
|
|
152
|
+
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _try_place_in_children(parent: _Match, child: _Match) -> bool:
|
|
157
|
+
"""Try to place a child match in the parent's children list."""
|
|
158
|
+
# First check if it fits in any existing child
|
|
159
|
+
for existing_child in parent.children:
|
|
160
|
+
if existing_child.contains(child):
|
|
161
|
+
return _try_place_in_children(existing_child, child)
|
|
162
|
+
|
|
163
|
+
# Check for overlaps with existing children
|
|
164
|
+
for existing_child in parent.children:
|
|
165
|
+
if _matches_overlap(child, existing_child):
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
# Can add as a direct child
|
|
169
|
+
parent.children.append(child)
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _matches_overlap(m1: _Match, m2: _Match) -> bool:
|
|
174
|
+
"""Check if two matches have invalid overlap (partial, not nested)."""
|
|
175
|
+
# No overlap
|
|
176
|
+
if m1.end <= m2.start or m2.end <= m1.start:
|
|
177
|
+
return False
|
|
178
|
+
# m1 contains m2 in inner content
|
|
179
|
+
if m1.inner_start <= m2.start and m2.end <= m1.inner_end:
|
|
180
|
+
return False
|
|
181
|
+
# m2 contains m1 in inner content
|
|
182
|
+
if m2.inner_start <= m1.start and m1.end <= m2.inner_end:
|
|
183
|
+
return False
|
|
184
|
+
# Invalid overlap
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _process_match(
|
|
189
|
+
text: str,
|
|
190
|
+
match: _Match,
|
|
191
|
+
base_offset: int,
|
|
192
|
+
) -> Tuple[str, List[TelegramEntity]]:
|
|
193
|
+
"""
|
|
194
|
+
Process a single match and its children, returning plain text and entities.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
text: The text containing the match
|
|
198
|
+
match: The match to process
|
|
199
|
+
base_offset: Offset in the final output where this match starts
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (processed_text, entities)
|
|
203
|
+
"""
|
|
204
|
+
inner_text = text[match.inner_start : match.inner_end]
|
|
205
|
+
entities: List[TelegramEntity] = []
|
|
206
|
+
|
|
207
|
+
# If there are children, process them
|
|
208
|
+
if match.children:
|
|
209
|
+
# Sort children by position
|
|
210
|
+
match.children.sort(key=lambda m: m.start)
|
|
211
|
+
|
|
212
|
+
# Process children recursively
|
|
213
|
+
processed_parts: List[str] = []
|
|
214
|
+
child_entities: List[TelegramEntity] = []
|
|
215
|
+
last_end = match.inner_start
|
|
216
|
+
|
|
217
|
+
for child in match.children:
|
|
218
|
+
# Add text before this child
|
|
219
|
+
processed_parts.append(text[last_end : child.start])
|
|
220
|
+
|
|
221
|
+
# Calculate child's offset in the final output
|
|
222
|
+
child_offset = base_offset + sum(len(p) for p in processed_parts)
|
|
223
|
+
|
|
224
|
+
# Process child recursively
|
|
225
|
+
child_text, child_ents = _process_match(text, child, child_offset)
|
|
226
|
+
processed_parts.append(child_text)
|
|
227
|
+
child_entities.extend(child_ents)
|
|
228
|
+
|
|
229
|
+
last_end = child.end
|
|
230
|
+
|
|
231
|
+
# Add remaining text after last child
|
|
232
|
+
processed_parts.append(text[last_end : match.inner_end])
|
|
233
|
+
|
|
234
|
+
inner_text = "".join(processed_parts)
|
|
235
|
+
entities.extend(child_entities)
|
|
236
|
+
|
|
237
|
+
# Create entities for this match
|
|
238
|
+
for entity_type in match.entity_types:
|
|
239
|
+
entities.append(
|
|
240
|
+
TelegramEntity(
|
|
241
|
+
type=entity_type,
|
|
242
|
+
offset=base_offset,
|
|
243
|
+
length=len(inner_text),
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return inner_text, entities
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def extract_inline_formatting_entities(
|
|
251
|
+
text: str,
|
|
252
|
+
) -> Tuple[str, List[TelegramEntity]]:
|
|
253
|
+
"""
|
|
254
|
+
Extract inline formatting (bold, italic, etc.) and return plain text with entities.
|
|
255
|
+
|
|
256
|
+
Handles nested formatting where one style is fully contained within another.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
text: Input text with Markdown formatting markers
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Tuple of (text_without_markers, list_of_entities)
|
|
263
|
+
"""
|
|
264
|
+
matches = _find_all_matches(text)
|
|
265
|
+
top_level_matches = _build_match_tree(matches)
|
|
266
|
+
|
|
267
|
+
if not top_level_matches:
|
|
268
|
+
return text, []
|
|
269
|
+
|
|
270
|
+
# Sort by position
|
|
271
|
+
top_level_matches.sort(key=lambda m: m.start)
|
|
272
|
+
|
|
273
|
+
# Process all matches
|
|
274
|
+
result_parts: List[str] = []
|
|
275
|
+
all_entities: List[TelegramEntity] = []
|
|
276
|
+
last_end = 0
|
|
277
|
+
|
|
278
|
+
for match in top_level_matches:
|
|
279
|
+
# Add text before this match
|
|
280
|
+
result_parts.append(text[last_end : match.start])
|
|
281
|
+
|
|
282
|
+
# Calculate offset for this match
|
|
283
|
+
current_offset = sum(len(p) for p in result_parts)
|
|
284
|
+
|
|
285
|
+
# Process match and its children
|
|
286
|
+
processed_text, entities = _process_match(text, match, current_offset)
|
|
287
|
+
result_parts.append(processed_text)
|
|
288
|
+
all_entities.extend(entities)
|
|
289
|
+
|
|
290
|
+
last_end = match.end
|
|
291
|
+
|
|
292
|
+
# Add remaining text
|
|
293
|
+
result_parts.append(text[last_end:])
|
|
294
|
+
|
|
295
|
+
return "".join(result_parts), all_entities
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Link entity extraction."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from ..entity import EntityType, TelegramEntity
|
|
7
|
+
|
|
8
|
+
# Pattern for Markdown links: [text](url)
|
|
9
|
+
# Also handles image links:  - treated the same as regular links
|
|
10
|
+
_LINK_PATTERN = re.compile(r"!?\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_link_entities(
|
|
14
|
+
text: str,
|
|
15
|
+
existing_entities: List[TelegramEntity] | None = None,
|
|
16
|
+
) -> Tuple[str, List[TelegramEntity], List[TelegramEntity]]:
|
|
17
|
+
"""
|
|
18
|
+
Extract Markdown links and return plain text with TEXT_LINK entities.
|
|
19
|
+
|
|
20
|
+
Handles both regular links [text](url) and image links .
|
|
21
|
+
Image links are converted to text links showing the alt text.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
text: Input text with Markdown links
|
|
25
|
+
existing_entities: Optional list of entities to adjust offsets for
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (text_with_links_replaced, link_entities, adjusted_existing_entities)
|
|
29
|
+
"""
|
|
30
|
+
entities: List[TelegramEntity] = []
|
|
31
|
+
result_parts: List[str] = []
|
|
32
|
+
last_end = 0
|
|
33
|
+
|
|
34
|
+
# Track adjustments: list of (position_in_original, chars_removed)
|
|
35
|
+
adjustments: List[Tuple[int, int]] = []
|
|
36
|
+
|
|
37
|
+
for match in _LINK_PATTERN.finditer(text):
|
|
38
|
+
# Add text before this link
|
|
39
|
+
result_parts.append(text[last_end : match.start()])
|
|
40
|
+
|
|
41
|
+
# Calculate position in output
|
|
42
|
+
current_offset = sum(len(p) for p in result_parts)
|
|
43
|
+
|
|
44
|
+
# Extract link text and URL
|
|
45
|
+
link_text = match.group(1)
|
|
46
|
+
url = match.group(2)
|
|
47
|
+
|
|
48
|
+
# Calculate how many chars are removed
|
|
49
|
+
# Original: [text](url) or 
|
|
50
|
+
# New: text
|
|
51
|
+
chars_removed = len(match.group(0)) - len(link_text)
|
|
52
|
+
adjustments.append((match.start(), chars_removed))
|
|
53
|
+
|
|
54
|
+
# Add the link text (without the markdown syntax)
|
|
55
|
+
result_parts.append(link_text)
|
|
56
|
+
|
|
57
|
+
# Create entity
|
|
58
|
+
entities.append(
|
|
59
|
+
TelegramEntity(
|
|
60
|
+
type=EntityType.TEXT_LINK,
|
|
61
|
+
offset=current_offset,
|
|
62
|
+
length=len(link_text),
|
|
63
|
+
url=url,
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
last_end = match.end()
|
|
68
|
+
|
|
69
|
+
# Add remaining text
|
|
70
|
+
result_parts.append(text[last_end:])
|
|
71
|
+
|
|
72
|
+
# Adjust existing entities
|
|
73
|
+
adjusted_existing: List[TelegramEntity] = []
|
|
74
|
+
if existing_entities:
|
|
75
|
+
for e in existing_entities:
|
|
76
|
+
new_offset = e.offset
|
|
77
|
+
# Apply all adjustments that occur before this entity
|
|
78
|
+
for adj_pos, chars_removed in adjustments:
|
|
79
|
+
if adj_pos < e.offset:
|
|
80
|
+
new_offset -= chars_removed
|
|
81
|
+
adjusted_existing.append(
|
|
82
|
+
TelegramEntity(
|
|
83
|
+
type=e.type,
|
|
84
|
+
offset=new_offset,
|
|
85
|
+
length=e.length,
|
|
86
|
+
url=e.url,
|
|
87
|
+
language=e.language,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return "".join(result_parts), entities, adjusted_existing
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""Main parser that combines all entity extractors."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from .entity import EntityType, TelegramEntity
|
|
7
|
+
from .extractors import (extract_blockquote_entities, extract_heading_entities,
|
|
8
|
+
extract_inline_formatting_entities,
|
|
9
|
+
extract_link_entities)
|
|
10
|
+
from .utf16 import utf16_len
|
|
11
|
+
|
|
12
|
+
# Placeholder prefix for protected content
|
|
13
|
+
_CODE_BLOCK_PLACEHOLDER = "\x00CODEBLOCK"
|
|
14
|
+
_INLINE_CODE_PLACEHOLDER = "\x00INLINECODE"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _convert_list_markers(text: str) -> str:
|
|
18
|
+
"""Convert Markdown list markers (* or -) to bullet points."""
|
|
19
|
+
return re.sub(r"^(\s*)[\-\*]\s+", r"\1• ", text, flags=re.MULTILINE)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _remove_citation_markers(text: str) -> str:
|
|
23
|
+
"""Remove ChatGPT-style citation markers like 【1】."""
|
|
24
|
+
return re.sub(r"【[^】]+】", "", text)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _adjust_entities_to_utf16(
|
|
28
|
+
text: str, entities: List[TelegramEntity]
|
|
29
|
+
) -> List[TelegramEntity]:
|
|
30
|
+
"""
|
|
31
|
+
Convert entity offsets and lengths from Python char indices to UTF-16 code units.
|
|
32
|
+
|
|
33
|
+
Telegram requires UTF-16 code units for entity positions.
|
|
34
|
+
"""
|
|
35
|
+
adjusted = []
|
|
36
|
+
for entity in entities:
|
|
37
|
+
# Clamp offset and length to text bounds
|
|
38
|
+
offset = min(entity.offset, len(text))
|
|
39
|
+
length = min(entity.length, len(text) - offset)
|
|
40
|
+
|
|
41
|
+
if length <= 0:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
# Get the text portions
|
|
45
|
+
before_text = text[:offset]
|
|
46
|
+
entity_text = text[offset : offset + length]
|
|
47
|
+
|
|
48
|
+
# Convert to UTF-16 units
|
|
49
|
+
utf16_offset = utf16_len(before_text)
|
|
50
|
+
utf16_length = utf16_len(entity_text)
|
|
51
|
+
|
|
52
|
+
if utf16_length > 0:
|
|
53
|
+
adjusted.append(
|
|
54
|
+
TelegramEntity(
|
|
55
|
+
type=entity.type,
|
|
56
|
+
offset=utf16_offset,
|
|
57
|
+
length=utf16_length,
|
|
58
|
+
url=entity.url,
|
|
59
|
+
language=entity.language,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return adjusted
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _validate_and_sort_entities(
|
|
67
|
+
entities: List[TelegramEntity],
|
|
68
|
+
) -> List[TelegramEntity]:
|
|
69
|
+
"""
|
|
70
|
+
Sort entities by offset and filter invalid ones.
|
|
71
|
+
"""
|
|
72
|
+
# Filter out zero-length and negative entities
|
|
73
|
+
entities = [e for e in entities if e.length > 0 and e.offset >= 0]
|
|
74
|
+
|
|
75
|
+
# Sort by offset, then by length descending (longer first for nesting)
|
|
76
|
+
entities = sorted(entities, key=lambda e: (e.offset, -e.length))
|
|
77
|
+
|
|
78
|
+
return entities
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _clean_multiple_newlines(text: str) -> str:
|
|
82
|
+
"""Reduce 3+ consecutive newlines to just 2."""
|
|
83
|
+
return re.sub(r"\n{3,}", "\n\n", text)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_with_placeholders(
|
|
87
|
+
text: str, pattern: re.Pattern, placeholder_prefix: str
|
|
88
|
+
) -> Tuple[str, dict]:
|
|
89
|
+
"""
|
|
90
|
+
Extract matches and replace with placeholders.
|
|
91
|
+
Returns (modified_text, {placeholder: (content, entity_info)})
|
|
92
|
+
"""
|
|
93
|
+
extractions = {}
|
|
94
|
+
counter = [0]
|
|
95
|
+
|
|
96
|
+
def replacer(match):
|
|
97
|
+
placeholder = f"{placeholder_prefix}{counter[0]}\x00"
|
|
98
|
+
counter[0] += 1
|
|
99
|
+
extractions[placeholder] = match
|
|
100
|
+
return placeholder
|
|
101
|
+
|
|
102
|
+
modified = pattern.sub(replacer, text)
|
|
103
|
+
return modified, extractions
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_entities(text: str) -> Tuple[str, List[TelegramEntity]]:
|
|
107
|
+
"""
|
|
108
|
+
Parse Markdown text and return plain text with Telegram entities.
|
|
109
|
+
|
|
110
|
+
Uses a placeholder-based approach to handle the order of extraction correctly:
|
|
111
|
+
1. Replace code blocks and inline code with placeholders
|
|
112
|
+
2. Extract all other formatting (blockquotes, headings, links, inline styles)
|
|
113
|
+
3. Restore placeholders and calculate final offsets
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
text: Markdown-formatted text
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Tuple of (plain_text, list_of_entities)
|
|
120
|
+
Entities have offsets/lengths in UTF-16 code units.
|
|
121
|
+
"""
|
|
122
|
+
all_entities: List[TelegramEntity] = []
|
|
123
|
+
|
|
124
|
+
# Phase 1: Extract code blocks to placeholders
|
|
125
|
+
code_block_pattern = re.compile(
|
|
126
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n(?P<code>[\s\S]*?)(?P=fence)",
|
|
127
|
+
flags=re.MULTILINE,
|
|
128
|
+
)
|
|
129
|
+
code_block_map = {}
|
|
130
|
+
code_block_counter = [0]
|
|
131
|
+
|
|
132
|
+
def replace_code_block(match):
|
|
133
|
+
placeholder = f"{_CODE_BLOCK_PLACEHOLDER}{code_block_counter[0]}\x00"
|
|
134
|
+
code_block_counter[0] += 1
|
|
135
|
+
# Strip trailing newline from code content (appears before closing fence)
|
|
136
|
+
code_content = match.group("code").rstrip("\n")
|
|
137
|
+
language = match.group("lang") or None
|
|
138
|
+
code_block_map[placeholder] = (code_content, language)
|
|
139
|
+
return placeholder
|
|
140
|
+
|
|
141
|
+
# Ensure closing delimiters
|
|
142
|
+
text = _ensure_closing_delimiters(text)
|
|
143
|
+
text = code_block_pattern.sub(replace_code_block, text)
|
|
144
|
+
|
|
145
|
+
# Phase 2: Extract inline code to placeholders
|
|
146
|
+
inline_code_pattern = re.compile(r"`([^`\n]+)`")
|
|
147
|
+
inline_code_map = {}
|
|
148
|
+
inline_code_counter = [0]
|
|
149
|
+
|
|
150
|
+
def replace_inline_code(match):
|
|
151
|
+
placeholder = f"{_INLINE_CODE_PLACEHOLDER}{inline_code_counter[0]}\x00"
|
|
152
|
+
inline_code_counter[0] += 1
|
|
153
|
+
code_content = match.group(1)
|
|
154
|
+
inline_code_map[placeholder] = code_content
|
|
155
|
+
return placeholder
|
|
156
|
+
|
|
157
|
+
text = inline_code_pattern.sub(replace_inline_code, text)
|
|
158
|
+
|
|
159
|
+
# Phase 3: Extract other formatting (on text with placeholders)
|
|
160
|
+
# Order matters: inline formatting first (removes markers), then links
|
|
161
|
+
text, blockquote_entities = extract_blockquote_entities(text)
|
|
162
|
+
all_entities.extend(blockquote_entities)
|
|
163
|
+
|
|
164
|
+
text, heading_entities = extract_heading_entities(text)
|
|
165
|
+
all_entities.extend(heading_entities)
|
|
166
|
+
|
|
167
|
+
text, inline_entities = extract_inline_formatting_entities(text)
|
|
168
|
+
all_entities.extend(inline_entities)
|
|
169
|
+
|
|
170
|
+
# Extract links AFTER inline formatting, adjusting existing entity offsets
|
|
171
|
+
text, link_entities, all_entities = extract_link_entities(text, all_entities)
|
|
172
|
+
all_entities.extend(link_entities)
|
|
173
|
+
|
|
174
|
+
# Phase 4: Restore code placeholders and create entities
|
|
175
|
+
# Collect all placeholders with their info
|
|
176
|
+
all_placeholders = []
|
|
177
|
+
|
|
178
|
+
for placeholder, (code_content, language) in code_block_map.items():
|
|
179
|
+
if placeholder in text:
|
|
180
|
+
pos = text.find(placeholder)
|
|
181
|
+
all_placeholders.append({
|
|
182
|
+
'placeholder': placeholder,
|
|
183
|
+
'content': code_content,
|
|
184
|
+
'position': pos,
|
|
185
|
+
'type': EntityType.PRE,
|
|
186
|
+
'language': language,
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
for placeholder, code_content in inline_code_map.items():
|
|
190
|
+
if placeholder in text:
|
|
191
|
+
pos = text.find(placeholder)
|
|
192
|
+
all_placeholders.append({
|
|
193
|
+
'placeholder': placeholder,
|
|
194
|
+
'content': code_content,
|
|
195
|
+
'position': pos,
|
|
196
|
+
'type': EntityType.CODE,
|
|
197
|
+
'language': None,
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
# Sort by position ascending (restore from start to end)
|
|
201
|
+
# This way, when we shift entities, the later entities get adjusted correctly
|
|
202
|
+
all_placeholders.sort(key=lambda x: x['position'])
|
|
203
|
+
|
|
204
|
+
code_entities: List[TelegramEntity] = []
|
|
205
|
+
|
|
206
|
+
for ph_info in all_placeholders:
|
|
207
|
+
placeholder = ph_info['placeholder']
|
|
208
|
+
code_content = ph_info['content']
|
|
209
|
+
offset = text.find(placeholder)
|
|
210
|
+
text = text.replace(placeholder, code_content, 1)
|
|
211
|
+
|
|
212
|
+
code_entities.append(
|
|
213
|
+
TelegramEntity(
|
|
214
|
+
type=ph_info['type'],
|
|
215
|
+
offset=offset,
|
|
216
|
+
length=len(code_content),
|
|
217
|
+
language=ph_info['language'],
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Adjust existing entities (both all_entities and code_entities) after this position
|
|
222
|
+
placeholder_len = len(placeholder)
|
|
223
|
+
content_len = len(code_content)
|
|
224
|
+
shift = content_len - placeholder_len
|
|
225
|
+
all_entities = _shift_entities_after(all_entities, offset, shift)
|
|
226
|
+
# Also shift already-created code entities (except the one we just added)
|
|
227
|
+
code_entities = _shift_entities_after(code_entities[:-1], offset, shift) + [code_entities[-1]]
|
|
228
|
+
|
|
229
|
+
all_entities.extend(code_entities)
|
|
230
|
+
|
|
231
|
+
# Phase 5: Clean up
|
|
232
|
+
text = _convert_list_markers(text)
|
|
233
|
+
text = _remove_citation_markers(text)
|
|
234
|
+
text = _clean_multiple_newlines(text)
|
|
235
|
+
|
|
236
|
+
# Validate and sort entities
|
|
237
|
+
all_entities = _validate_and_sort_entities(all_entities)
|
|
238
|
+
|
|
239
|
+
# Convert to UTF-16 offsets
|
|
240
|
+
all_entities = _adjust_entities_to_utf16(text, all_entities)
|
|
241
|
+
|
|
242
|
+
return text.strip(), all_entities
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _shift_entities_after(
|
|
246
|
+
entities: List[TelegramEntity], position: int, shift: int
|
|
247
|
+
) -> List[TelegramEntity]:
|
|
248
|
+
"""Shift entity offsets that come after a given position."""
|
|
249
|
+
result = []
|
|
250
|
+
for e in entities:
|
|
251
|
+
if e.offset >= position:
|
|
252
|
+
result.append(
|
|
253
|
+
TelegramEntity(
|
|
254
|
+
type=e.type,
|
|
255
|
+
offset=e.offset + shift,
|
|
256
|
+
length=e.length,
|
|
257
|
+
url=e.url,
|
|
258
|
+
language=e.language,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
else:
|
|
262
|
+
result.append(e)
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _ensure_closing_delimiters(text: str) -> str:
|
|
267
|
+
"""Append any missing closing backtick fences for Markdown code blocks."""
|
|
268
|
+
code_block_re = re.compile(
|
|
269
|
+
r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
|
|
270
|
+
flags=re.DOTALL,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
open_fence = None
|
|
274
|
+
for line in text.splitlines():
|
|
275
|
+
stripped = line.strip()
|
|
276
|
+
if open_fence is None:
|
|
277
|
+
match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
|
|
278
|
+
if match:
|
|
279
|
+
open_fence = match.group("fence")
|
|
280
|
+
else:
|
|
281
|
+
if stripped == open_fence:
|
|
282
|
+
open_fence = None
|
|
283
|
+
|
|
284
|
+
if open_fence is not None:
|
|
285
|
+
if not text.endswith("\n"):
|
|
286
|
+
text += "\n"
|
|
287
|
+
text += open_fence
|
|
288
|
+
|
|
289
|
+
# Check for unclosed triple backticks
|
|
290
|
+
temp = code_block_re.sub("", text)
|
|
291
|
+
if temp.count("```") % 2 != 0:
|
|
292
|
+
text += "\n```"
|
|
293
|
+
|
|
294
|
+
# Check for unclosed single backticks (inline code)
|
|
295
|
+
temp = code_block_re.sub("", text)
|
|
296
|
+
temp = re.sub(r"``+", "", temp)
|
|
297
|
+
if temp.count("`") % 2 != 0:
|
|
298
|
+
text += "`"
|
|
299
|
+
|
|
300
|
+
return text
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""UTF-16 encoding utilities for Telegram entity offset calculation."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def utf16_len(text: str) -> int:
|
|
5
|
+
"""
|
|
6
|
+
Calculate the length of a string in UTF-16 code units.
|
|
7
|
+
|
|
8
|
+
Telegram uses UTF-16 code units for entity offsets and lengths.
|
|
9
|
+
Characters outside the Basic Multilingual Plane (like emoji) take 2 units.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
text: The string to measure
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Length in UTF-16 code units
|
|
16
|
+
"""
|
|
17
|
+
return len(text.encode("utf-16-le")) // 2
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def char_to_utf16_offset(text: str, char_index: int) -> int:
|
|
21
|
+
"""
|
|
22
|
+
Convert a Python string index to a UTF-16 offset.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text: The full text string
|
|
26
|
+
char_index: Python string index (0-based)
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
UTF-16 offset for the same position
|
|
30
|
+
"""
|
|
31
|
+
return utf16_len(text[:char_index])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def utf16_to_char_offset(text: str, utf16_offset: int) -> int:
|
|
35
|
+
"""
|
|
36
|
+
Convert a UTF-16 offset to a Python string index.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
text: The full text string
|
|
40
|
+
utf16_offset: UTF-16 offset
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Python string index for the same position
|
|
44
|
+
"""
|
|
45
|
+
current_utf16 = 0
|
|
46
|
+
for i, char in enumerate(text):
|
|
47
|
+
if current_utf16 >= utf16_offset:
|
|
48
|
+
return i
|
|
49
|
+
current_utf16 += len(char.encode("utf-16-le")) // 2
|
|
50
|
+
return len(text)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
chatgpt_md_converter/__init__.py,sha256=pq0o14l7pBFPm-YsLj0A7nO2FPgF48MSRCEc7b9ktGQ,447
|
|
2
|
+
chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
|
|
3
|
+
chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
|
|
4
|
+
chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
|
|
5
|
+
chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
|
|
6
|
+
chatgpt_md_converter/html_markdown/handlers.py,sha256=zKGRg__41SP7bKs8jodOWAEZJb2FNaC5_raoUiWdBUE,6696
|
|
7
|
+
chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
|
|
8
|
+
chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
|
|
9
|
+
chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
|
|
10
|
+
chatgpt_md_converter/telegram_entities/__init__.py,sha256=dopG-8_gWX8xPeD-9dyHdurs5VPrz-wAFFRvHNKiUNg,1855
|
|
11
|
+
chatgpt_md_converter/telegram_entities/entity.py,sha256=oygQxwBsE7AGm2etq6HFZIeo7tBCwsUGniLP17-_Oz0,1705
|
|
12
|
+
chatgpt_md_converter/telegram_entities/parser.py,sha256=rNYtWwZuet5_HObrupehnOiNaBoheDusGwOTaX5mQBs,9925
|
|
13
|
+
chatgpt_md_converter/telegram_entities/utf16.py,sha256=eH-yX7d1wZwb3nRdk3kq1LFd-NQMqYHutPbkvX5_DC0,1283
|
|
14
|
+
chatgpt_md_converter/telegram_entities/extractors/__init__.py,sha256=FinTAoRNjuHza0LcEBtpNnBvSR8PFo6cVVDkLg0cV6w,407
|
|
15
|
+
chatgpt_md_converter/telegram_entities/extractors/blockquotes.py,sha256=Di8nG5Oej0hLbBB-WJ3GtlZCvCaa_BNmoUdpFGo9mnY,3596
|
|
16
|
+
chatgpt_md_converter/telegram_entities/extractors/headings.py,sha256=AzjF9jElWfw3d4Qx-81fku7gyTkvb0pKlmow0zUXSk4,1602
|
|
17
|
+
chatgpt_md_converter/telegram_entities/extractors/inline.py,sha256=DYSs7cJEFY3-fGtdMdOA7DO5ERtEF8r2GQns5WcPyto,8745
|
|
18
|
+
chatgpt_md_converter/telegram_entities/extractors/links.py,sha256=AmCS8mx7ObY2aL5q7owULemjx-Ivuto_4PtKsL7K45Q,2898
|
|
19
|
+
chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
|
|
20
|
+
chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=VPkSisvb6DiS5KAcq0OaX4sqR1YX4VgZvJEXZeAjIWk,3067
|
|
21
|
+
chatgpt_md_converter/telegram_markdown/inline.py,sha256=MPzj5VpDqrlvPy69CCwUIOsWgtgIFfbB4CliV5Wz-TY,2207
|
|
22
|
+
chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
|
|
23
|
+
chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=k9XBtwgXkh07SlsqbdcZHwOMHhUGOjiIbOehO5wBnu0,1561
|
|
24
|
+
chatgpt_md_converter/telegram_markdown/renderer.py,sha256=39ZehJq6PVWm-sigeBz7vCycwzEmV4Mwiw36jkGIgXI,1960
|
|
25
|
+
chatgpt_md_converter-0.4.0b2.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
26
|
+
chatgpt_md_converter-0.4.0b2.dist-info/METADATA,sha256=pxqmox4G4H1wVWmbDAWa0WB4Rh8BdT4AmXuFHfuD2cc,6606
|
|
27
|
+
chatgpt_md_converter-0.4.0b2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
28
|
+
chatgpt_md_converter-0.4.0b2.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
29
|
+
chatgpt_md_converter-0.4.0b2.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
chatgpt_md_converter/__init__.py,sha256=6ts2hnimdBn_qCA15LKuipUjSU9ZCqRk1GbDPc_JjO4,242
|
|
2
|
-
chatgpt_md_converter/html_splitter.py,sha256=DdjJx0I-A9rZHOxS-0LXsy7YUrgrkrtdeqZtEQ7eooA,7853
|
|
3
|
-
chatgpt_md_converter/html_to_markdown.py,sha256=XlLpQD7W_AooWrvTtvrGVwfPPa80tDKWuT1iT6Vzygw,174
|
|
4
|
-
chatgpt_md_converter/telegram_formatter.py,sha256=w3tjoSdRH_UdoFmGeXe7I47dhDIceXuGOA1oCLMnUmM,87
|
|
5
|
-
chatgpt_md_converter/html_markdown/escaping.py,sha256=wJA4vUJQVcxpkJ4sCIYIWKaqffb_O72R93H81hTgTxA,1808
|
|
6
|
-
chatgpt_md_converter/html_markdown/handlers.py,sha256=zKGRg__41SP7bKs8jodOWAEZJb2FNaC5_raoUiWdBUE,6696
|
|
7
|
-
chatgpt_md_converter/html_markdown/renderer.py,sha256=en-fAr3Bhmm4ZndDaPKV8nLVQ_7HpS_NFBSWcrQporY,438
|
|
8
|
-
chatgpt_md_converter/html_markdown/state.py,sha256=sxbz0ucCakI0KgR86EMZx0nvfU1oiqgVUofujFTeKoo,432
|
|
9
|
-
chatgpt_md_converter/html_markdown/tree.py,sha256=ryohrhO2X5QepZev3087qPoGmMznqHDwH00TNGoW6a4,2154
|
|
10
|
-
chatgpt_md_converter/telegram_markdown/__init__.py,sha256=C0Oexz9brpdE-TqEpiAUV78TsZdSrnnH_5yYpEJ03Us,131
|
|
11
|
-
chatgpt_md_converter/telegram_markdown/code_blocks.py,sha256=VPkSisvb6DiS5KAcq0OaX4sqR1YX4VgZvJEXZeAjIWk,3067
|
|
12
|
-
chatgpt_md_converter/telegram_markdown/inline.py,sha256=MPzj5VpDqrlvPy69CCwUIOsWgtgIFfbB4CliV5Wz-TY,2207
|
|
13
|
-
chatgpt_md_converter/telegram_markdown/postprocess.py,sha256=jUf01tAIqHQ1NxNlVGsvU-Yw8SDOHtMoS7MUzaQLf_8,775
|
|
14
|
-
chatgpt_md_converter/telegram_markdown/preprocess.py,sha256=k9XBtwgXkh07SlsqbdcZHwOMHhUGOjiIbOehO5wBnu0,1561
|
|
15
|
-
chatgpt_md_converter/telegram_markdown/renderer.py,sha256=39ZehJq6PVWm-sigeBz7vCycwzEmV4Mwiw36jkGIgXI,1960
|
|
16
|
-
chatgpt_md_converter-0.3.12.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
17
|
-
chatgpt_md_converter-0.3.12.dist-info/METADATA,sha256=CWvPYndrqJad_RD-zJABwdDSPhVhkcuXzVwf8Z7BTjw,6605
|
|
18
|
-
chatgpt_md_converter-0.3.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
-
chatgpt_md_converter-0.3.12.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
20
|
-
chatgpt_md_converter-0.3.12.dist-info/RECORD,,
|
|
File without changes
|
{chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{chatgpt_md_converter-0.3.12.dist-info → chatgpt_md_converter-0.4.0b2.dist-info}/top_level.txt
RENAMED
|
File without changes
|