chatgpt-md-converter 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatgpt_md_converter/formatters.py +11 -0
- chatgpt_md_converter/telegram_formatter.py +63 -20
- {chatgpt_md_converter-0.1.2.dist-info → chatgpt_md_converter-0.2.0.dist-info}/METADATA +1 -1
- chatgpt_md_converter-0.2.0.dist-info/RECORD +11 -0
- {chatgpt_md_converter-0.1.2.dist-info → chatgpt_md_converter-0.2.0.dist-info}/WHEEL +1 -1
- chatgpt_md_converter-0.1.2.dist-info/RECORD +0 -11
- {chatgpt_md_converter-0.1.2.dist-info → chatgpt_md_converter-0.2.0.dist-info}/LICENSE +0 -0
- {chatgpt_md_converter-0.1.2.dist-info → chatgpt_md_converter-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -26,3 +26,14 @@ def combine_blockquotes(text: str) -> str:
|
|
|
26
26
|
)
|
|
27
27
|
|
|
28
28
|
return "\n".join(combined_lines)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def fix_asterisk_equations(text: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Replaces numeric expressions with '*' in them with '×'
|
|
34
|
+
to avoid accidental italic formatting.
|
|
35
|
+
e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
|
|
36
|
+
"""
|
|
37
|
+
import re
|
|
38
|
+
eq_pattern = re.compile(r'(\d+)\s*\*\s*(\d+)')
|
|
39
|
+
return eq_pattern.sub(r'\1×\2', text)
|
|
@@ -1,57 +1,100 @@
|
|
|
1
1
|
import re
|
|
2
|
+
|
|
2
3
|
from .converters import convert_html_chars, split_by_tag
|
|
3
4
|
from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
|
|
4
5
|
from .formatters import combine_blockquotes
|
|
5
6
|
from .helpers import remove_blockquote_escaping
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
def extract_inline_code_snippets(text: str):
|
|
10
|
+
"""
|
|
11
|
+
Extracts inline code (single-backtick content) from the text,
|
|
12
|
+
replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
|
|
13
|
+
This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
|
|
14
|
+
"""
|
|
15
|
+
placeholders = []
|
|
16
|
+
code_snippets = {}
|
|
17
|
+
inline_code_pattern = re.compile(r"`([^`]+)`")
|
|
18
|
+
|
|
19
|
+
def replacer(match):
|
|
20
|
+
snippet = match.group(1)
|
|
21
|
+
placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
|
|
22
|
+
placeholders.append(placeholder)
|
|
23
|
+
code_snippets[placeholder] = snippet
|
|
24
|
+
return placeholder
|
|
25
|
+
|
|
26
|
+
new_text = inline_code_pattern.sub(replacer, text)
|
|
27
|
+
return new_text, code_snippets
|
|
28
|
+
|
|
29
|
+
|
|
8
30
|
def telegram_format(text: str) -> str:
|
|
9
31
|
"""
|
|
10
32
|
Converts markdown in the provided text to HTML supported by Telegram.
|
|
11
33
|
"""
|
|
34
|
+
|
|
12
35
|
# Step 0: Combine blockquotes
|
|
13
36
|
text = combine_blockquotes(text)
|
|
14
37
|
|
|
15
38
|
# Step 1: Convert HTML reserved symbols
|
|
16
39
|
text = convert_html_chars(text)
|
|
17
40
|
|
|
18
|
-
# Step 2: Extract and convert code blocks first
|
|
19
|
-
output,
|
|
41
|
+
# Step 2: Extract and convert triple-backtick code blocks first
|
|
42
|
+
output, triple_code_blocks = extract_and_convert_code_blocks(text)
|
|
43
|
+
|
|
44
|
+
# Step 2.5: Extract inline code snippets (single backticks) so they won't be parsed as italics, etc.
|
|
45
|
+
output, inline_code_snippets = extract_inline_code_snippets(output)
|
|
20
46
|
|
|
21
|
-
# Step 3: Escape HTML special characters in the output text
|
|
47
|
+
# Step 3: Escape HTML special characters in the output text (for non-code parts)
|
|
48
|
+
# We do NOT want to escape what's inside placeholders here, only what's outside code placeholders.
|
|
22
49
|
output = output.replace("<", "<").replace(">", ">")
|
|
23
50
|
|
|
24
|
-
#
|
|
25
|
-
output = re.sub(r"
|
|
51
|
+
# Convert headings (H1-H6)
|
|
52
|
+
output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
|
|
53
|
+
|
|
54
|
+
# Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
|
|
55
|
+
output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
|
|
56
|
+
|
|
57
|
+
# Remove this old inline code replacement — now handled by extract_inline_code_snippets()
|
|
58
|
+
# output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
|
|
26
59
|
|
|
27
60
|
# Nested Bold and Italic
|
|
28
61
|
output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
|
|
29
62
|
output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
|
|
30
63
|
|
|
31
|
-
# Process markdown
|
|
32
|
-
# and convert them to their respective HTML tags
|
|
64
|
+
# Process markdown for bold (**), underline (__), strikethrough (~~)
|
|
33
65
|
output = split_by_tag(output, "**", "b")
|
|
34
66
|
output = split_by_tag(output, "__", "u")
|
|
35
|
-
output = split_by_tag(output, "_", "i")
|
|
36
|
-
output = split_by_tag(output, "*", "i")
|
|
37
67
|
output = split_by_tag(output, "~~", "s")
|
|
38
68
|
|
|
39
|
-
#
|
|
40
|
-
|
|
69
|
+
# Custom approach for single-asterisk italic
|
|
70
|
+
italic_pattern = re.compile(
|
|
71
|
+
r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])",
|
|
72
|
+
re.DOTALL
|
|
73
|
+
)
|
|
74
|
+
output = italic_pattern.sub(r"<i>\1</i>", output)
|
|
41
75
|
|
|
42
|
-
#
|
|
43
|
-
output =
|
|
76
|
+
# Process single underscore-based italic
|
|
77
|
+
output = split_by_tag(output, "_", "i")
|
|
78
|
+
|
|
79
|
+
# Remove storage links (Vector storage placeholders like 【4:0†source】)
|
|
80
|
+
output = re.sub(r"【[^】]+】", "", output)
|
|
44
81
|
|
|
45
|
-
# Convert
|
|
46
|
-
|
|
82
|
+
# Convert Markdown links/images to <a href="">…</a>
|
|
83
|
+
link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
|
|
84
|
+
output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
|
|
47
85
|
|
|
48
|
-
#
|
|
49
|
-
|
|
86
|
+
# Step 3.5: Reinsert inline code snippets, escaping special chars in code content
|
|
87
|
+
for placeholder, snippet in inline_code_snippets.items():
|
|
88
|
+
escaped_snippet = snippet.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
89
|
+
output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
|
|
50
90
|
|
|
51
|
-
# Step 4: Reinsert the converted
|
|
52
|
-
output = reinsert_code_blocks(output,
|
|
91
|
+
# Step 4: Reinsert the converted triple-backtick code blocks
|
|
92
|
+
output = reinsert_code_blocks(output, triple_code_blocks)
|
|
53
93
|
|
|
54
94
|
# Step 5: Remove blockquote escaping
|
|
55
95
|
output = remove_blockquote_escaping(output)
|
|
56
96
|
|
|
57
|
-
|
|
97
|
+
# Clean up multiple consecutive newlines, but preserve intentional spacing
|
|
98
|
+
output = re.sub(r"\n{3,}", "\n\n", output)
|
|
99
|
+
|
|
100
|
+
return output.strip()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
|
|
2
|
+
chatgpt_md_converter/converters.py,sha256=nfbKCcYCAYBk_0RQntCVQFQgAlEUWrGtLWULE1wETmU,657
|
|
3
|
+
chatgpt_md_converter/extractors.py,sha256=RNwo57_6jCe-HoX5eCvvZcjSTc2uPax-6QEtXqXA5QQ,1880
|
|
4
|
+
chatgpt_md_converter/formatters.py,sha256=daekV8M-42E3_N1uXx6M4EbZpSToHo8Vt8fl8AP_yyA,1197
|
|
5
|
+
chatgpt_md_converter/helpers.py,sha256=9CtBeMzKYrymECNPl0MXsW0Vscp4A02a64a5z0sVWqE,261
|
|
6
|
+
chatgpt_md_converter/telegram_formatter.py,sha256=MDyC_gkjN7J-LoMxQaJ1awcEQZzcaYFosOdCgDeDkRU,4036
|
|
7
|
+
chatgpt_md_converter-0.2.0.dist-info/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
8
|
+
chatgpt_md_converter-0.2.0.dist-info/METADATA,sha256=zm80EZ56yxE7Z3AZkZXIm9CKxPDamouKuqFjS4y1xgU,3086
|
|
9
|
+
chatgpt_md_converter-0.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
10
|
+
chatgpt_md_converter-0.2.0.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
11
|
+
chatgpt_md_converter-0.2.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
|
|
2
|
-
chatgpt_md_converter/converters.py,sha256=nfbKCcYCAYBk_0RQntCVQFQgAlEUWrGtLWULE1wETmU,657
|
|
3
|
-
chatgpt_md_converter/extractors.py,sha256=RNwo57_6jCe-HoX5eCvvZcjSTc2uPax-6QEtXqXA5QQ,1880
|
|
4
|
-
chatgpt_md_converter/formatters.py,sha256=gG_SavtZI0BVl7SqkwGZ_usCB89ZPpAQWofpDUd9DzU,878
|
|
5
|
-
chatgpt_md_converter/helpers.py,sha256=9CtBeMzKYrymECNPl0MXsW0Vscp4A02a64a5z0sVWqE,261
|
|
6
|
-
chatgpt_md_converter/telegram_formatter.py,sha256=3XSNWda_5LKRShjZlkO-D7c1Uq77pfvUGlhqliEO0eU,2007
|
|
7
|
-
chatgpt_md_converter-0.1.2.dist-info/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
|
|
8
|
-
chatgpt_md_converter-0.1.2.dist-info/METADATA,sha256=roSPyHowfr_bCIlyWkja5ozrq3j8zjAQI1cI_0Iqodo,3086
|
|
9
|
-
chatgpt_md_converter-0.1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
10
|
-
chatgpt_md_converter-0.1.2.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
|
|
11
|
-
chatgpt_md_converter-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|