chatgpt-md-converter 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@ def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
16
16
  Splits the text by markdown tag and replaces it with the specified HTML tag.
17
17
  """
18
18
  tag_pattern = re.compile(
19
- r"{}(.*?){}".format(re.escape(md_tag), re.escape(md_tag)), re.DOTALL
19
+ r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
20
+ re.DOTALL,
20
21
  )
21
22
  return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)
@@ -1,9 +1,6 @@
1
- import re
2
-
3
-
4
1
  def combine_blockquotes(text: str) -> str:
5
2
  """
6
- Combines multiline blockquotes into a single blockquote.
3
+ Combines multiline blockquotes into a single blockquote while keeping the \n characters.
7
4
  """
8
5
  lines = text.split("\n")
9
6
  combined_lines = []
@@ -17,7 +14,7 @@ def combine_blockquotes(text: str) -> str:
17
14
  else:
18
15
  if in_blockquote:
19
16
  combined_lines.append(
20
- "<blockquote>" + " ".join(blockquote_lines) + "</blockquote>"
17
+ "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
21
18
  )
22
19
  blockquote_lines = []
23
20
  in_blockquote = False
@@ -25,7 +22,18 @@ def combine_blockquotes(text: str) -> str:
25
22
 
26
23
  if in_blockquote:
27
24
  combined_lines.append(
28
- "<blockquote>" + " ".join(blockquote_lines) + "</blockquote>"
25
+ "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
29
26
  )
30
27
 
31
28
  return "\n".join(combined_lines)
29
+
30
+
31
+ def fix_asterisk_equations(text: str) -> str:
32
+ """
33
+ Replaces numeric expressions with '*' in them with '×'
34
+ to avoid accidental italic formatting.
35
+ e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
36
+ """
37
+ import re
38
+ eq_pattern = re.compile(r'(\d+)\s*\*\s*(\d+)')
39
+ return eq_pattern.sub(r'\1×\2', text)
@@ -1,56 +1,100 @@
1
1
  import re
2
+
2
3
  from .converters import convert_html_chars, split_by_tag
3
4
  from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
4
5
  from .formatters import combine_blockquotes
5
6
  from .helpers import remove_blockquote_escaping
6
7
 
7
8
 
9
+ def extract_inline_code_snippets(text: str):
10
+ """
11
+ Extracts inline code (single-backtick content) from the text,
12
+ replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
13
+ This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
14
+ """
15
+ placeholders = []
16
+ code_snippets = {}
17
+ inline_code_pattern = re.compile(r"`([^`]+)`")
18
+
19
+ def replacer(match):
20
+ snippet = match.group(1)
21
+ placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
22
+ placeholders.append(placeholder)
23
+ code_snippets[placeholder] = snippet
24
+ return placeholder
25
+
26
+ new_text = inline_code_pattern.sub(replacer, text)
27
+ return new_text, code_snippets
28
+
29
+
8
30
  def telegram_format(text: str) -> str:
9
31
  """
10
32
  Converts markdown in the provided text to HTML supported by Telegram.
11
33
  """
34
+
12
35
  # Step 0: Combine blockquotes
13
36
  text = combine_blockquotes(text)
14
37
 
15
38
  # Step 1: Convert HTML reserved symbols
16
39
  text = convert_html_chars(text)
17
40
 
18
- # Step 2: Extract and convert code blocks first
19
- output, code_blocks = extract_and_convert_code_blocks(text)
41
+ # Step 2: Extract and convert triple-backtick code blocks first
42
+ output, triple_code_blocks = extract_and_convert_code_blocks(text)
43
+
44
+ # Step 2.5: Extract inline code snippets (single backticks) so they won't be parsed as italics, etc.
45
+ output, inline_code_snippets = extract_inline_code_snippets(output)
20
46
 
21
- # Step 3: Escape HTML special characters in the output text
47
+ # Step 3: Escape HTML special characters in the output text (for non-code parts)
48
+ # We do NOT want to escape what's inside placeholders here, only what's outside code placeholders.
22
49
  output = output.replace("<", "&lt;").replace(">", "&gt;")
23
50
 
24
- # Inline code
25
- output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
51
+ # Convert headings (H1-H6)
52
+ output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
53
+
54
+ # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
55
+ output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
56
+
57
+ # Remove this old inline code replacement — now handled by extract_inline_code_snippets()
58
+ # output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
26
59
 
27
60
  # Nested Bold and Italic
28
61
  output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
62
+ output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
29
63
 
30
- # Process markdown formatting tags (bold, underline, italic, strikethrough)
31
- # and convert them to their respective HTML tags
64
+ # Process markdown for bold (**), underline (__), strikethrough (~~)
32
65
  output = split_by_tag(output, "**", "b")
33
66
  output = split_by_tag(output, "__", "u")
34
- output = split_by_tag(output, "_", "i")
35
- output = split_by_tag(output, "*", "i")
36
67
  output = split_by_tag(output, "~~", "s")
37
68
 
38
- # Remove storage links
39
- output = re.sub(r"【[^】]+】", "", output)
69
+ # Custom approach for single-asterisk italic
70
+ italic_pattern = re.compile(
71
+ r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])",
72
+ re.DOTALL
73
+ )
74
+ output = italic_pattern.sub(r"<i>\1</i>", output)
40
75
 
41
- # Convert links
42
- output = re.sub(r"\[(.*?)\]\((.*?)\)", r'<a href="\2">\1</a>', output)
76
+ # Process single underscore-based italic
77
+ output = split_by_tag(output, "_", "i")
78
+
79
+ # Remove storage links (Vector storage placeholders like 【4:0†source】)
80
+ output = re.sub(r"【[^】]+】", "", output)
43
81
 
44
- # Convert lists
45
- output = re.sub(r"^\s*[\-\*] (.+)", r"• \1", output, flags=re.MULTILINE)
82
+ # Convert Markdown links/images to <a href="">…</a>
83
+ link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
84
+ output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
46
85
 
47
- # Convert headings
48
- output = re.sub(r"^\s*#+ (.+)", r"<b>\1</b>", output, flags=re.MULTILINE)
86
+ # Step 3.5: Reinsert inline code snippets, escaping special chars in code content
87
+ for placeholder, snippet in inline_code_snippets.items():
88
+ escaped_snippet = snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
89
+ output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
49
90
 
50
- # Step 4: Reinsert the converted HTML code blocks
51
- output = reinsert_code_blocks(output, code_blocks)
91
+ # Step 4: Reinsert the converted triple-backtick code blocks
92
+ output = reinsert_code_blocks(output, triple_code_blocks)
52
93
 
53
94
  # Step 5: Remove blockquote escaping
54
95
  output = remove_blockquote_escaping(output)
55
96
 
56
- return output
97
+ # Clean up multiple consecutive newlines, but preserve intentional spacing
98
+ output = re.sub(r"\n{3,}", "\n\n", output)
99
+
100
+ return output.strip()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: chatgpt_md_converter
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
5
  Home-page: https://github.com/Latand/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
@@ -0,0 +1,11 @@
1
+ chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
+ chatgpt_md_converter/converters.py,sha256=nfbKCcYCAYBk_0RQntCVQFQgAlEUWrGtLWULE1wETmU,657
3
+ chatgpt_md_converter/extractors.py,sha256=RNwo57_6jCe-HoX5eCvvZcjSTc2uPax-6QEtXqXA5QQ,1880
4
+ chatgpt_md_converter/formatters.py,sha256=daekV8M-42E3_N1uXx6M4EbZpSToHo8Vt8fl8AP_yyA,1197
5
+ chatgpt_md_converter/helpers.py,sha256=9CtBeMzKYrymECNPl0MXsW0Vscp4A02a64a5z0sVWqE,261
6
+ chatgpt_md_converter/telegram_formatter.py,sha256=MDyC_gkjN7J-LoMxQaJ1awcEQZzcaYFosOdCgDeDkRU,4036
7
+ chatgpt_md_converter-0.2.0.dist-info/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
8
+ chatgpt_md_converter-0.2.0.dist-info/METADATA,sha256=zm80EZ56yxE7Z3AZkZXIm9CKxPDamouKuqFjS4y1xgU,3086
9
+ chatgpt_md_converter-0.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
10
+ chatgpt_md_converter-0.2.0.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
11
+ chatgpt_md_converter-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
- chatgpt_md_converter/converters.py,sha256=-SbsAiMetDZVkC7PrEQrrKlpoagnUycCL1WNBozd7u0,635
3
- chatgpt_md_converter/extractors.py,sha256=RNwo57_6jCe-HoX5eCvvZcjSTc2uPax-6QEtXqXA5QQ,1880
4
- chatgpt_md_converter/formatters.py,sha256=T85JwXI7t3PpqAHvkV7FFrmBar6pYRYLVLpET0TeRp0,856
5
- chatgpt_md_converter/helpers.py,sha256=9CtBeMzKYrymECNPl0MXsW0Vscp4A02a64a5z0sVWqE,261
6
- chatgpt_md_converter/telegram_formatter.py,sha256=3TrQpuVm1P4Qv1ZMrcBwD7sA2GI-yCDGuTUwUZSlw3E,1896
7
- chatgpt_md_converter-0.1.1.dist-info/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
8
- chatgpt_md_converter-0.1.1.dist-info/METADATA,sha256=635E9EVpEVMP1fpbPAuDl0x_Y9cu3Ye5WfnwdjnWeYc,3086
9
- chatgpt_md_converter-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
10
- chatgpt_md_converter-0.1.1.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
11
- chatgpt_md_converter-0.1.1.dist-info/RECORD,,