chatgpt-md-converter 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,16 +2,43 @@ import re
2
2
 
3
3
 
4
4
  def ensure_closing_delimiters(text: str) -> str:
5
- """
6
- Ensures that if an opening ` or ``` is found without a matching closing delimiter,
7
- the missing delimiter is appended to the end of the text.
8
- """
9
- # For triple backticks
10
- if text.count("```") % 2 != 0:
5
+ """Append missing closing backtick delimiters."""
6
+
7
+ code_block_re = re.compile(
8
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
9
+ flags=re.DOTALL,
10
+ )
11
+
12
+ # Remove complete code blocks from consideration so inner backticks
13
+ # don't affect delimiter balancing.
14
+ cleaned = code_block_re.sub("", text)
15
+
16
+ # Detect unclosed fences by tracking opening fence lengths.
17
+ stack = []
18
+ for line in cleaned.splitlines():
19
+ m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", line.strip())
20
+ if not m:
21
+ continue
22
+ fence = m.group("fence")
23
+ if stack and fence == stack[-1]:
24
+ stack.pop()
25
+ else:
26
+ stack.append(fence)
27
+
28
+ if stack:
29
+ text += "\n" + stack[-1]
30
+
31
+ cleaned_inline = code_block_re.sub("", text)
32
+
33
+ # Balance triple backticks that are not part of a complete fence.
34
+ if cleaned_inline.count("```") % 2 != 0:
11
35
  text += "```"
12
- # For single backticks
13
- if text.count("`") % 2 != 0:
36
+
37
+ # Balance single backticks outside fenced blocks.
38
+ cleaned_inline = code_block_re.sub("", text)
39
+ if cleaned_inline.count("`") % 2 != 0:
14
40
  text += "`"
41
+
15
42
  return text
16
43
 
17
44
 
@@ -25,8 +52,8 @@ def extract_and_convert_code_blocks(text: str):
25
52
  code_blocks = {}
26
53
 
27
54
  def replacer(match):
28
- language = match.group(1) if match.group(1) else ""
29
- code_content = match.group(3)
55
+ language = match.group("lang") if match.group("lang") else ""
56
+ code_content = match.group("code")
30
57
 
31
58
  # Properly escape HTML entities in code content
32
59
  escaped_content = (
@@ -44,8 +71,14 @@ def extract_and_convert_code_blocks(text: str):
44
71
  return (placeholder, html_code_block)
45
72
 
46
73
  modified_text = text
47
- for match in re.finditer(r"```(\w*)?(\n)?(.*?)```", text, flags=re.DOTALL):
48
- placeholder, html_code_block = replacer(match)
74
+ code_block_pattern = re.compile(
75
+ r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
76
+ flags=re.DOTALL,
77
+ )
78
+ for match in code_block_pattern.finditer(text):
79
+ placeholder, html_code_block = replacer(
80
+ match
81
+ )
49
82
  code_blocks[placeholder] = html_code_block
50
83
  modified_text = modified_text.replace(match.group(0), placeholder, 1)
51
84
 
@@ -0,0 +1,114 @@
1
+ import re
2
+ from html.parser import HTMLParser
3
+
4
+ MAX_LENGTH = 4096
5
+
6
+
7
+ class HTMLTagTracker(HTMLParser):
8
+ def __init__(self):
9
+ super().__init__()
10
+ self.open_tags = []
11
+
12
+ def handle_starttag(self, tag, attrs):
13
+ # saving tags
14
+ if tag in ("b", "i", "u", "s", "code", "pre", "a", "span", "blockquote"):
15
+ self.open_tags.append((tag, attrs))
16
+
17
+ def handle_endtag(self, tag):
18
+ for i in range(len(self.open_tags) - 1, -1, -1):
19
+ if self.open_tags[i][0] == tag:
20
+ del self.open_tags[i]
21
+ break
22
+
23
+ def get_open_tags_html(self):
24
+ parts = []
25
+ for tag, attrs in self.open_tags:
26
+ attr_str = ""
27
+ if attrs:
28
+ attr_str = " " + " ".join(f'{k}="{v}"' for k, v in attrs)
29
+ parts.append(f"<{tag}{attr_str}>")
30
+ return "".join(parts)
31
+
32
+ def get_closing_tags_html(self):
33
+ return "".join(f"</{tag}>" for tag, _ in reversed(self.open_tags))
34
+
35
+
36
+ def split_pre_block(pre_block: str) -> list[str]:
37
+ # language-aware: <pre><code class="language-python">...</code></pre>
38
+ match = re.match(r"<pre><code(.*?)>(.*)</code></pre>", pre_block, re.DOTALL)
39
+ if match:
40
+ attr, content = match.groups()
41
+ lines = content.splitlines(keepends=True)
42
+ chunks, buf = [], ""
43
+ for line in lines:
44
+ if len(buf) + len(line) + len('<pre><code></code></pre>') > MAX_LENGTH:
45
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
46
+ buf = ""
47
+ buf += line
48
+ if buf:
49
+ chunks.append(f"<pre><code{attr}>{buf}</code></pre>")
50
+ return chunks
51
+ else:
52
+ # regular <pre>...</pre>
53
+ inner = pre_block[5:-6]
54
+ lines = inner.splitlines(keepends=True)
55
+ chunks, buf = [], ""
56
+ for line in lines:
57
+ if len(buf) + len(line) + len('<pre></pre>') > MAX_LENGTH:
58
+ chunks.append(f"<pre>{buf}</pre>")
59
+ buf = ""
60
+ buf += line
61
+ if buf:
62
+ chunks.append(f"<pre>{buf}</pre>")
63
+ return chunks
64
+
65
+
66
+ def split_html_for_telegram(text: str) -> list[str]:
67
+ chunks = []
68
+ pattern = re.compile(r"(<pre>.*?</pre>|<pre><code.*?</code></pre>)", re.DOTALL)
69
+ parts = pattern.split(text)
70
+
71
+ for part in parts:
72
+ if not part:
73
+ continue
74
+ if part.startswith("<pre>") or part.startswith("<pre><code"):
75
+ pre_chunks = split_pre_block(part)
76
+ chunks.extend(pre_chunks)
77
+ else:
78
+ # breaking down regular HTML
79
+ tracker = HTMLTagTracker()
80
+ current = ""
81
+ blocks = re.split(r"(\n\s*\n|<br\s*/?>|\n)", part)
82
+ for block in blocks:
83
+ prospective = current + block
84
+ if len(prospective) > MAX_LENGTH:
85
+ tracker.feed(current)
86
+ open_tags = tracker.get_open_tags_html()
87
+ close_tags = tracker.get_closing_tags_html()
88
+ chunks.append(open_tags + current + close_tags)
89
+ current = block
90
+ tracker = HTMLTagTracker()
91
+ else:
92
+ current = prospective
93
+ if current.strip():
94
+ tracker.feed(current)
95
+ open_tags = tracker.get_open_tags_html()
96
+ close_tags = tracker.get_closing_tags_html()
97
+ chunks.append(open_tags + current + close_tags)
98
+
99
+ # post-unification: combine chunks if they don't exceed the limit in total
100
+ merged_chunks = []
101
+ buf = ""
102
+ for chunk in chunks:
103
+ # chunk = chunk.lstrip("\n") # removing leading line breaks
104
+
105
+ if len(buf) + len(chunk) <= MAX_LENGTH:
106
+ buf += chunk
107
+ else:
108
+ if buf:
109
+ merged_chunks.append(buf)
110
+ buf = chunk
111
+ if buf:
112
+ merged_chunks.append(buf)
113
+
114
+ return merged_chunks
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatgpt_md_converter
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary: A package for converting markdown to HTML for chat Telegram bots
5
- Home-page: https://github.com/Latand/formatter-chatgpt-telegram
5
+ Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
6
6
  Author: Kostiantyn Kriuchkov
7
7
  Author-email: latand666@gmail.com
8
8
  Classifier: Programming Language :: Python :: 3
@@ -0,0 +1,12 @@
1
+ chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
+ chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
+ chatgpt_md_converter/extractors.py,sha256=uThH9vnjlEwZowCbxvcZreMZUPqUEiuq0nbWva3K-CE,3023
4
+ chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
+ chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
+ chatgpt_md_converter/html_splitter.py,sha256=8ao4QU5PFDFCHMg8pj5kBqmxSOUO6RfzqQfk4o1F8ms,3897
7
+ chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
8
+ chatgpt_md_converter-0.3.7.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
9
+ chatgpt_md_converter-0.3.7.dist-info/METADATA,sha256=4gweCWqlv3a6pR6FJbf-ycCEToIjCRf2Ohnk5p81bwQ,5792
10
+ chatgpt_md_converter-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ chatgpt_md_converter-0.3.7.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
12
+ chatgpt_md_converter-0.3.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +0,0 @@
1
- chatgpt_md_converter/__init__.py,sha256=AfkikySkXsJ8HKQcSlU7B1KBHz54QCGJ7MO5Ka9oWRM,79
2
- chatgpt_md_converter/converters.py,sha256=fgebhbhMcIOqnr0xuV04v81RD91FfaGfA0kO417cDqc,831
3
- chatgpt_md_converter/extractors.py,sha256=WU38iAG-MANmilqR73gAvxqqXvx4JT8q3xrac_GRXGI,2071
4
- chatgpt_md_converter/formatters.py,sha256=UbjRG7bLETIGDaFDbFybwW8dKYBMDmgLmIasJiw_j60,2304
5
- chatgpt_md_converter/helpers.py,sha256=2Nc9_s0HcLq79mBt7Hje19LzbO6z9mUNgayoMyWkIhI,874
6
- chatgpt_md_converter/telegram_formatter.py,sha256=YlWW8JUlXqP_3chz53_kj15o4d2uW0RlVsuJVcCrzic,3872
7
- chatgpt_md_converter-0.3.5.dist-info/licenses/LICENSE,sha256=SDr2jeP-s2g4vf17-jdLXrrqA4_mU7L_RtSJlv4Y2mk,1077
8
- chatgpt_md_converter-0.3.5.dist-info/METADATA,sha256=ly_GTnX933MbdUCZKKQAn3kQrICrRiCWvgaJ1dBgWCc,5785
9
- chatgpt_md_converter-0.3.5.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
10
- chatgpt_md_converter-0.3.5.dist-info/top_level.txt,sha256=T2o7csVtZgr-Pwm83aSUkZn0humJmDFNqW38tRSsNqw,21
11
- chatgpt_md_converter-0.3.5.dist-info/RECORD,,