PyPI - chatgpt-md-converter - Versions diffs - 0.3.8__tar.gz → 0.3.10__tar.gz - Mend

chatgpt-md-converter 0.3.8tar.gz → 0.3.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chatgpt_md_converter
-Version: 0.3.8
+Version: 0.3.10
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov
@@ -114,6 +114,24 @@ Hidden by default
 Multiple lines</blockquote>
 ```
+## Performance
+Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
+| Sample       | Direction     | Avg ms/call | Ops/sec |
+|--------------|---------------|-------------|---------|
+| short_inline | Markdown→HTML | 0.043       | 23,476  |
+| short_inline | HTML→Markdown | 0.078       | 12,824  |
+| medium_block | Markdown→HTML | 0.108       |  9,270  |
+| medium_block | HTML→Markdown | 0.155       |  6,437  |
+| long_mixed   | Markdown→HTML | 0.446       |  2,242  |
+| long_mixed   | HTML→Markdown | 0.730       |  1,370  |
+These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
+Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
 ## Requirements
 - Python 3.x

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/README.md RENAMED Viewed

@@ -91,6 +91,24 @@ Hidden by default
 Multiple lines</blockquote>
 ```
+## Performance
+Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
+| Sample       | Direction     | Avg ms/call | Ops/sec |
+|--------------|---------------|-------------|---------|
+| short_inline | Markdown→HTML | 0.043       | 23,476  |
+| short_inline | HTML→Markdown | 0.078       | 12,824  |
+| medium_block | Markdown→HTML | 0.108       |  9,270  |
+| medium_block | HTML→Markdown | 0.155       |  6,437  |
+| long_mixed   | Markdown→HTML | 0.446       |  2,242  |
+| long_mixed   | HTML→Markdown | 0.730       |  1,370  |
+These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
+Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
 ## Requirements
 - Python 3.x

chatgpt_md_converter-0.3.10/chatgpt_md_converter/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .html_splitter import split_html_for_telegram
+from .html_to_markdown import html_to_telegram_markdown
+from .telegram_formatter import telegram_format
+__all__ = ["telegram_format", "split_html_for_telegram", "html_to_telegram_markdown"]

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/escaping.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Shared escaping utilities for Telegram Markdown conversion."""
+from __future__ import annotations
+import html
+import re
+from .tree import Node
+_SIMPLE_STAR_ITALIC = re.compile(
+    r"(?<!\\)(?<!\*)\*(?=[^\s])([^\*\n]+?)(?<!\s)\*(?![A-Za-z0-9\*])",
+)
+def _canonicalize_star_italics(text: str) -> str:
+    def _replace(match: re.Match[str]) -> str:
+        inner = match.group(1)
+        if "*" in inner or "_" in inner or "`" in inner:
+            return match.group(0)
+        return f"_{inner}_"
+    return _SIMPLE_STAR_ITALIC.sub(_replace, text)
+def normalise_text(text: str) -> str:
+    if not text:
+        return ""
+    unescaped = html.unescape(text)
+    return unescaped.replace("\u00a0", " ")
+def collect_text(node: Node) -> str:
+    if node.kind == "text":
+        return html.unescape(node.text)
+    parts: list[str] = []
+    for child in node.children:
+        if child.kind == "text":
+            parts.append(html.unescape(child.text))
+        elif child.kind == "element":
+            if child.tag.lower() == "br":
+                parts.append("\n")
+            else:
+                parts.append(collect_text(child))
+    return "".join(parts)
+def escape_inline_code(text: str) -> str:
+    return text.replace("`", "\\`")
+def escape_link_label(label: str) -> str:
+    escaped = label
+    for ch in "[]()":
+        escaped = escaped.replace(ch, f"\\{ch}")
+    return escaped
+def escape_link_url(url: str) -> str:
+    return url.replace("\\", "\\\\").replace(")", "\\)")
+def post_process(markdown: str) -> str:
+    text = re.sub(r"(^|\n)•\s", r"\1- ", markdown)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = text.replace("\r", "")
+    text = "\n".join(line.rstrip() for line in text.split("\n"))
+    text = _canonicalize_star_italics(text)
+    return text.strip()

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/handlers.py ADDED Viewed

@@ -0,0 +1,216 @@
+"""Tag-specific renderers for Telegram Markdown."""
+from __future__ import annotations
+from typing import Callable, Dict
+from .escaping import (collect_text, escape_inline_code, escape_link_label,
+                       escape_link_url, normalise_text)
+from .state import RenderState
+from .tree import Node
+InlineHandler = Callable[[Node, RenderState], str]
+_INLINE_MARKERS: Dict[str, tuple[str, str]] = {
+    "u": ("__", "__"),
+    "ins": ("__", "__"),
+    "s": ("~~", "~~"),
+    "strike": ("~~", "~~"),
+    "del": ("~~", "~~"),
+}
+def render_nodes(nodes: list[Node], state: RenderState) -> str:
+    return "".join(render_node(node, state) for node in nodes)
+def render_node(node: Node, state: RenderState) -> str:
+    if node.kind == "text":
+        return normalise_text(node.text)
+    handler = TAG_DISPATCH.get(node.tag.lower())
+    if handler:
+        return handler(node, state)
+    return render_nodes(node.children, state)
+def _split_surrounding_whitespace(text: str) -> tuple[str, str, str]:
+    """Return leading whitespace, core text, and trailing whitespace."""
+    start = 0
+    end = len(text)
+    while start < end and text[start].isspace():
+        start += 1
+    while end > start and text[end - 1].isspace():
+        end -= 1
+    return text[:start], text[start:end], text[end:]
+def _italic_boundary_conflict(marker: str, core: str) -> bool:
+    if marker == "*":
+        return core.startswith("*") or core.endswith("*")
+    if marker == "_":
+        starts = core.startswith("_")
+        if starts and len(core) > 1 and core[1] == "_":
+            starts = False
+        ends = core.endswith("_")
+        if ends and len(core) > 1 and core[-2] == "_":
+            ends = False
+        return starts or ends
+    return False
+def _choose_italic_marker(state: RenderState, core: str) -> str:
+    depth = state.italic_depth
+    if state.bold_depth > 0 and depth == 0:
+        candidates = ["_", "*"]
+    elif depth % 2 == 0:
+        candidates = ["*", "_"]
+    else:
+        candidates = ["_", "*"]
+    for marker in candidates:
+        if not _italic_boundary_conflict(marker, core):
+            return marker
+    return candidates[0]
+def _handle_bold(node: Node, state: RenderState) -> str:
+    inner_state = state.child(bold_depth=state.bold_depth + 1)
+    inner = render_nodes(node.children, inner_state)
+    leading, core, trailing = _split_surrounding_whitespace(inner)
+    if not core:
+        return leading + trailing
+    return f"{leading}**{core}**{trailing}"
+def _handle_italic(node: Node, state: RenderState) -> str:
+    depth = state.italic_depth
+    inner_state = state.child(italic_depth=depth + 1)
+    inner = render_nodes(node.children, inner_state)
+    leading, core, trailing = _split_surrounding_whitespace(inner)
+    if not core:
+        return leading + trailing
+    marker = _choose_italic_marker(state, core)
+    return f"{leading}{marker}{core}{marker}{trailing}"
+def _handle_inline_marker(node: Node, state: RenderState) -> str:
+    marker_open, marker_close = _INLINE_MARKERS[node.tag.lower()]
+    inner = render_nodes(node.children, state)
+    leading, core, trailing = _split_surrounding_whitespace(inner)
+    if not core:
+        return leading + trailing
+    return f"{leading}{marker_open}{core}{marker_close}{trailing}"
+def _handle_spoiler(node: Node, state: RenderState) -> str:
+    inner = render_nodes(node.children, state)
+    leading, core, trailing = _split_surrounding_whitespace(inner)
+    if not core:
+        return leading + trailing
+    return f"{leading}||{core}||{trailing}"
+def _handle_code(node: Node, state: RenderState) -> str:
+    inner = collect_text(node)
+    return f"`{escape_inline_code(inner)}`"
+def _handle_pre(node: Node, state: RenderState) -> str:
+    children = node.children
+    language: str | None = None
+    content_node: Node
+    if len(children) == 1 and children[0].kind == "element" and children[0].tag.lower() == "code":
+        content_node = children[0]
+        class_attr = content_node.attrs.get("class") or ""
+        for part in class_attr.split():
+            if part.startswith("language-"):
+                language = part.split("-", 1)[1]
+                break
+    else:
+        content_node = Node(kind="element", tag="__virtual__", children=children)
+    inner_text = collect_text(content_node)
+    fence = f"```{language}" if language else "```"
+    if language or "\n" in inner_text:
+        return f"{fence}\n{inner_text}```"
+    return f"{fence}{inner_text}```"
+def _handle_link(node: Node, state: RenderState) -> str:
+    href = node.attrs.get("href", "") or ""
+    label = render_nodes(node.children, state)
+    if not label:
+        label = href
+    escaped_label = escape_link_label(label)
+    escaped_url = escape_link_url(href)
+    if href.startswith("tg://emoji?"):
+        return f"![{escaped_label}]({escaped_url})"
+    return f"[{escaped_label}]({escaped_url})"
+def _handle_blockquote(node: Node, state: RenderState) -> str:
+    inner = render_nodes(node.children, state)
+    lines = inner.split("\n")
+    expandable = "expandable" in node.attrs
+    rendered: list[str] = []
+    for index, line in enumerate(lines):
+        prefix = "**>" if expandable and index == 0 else ">"
+        stripped = line.rstrip("\r")
+        if expandable:
+            rendered.append(prefix + stripped)
+        else:
+            rendered.append(f"{prefix} {stripped}" if stripped else prefix)
+    return "\n".join(rendered)
+def _handle_tg_emoji(node: Node, state: RenderState) -> str:
+    emoji_id = node.attrs.get("emoji-id")
+    label = render_nodes(node.children, state)
+    if emoji_id:
+        href = f"tg://emoji?id={emoji_id}"
+        return f"![{escape_link_label(label)}]({href})"
+    return label
+def _handle_span(node: Node, state: RenderState) -> str:
+    classes = (node.attrs.get("class") or "").split()
+    if any(cls == "tg-spoiler" for cls in classes):
+        return _handle_spoiler(node, state)
+    if any(cls == "tg-emoji" for cls in classes):
+        return render_nodes(node.children, state)
+    return render_nodes(node.children, state)
+TAG_DISPATCH: Dict[str, Callable[[Node, RenderState], str]] = {
+    "b": _handle_bold,
+    "strong": _handle_bold,
+    "i": _handle_italic,
+    "em": _handle_italic,
+    "u": _handle_inline_marker,
+    "ins": _handle_inline_marker,
+    "s": _handle_inline_marker,
+    "strike": _handle_inline_marker,
+    "del": _handle_inline_marker,
+    "span": _handle_span,
+    "tg-spoiler": _handle_spoiler,
+    "code": _handle_code,
+    "pre": _handle_pre,
+    "a": _handle_link,
+    "blockquote": _handle_blockquote,
+    "tg-emoji": _handle_tg_emoji,
+}

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/renderer.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""High-level HTML → Telegram Markdown renderer."""
+from __future__ import annotations
+from typing import List
+from .escaping import post_process
+from .handlers import render_nodes
+from .state import RenderState
+from .tree import Node, build_tree
+def html_to_telegram_markdown(html_text: str) -> str:
+    nodes: List[Node] = build_tree(html_text)
+    markdown = render_nodes(nodes, RenderState())
+    return post_process(markdown)

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/state.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Rendering state for HTML → Telegram Markdown conversion."""
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class RenderState:
+    bold_depth: int = 0
+    italic_depth: int = 0
+    def child(self, **updates: int) -> "RenderState":
+        data = {"bold_depth": self.bold_depth, "italic_depth": self.italic_depth}
+        data.update(updates)
+        return RenderState(**data)

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_markdown/tree.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""DOM-like tree construction for Telegram HTML fragments."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from html.parser import HTMLParser
+from typing import Dict, List, Optional
+@dataclass
+class Node:
+    kind: str  # "text" or "element"
+    text: str = ""
+    tag: str = ""
+    attrs: Dict[str, Optional[str]] = field(default_factory=dict)
+    children: List["Node"] = field(default_factory=list)
+class _HTMLTreeBuilder(HTMLParser):
+    SELF_CLOSING_TAGS = {"br"}
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=False)
+        self.root = Node(kind="element", tag="__root__")
+        self._stack: List[Node] = [self.root]
+    def handle_starttag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
+        if tag in self.SELF_CLOSING_TAGS:
+            if tag == "br":
+                self._stack[-1].children.append(Node(kind="text", text="\n"))
+            return
+        node = Node(kind="element", tag=tag, attrs=dict(attrs))
+        self._stack[-1].children.append(node)
+        self._stack.append(node)
+    def handle_endtag(self, tag: str) -> None:
+        for index in range(len(self._stack) - 1, 0, -1):
+            if self._stack[index].tag == tag:
+                del self._stack[index:]
+                return
+    def handle_startendtag(self, tag: str, attrs: List[tuple[str, Optional[str]]]) -> None:
+        if tag in self.SELF_CLOSING_TAGS:
+            self.handle_starttag(tag, attrs)
+            return
+        node = Node(kind="element", tag=tag, attrs=dict(attrs))
+        self._stack[-1].children.append(node)
+    def handle_data(self, data: str) -> None:
+        if data:
+            self._stack[-1].children.append(Node(kind="text", text=data))
+    def handle_entityref(self, name: str) -> None:
+        self.handle_data(f"&{name};")
+    def handle_charref(self, name: str) -> None:
+        self.handle_data(f"&#{name};")
+def build_tree(html_text: str) -> List[Node]:
+    """Parse HTML and return the list of top-level nodes."""
+    builder = _HTMLTreeBuilder()
+    builder.feed(html_text)
+    builder.close()
+    return builder.root.children

chatgpt_md_converter-0.3.10/chatgpt_md_converter/html_to_markdown.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Backward-compatible entry point for HTML → Telegram Markdown."""
+from .html_markdown.renderer import html_to_telegram_markdown
+__all__ = ["html_to_telegram_markdown"]

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_formatter.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .telegram_markdown.renderer import telegram_format
+__all__ = ['telegram_format']

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Modular Telegram Markdown → HTML conversion helpers."""
+from .renderer import telegram_format
+__all__ = ["telegram_format"]

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/code_blocks.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Code block extraction utilities for Telegram Markdown conversion."""
+import re
+_CODE_BLOCK_RE = re.compile(
+    r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
+    flags=re.DOTALL,
+)
+def _count_unescaped_backticks(text: str) -> int:
+    """Return the number of backticks not escaped by a backslash."""
+    count = 0
+    for index, char in enumerate(text):
+        if char != "`":
+            continue
+        backslashes = 0
+        j = index - 1
+        while j >= 0 and text[j] == '\\':
+            backslashes += 1
+            j -= 1
+        if backslashes % 2 == 0:
+            count += 1
+    return count
+def ensure_closing_delimiters(text: str) -> str:
+    """Append any missing closing backtick fences for Markdown code blocks."""
+    open_fence = None
+    for line in text.splitlines():
+        stripped = line.strip()
+        if open_fence is None:
+            match = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
+            if match:
+                open_fence = match.group("fence")
+        else:
+            if stripped.endswith(open_fence):
+                open_fence = None
+    if open_fence is not None:
+        if not text.endswith("\n"):
+            text += "\n"
+        text += open_fence
+    cleaned_inline = _CODE_BLOCK_RE.sub("", text)
+    if cleaned_inline.count("```") % 2 != 0:
+        text += "```"
+    cleaned_inline = _CODE_BLOCK_RE.sub("", text)
+    if _count_unescaped_backticks(cleaned_inline) % 2 != 0:
+        text += "`"
+    return text
+def extract_and_convert_code_blocks(text: str):
+    """Replace fenced code blocks with placeholders and return HTML renderings."""
+    text = ensure_closing_delimiters(text)
+    placeholders: list[str] = []
+    code_blocks: dict[str, str] = {}
+    def _replacement(match: re.Match[str]) -> tuple[str, str]:
+        language = match.group("lang") or ""
+        code_content = match.group("code")
+        escaped = (
+            code_content.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+        )
+        placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
+        placeholders.append(placeholder)
+        if language:
+            html_block = f'<pre><code class="language-{language}">{escaped}</code></pre>'
+        else:
+            html_block = f"<pre><code>{escaped}</code></pre>"
+        return placeholder, html_block
+    modified = text
+    pattern = re.compile(
+        r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
+        flags=re.DOTALL,
+    )
+    for match in pattern.finditer(text):
+        placeholder, html_block = _replacement(match)
+        code_blocks[placeholder] = html_block
+        modified = modified.replace(match.group(0), placeholder, 1)
+    return modified, code_blocks
+def reinsert_code_blocks(text: str, code_blocks: dict[str, str]) -> str:
+    """Insert rendered HTML code blocks back into their placeholders."""
+    for placeholder, html_block in code_blocks.items():
+        text = text.replace(placeholder, html_block, 1)
+    return text

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/inline.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Inline text helpers for Telegram Markdown conversion."""
+import re
+_inline_code_pattern = re.compile(r"`([^`]+)`")
+_BOLD_PATTERN = re.compile(r"(?<!\\)\*\*(?=\S)(.*?)(?<=\S)\*\*", re.DOTALL)
+_UNDERLINE_PATTERN = re.compile(
+    r"(?<!\\)(?<![A-Za-z0-9_])__(?=\S)(.*?)(?<=\S)__(?![A-Za-z0-9_])",
+    re.DOTALL,
+)
+_ITALIC_UNDERSCORE_PATTERN = re.compile(
+    r"(?<!\\)(?<![A-Za-z0-9_])_(?=\S)(.*?)(?<=\S)_(?![A-Za-z0-9_])",
+    re.DOTALL,
+)
+_STRIKETHROUGH_PATTERN = re.compile(r"(?<!\\)~~(?=\S)(.*?)(?<=\S)~~", re.DOTALL)
+_SPOILER_PATTERN = re.compile(r"(?<!\\)\|\|(?=\S)([^\n]*?)(?<=\S)\|\|")
+_ITALIC_STAR_PATTERN = re.compile(
+    r"(?<![A-Za-z0-9\\])\*(?!\*)(?=[^\s])(.*?)(?<![\s\\])\*(?![A-Za-z0-9\\])",
+    re.DOTALL,
+)
+_PATTERN_MAP = {
+    "**": _BOLD_PATTERN,
+    "__": _UNDERLINE_PATTERN,
+    "_": _ITALIC_UNDERSCORE_PATTERN,
+    "~~": _STRIKETHROUGH_PATTERN,
+    "||": _SPOILER_PATTERN,
+}
+def convert_html_chars(text: str) -> str:
+    text = text.replace("&", "&amp;")
+    text = text.replace("<", "&lt;")
+    text = text.replace(">", "&gt;")
+    return text
+def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
+    pattern = _PATTERN_MAP.get(md_tag)
+    if pattern is None:
+        escaped = re.escape(md_tag)
+        pattern = re.compile(
+            rf"(?<!\\){escaped}(?=\S)(.*?)(?<=\S){escaped}",
+            re.DOTALL,
+        )
+    def _wrap(match: re.Match[str]) -> str:
+        inner = match.group(1)
+        if html_tag == 'span class="tg-spoiler"':
+            return f'<span class="tg-spoiler">{inner}</span>'
+        return f"<{html_tag}>{inner}</{html_tag}>"
+    return pattern.sub(_wrap, out_text)
+def extract_inline_code_snippets(text: str):
+    placeholders: list[str] = []
+    snippets: dict[str, str] = {}
+    def replacer(match: re.Match[str]) -> str:
+        snippet = match.group(1)
+        placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
+        placeholders.append(placeholder)
+        snippets[placeholder] = snippet
+        return placeholder
+    modified = _inline_code_pattern.sub(replacer, text)
+    return modified, snippets
+def apply_custom_italic(text: str) -> str:
+    return _ITALIC_STAR_PATTERN.sub(r"<i>\1</i>", text)

chatgpt_md_converter-0.3.8/chatgpt_md_converter/helpers.py → chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/postprocess.py RENAMED Viewed

@@ -1,25 +1,19 @@
+"""Post-processing helpers for Telegram Markdown conversion."""
 def remove_blockquote_escaping(output: str) -> str:
-    """
-    Removes the escaping from blockquote tags, including expandable blockquotes.
-    """
-    # Regular blockquotes
+    """Unescape blockquote tags produced during formatting."""
     output = output.replace("&lt;blockquote&gt;", "<blockquote>").replace(
         "&lt;/blockquote&gt;", "</blockquote>"
     )
-    # Expandable blockquotes
     output = output.replace(
         "&lt;blockquote expandable&gt;", "<blockquote expandable>"
     ).replace("&lt;/blockquote&gt;", "</blockquote>")
     return output
 def remove_spoiler_escaping(output: str) -> str:
-    """
-    Ensures spoiler tags are correctly formatted (rather than being escaped).
-    """
-    # Fix any incorrectly escaped spoiler tags
+    """Ensure spoiler spans remain HTML tags, not escaped text."""
     output = output.replace(
         '&lt;span class="tg-spoiler"&gt;', '<span class="tg-spoiler">'
     )

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/preprocess.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Pre-processing helpers for Telegram Markdown conversion."""
+def combine_blockquotes(text: str) -> str:
+    """Collapse consecutive Markdown blockquote lines into Telegram HTML blocks."""
+    lines = text.split("\n")
+    combined_lines = []
+    blockquote_lines = []
+    in_blockquote = False
+    is_expandable = False
+    for line in lines:
+        if line.startswith("**>"):
+            in_blockquote = True
+            is_expandable = True
+            blockquote_lines.append(line[3:].strip())
+        elif line.startswith(">"):
+            if not in_blockquote:
+                in_blockquote = True
+                is_expandable = False
+            blockquote_lines.append(line[1:].strip())
+        else:
+            if in_blockquote:
+                combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
+                blockquote_lines = []
+                in_blockquote = False
+                is_expandable = False
+            combined_lines.append(line)
+    if in_blockquote:
+        combined_lines.append(_render_blockquote(blockquote_lines, is_expandable))
+    return "\n".join(combined_lines)
+def _render_blockquote(lines: list[str], expandable: bool) -> str:
+    if expandable:
+        return "<blockquote expandable>" + "\n".join(lines) + "</blockquote>"
+    return "<blockquote>" + "\n".join(lines) + "</blockquote>"

chatgpt_md_converter-0.3.10/chatgpt_md_converter/telegram_markdown/renderer.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""High-level Telegram Markdown → HTML renderer."""
+from __future__ import annotations
+import re
+from .code_blocks import extract_and_convert_code_blocks, reinsert_code_blocks
+from .inline import (apply_custom_italic, convert_html_chars,
+                     extract_inline_code_snippets, split_by_tag)
+from .postprocess import remove_blockquote_escaping, remove_spoiler_escaping
+from .preprocess import combine_blockquotes
+def telegram_format(text: str) -> str:
+    text = combine_blockquotes(text)
+    output, block_map = extract_and_convert_code_blocks(text)
+    output, inline_snippets = extract_inline_code_snippets(output)
+    output = convert_html_chars(output)
+    output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
+    output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
+    output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
+    output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
+    output = split_by_tag(output, "**", "b")
+    output = split_by_tag(output, "__", "u")
+    output = split_by_tag(output, "~~", "s")
+    output = split_by_tag(output, "||", 'span class="tg-spoiler"')
+    output = apply_custom_italic(output)
+    output = split_by_tag(output, "_", "i")
+    output = re.sub(r"【[^】]+】", "", output)
+    link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
+    output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
+    for placeholder, snippet in inline_snippets.items():
+        escaped = (
+            snippet.replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+        )
+        output = output.replace(placeholder, f"<code>{escaped}</code>")
+    output = reinsert_code_blocks(output, block_map)
+    output = remove_blockquote_escaping(output)
+    output = remove_spoiler_escaping(output)
+    output = re.sub(r"\n{3,}", "\n\n", output)
+    return output.strip()

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chatgpt_md_converter
-Version: 0.3.8
+Version: 0.3.10
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/botfather-dev/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov
@@ -114,6 +114,24 @@ Hidden by default
 Multiple lines</blockquote>
 ```
+## Performance
+Benchmarks were recorded on Linux 6.16.6 (Python 3.11.10) using 1,000 iterations per sample.
+| Sample       | Direction     | Avg ms/call | Ops/sec |
+|--------------|---------------|-------------|---------|
+| short_inline | Markdown→HTML | 0.043       | 23,476  |
+| short_inline | HTML→Markdown | 0.078       | 12,824  |
+| medium_block | Markdown→HTML | 0.108       |  9,270  |
+| medium_block | HTML→Markdown | 0.155       |  6,437  |
+| long_mixed   | Markdown→HTML | 0.446       |  2,242  |
+| long_mixed   | HTML→Markdown | 0.730       |  1,370  |
+These numbers provide a baseline; real-world throughput depends on text length and interpreter speed.
+Reproduce the measurements with `python scripts/benchmark.py --iterations 1000 --json benchmarks.json --summary BENCHMARKS.md`.
 ## Requirements
 - Python 3.x

chatgpt_md_converter-0.3.10/chatgpt_md_converter.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,26 @@
+LICENSE
+README.md
+setup.py
+chatgpt_md_converter/__init__.py
+chatgpt_md_converter/html_splitter.py
+chatgpt_md_converter/html_to_markdown.py
+chatgpt_md_converter/telegram_formatter.py
+chatgpt_md_converter.egg-info/PKG-INFO
+chatgpt_md_converter.egg-info/SOURCES.txt
+chatgpt_md_converter.egg-info/dependency_links.txt
+chatgpt_md_converter.egg-info/top_level.txt
+chatgpt_md_converter/html_markdown/escaping.py
+chatgpt_md_converter/html_markdown/handlers.py
+chatgpt_md_converter/html_markdown/renderer.py
+chatgpt_md_converter/html_markdown/state.py
+chatgpt_md_converter/html_markdown/tree.py
+chatgpt_md_converter/telegram_markdown/__init__.py
+chatgpt_md_converter/telegram_markdown/code_blocks.py
+chatgpt_md_converter/telegram_markdown/inline.py
+chatgpt_md_converter/telegram_markdown/postprocess.py
+chatgpt_md_converter/telegram_markdown/preprocess.py
+chatgpt_md_converter/telegram_markdown/renderer.py
+tests/test_html_to_markdown_inline_spacing.py
+tests/test_parser.py
+tests/test_roundtrip_markdown.py
+tests/test_splitter.py

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup
 setup(
     name="chatgpt_md_converter",
-    version="0.3.8",
+    version="0.3.10",
     author="Kostiantyn Kriuchkov",
     author_email="latand666@gmail.com",
     description="A package for converting markdown to HTML for chat Telegram bots",

chatgpt_md_converter-0.3.10/tests/test_html_to_markdown_inline_spacing.py ADDED Viewed

@@ -0,0 +1,25 @@
+import pytest
+from chatgpt_md_converter import html_to_telegram_markdown
+@pytest.mark.parametrize(
+    ("html", "expected"),
+    [
+        ("Start <b>bold </b>finish", "Start **bold** finish"),
+        ("Start <b> bold</b> finish", "Start  **bold** finish"),
+        ("Start <i> italics </i>finish", "Start  _italics_ finish"),
+        ("Start <i>value_</i>end", "Start *value_*end"),
+        ("Start <u> underline </u>finish", "Start  __underline__ finish"),
+        (
+            "Start <span class=\"tg-spoiler\"> secret </span>end",
+            "Start  ||secret|| end",
+        ),
+        (
+            "Intro <b>bold <i> inner </i> block</b> outro",
+            "Intro **bold  _inner_  block** outro",
+        ),
+    ],
+)
+def test_html_to_markdown_strips_inline_whitespace(html: str, expected: str) -> None:
+    assert html_to_telegram_markdown(html) == expected

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/tests/test_parser.py RENAMED Viewed

@@ -1,5 +1,6 @@
-from chatgpt_md_converter.extractors import ensure_closing_delimiters
 from chatgpt_md_converter.telegram_formatter import telegram_format
+from chatgpt_md_converter.telegram_markdown.code_blocks import \
+    ensure_closing_delimiters
 def test_split_by_tag_bold():
@@ -889,7 +890,7 @@ print("hello world ```"')
 </code></pre>
 <pre><code class="language-python">print("Some another text")
 </code></pre>""" # But the code block is still closed correctly.
     output = telegram_format(input_text)
     def show_output():
       print(f"Expected was: \n\n{expected_output}\n\n")
@@ -909,7 +910,7 @@ print("hello world ```"')
 </code></pre>
 <pre><code class="language-python">print("Some another text")
 </code></pre>""" # But the code block is still closed correctly.
     output = telegram_format(input_text)
     def show_output():
       print(f"Expected was: \n\n{expected_output}\n\n")
@@ -934,4 +935,11 @@ print("hello world ```")
     def show_output():
       print(f"Expected was: \n\n{expected_output}\n\n")
       print(f"output was: \n\n{output}")
-    assert output == expected_output, show_output()
+    assert output == expected_output, show_output()
+def test_inline_code_with_escaped_backtick_trailing_text():
+    """Ensure inline code with escaped backtick does not gain an extra closing tick."""
+    input_text = "Escaped \\*asterisks\\* and `code with \\` backtick`"
+    expected_output = "Escaped \\*asterisks\\* and <code>code with \\</code> backtick`"
+    output = telegram_format(input_text)
+    assert output == expected_output

chatgpt_md_converter-0.3.10/tests/test_roundtrip_markdown.py ADDED Viewed

@@ -0,0 +1,32 @@
+import pytest
+from chatgpt_md_converter import html_to_telegram_markdown, telegram_format
+from tests.fixtures.markdown_roundtrips import ROUND_TRIP_CASES
+@pytest.mark.parametrize("_case, markdown_input, expected_markdown", ROUND_TRIP_CASES)
+def test_html_round_trip_normalizes_markdown(_case, markdown_input, expected_markdown):
+    html1 = telegram_format(markdown_input)
+    markdown2 = html_to_telegram_markdown(html1)
+    html2 = telegram_format(markdown2)
+    markdown3 = html_to_telegram_markdown(html2)
+    html3 = telegram_format(markdown3)
+    assert markdown2 == expected_markdown
+    assert markdown3 == expected_markdown
+    assert html1 == html2 == html3
+    assert '<br' not in html1
+    assert '<br' not in html2
+    assert '<br' not in html3
+@pytest.mark.parametrize("_case, markdown_input, _", ROUND_TRIP_CASES)
+def test_markdown_html_markdown_cycle_is_idempotent(_case, markdown_input, _):
+    html_first = telegram_format(markdown_input)
+    markdown_second = html_to_telegram_markdown(html_first)
+    html_third = telegram_format(markdown_second)
+    assert '<br' not in html_first
+    assert '<br' not in html_third
+    assert html_first == html_third

chatgpt_md_converter-0.3.8/chatgpt_md_converter/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .telegram_formatter import telegram_format
-from .html_splitter import split_html_for_telegram
-__all__ = ["telegram_format", "split_html_for_telegram"]

chatgpt_md_converter-0.3.8/chatgpt_md_converter/converters.py DELETED Viewed

@@ -1,27 +0,0 @@
-import re
-def convert_html_chars(text: str) -> str:
-    """
-    Converts HTML reserved symbols to their respective character references.
-    """
-    text = text.replace("&", "&amp;")
-    text = text.replace("<", "&lt;")
-    text = text.replace(">", "&gt;")
-    return text
-def split_by_tag(out_text: str, md_tag: str, html_tag: str) -> str:
-    """
-    Splits the text by markdown tag and replaces it with the specified HTML tag.
-    """
-    tag_pattern = re.compile(
-        r"(?<!\w){}(.*?){}(?!\w)".format(re.escape(md_tag), re.escape(md_tag)),
-        re.DOTALL,
-    )
-    # Special handling for the tg-spoiler tag
-    if html_tag == 'span class="tg-spoiler"':
-        return tag_pattern.sub(r'<span class="tg-spoiler">\1</span>', out_text)
-    return tag_pattern.sub(r"<{}>\1</{}>".format(html_tag, html_tag), out_text)

chatgpt_md_converter-0.3.8/chatgpt_md_converter/extractors.py DELETED Viewed

@@ -1,95 +0,0 @@
-import re
-def ensure_closing_delimiters(text: str) -> str:
-    # Append missing closing backtick delimiters.
-    code_block_re = re.compile(
-        r"(?P<fence>`{3,})(?P<lang>\w+)?\n?[\s\S]*?(?<=\n)?(?P=fence)",
-        flags=re.DOTALL,
-    )
-    # Track an open fence.  Once a fence is opened, everything until the same
-    # fence is encountered again is treated as plain text.  This mimics how
-    # Markdown handles fences and allows fence-like strings inside code blocks.
-    open_fence = None
-    for line in text.splitlines():
-        stripped = line.strip()
-        if open_fence is None:
-            m = re.match(r"^(?P<fence>`{3,})(?P<lang>\w+)?$", stripped)
-            if m:
-                open_fence = m.group("fence")
-        else:
-            if stripped.endswith(open_fence):
-                open_fence = None
-    # If a fence was left open, append a matching closing fence.
-    if open_fence is not None:
-        if not text.endswith("\n"):
-            text += "\n"
-        text += open_fence
-    cleaned_inline = code_block_re.sub("", text)
-    # Balance triple backticks that are not part of a complete fence.
-    if cleaned_inline.count("```") % 2 != 0:
-        text += "```"
-    # Balance single backticks outside fenced blocks.
-    cleaned_inline = code_block_re.sub("", text)
-    if cleaned_inline.count("`") % 2 != 0:
-        text += "`"
-    return text
-def extract_and_convert_code_blocks(text: str):
-    """
-    Extracts code blocks from the text, converting them to HTML <pre><code> format,
-    and replaces them with placeholders. Also ensures closing delimiters for unmatched blocks.
-    """
-    text = ensure_closing_delimiters(text)
-    placeholders = []
-    code_blocks = {}
-    def replacer(match):
-        language = match.group("lang") if match.group("lang") else ""
-        code_content = match.group("code")
-        # Properly escape HTML entities in code content
-        escaped_content = (
-            code_content.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-        )
-        placeholder = f"CODEBLOCKPLACEHOLDER{len(placeholders)}"
-        placeholders.append(placeholder)
-        if not language:
-            html_code_block = f"<pre><code>{escaped_content}</code></pre>"
-        else:
-            html_code_block = (
-                f'<pre><code class="language-{language}">{escaped_content}</code></pre>'
-            )
-        return (placeholder, html_code_block)
-    modified_text = text
-    code_block_pattern = re.compile(
-        r"(?P<fence>`{3,})(?P<lang>\w+)?\n?(?P<code>[\s\S]*?)(?<=\n)?(?P=fence)",
-        flags=re.DOTALL,
-    )
-    for match in code_block_pattern.finditer(text):
-        placeholder, html_code_block = replacer(
-            match
-        )
-        code_blocks[placeholder] = html_code_block
-        modified_text = modified_text.replace(match.group(0), placeholder, 1)
-    return modified_text, code_blocks
-def reinsert_code_blocks(text: str, code_blocks: dict) -> str:
-    """
-    Reinserts HTML code blocks into the text, replacing their placeholders.
-    """
-    for placeholder, html_code_block in code_blocks.items():
-        text = text.replace(placeholder, html_code_block, 1)
-    return text

chatgpt_md_converter-0.3.8/chatgpt_md_converter/formatters.py DELETED Viewed

@@ -1,68 +0,0 @@
-def combine_blockquotes(text: str) -> str:
-    """
-    Combines multiline blockquotes into a single blockquote while keeping the \n characters.
-    Supports both regular blockquotes (>) and expandable blockquotes (**>).
-    """
-    lines = text.split("\n")
-    combined_lines = []
-    blockquote_lines = []
-    in_blockquote = False
-    is_expandable = False
-    for line in lines:
-        if line.startswith("**>"):
-            # Expandable blockquote
-            in_blockquote = True
-            is_expandable = True
-            blockquote_lines.append(line[3:].strip())
-        elif line.startswith(">"):
-            # Regular blockquote
-            if not in_blockquote:
-                # This is a new blockquote
-                in_blockquote = True
-                is_expandable = False
-            blockquote_lines.append(line[1:].strip())
-        else:
-            if in_blockquote:
-                # End of blockquote, combine the lines
-                if is_expandable:
-                    combined_lines.append(
-                        "<blockquote expandable>"
-                        + "\n".join(blockquote_lines)
-                        + "</blockquote>"
-                    )
-                else:
-                    combined_lines.append(
-                        "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
-                    )
-                blockquote_lines = []
-                in_blockquote = False
-                is_expandable = False
-            combined_lines.append(line)
-    if in_blockquote:
-        # Handle the case where the file ends with a blockquote
-        if is_expandable:
-            combined_lines.append(
-                "<blockquote expandable>"
-                + "\n".join(blockquote_lines)
-                + "</blockquote>"
-            )
-        else:
-            combined_lines.append(
-                "<blockquote>" + "\n".join(blockquote_lines) + "</blockquote>"
-            )
-    return "\n".join(combined_lines)
-def fix_asterisk_equations(text: str) -> str:
-    """
-    Replaces numeric expressions with '*' in them with '×'
-    to avoid accidental italic formatting.
-    e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
-    """
-    import re
-    eq_pattern = re.compile(r"(\d+)\s*\*\s*(\d+)")
-    return eq_pattern.sub(r"\1×\2", text)

chatgpt_md_converter-0.3.8/chatgpt_md_converter/telegram_formatter.py DELETED Viewed

@@ -1,99 +0,0 @@
-import re
-from .converters import convert_html_chars, split_by_tag
-from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
-from .formatters import combine_blockquotes
-from .helpers import remove_blockquote_escaping, remove_spoiler_escaping
-def extract_inline_code_snippets(text: str):
-    """
-    Extracts inline code (single-backtick content) from the text,
-    replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
-    This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
-    """
-    placeholders = []
-    code_snippets = {}
-    inline_code_pattern = re.compile(r"`([^`]+)`")
-    def replacer(match):
-        snippet = match.group(1)
-        placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
-        placeholders.append(placeholder)
-        code_snippets[placeholder] = snippet
-        return placeholder
-    new_text = inline_code_pattern.sub(replacer, text)
-    return new_text, code_snippets
-def telegram_format(text: str) -> str:
-    """
-    Converts markdown in the provided text to HTML supported by Telegram.
-    """
-    # Step 0: Combine blockquotes
-    text = combine_blockquotes(text)
-    # Step 1: Extract and convert triple-backtick code blocks first
-    output, triple_code_blocks = extract_and_convert_code_blocks(text)
-    # Step 2: Extract inline code snippets
-    output, inline_code_snippets = extract_inline_code_snippets(output)
-    # Step 3: Convert HTML reserved symbols in the text (not in code blocks)
-    output = convert_html_chars(output)
-    # Convert headings (H1-H6)
-    output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
-    # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
-    output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
-    # Nested Bold and Italic
-    output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
-    output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
-    # Process markdown for bold (**), underline (__), strikethrough (~~), and spoiler (||)
-    output = split_by_tag(output, "**", "b")
-    output = split_by_tag(output, "__", "u")
-    output = split_by_tag(output, "~~", "s")
-    output = split_by_tag(output, "||", 'span class="tg-spoiler"')
-    # Custom approach for single-asterisk italic
-    italic_pattern = re.compile(
-        r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])", re.DOTALL
-    )
-    output = italic_pattern.sub(r"<i>\1</i>", output)
-    # Process single underscore-based italic
-    output = split_by_tag(output, "_", "i")
-    # Remove storage links (Vector storage placeholders like 【4:0†source】)
-    output = re.sub(r"【[^】]+】", "", output)
-    # Convert Markdown links/images to <a href="">…</a>
-    link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
-    output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
-    # Step 4: Reinsert inline code snippets, applying HTML escaping to the content
-    for placeholder, snippet in inline_code_snippets.items():
-        # Apply HTML escaping to the content of inline code
-        escaped_snippet = (
-            snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-        )
-        output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
-    # Step 5: Reinsert the converted triple-backtick code blocks
-    output = reinsert_code_blocks(output, triple_code_blocks)
-    # Step 6: Remove blockquote escaping
-    output = remove_blockquote_escaping(output)
-    # Step 7: Remove spoiler tag escaping
-    output = remove_spoiler_escaping(output)
-    # Clean up multiple consecutive newlines, but preserve intentional spacing
-    output = re.sub(r"\n{3,}", "\n\n", output)
-    return output.strip()

chatgpt_md_converter-0.3.8/chatgpt_md_converter.egg-info/SOURCES.txt DELETED Viewed

@@ -1,16 +0,0 @@
-LICENSE
-README.md
-setup.py
-chatgpt_md_converter/__init__.py
-chatgpt_md_converter/converters.py
-chatgpt_md_converter/extractors.py
-chatgpt_md_converter/formatters.py
-chatgpt_md_converter/helpers.py
-chatgpt_md_converter/html_splitter.py
-chatgpt_md_converter/telegram_formatter.py
-chatgpt_md_converter.egg-info/PKG-INFO
-chatgpt_md_converter.egg-info/SOURCES.txt
-chatgpt_md_converter.egg-info/dependency_links.txt
-chatgpt_md_converter.egg-info/top_level.txt
-tests/test_parser.py
-tests/test_splitter.py

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/LICENSE RENAMED Viewed

File without changes

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter/html_splitter.py RENAMED Viewed

File without changes

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/chatgpt_md_converter.egg-info/top_level.txt RENAMED Viewed

File without changes

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/setup.cfg RENAMED Viewed

File without changes

{chatgpt_md_converter-0.3.8 → chatgpt_md_converter-0.3.10}/tests/test_splitter.py RENAMED Viewed

File without changes

chatgpt-md-converter 0.3.8__tar.gz → 0.3.10__tar.gz

chatgpt-md-converter 0.3.8tar.gz → 0.3.10tar.gz