PyPI - chatgpt-md-converter - Versions diffs - 0.1.2__tar.gz → 0.2.0__tar.gz - Mend

chatgpt-md-converter 0.1.2tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{chatgpt_md_converter-0.1.2 → chatgpt_md_converter-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: chatgpt_md_converter
-Version: 0.1.2
+Version: 0.2.0
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/Latand/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov

{chatgpt_md_converter-0.1.2 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter/formatters.py RENAMED Viewed

@@ -26,3 +26,14 @@ def combine_blockquotes(text: str) -> str:
         )
     return "\n".join(combined_lines)
+def fix_asterisk_equations(text: str) -> str:
+    """
+    Replaces numeric expressions with '*' in them with '×'
+    to avoid accidental italic formatting.
+    e.g. '6*8' -> '6×8', '6 * 8' -> '6×8'
+    """
+    import re
+    eq_pattern = re.compile(r'(\d+)\s*\*\s*(\d+)')
+    return eq_pattern.sub(r'\1×\2', text)

chatgpt_md_converter-0.2.0/chatgpt_md_converter/telegram_formatter.py ADDED Viewed

@@ -0,0 +1,100 @@
+import re
+from .converters import convert_html_chars, split_by_tag
+from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
+from .formatters import combine_blockquotes
+from .helpers import remove_blockquote_escaping
+def extract_inline_code_snippets(text: str):
+    """
+    Extracts inline code (single-backtick content) from the text,
+    replacing it with placeholders, returning modified text and a dict of placeholders -> code text.
+    This ensures characters like '*' or '_' inside inline code won't be interpreted as Markdown.
+    """
+    placeholders = []
+    code_snippets = {}
+    inline_code_pattern = re.compile(r"`([^`]+)`")
+    def replacer(match):
+        snippet = match.group(1)
+        placeholder = f"INLINECODEPLACEHOLDER{len(placeholders)}"
+        placeholders.append(placeholder)
+        code_snippets[placeholder] = snippet
+        return placeholder
+    new_text = inline_code_pattern.sub(replacer, text)
+    return new_text, code_snippets
+def telegram_format(text: str) -> str:
+    """
+    Converts markdown in the provided text to HTML supported by Telegram.
+    """
+    # Step 0: Combine blockquotes
+    text = combine_blockquotes(text)
+    # Step 1: Convert HTML reserved symbols
+    text = convert_html_chars(text)
+    # Step 2: Extract and convert triple-backtick code blocks first
+    output, triple_code_blocks = extract_and_convert_code_blocks(text)
+    # Step 2.5: Extract inline code snippets (single backticks) so they won't be parsed as italics, etc.
+    output, inline_code_snippets = extract_inline_code_snippets(output)
+    # Step 3: Escape HTML special characters in the output text (for non-code parts)
+    # We do NOT want to escape what's inside placeholders here, only what's outside code placeholders.
+    output = output.replace("<", "&lt;").replace(">", "&gt;")
+    # Convert headings (H1-H6)
+    output = re.sub(r"^(#{1,6})\s+(.+)$", r"<b>\2</b>", output, flags=re.MULTILINE)
+    # Convert unordered lists (do this before italic detection so that leading '*' is recognized as bullet)
+    output = re.sub(r"^(\s*)[\-\*]\s+(.+)$", r"\1• \2", output, flags=re.MULTILINE)
+    # Remove this old inline code replacement — now handled by extract_inline_code_snippets()
+    # output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
+    # Nested Bold and Italic
+    output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
+    output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
+    # Process markdown for bold (**), underline (__), strikethrough (~~)
+    output = split_by_tag(output, "**", "b")
+    output = split_by_tag(output, "__", "u")
+    output = split_by_tag(output, "~~", "s")
+    # Custom approach for single-asterisk italic
+    italic_pattern = re.compile(
+        r"(?<![A-Za-z0-9])\*(?=[^\s])(.*?)(?<!\s)\*(?![A-Za-z0-9])",
+        re.DOTALL
+    )
+    output = italic_pattern.sub(r"<i>\1</i>", output)
+    # Process single underscore-based italic
+    output = split_by_tag(output, "_", "i")
+    # Remove storage links (Vector storage placeholders like 【4:0†source】)
+    output = re.sub(r"【[^】]+】", "", output)
+    # Convert Markdown links/images to <a href="">…</a>
+    link_pattern = r"(?:!?)\[((?:[^\[\]]|\[.*?\])*)\]\(([^)]+)\)"
+    output = re.sub(link_pattern, r'<a href="\2">\1</a>', output)
+    # Step 3.5: Reinsert inline code snippets, escaping special chars in code content
+    for placeholder, snippet in inline_code_snippets.items():
+        escaped_snippet = snippet.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+        output = output.replace(placeholder, f"<code>{escaped_snippet}</code>")
+    # Step 4: Reinsert the converted triple-backtick code blocks
+    output = reinsert_code_blocks(output, triple_code_blocks)
+    # Step 5: Remove blockquote escaping
+    output = remove_blockquote_escaping(output)
+    # Clean up multiple consecutive newlines, but preserve intentional spacing
+    output = re.sub(r"\n{3,}", "\n\n", output)
+    return output.strip()

{chatgpt_md_converter-0.1.2 → chatgpt_md_converter-0.2.0}/chatgpt_md_converter.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: chatgpt_md_converter
-Version: 0.1.2
+Version: 0.2.0
 Summary: A package for converting markdown to HTML for chat Telegram bots
 Home-page: https://github.com/Latand/formatter-chatgpt-telegram
 Author: Kostiantyn Kriuchkov

{chatgpt_md_converter-0.1.2 → chatgpt_md_converter-0.2.0}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup
 setup(
     name="chatgpt_md_converter",
-    version="0.1.2",
+    version="0.2.0",
     author="Kostiantyn Kriuchkov",
     author_email="latand666@gmail.com",
     description="A package for converting markdown to HTML for chat Telegram bots",

{chatgpt_md_converter-0.1.2 → chatgpt_md_converter-0.2.0}/tests/test_parser.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from chatgpt_md_converter.extractors import ensure_closing_delimiters
 from chatgpt_md_converter.telegram_formatter import telegram_format
@@ -64,7 +65,18 @@ for i in range(3):
 [Link](http://example.com)
 """
-    expected_output = """<b>Heading</b>\nThis is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.\n• Item 1\n• Item 2\n\n<pre><code class="language-python">for i in range(3):\n    print(i)\n</code></pre>\n\n<a href="http://example.com">Link</a>\n"""
+    expected_output = """
+<b>Heading</b>
+This is a test of <b>bold</b>, <u>underline</u>, and <code>inline code</code>.
+• Item 1
+• Item 2
+<pre><code class="language-python">for i in range(3):
+    print(i)
+</code></pre>
+<a href="http://example.com">Link</a>
+"""
     output = telegram_format(input_text)
     assert (
         output.strip() == expected_output.strip()
@@ -129,7 +141,6 @@ def test_code_block_within_bold_text():
 def test_triple_backticks_with_nested_markdown():
     input_text = "```python\n**bold text** and __underline__ in code block```"
-    # Expecting the markdown syntax to be ignored within the code block
     expected_output = '<pre><code class="language-python">**bold text** and __underline__ in code block</code></pre>'
     output = telegram_format(input_text)
     assert (
@@ -139,7 +150,6 @@ def test_triple_backticks_with_nested_markdown():
 def test_unmatched_code_delimiters():
     input_text = "This has an `unmatched code delimiter."
-    # Expecting original input as output due to the unmatched delimiter
     expected_output = "This has an <code>unmatched code delimiter.</code>"
     output = telegram_format(input_text)
     assert output == expected_output, "Failed handling unmatched code delimiters"
@@ -281,7 +291,7 @@ def test_md_large_example():
    - Item 2
      - Subitem 1
      - Subitem 2
    - **Ordered List:**
    1. First item
@@ -349,7 +359,7 @@ def example_function():
    • Item 2
      • Subitem 1
      • Subitem 2
    • <b>Ordered List:</b>
    1. First item
@@ -396,3 +406,253 @@ Here is some <code>inline code</code>.
     assert (
         output.strip() == expected_output.strip()
     ), "Failed handling large markdown example"
+def test_unclosed_single_backtick():
+    """Test that a single unclosed backtick is properly handled"""
+    text = "Here is some `code without closing"
+    result = ensure_closing_delimiters(text)
+    assert result == "Here is some `code without closing`"
+def test_unclosed_triple_backtick():
+    """Test that unclosed triple backticks are properly handled"""
+    text = "Here is some ```code without closing"
+    result = ensure_closing_delimiters(text)
+    assert result == "Here is some ```code without closing```"
+def test_bracket_link_with_additional_text():
+    """
+    Ensures that text like '[OtherText] [Title](Link)' doesn't
+    merge 'OtherText' and 'Title' into the <a> tag text.
+    """
+    input_text = "[OtherText] [Title](https://example.com)"
+    output = telegram_format(input_text)
+    expected_output = '[OtherText] <a href="https://example.com">Title</a>'
+    assert output == expected_output, f"Output was: {output}"
+def test_heading_formatting_with_newlines():
+    """
+    Checks that headings #, ##, etc. are properly wrapped in <b> tags.
+    """
+    input_text = """# Heading1
+Some text
+## Heading2
+More text
+"""
+    output = telegram_format(input_text)
+    lines = output.splitlines()
+    assert "<b>Heading1</b>" in output
+    assert "<b>Heading2</b>" in output
+    assert lines[0] == "<b>Heading1</b>"
+    assert lines[1] == "Some text"
+    assert lines[2] == "<b>Heading2</b>"
+    assert lines[3] == "More text"
+def test_list_formatting_with_newlines():
+    """
+    Checks that list items (starting with '-' or '*') become bullet points,
+    each on its own line with proper spacing.
+    """
+    input_text = """- Item one
+- Item two
+* Item three
+Some text
+- Item four"""
+    output = telegram_format(input_text)
+    lines = [line.strip() for line in output.splitlines() if line.strip()]
+    assert "• Item one" in lines
+    assert "• Item two" in lines
+    assert "• Item three" in lines
+    assert "• Item four" in lines
+    assert "Some text" in lines
+    bullet_lines = [line for line in lines if line.startswith("•")]
+    assert len(bullet_lines) == 4
+    assert bullet_lines[0] == "• Item one"
+    assert bullet_lines[1] == "• Item two"
+    assert bullet_lines[2] == "• Item three"
+    assert bullet_lines[3] == "• Item four"
+def test_preserve_other_brackets():
+    """
+    Ensures that other bracketed text not forming a valid link is preserved literally.
+    """
+    input_text = "Look at [this], but [not a link] something else."
+    output = telegram_format(input_text)
+    assert "[this]" in output
+    assert "[not a link]" in output
+    assert "<a href=" not in output
+def test_link_with_nested_brackets():
+    """Test that links with nested brackets in the text are handled correctly"""
+    input_text = "[Link [with brackets]](https://example.com)"
+    output = telegram_format(input_text)
+    expected_output = '<a href="https://example.com">Link [with brackets]</a>'
+    assert output == expected_output, f"Output was: {output}"
+def test_link_with_spaces():
+    """Test that links with spaces are handled correctly"""
+    input_text = "[OtherText] [Title](Link)"
+    output = telegram_format(input_text)
+    expected_output = '[OtherText] <a href="Link">Title</a>'
+    assert output == expected_output, f"Output was: {output}"
+def test_ukrainian_bullet_points():
+    input_text = """Звісно, ось список цікавих речей у форматі Markdown:
+*  **Парадокс кота Шредінгера:** Чи може кіт бути одночасно живим і мертвим? 🤔
+*  **Ефект метелика:** Маленька зміна може мати великі наслідки. 🦋
+*  **Теорія струн:** Чи є наш всесвіт просто вібрацією струн? 🎶
+*  **Темна матерія та темна енергія:** Що складає 95% всесвіту? 🌌
+*  **Квантова заплутаність:** Чи можуть два об'єкти бути зв'язані на відстані? 🔗
+*  **Соліпсизм:** Чи існує щось, крім моєї свідомості? 🤨
+*  **Парадокс Фермі:** Де всі інші інопланетяни? 👽
+*  **Симуляційна гіпотеза:** Чи живемо ми в симуляції? 💻
+*  **Ефект Даннінга-Крюгера:** Чому некомпетентні люди переоцінюють себе? 🤓
+*  **Когнітивні спотворення:** Як наш мозок обманює нас? 🤯
+"""
+    expected_output = """Звісно, ось список цікавих речей у форматі Markdown:
+• <b>Парадокс кота Шредінгера:</b> Чи може кіт бути одночасно живим і мертвим? 🤔
+• <b>Ефект метелика:</b> Маленька зміна може мати великі наслідки. 🦋
+• <b>Теорія струн:</b> Чи є наш всесвіт просто вібрацією струн? 🎶
+• <b>Темна матерія та темна енергія:</b> Що складає 95% всесвіту? 🌌
+• <b>Квантова заплутаність:</b> Чи можуть два об'єкти бути зв'язані на відстані? 🔗
+• <b>Соліпсизм:</b> Чи існує щось, крім моєї свідомості? 🤨
+• <b>Парадокс Фермі:</b> Де всі інші інопланетяни? 👽
+• <b>Симуляційна гіпотеза:</b> Чи живемо ми в симуляції? 💻
+• <b>Ефект Даннінга-Крюгера:</b> Чому некомпетентні люди переоцінюють себе? 🤓
+• <b>Когнітивні спотворення:</b> Як наш мозок обманює нас? 🤯
+"""
+    output = telegram_format(input_text)
+    print(output)
+    assert output.strip() == expected_output.strip()
+def test_asterisk_in_equations():
+    """Test that asterisks in mathematical equations are not converted to italic"""
+    test_cases = [
+        ("2 * 2 = 4", "2 * 2 = 4"),
+        ("x*y + z = 10", "x*y + z = 10"),
+        ("a * b * c", "a * b * c"),
+        ("2*x + 3*y = z", "2*x + 3*y = z"),
+        ("This is *italic* but 2 * 2 is not", "This is <i>italic</i> but 2 * 2 is not"),
+        ("5 * x + *emphasized* text", "5 * x + <i>emphasized</i> text"),
+    ]
+    for input_text, expected_output in test_cases:
+        output = telegram_format(input_text)
+        assert (
+            output == expected_output
+        ), f"Failed on input: {input_text}, got: {output}"
+def test_complex_equations_with_asterisk():
+    """Test more complex mathematical expressions with asterisks"""
+    input_text = """The formula is:
+f(x) = 2*x + 3*y
+g(x) = x * (y + z)
+This is *italic* text with equation 2 * 2 = 4
+"""
+    expected_output = """The formula is:
+f(x) = 2*x + 3*y
+g(x) = x * (y + z)
+This is <i>italic</i> text with equation 2 * 2 = 4"""
+    output = telegram_format(input_text)
+    assert output.strip() == expected_output.strip(), f"Output was: {output}"
+# ----------------------------------------------------------------------------------------
+# New, more comprehensive and edge-case test methods begin here
+# ----------------------------------------------------------------------------------------
+def test_empty_string():
+    """Check behavior with an empty string."""
+    input_text = ""
+    output = telegram_format(input_text)
+    assert output == ""
+def test_spaces_only():
+    """Check behavior with a string that has only spaces."""
+    input_text = "    "
+    output = telegram_format(input_text)
+    # Should either remain blank or just be those spaces (strip() might remove them)
+    assert output.strip() == ""
+def test_asterisk_in_parentheses():
+    """Edge case with asterisk in parentheses."""
+    input_text = "(2*3) is an equation, but *italic* text is separate."
+    expected_output = "(2*3) is an equation, but <i>italic</i> text is separate."
+    output = telegram_format(input_text)
+    assert output == expected_output
+def test_underscore_in_non_italic_context():
+    """Edge case with underscores that should not convert to italic."""
+    input_text = "This_variable should remain, but _italic_ should convert."
+    expected_output = "This_variable should remain, but <i>italic</i> should convert."
+    output = telegram_format(input_text)
+    assert output == expected_output
+def test_code_block_mixed_with_unescaped_html():
+    """Ensure code block remains escaped but outside text is processed normally."""
+    input_text = """
+Some <div>stuff</div> here.
+```
+<html><body>Unescaped?</body></html>
+```
+More text with *italic*.
+"""
+    expected_output = """
+Some &lt;div&gt;stuff&lt;/div&gt; here.
+<pre><code>&lt;html&gt;&lt;body&gt;Unescaped?&lt;/body&gt;&lt;/html&gt;
+</code></pre>
+More text with <i>italic</i>.
+"""
+    output = telegram_format(input_text)
+    assert output.strip() == expected_output.strip()
+def test_equation_with_asterisks_and_italics_combined():
+    """More advanced check: combine equations and true italics side by side."""
+    input_text = "2*x + 3*y = 10, and *italic* is separate."
+    expected_output = "2*x + 3*y = 10, and <i>italic</i> is separate."
+    output = telegram_format(input_text)
+    assert output == expected_output
+def test_inline_code_with_asterisk_and_underscore():
+    """Ensure that `*` and `_` inside inline code are not interpreted as markdown."""
+    input_text = "Here is `code_with_*_asterisk` outside of `code_with__underscore__`"
+    expected_output = "Here is <code>code_with_*_asterisk</code> outside of <code>code_with__underscore__</code>"
+    output = telegram_format(input_text)
+    assert output == expected_output
+def test_heading_followed_by_equation():
+    """Check heading usage right before an equation line."""
+    input_text = """# MyHeading
+2*x + y = 4
+"""
+    # Heading should become <b>MyHeading</b>, equation line remains as is
+    expected_output = """<b>MyHeading</b>
+2*x + y = 4"""
+    output = telegram_format(input_text)
+    assert output.strip() == expected_output.strip(), f"Got: {output}"

chatgpt_md_converter-0.1.2/chatgpt_md_converter/telegram_formatter.py DELETED Viewed

@@ -1,57 +0,0 @@
-import re
-from .converters import convert_html_chars, split_by_tag
-from .extractors import extract_and_convert_code_blocks, reinsert_code_blocks
-from .formatters import combine_blockquotes
-from .helpers import remove_blockquote_escaping
-def telegram_format(text: str) -> str:
-    """
-    Converts markdown in the provided text to HTML supported by Telegram.
-    """
-    # Step 0: Combine blockquotes
-    text = combine_blockquotes(text)
-    # Step 1: Convert HTML reserved symbols
-    text = convert_html_chars(text)
-    # Step 2: Extract and convert code blocks first
-    output, code_blocks = extract_and_convert_code_blocks(text)
-    # Step 3: Escape HTML special characters in the output text
-    output = output.replace("<", "&lt;").replace(">", "&gt;")
-    # Inline code
-    output = re.sub(r"`(.*?)`", r"<code>\1</code>", output)
-    # Nested Bold and Italic
-    output = re.sub(r"\*\*\*(.*?)\*\*\*", r"<b><i>\1</i></b>", output)
-    output = re.sub(r"\_\_\_(.*?)\_\_\_", r"<u><i>\1</i></u>", output)
-    # Process markdown formatting tags (bold, underline, italic, strikethrough)
-    # and convert them to their respective HTML tags
-    output = split_by_tag(output, "**", "b")
-    output = split_by_tag(output, "__", "u")
-    output = split_by_tag(output, "_", "i")
-    output = split_by_tag(output, "*", "i")
-    output = split_by_tag(output, "~~", "s")
-    # Remove storage links
-    output = re.sub(r"【[^】]+】", "", output)
-    # Convert links
-    output = re.sub(r"!?\[(.*?)\]\((.*?)\)", r'<a href="\2">\1</a>', output)
-    # Convert headings
-    output = re.sub(r"^\s*#+ (.+)", r"<b>\1</b>", output, flags=re.MULTILINE)
-    # Convert unordered lists, preserving indentation
-    output = re.sub(r"^(\s*)[\-\*] (.+)", r"\1• \2", output, flags=re.MULTILINE)
-    # Step 4: Reinsert the converted HTML code blocks
-    output = reinsert_code_blocks(output, code_blocks)
-    # Step 5: Remove blockquote escaping
-    output = remove_blockquote_escaping(output)
-    return output