PyPI - mdformat-sembr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mdformat-sembr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mdformat_sembr/__init__.py +14 -0
mdformat_sembr/_plugin.py +133 -0
mdformat_sembr/_sembr.py +228 -0
mdformat_sembr/py.typed +0 -0
mdformat_sembr-0.1.0.dist-info/METADATA +114 -0
mdformat_sembr-0.1.0.dist-info/RECORD +9 -0
mdformat_sembr-0.1.0.dist-info/WHEEL +4 -0
mdformat_sembr-0.1.0.dist-info/entry_points.txt +2 -0
mdformat_sembr-0.1.0.dist-info/licenses/LICENSE +21 -0

mdformat_sembr/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""mdformat-sembr: Semantic Line Breaks as CommonMark soft breaks.
+The entry point ``mdformat.parser_extension`` -> ``sembr`` resolves to the
+``_plugin`` attribute of this package (see ``pyproject.toml``). Importing it here
+exposes the interface object as ``mdformat_sembr._plugin``.
+"""
+from __future__ import annotations
+from mdformat_sembr import _plugin
+from mdformat_sembr._sembr import insert_breaks
+__all__ = ["_plugin", "insert_breaks"]
+__version__ = "0.1.0"

mdformat_sembr/_plugin.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""mdformat parser-extension interface for the SemBr plugin.
+This module *is* the plugin interface object referenced by the entry point
+``mdformat_sembr:_plugin``. It exposes the members required by
+``mdformat.plugins.ParserExtensionInterface`` at module level.
+We do not change the parser or override any renderer; all work happens in a
+postprocessor registered on the ``paragraph`` node type. At that point inline
+formatting is already resolved into the rendered string, so we operate on final
+text and protect a few inline constructs by regex.
+"""
+from __future__ import annotations
+import argparse
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any
+from mdformat_sembr._sembr import (
+    DEFAULT_ABBREVIATIONS,
+    DEFAULT_CLAUSE_CHARS,
+    DEFAULT_MIN_CHARS,
+    insert_breaks,
+)
+if TYPE_CHECKING:
+    from markdown_it import MarkdownIt
+    from mdformat.renderer import RenderContext, RenderTreeNode
+#: SemBr soft breaks never alter the rendered output, so the AST is unchanged.
+#: This lets mdformat's built-in ``is_md_equal`` validator gate correctness.
+CHANGES_AST = False
+def update_mdit(mdit: "MarkdownIt") -> None:
+    """No parser change is needed for SemBr."""
+    # Intentionally a no-op.
+    pass
+def _plugin_options(context: "RenderContext") -> Mapping[str, Any]:
+    """Return the merged ``[plugin.sembr]`` / CLI options mapping."""
+    mdformat_opts = context.options.get("mdformat", {})
+    plugin_opts = mdformat_opts.get("plugin", {})
+    return plugin_opts.get("sembr", {}) or {}
+def _postprocess_paragraph(
+    text: str,
+    node: "RenderTreeNode",
+    context: "RenderContext",
+) -> str:
+    """Insert SemBr soft breaks into an already-rendered paragraph string."""
+    opts = _plugin_options(context)
+    min_chars = opts.get("min_chars", DEFAULT_MIN_CHARS)
+    abbreviations = opts.get("abbreviations", None)
+    break_clauses = bool(opts.get("break_clauses", False))
+    clause_chars = opts.get("clause_chars", DEFAULT_CLAUSE_CHARS)
+    return insert_breaks(
+        text,
+        min_chars=int(min_chars),
+        abbreviations=abbreviations,
+        break_clauses=break_clauses,
+        clause_chars=clause_chars,
+    )
+def add_cli_argument_group(group: argparse._ArgumentGroup) -> None:
+    """Register CLI options, mirrored to TOML ``[plugin.sembr]``.
+    Values are stored under ``mdit.options["mdformat"]["plugin"]["sembr"]`` and
+    merged with the TOML config. ``dest`` names deliberately match the TOML keys
+    so CLI values merge cleanly over TOML.
+    """
+    group.add_argument(
+        "--sembr-min-chars",
+        dest="min_chars",
+        type=int,
+        default=None,
+        metavar="N",
+        help=(
+            "minimum length of the segment before a break is allowed "
+            f"(default: {DEFAULT_MIN_CHARS})"
+        ),
+    )
+    group.add_argument(
+        "--sembr-abbreviations",
+        dest="abbreviations",
+        action="append",
+        default=None,
+        metavar="ABBR",
+        help=(
+            "abbreviation after which no sentence break is inserted; "
+            "repeat to add several (replaces the default list)"
+        ),
+    )
+    group.add_argument(
+        "--sembr-break-clauses",
+        dest="break_clauses",
+        action="store_true",
+        default=None,
+        help="also break after clause punctuation (Iteration 2; off by default)",
+    )
+    group.add_argument(
+        "--sembr-clause-chars",
+        dest="clause_chars",
+        default=None,
+        metavar="CHARS",
+        help=(
+            "clause punctuation set used when --sembr-break-clauses is on "
+            f"(default: {DEFAULT_CLAUSE_CHARS!r})"
+        ),
+    )
+#: A mapping from ``RenderTreeNode.type`` to a ``Render`` function. Empty: we do
+#: not override rendering.
+RENDERERS: Mapping[str, Any] = {}
+#: A mapping from ``RenderTreeNode.type`` to a collaborative ``Postprocess``.
+POSTPROCESSORS: Mapping[str, Any] = {"paragraph": _postprocess_paragraph}
+__all__ = [
+    "CHANGES_AST",
+    "RENDERERS",
+    "POSTPROCESSORS",
+    "update_mdit",
+    "add_cli_argument_group",
+    "DEFAULT_ABBREVIATIONS",
+]

mdformat_sembr/_sembr.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""Deterministic Semantic Line Break (sembr.org) insertion.
+This module holds the pure, rule-based break logic. It is intentionally free of
+any mdformat imports so it can be unit-tested in isolation and reused by the
+plugin's paragraph postprocessor.
+The sentence-boundary regex, abbreviation list, inline-code masking and
+abbreviation guard are ported from the project's original
+``.github/hooks/markdown-format/markdown-format.py`` script.
+"""
+from __future__ import annotations
+import re
+from collections.abc import Iterable
+# ---------------------------------------------------------------------------
+# Defaults (ported from the original markdown-format.py hook script)
+# ---------------------------------------------------------------------------
+#: Minimum length of the segment preceding a break. Prevents splitting short
+#: enumerations and fragments.
+DEFAULT_MIN_CHARS = 15
+#: Clause punctuation used by Iteration 2 (only when ``break_clauses`` is on).
+DEFAULT_CLAUSE_CHARS = ",;:\u2014"  # comma, semicolon, colon, em dash
+#: Common abbreviations whose trailing dot must NOT end a sentence.
+DEFAULT_ABBREVIATIONS: frozenset[str] = frozenset(
+    {
+        "e.g", "i.e", "etc", "vs", "cf", "viz", "al", "approx", "incl", "excl",
+        "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", "St",
+        "Inc", "Ltd", "Co", "Corp", "U.S", "U.K", "U.N", "E.U",
+        "Fig", "fig", "no", "No", "vol", "Vol", "ch", "Ch",
+        "p", "pp", "para", "sec", "Sect",
+    }
+)
+# ---------------------------------------------------------------------------
+# Regexes
+# ---------------------------------------------------------------------------
+# Sentence-terminator (. ! ?) — including "?!"/"!?" runs — followed by an
+# optional closing quote/bracket, whitespace, and a likely sentence start.
+# The negative lookbehind avoids breaking inside an ellipsis ("...").
+_SENTENCE_BOUNDARY = re.compile(
+    r'(?<=[.!?])(?<!\.\.\.)["\')\]]?\s+(?=["\'(\[`*_]?[A-Z0-9])'
+)
+# Placeholder markers use NUL bytes which never occur in Markdown source text.
+_PLACEHOLDER = "\x00{kind}{index}\x00"
+_PLACEHOLDER_RE = re.compile(r"\x00([A-Z]+)(\d+)\x00")
+# Protected inline constructs. Order matters: images/links before bare code so
+# a link label containing backticks is masked as one unit.
+_PROTECTED_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
+    # Inline code spans: one or more backticks, no embedded newline.
+    ("CODE", re.compile(r"`+[^`\n]*`+")),
+    # Images and links: optional leading '!', label in [...], target in (...).
+    ("LINK", re.compile(r"!?\[[^\]\n]*\]\([^)\n]*\)")),
+    # Reference-style links / footnote references: [text][id] or [^id].
+    ("REF", re.compile(r"!?\[[^\]\n]*\]\[[^\]\n]*\]")),
+    ("FOOT", re.compile(r"\[\^[^\]\n]+\]")),
+)
+# ---------------------------------------------------------------------------
+# Masking of protected regions
+# ---------------------------------------------------------------------------
+def _mask(text: str) -> tuple[str, dict[str, str]]:
+    """Replace protected inline spans with opaque NUL-delimited tokens.
+    Returns the masked text and a mapping of token -> original span, so the
+    break regex can never match inside code, links, images or footnote refs.
+    """
+    store: dict[str, str] = {}
+    counter = 0
+    for kind, pattern in _PROTECTED_PATTERNS:
+        def repl(m: re.Match[str], _kind: str = kind) -> str:
+            nonlocal counter
+            token = _PLACEHOLDER.format(kind=_kind, index=counter)
+            store[token] = m.group(0)
+            counter += 1
+            return token
+        text = pattern.sub(repl, text)
+    return text, store
+def _unmask(text: str, store: dict[str, str]) -> str:
+    """Reverse :func:`_mask`, restoring original spans from placeholder tokens."""
+    if not store:
+        return text
+    def repl(m: re.Match[str]) -> str:
+        return store.get(m.group(0), m.group(0))
+    # Repeat until stable in case a restored span contained another token
+    # (protected spans never nest in practice, but this keeps it robust).
+    prev = None
+    while prev != text:
+        prev = text
+        text = _PLACEHOLDER_RE.sub(repl, text)
+    return text
+# ---------------------------------------------------------------------------
+# Abbreviation guard
+# ---------------------------------------------------------------------------
+def _is_abbreviation_before(text: str, idx: int, abbreviations: frozenset[str]) -> bool:
+    """Return True if the period just before ``idx`` belongs to an abbreviation."""
+    j = idx - 1
+    if j < 0 or text[j] != ".":
+        return False
+    start = j
+    while start > 0 and (text[start - 1].isalnum() or text[start - 1] == "."):
+        start -= 1
+    token = text[start:j]  # word chars before the trailing dot
+    return token in abbreviations
+# ---------------------------------------------------------------------------
+# Core break logic
+# ---------------------------------------------------------------------------
+def _collapse_whitespace(text: str) -> str:
+    """Collapse all runs of whitespace (including newlines) to single spaces.
+    Collapse-then-rebreak is what makes the transform deterministic and
+    idempotent regardless of any existing soft breaks in the input.
+    """
+    return re.sub(r"\s+", " ", text).strip()
+def _split_points(masked: str, abbreviations: frozenset[str]) -> list[int]:
+    """Return sorted cut indices for sentence boundaries in ``masked`` text.
+    Each index is the position *after* which a break should be inserted (i.e.
+    the start of the whitespace run following a sentence terminator).
+    """
+    points: list[int] = []
+    for m in _SENTENCE_BOUNDARY.finditer(masked):
+        if _is_abbreviation_before(masked, m.start(), abbreviations):
+            continue
+        points.append(m.start())
+    return points
+def _clause_split_points(masked: str, clause_chars: str) -> list[int]:
+    """Return cut indices after independent-clause punctuation."""
+    if not clause_chars:
+        return []
+    escaped = re.escape(clause_chars)
+    # Clause punctuation followed by whitespace and a non-space continuation.
+    pattern = re.compile(rf"(?<=[{escaped}])\s+(?=\S)")
+    return [m.start() for m in pattern.finditer(masked)]
+def _apply_breaks(masked: str, cut_points: Iterable[int], min_chars: int) -> str:
+    """Insert newlines at ``cut_points`` subject to the ``min_chars`` threshold.
+    The threshold is measured against the current line segment: a break is only
+    inserted if the text since the previous break is at least ``min_chars`` long.
+    """
+    unique_points = sorted(set(cut_points))
+    if not unique_points:
+        return masked
+    out: list[str] = []
+    last = 0
+    line_start = 0
+    for point in unique_points:
+        segment_len = len(masked[line_start:point].strip())
+        if segment_len < min_chars:
+            continue
+        out.append(masked[last:point].rstrip())
+        out.append("\n")
+        # Skip the whitespace run that followed the boundary.
+        next_start = point
+        while next_start < len(masked) and masked[next_start].isspace():
+            next_start += 1
+        last = next_start
+        line_start = next_start
+    out.append(masked[last:])
+    return "".join(out)
+def insert_breaks(
+    text: str,
+    *,
+    min_chars: int = DEFAULT_MIN_CHARS,
+    abbreviations: Iterable[str] | None = None,
+    break_clauses: bool = False,
+    clause_chars: str = DEFAULT_CLAUSE_CHARS,
+) -> str:
+    """Insert SemBr soft breaks into a single rendered paragraph string.
+    Sentence boundaries (``.``/``!``/``?``) always break (Iteration 1). When
+    ``break_clauses`` is true, clause punctuation in ``clause_chars`` also breaks
+    (Iteration 2). Protected inline regions (code, links, images, footnote refs)
+    and abbreviations are never split.
+    Only bare ``\\n`` soft breaks are emitted — never hard breaks. Rendered HTML
+    output is therefore unchanged. The transform is deterministic and idempotent.
+    """
+    abbrev = (
+        DEFAULT_ABBREVIATIONS
+        if abbreviations is None
+        else frozenset(abbreviations)
+    )
+    collapsed = _collapse_whitespace(text)
+    if not collapsed:
+        return collapsed
+    masked, store = _mask(collapsed)
+    cut_points = _split_points(masked, abbrev)
+    if break_clauses:
+        cut_points += _clause_split_points(masked, clause_chars)
+    broken = _apply_breaks(masked, cut_points, min_chars)
+    return _unmask(broken, store)

mdformat_sembr/py.typed ADDED Viewed

File without changes

mdformat_sembr-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,114 @@
+Metadata-Version: 2.4
+Name: mdformat-sembr
+Version: 0.1.0
+Summary: mdformat plugin that inserts Semantic Line Breaks (sembr.org) as CommonMark soft breaks
+Project-URL: Homepage, https://codeberg.org/bugrasan/mdformat-sembr
+Project-URL: GitHub Mirror, https://github.com/bugrasan/mdformat-sembr
+Author: bugrasan
+License-Expression: MIT
+License-File: LICENSE
+Keywords: formatter,markdown,mdformat,semantic line breaks,sembr
+Classifier: Development Status :: 4 - Beta
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Requires-Python: >=3.10
+Requires-Dist: mdformat>=1.0
+Provides-Extra: test
+Requires-Dist: pytest>=7; extra == 'test'
+Description-Content-Type: text/markdown
+# mdformat-sembr
+An [mdformat](https://mdformat.readthedocs.io) parser-extension plugin that inserts
+[Semantic Line Breaks](https://sembr.org) (SemBr) as CommonMark **soft breaks**.
+SemBr is a convention for adding line breaks in Markdown source at sentence and
+clause boundaries. Because the breaks are CommonMark *soft* breaks (a bare `\n`
+inside a paragraph), they render to a single space — the rendered HTML output is
+unchanged, only the source becomes more diff-friendly.
+The plugin is fully deterministic: no ML, no network, no LLM calls. The same input
+always produces the same output.
+## Why
+Moving SemBr logic out of an LLM/agent loop into a token-free, reproducible
+formatter pass makes authored Markdown consistent and cheap to maintain.
+## Install
+```bash
+# uv (recommended) — --with is repeatable (or comma-separate the plugins)
+uv tool install mdformat --with mdformat-sembr --with mdformat-frontmatter
+# pipx — install the app, then inject the plugins into its environment
+pipx install mdformat
+pipx inject mdformat mdformat-sembr mdformat-frontmatter
+# local development
+pip install -e .
+```
+`mdformat-frontmatter` is optional: install it only if your Markdown uses
+YAML/TOML frontmatter and you want mdformat to preserve/format it. It composes
+with `mdformat-sembr` (frontmatter is a separate node type and is never broken).
+## Usage
+```bash
+mdformat --version          # should list "mdformat_sembr"
+echo "First sentence. Second sentence." | mdformat -
+```
+From Python:
+```python
+import mdformat
+mdformat.text("First sentence. Second sentence.\n", extensions={"sembr"})
+# 'First sentence.\nSecond sentence.\n'
+```
+## How it works
+The plugin registers a **postprocessor** on the `paragraph` node type. At that point
+inline formatting (emphasis, links, inline code) is already resolved into the string,
+so it operates on the final rendered text and only protects a few inline constructs by
+regex. Block-level elements (headings, code blocks, tables, frontmatter, HTML blocks)
+are separate node types and are never touched.
+`CHANGES_AST = False`: soft breaks are AST-safe by design, so mdformat's built-in
+`is_md_equal` validator gates correctness. If validation ever fails, the break logic is
+wrong — it is never worked around with `--no-validate` or hard breaks.
+## Configuration
+Configure via `[plugin.sembr]` in `.mdformat.toml`, or via CLI flags. CLI values merge
+over TOML.
+| Option          | Type        | Default   | Meaning                                                        |
+| --------------- | ----------- | --------- | -------------------------------------------------------------- |
+| `min_chars`     | int         | `15`      | Minimum length of the segment before a break is allowed.       |
+| `abbreviations` | list[str]   | see below | Tokens after which no sentence break is inserted.              |
+| `break_clauses` | bool        | `false`   | Enable clause-level breaks (SemBr "SHOULD"). Off by default.   |
+| `clause_chars`  | str         | `",;:—"`  | Clause punctuation set (only used when `break_clauses` true).  |
+CLI flags: `--sembr-min-chars`, `--sembr-abbreviations`, `--sembr-break-clauses`,
+`--sembr-clause-chars`.
+`.mdformat.toml` example:
+```toml
+[plugin.sembr]
+min_chars = 20
+break_clauses = true
+```
+## License
+MIT

mdformat_sembr-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+mdformat_sembr/__init__.py,sha256=2angvPWFxzexnHG8AEkcDZIK6y88YAkcef9qWhQPUqU,471
+mdformat_sembr/_plugin.py,sha256=Mm3godzv0eUJHYLaGDehvz7led88L1bjnDHwMaqWiyo,4119
+mdformat_sembr/_sembr.py,sha256=ouOVnHJTGvnw8-iR6_1UdySYhcAyuoTYQZ1trr3EbSE,8493
+mdformat_sembr/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mdformat_sembr-0.1.0.dist-info/METADATA,sha256=8rydE6eWbkXL92uxa2Kd44opWsxiOqoGt4-3PCd4W2o,4314
+mdformat_sembr-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+mdformat_sembr-0.1.0.dist-info/entry_points.txt,sha256=cZs7zd0X63vlwUoEMiZQzx_C30mo6WXjcCHiqLnx1ko,59
+mdformat_sembr-0.1.0.dist-info/licenses/LICENSE,sha256=xEWxditjeckt1SLFDqL2rKbJaqS72hzBGez4K1QSxYI,1084
+mdformat_sembr-0.1.0.dist-info/RECORD,,

mdformat_sembr-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

mdformat_sembr-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [mdformat.parser_extension]
2	+ sembr = mdformat_sembr:_plugin

mdformat_sembr-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 mdformat-sembr contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.