puku-markdown 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puku_markdown-0.1.0/.github/workflows/publish.yaml +99 -0
- puku_markdown-0.1.0/.github/workflows/pytest.yaml +41 -0
- puku_markdown-0.1.0/.gitignore +5 -0
- puku_markdown-0.1.0/PKG-INFO +5 -0
- puku_markdown-0.1.0/README.md +0 -0
- puku_markdown-0.1.0/puku_markdown/__init__.py +0 -0
- puku_markdown-0.1.0/puku_markdown/_utils/constants.py +315 -0
- puku_markdown-0.1.0/puku_markdown/_utils/metrics.py +26 -0
- puku_markdown-0.1.0/puku_markdown/_utils/predicates.py +38 -0
- puku_markdown-0.1.0/puku_markdown/_utils/re_patterns.py +180 -0
- puku_markdown-0.1.0/puku_markdown/_utils/scanners/__init__.py +29 -0
- puku_markdown-0.1.0/puku_markdown/_utils/scanners/link_destination.py +95 -0
- puku_markdown-0.1.0/puku_markdown/_utils/scanners/link_title.py +113 -0
- puku_markdown-0.1.0/puku_markdown/column_resolution.py +130 -0
- puku_markdown-0.1.0/puku_markdown/elements/__init__.py +36 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/__init__.py +34 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/base.py +11 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/__init__.py +31 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/atx_heading.py +9 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/blockquote.py +10 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/fenced_code_block.py +10 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/html_block.py +20 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/indented_code_block.py +8 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/link_reference_definition.py +10 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/list.py +23 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/paragraph.py +8 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/setext_heading.py +9 -0
- puku_markdown-0.1.0/puku_markdown/elements/block/commonmark/thematic_break.py +8 -0
- puku_markdown-0.1.0/puku_markdown/elements/document.py +17 -0
- puku_markdown-0.1.0/puku_markdown/line_span.py +31 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/block_stream.py +29 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/command.py +273 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/__init__.py +55 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/atx_heading.py +134 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/blockquote.py +287 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/fenced_code_block.py +158 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/html_blocks.py +314 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/indented_code_block.py +82 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/link_reference_definition.py +443 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/list.py +553 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/__init__.py +19 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/blockquote.py +54 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/link_reference_definition.py +101 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/list.py +59 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/paragraph.py +8 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/locals/setext_heading.py +9 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/paragraph.py +105 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/setext_heading.py +190 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/commonmark/rules/thematic_break.py +79 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/frame.py +331 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/frame_actuals.py +83 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/frame_spec.py +46 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/line_descriptor.py +107 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/logger.py +4 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/parse.py +308 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/rule.py +22 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/rule_chain.py +28 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/rule_chains_registry.py +87 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/rule_context.py +223 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/rule_locals.py +38 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/state.py +746 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/type_aliases.py +14 -0
- puku_markdown-0.1.0/puku_markdown/parser/block/upcall.py +35 -0
- puku_markdown-0.1.0/puku_markdown/parser/parse.py +37 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/__init__.py +5 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/change_set.py +41 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/core.py +76 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/modification.py +36 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/transactional_editor.py +173 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/transient.py +131 -0
- puku_markdown-0.1.0/puku_markdown/persistent_list/type_vars.py +4 -0
- puku_markdown-0.1.0/pyproject.toml +19 -0
- puku_markdown-0.1.0/tests/__init__.py +0 -0
- puku_markdown-0.1.0/tests/commonmark/__init__.py +0 -0
- puku_markdown-0.1.0/tests/commonmark/spec.json +5218 -0
- puku_markdown-0.1.0/tests/commonmark/spec.md +9756 -0
- puku_markdown-0.1.0/tests/commonmark/spec.py +96 -0
- puku_markdown-0.1.0/tests/commonmark/test_block_parser.py +19 -0
- puku_markdown-0.1.0/tests/commonmark/update_spec.py +76 -0
- puku_markdown-0.1.0/tests/markdown_it_py/__init__.py +4 -0
- puku_markdown-0.1.0/tests/markdown_it_py/block_parse.py +64 -0
- puku_markdown-0.1.0/tests/markdown_it_py/block_token.py +366 -0
- puku_markdown-0.1.0/uv.lock +321 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
inputs:
|
|
8
|
+
use_testpypi:
|
|
9
|
+
description: 'Publish to TestPyPI instead of PyPI'
|
|
10
|
+
required: false
|
|
11
|
+
default: false
|
|
12
|
+
type: boolean
|
|
13
|
+
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
validate:
|
|
19
|
+
name: Validate source branch and destination
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
steps:
|
|
22
|
+
- name: Check branch and destination compatibility
|
|
23
|
+
run: |
|
|
24
|
+
BRANCH="${{ github.ref_name }}"
|
|
25
|
+
USE_TEST="${{ inputs.use_testpypi }}"
|
|
26
|
+
EVENT="${{ github.event_name }}"
|
|
27
|
+
|
|
28
|
+
if [[ "$USE_TEST" == "true" ]]; then
|
|
29
|
+
echo "✅ Publishing to TestPyPI - branch $BRANCH is allowed."
|
|
30
|
+
exit 0
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
if [[ "$BRANCH" != "main" && "$EVENT" != "release" ]]; then
|
|
34
|
+
echo "❌ Error: Publishing to PyPI is only allowed from the 'main' branch or a release tag."
|
|
35
|
+
echo "Current branch: $BRANCH, event: $EVENT"
|
|
36
|
+
exit 1
|
|
37
|
+
fi
|
|
38
|
+
|
|
39
|
+
echo "✅ Validation passed - publishing to PyPI from $BRANCH (event: $EVENT)."
|
|
40
|
+
|
|
41
|
+
test:
|
|
42
|
+
name: Run pytest suite
|
|
43
|
+
needs: validate
|
|
44
|
+
uses: ./.github/workflows/pytest.yaml
|
|
45
|
+
secrets: inherit
|
|
46
|
+
|
|
47
|
+
build:
|
|
48
|
+
name: Build distribution
|
|
49
|
+
needs: test
|
|
50
|
+
runs-on: ubuntu-latest
|
|
51
|
+
steps:
|
|
52
|
+
- uses: actions/checkout@v4
|
|
53
|
+
|
|
54
|
+
- name: Set up Python
|
|
55
|
+
uses: actions/setup-python@v6
|
|
56
|
+
with:
|
|
57
|
+
python-version: '3.12'
|
|
58
|
+
|
|
59
|
+
- name: Install uv
|
|
60
|
+
uses: astral-sh/setup-uv@v5
|
|
61
|
+
|
|
62
|
+
- name: Build wheel and sdist
|
|
63
|
+
run: uv build
|
|
64
|
+
|
|
65
|
+
- name: Upload dist as artifact
|
|
66
|
+
uses: actions/upload-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: dist
|
|
69
|
+
path: dist
|
|
70
|
+
retention-days: 1
|
|
71
|
+
|
|
72
|
+
publish:
|
|
73
|
+
name: Publish to ${{ inputs.use_testpypi && 'TestPyPI' || 'PyPI' }}
|
|
74
|
+
needs: build
|
|
75
|
+
runs-on: ubuntu-latest
|
|
76
|
+
environment:
|
|
77
|
+
name: ${{ inputs.use_testpypi && 'testpypi' || 'pypi' }}
|
|
78
|
+
url: ${{ inputs.use_testpypi && 'https://test.pypi.org/project/puku-markdown' || 'https://pypi.org/project/puku-markdown' }}
|
|
79
|
+
steps:
|
|
80
|
+
- name: Download dist
|
|
81
|
+
uses: actions/download-artifact@v4
|
|
82
|
+
with:
|
|
83
|
+
name: dist
|
|
84
|
+
path: dist
|
|
85
|
+
|
|
86
|
+
- name: Publish to TestPyPI (with API token)
|
|
87
|
+
if: inputs.use_testpypi
|
|
88
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
89
|
+
with:
|
|
90
|
+
packages-dir: dist
|
|
91
|
+
repository-url: https://test.pypi.org/legacy/
|
|
92
|
+
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
|
|
93
|
+
|
|
94
|
+
- name: Publish to PyPI (with API token)
|
|
95
|
+
if: ${{ !inputs.use_testpypi }}
|
|
96
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
97
|
+
with:
|
|
98
|
+
packages-dir: dist
|
|
99
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Pytest
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_call:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
test:
|
|
15
|
+
name: Python ${{ matrix.python-version }}
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
python-version: ["3.12", "3.13"]
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- name: Checkout repository
|
|
24
|
+
uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
27
|
+
uses: actions/setup-python@v6.2.0
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
|
|
31
|
+
- name: Install uv
|
|
32
|
+
uses: astral-sh/setup-uv@v5
|
|
33
|
+
with:
|
|
34
|
+
enable-cache: true
|
|
35
|
+
cache-dependency-glob: "uv.lock"
|
|
36
|
+
|
|
37
|
+
- name: Install dependencies (including test group)
|
|
38
|
+
run: uv sync --group test
|
|
39
|
+
|
|
40
|
+
- name: Run pytest with coverage monitoring
|
|
41
|
+
run: uv run pytest --cov --cov-report=term-missing
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CommonMark constants for Markdown parsing.
|
|
3
|
+
|
|
4
|
+
Coding style note:
|
|
5
|
+
Typically, names are limited to at most 4 words (e.g., `INDENTED_CODE_BLOCK_MIN_INDENT`).
|
|
6
|
+
However, for constants that directly represent a CommonMark element (e.g., fenced code
|
|
7
|
+
blocks, thematic breaks), the element name in the prefix (e.g., `FENCED_CODE_BLOCK_`,
|
|
8
|
+
`THEMATIC_BREAK_`) is **excluded** from the word count. This allows longer, more
|
|
9
|
+
precise names without violating the spirit of the rule.
|
|
10
|
+
|
|
11
|
+
This exception is *temporary* and subject to revision.
|
|
12
|
+
|
|
13
|
+
TODO: Re-evaluate the word-limit exception for CommonMark element prefixes.
|
|
14
|
+
Consider adopting a fixed maximum total length or a different scheme.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from typing import Final
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
COMMONMARK_TAB_STOP: Final[int] = 4
|
|
21
|
+
"""Number of spaces to which a tab character expands in CommonMark parsing.
|
|
22
|
+
|
|
23
|
+
See Also:
|
|
24
|
+
CommonMark Spec 0.31.2, Section 2.2 (Tabs):
|
|
25
|
+
https://spec.commonmark.org/0.31.2/#tabs
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
NULL_CHARACTER: Final[str] = "\0"
|
|
29
|
+
|
|
30
|
+
HASH_CHARACTER: Final[str] = "#"
|
|
31
|
+
"""
|
|
32
|
+
Hash/number sign character used as the ATX heading marker in CommonMark.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
LESS_THAN_CHARACTER: Final[str] = "<"
|
|
36
|
+
"""
|
|
37
|
+
The less-than sign character '<' (Unicode U+003C).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
GREATER_THAN_CHARACTER: Final[str] = ">"
|
|
41
|
+
"""The greater-than sign character '>' (Unicode U+003E).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
SPACE_CHARACTER: Final[str] = " "
|
|
45
|
+
"""
|
|
46
|
+
The space character ' ' (Unicode U+0020).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
TAB_CHARACTER: Final[str] = "\t"
|
|
50
|
+
"""
|
|
51
|
+
The tab character '\\t' (Unicode U+0009, CHARACTER TABULATION).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
LEFT_SQUARE_BRACKET_CHARACTER: Final[str] = "["
|
|
55
|
+
"""
|
|
56
|
+
The left square bracket character '[' (Unicode U+005B, LEFT SQUARE BRACKET).
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
RIGHT_SQUARE_BRACKET_CHARACTER: Final[str] = "]"
|
|
60
|
+
"""
|
|
61
|
+
The right square bracket character ']' (Unicode U+005D, RIGHT SQUARE BRACKET).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
LINE_FEED_CHARACTER: Final[str] = "\n"
|
|
65
|
+
"""
|
|
66
|
+
Line feed (LF), the '\n' character.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
BACKSLASH_CHARACTER: Final[str] = "\\"
|
|
70
|
+
"""
|
|
71
|
+
Backslash, the escape character.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
COLON_CHARACTER: Final[str] = ":"
|
|
75
|
+
"""
|
|
76
|
+
Colon (:), used in reference definitions after the label.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
LEFT_PARENTHESIS_CHARACTER: Final[str] = "("
|
|
80
|
+
"""
|
|
81
|
+
Left parenthesis '(' (U+0028).
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
RIGHT_PARENTHESIS_CHARACTER: Final[str] = ")"
|
|
85
|
+
"""
|
|
86
|
+
Right parenthesis ')' (U+0029).
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
UNICODE_REPLACEMENT_CHARACTER: Final[str] = "\ufffd"
|
|
90
|
+
|
|
91
|
+
BACKTICK_CHARACTER: Final[str] = "`"
|
|
92
|
+
|
|
93
|
+
INDENTED_CODE_BLOCK_MIN_INDENT: Final[int] = 4
|
|
94
|
+
"""Minimum indentation required for a line to be part of an indented code block.
|
|
95
|
+
|
|
96
|
+
According to the CommonMark specification (Section 4.4), an indented chunk is a
|
|
97
|
+
sequence of non-blank lines, each preceded by *four or more spaces* of indentation.
|
|
98
|
+
Lines meeting this threshold form an indented code block.
|
|
99
|
+
|
|
100
|
+
Note:
|
|
101
|
+
An indented code block cannot interrupt a paragraph. Therefore, within a
|
|
102
|
+
paragraph, a line with indentation >= this value is treated as a lazy
|
|
103
|
+
continuation of the paragraph, not as a new code block.
|
|
104
|
+
|
|
105
|
+
See Also:
|
|
106
|
+
CommonMark Spec 0.31.2, Section 4.4:
|
|
107
|
+
https://spec.commonmark.org/0.31.2/#indented-code-blocks
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
THEMATIC_BREAK_MARKERS: Final[frozenset[str]] = frozenset({"*", "-", "_"})
|
|
111
|
+
"""
|
|
112
|
+
Immutable set of characters that can initiate a CommonMark thematic break.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
THEMATIC_BREAK_MIN_MARKER_COUNT: Final[int] = 3
|
|
116
|
+
"""
|
|
117
|
+
Minimum number of identical markers required for a CommonMark thematic break.
|
|
118
|
+
|
|
119
|
+
See CommonMark Spec 0.31.2, Section 4.1:
|
|
120
|
+
https://spec.commonmark.org/0.31.2/#thematic-breaks
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
FENCED_CODE_BLOCK_MIN_MARKER_COUNT: Final[int] = 3
|
|
124
|
+
"""
|
|
125
|
+
Minimum number of consecutive backticks or tildes required for a fenced code block.
|
|
126
|
+
|
|
127
|
+
Per CommonMark Spec 0.31.2, Section 4.5:
|
|
128
|
+
https://spec.commonmark.org/0.31.2/#fenced-code-blocks
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
FENCED_CODE_BLOCK_MARKERS: Final[frozenset[str]] = frozenset({"~", "`"})
|
|
132
|
+
"""
|
|
133
|
+
Immutable set of marker characters that can open a fenced code block.
|
|
134
|
+
|
|
135
|
+
See Also:
|
|
136
|
+
CommonMark Spec 0.31.2, Section 4.5:
|
|
137
|
+
https://spec.commonmark.org/0.31.2/#fenced-code-blocks
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
ATX_HEADING_MAX_LEVEL: Final[int] = 6
|
|
141
|
+
"""
|
|
142
|
+
Maximum heading level for an ATX heading, corresponding to six '#' characters.
|
|
143
|
+
|
|
144
|
+
See Also:
|
|
145
|
+
CommonMark Spec 0.31.2, Section 4.2 (ATX headings):
|
|
146
|
+
https://spec.commonmark.org/0.31.2/#atx-headings
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
HTML_BLOCK_NAMES: Final[tuple[str, ...]] = (
|
|
150
|
+
"address",
|
|
151
|
+
"article",
|
|
152
|
+
"aside",
|
|
153
|
+
"base",
|
|
154
|
+
"basefont",
|
|
155
|
+
"blockquote",
|
|
156
|
+
"body",
|
|
157
|
+
"caption",
|
|
158
|
+
"center",
|
|
159
|
+
"col",
|
|
160
|
+
"colgroup",
|
|
161
|
+
"dd",
|
|
162
|
+
"details",
|
|
163
|
+
"dialog",
|
|
164
|
+
"dir",
|
|
165
|
+
"div",
|
|
166
|
+
"dl",
|
|
167
|
+
"dt",
|
|
168
|
+
"fieldset",
|
|
169
|
+
"figcaption",
|
|
170
|
+
"figure",
|
|
171
|
+
"footer",
|
|
172
|
+
"form",
|
|
173
|
+
"frame",
|
|
174
|
+
"frameset",
|
|
175
|
+
"h1",
|
|
176
|
+
"h2",
|
|
177
|
+
"h3",
|
|
178
|
+
"h4",
|
|
179
|
+
"h5",
|
|
180
|
+
"h6",
|
|
181
|
+
"head",
|
|
182
|
+
"header",
|
|
183
|
+
"hr",
|
|
184
|
+
"html",
|
|
185
|
+
"iframe",
|
|
186
|
+
"legend",
|
|
187
|
+
"li",
|
|
188
|
+
"link",
|
|
189
|
+
"main",
|
|
190
|
+
"menu",
|
|
191
|
+
"menuitem",
|
|
192
|
+
"nav",
|
|
193
|
+
"noframes",
|
|
194
|
+
"ol",
|
|
195
|
+
"optgroup",
|
|
196
|
+
"option",
|
|
197
|
+
"p",
|
|
198
|
+
"param",
|
|
199
|
+
"search",
|
|
200
|
+
"section",
|
|
201
|
+
"summary",
|
|
202
|
+
"table",
|
|
203
|
+
"tbody",
|
|
204
|
+
"td",
|
|
205
|
+
"tfoot",
|
|
206
|
+
"th",
|
|
207
|
+
"thead",
|
|
208
|
+
"title",
|
|
209
|
+
"tr",
|
|
210
|
+
"track",
|
|
211
|
+
"ul",
|
|
212
|
+
)
|
|
213
|
+
"""
|
|
214
|
+
List of HTML block-level tag names as defined by the CommonMark specification.
|
|
215
|
+
|
|
216
|
+
This list is used to construct the regex for matching block-level HTML tags
|
|
217
|
+
(CommonMark HTML block type 6). The tags are matched case-insensitively
|
|
218
|
+
(using `re.IGNORECASE` flag) because HTML tag names are case-insensitive.
|
|
219
|
+
|
|
220
|
+
Reference: https://spec.commonmark.org/0.31.2/#html-blocks
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
SETEXT_HEADING_MARKERS: Final[frozenset[str]] = frozenset({"-", "="})
|
|
224
|
+
"""Markers that denote Setext headings (CommonMark section 4.2, version 0.31.2).
|
|
225
|
+
|
|
226
|
+
A line consisting entirely of `=` characters (optionally with trailing spaces)
|
|
227
|
+
indicates a level-1 heading. A line consisting entirely of `-` characters
|
|
228
|
+
indicates a level-2 heading. The marker line must appear immediately after the
|
|
229
|
+
heading text (with no blank line in between).
|
|
230
|
+
|
|
231
|
+
Reference: https://spec.commonmark.org/0.31.2/#setext-headings
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
HYPHEN_MINUS_CHARACTER: Final[str] = "-"
|
|
235
|
+
"""
|
|
236
|
+
The hyphen-minus character '-' (Unicode U+002D).
|
|
237
|
+
|
|
238
|
+
This character is used as a hyphen, minus sign, or dash. In Markdown,
|
|
239
|
+
it appears in Setext headings (level-2), unordered lists, and horizontal rules.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
EQUALS_SIGN_CHARACTER: Final[str] = "="
|
|
243
|
+
"""
|
|
244
|
+
The equals sign '=' (Unicode U+003D).
|
|
245
|
+
|
|
246
|
+
In Markdown, this character is used in Setext headings (level-1) and as a
|
|
247
|
+
delimiter for fenced code blocks (with backticks). It also appears in
|
|
248
|
+
HTML attributes and link definitions.
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
MAX_LINK_DESTINATION_PARENTHESIS_DEPTH: Final[int] = 32
|
|
252
|
+
"""
|
|
253
|
+
Maximum allowed nesting depth of parentheses inside a bare link destination
|
|
254
|
+
(i.e., when not enclosed in `<` `>`). Exceeding this limit invalidates the
|
|
255
|
+
link destination and stops scanning.
|
|
256
|
+
|
|
257
|
+
This limit is a safety measure against pathological input; it is not mandated
|
|
258
|
+
by the CommonMark specification, which only requires balanced parentheses.
|
|
259
|
+
The value 32 is high enough for all practical URLs yet low enough to prevent
|
|
260
|
+
excessive CPU consumption.
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
DOUBLE_QUOTE_CHARACTER: Final[str] = '"'
|
|
264
|
+
"""
|
|
265
|
+
Double quotation mark " (U+0022).
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
SINGLE_QUOTE_CHARACTER: Final[str] = "'"
|
|
269
|
+
"""
|
|
270
|
+
Apostrophe / single quote ' (U+0027).
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
BULLET_LIST_MARKERS: Final[frozenset[str]] = frozenset({"*", "-", "+"})
|
|
274
|
+
"""Bullet list markers as defined in CommonMark section 5.2 (version 0.31.2).
|
|
275
|
+
|
|
276
|
+
A line beginning with one of these characters, followed by a space or tab,
|
|
277
|
+
starts a bullet list item. The marker may be preceded by up to three spaces
|
|
278
|
+
of indentation.
|
|
279
|
+
|
|
280
|
+
References:
|
|
281
|
+
- https://spec.commonmark.org/0.31.2/#list-items
|
|
282
|
+
- https://spec.commonmark.org/0.31.2/#bullet-list-marker
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
ORDERED_LIST_MARKER_DELIMITERS: Final[frozenset[str]] = frozenset({".", ")"})
|
|
286
|
+
"""Ordered list marker delimiters as defined in CommonMark section 5.2 (version 0.31.2).
|
|
287
|
+
|
|
288
|
+
An ordered list marker consists of a positive integer followed by a delimiter
|
|
289
|
+
character: either a period (`.`) or a right parenthesis (`)`). This constant
|
|
290
|
+
holds the two allowed delimiter characters.
|
|
291
|
+
|
|
292
|
+
Reference: https://spec.commonmark.org/0.31.2/#list-items
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
EMPTY_STRING: str = ""
|
|
296
|
+
"""An immutable sentinel representing the empty string.
|
|
297
|
+
|
|
298
|
+
Use this constant instead of the literal `""` when the empty string serves as
|
|
299
|
+
a default value, a placeholder, or a well-known marker in public APIs or
|
|
300
|
+
repeated logic. This improves readability and centralises the concept.
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
MAX_ORDERED_LIST_MARKER_DIGITS: int = 9
|
|
304
|
+
"""
|
|
305
|
+
The maximum number of digits permitted in an ordered list marker.
|
|
306
|
+
|
|
307
|
+
The CommonMark Spec (0.30+) limits ordered list markers (e.g., '1.', '999999999.')
|
|
308
|
+
to at most 9 digits. This limit prevents integer overflows in browsers that use
|
|
309
|
+
signed 32-bit integers for list indexing.
|
|
310
|
+
|
|
311
|
+
Any marker exceeding this length (e.g., '1000000000.') is invalid and must not
|
|
312
|
+
be recognized as a list marker by a conforming implementation.
|
|
313
|
+
|
|
314
|
+
Source: CommonMark Spec 0.30, Section 5.2 - List Items.
|
|
315
|
+
"""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from puku_markdown._utils.constants import COMMONMARK_TAB_STOP, TAB_CHARACTER
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def commonmark_char_width(start_colno: int, character: str) -> int:
|
|
5
|
+
"""
|
|
6
|
+
Return the visual width (in columns) of a character, as defined by CommonMark.
|
|
7
|
+
|
|
8
|
+
For a tab character, the width is the number of spaces required to advance
|
|
9
|
+
the visual column from `start_colno` to the next tab stop (every 4 columns).
|
|
10
|
+
For any other character, the width is 1.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
start_colno: The visual column index where the character begins (0-based).
|
|
14
|
+
character: The character to measure.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
The visual width increment that the character contributes.
|
|
18
|
+
|
|
19
|
+
Reference:
|
|
20
|
+
https://spec.commonmark.org/0.31.2/#tabs
|
|
21
|
+
"""
|
|
22
|
+
return (
|
|
23
|
+
(COMMONMARK_TAB_STOP - (start_colno % COMMONMARK_TAB_STOP))
|
|
24
|
+
if character == TAB_CHARACTER
|
|
25
|
+
else 1
|
|
26
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from puku_markdown._utils.constants import TAB_CHARACTER, SPACE_CHARACTER
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def is_space_or_tab(character: str) -> bool:
|
|
5
|
+
"""
|
|
6
|
+
Return True if char is ASCII space (U+0020) or tab (U+0009).
|
|
7
|
+
|
|
8
|
+
In CommonMark, only these two characters affect indentation for block structure.
|
|
9
|
+
Other Unicode whitespace characters (e.g., non-breaking space) do not count as
|
|
10
|
+
leading indentation and are treated as regular content.
|
|
11
|
+
|
|
12
|
+
Reference: https://spec.commonmark.org/0.31.2/#tabs
|
|
13
|
+
"""
|
|
14
|
+
return character in (TAB_CHARACTER, SPACE_CHARACTER)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_ascii_control(character: str) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
Return True if the character is an ASCII control character.
|
|
20
|
+
|
|
21
|
+
This includes C0 control codes (U+0000-U+001F) and DEL (U+007F).
|
|
22
|
+
These characters are generally not allowed in plain text and
|
|
23
|
+
terminate link destinations or titles in CommonMark.
|
|
24
|
+
"""
|
|
25
|
+
code = ord(character)
|
|
26
|
+
return code < 0x20 or code == 0x7F
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_ascii_digit(character: str) -> bool:
|
|
30
|
+
"""
|
|
31
|
+
Return True if the character is an ASCII digit (U+0030-U+0039).
|
|
32
|
+
|
|
33
|
+
This is equivalent to `'0' <= character <= '9'` and is the most
|
|
34
|
+
readable and performant way to test for ASCII digits.
|
|
35
|
+
|
|
36
|
+
Reference: https://spec.commonmark.org/0.31.2/#list-items
|
|
37
|
+
"""
|
|
38
|
+
return "0" <= character <= "9"
|