docxrender 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docxrender/markdown.py ADDED
@@ -0,0 +1,177 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass(frozen=True, slots=True)
8
+ class MarkdownHeading:
9
+ level: int
10
+ text: str
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class MarkdownParagraph:
15
+ text: str
16
+
17
+
18
+ @dataclass(frozen=True, slots=True)
19
+ class MarkdownOrderedList:
20
+ items: tuple[str, ...]
21
+
22
+
23
+ @dataclass(frozen=True, slots=True)
24
+ class MarkdownTable:
25
+ rows: tuple[tuple[str, ...], ...]
26
+
27
+
28
+ @dataclass(frozen=True, slots=True)
29
+ class MarkdownImage:
30
+ caption: str
31
+ path: str
32
+ width_pct: float
33
+
34
+
35
+ @dataclass(frozen=True, slots=True)
36
+ class MarkdownPageBreak:
37
+ pass
38
+
39
+
40
+ @dataclass(frozen=True, slots=True)
41
+ class MarkdownSpacer:
42
+ pass
43
+
44
+
45
+ MarkdownBlock = (
46
+ MarkdownHeading
47
+ | MarkdownParagraph
48
+ | MarkdownOrderedList
49
+ | MarkdownTable
50
+ | MarkdownImage
51
+ | MarkdownPageBreak
52
+ | MarkdownSpacer
53
+ )
54
+
55
+ RE_HEADING = re.compile(r"^(#{1,6})\s+(.*)$")
56
+ RE_ORDERED_LIST_ITEM = re.compile(r"^\d+\.\s+(.*)$")
57
+ RE_IMAGE = re.compile(
58
+ r"^!\[(?P<caption>.*?)\]\((?P<path>.*?)\)"
59
+ r"(?:\{[^}]*width=(?P<width>\d+)%[^}]*\})?\s*$"
60
+ )
61
+
62
+
63
+ def parse_markdown_blocks(markdown_body: str) -> tuple[MarkdownBlock, ...]:
64
+ lines = markdown_body.splitlines()
65
+ blocks: list[MarkdownBlock] = []
66
+ idx = 0
67
+ while idx < len(lines):
68
+ line = lines[idx]
69
+ text = line.strip()
70
+ if not text:
71
+ idx += 1
72
+ continue
73
+ if text == r"\newpage":
74
+ blocks.append(MarkdownPageBreak())
75
+ idx += 1
76
+ continue
77
+ if text == r"\vspace":
78
+ blocks.append(MarkdownSpacer())
79
+ idx += 1
80
+ continue
81
+
82
+ match_heading = RE_HEADING.match(text)
83
+ if match_heading is not None:
84
+ blocks.append(
85
+ MarkdownHeading(
86
+ level=len(match_heading.group(1)),
87
+ text=match_heading.group(2).strip(),
88
+ )
89
+ )
90
+ idx += 1
91
+ continue
92
+
93
+ match_image = RE_IMAGE.match(text)
94
+ if match_image is not None:
95
+ width_raw = match_image.group("width")
96
+ blocks.append(
97
+ MarkdownImage(
98
+ caption=match_image.group("caption"),
99
+ path=match_image.group("path").strip(),
100
+ width_pct=float(width_raw) if width_raw is not None else 90.0,
101
+ )
102
+ )
103
+ idx += 1
104
+ continue
105
+
106
+ match_list_item = RE_ORDERED_LIST_ITEM.match(text)
107
+ if match_list_item is not None:
108
+ items: list[str] = []
109
+ while idx < len(lines):
110
+ match_current = RE_ORDERED_LIST_ITEM.match(lines[idx].strip())
111
+ if match_current is None:
112
+ break
113
+ items.append(match_current.group(1).strip())
114
+ idx += 1
115
+ blocks.append(MarkdownOrderedList(items=tuple(items)))
116
+ continue
117
+
118
+ if text.startswith("|"):
119
+ rows: list[tuple[str, ...]] = []
120
+ idx_table = 0
121
+ while idx < len(lines) and lines[idx].strip().startswith("|"):
122
+ row = tuple(
123
+ cell.strip() for cell in lines[idx].strip().strip("|").split("|")
124
+ )
125
+ is_header_separator = idx_table == 1 and all(
126
+ set(cell) <= {"-", ":"} for cell in row
127
+ )
128
+ if not is_header_separator:
129
+ rows.append(row)
130
+ idx += 1
131
+ idx_table += 1
132
+ blocks.append(MarkdownTable(rows=tuple(rows)))
133
+ continue
134
+
135
+ paragraph_parts: list[tuple[str, bool]] = []
136
+ while idx < len(lines):
137
+ line_current = lines[idx]
138
+ text_current = line_current.strip()
139
+ if not text_current:
140
+ break
141
+ if _is_special_line(text_current):
142
+ break
143
+ line_text, has_hard_break = _strip_hard_break(line_current)
144
+ paragraph_parts.append((line_text.strip(), has_hard_break))
145
+ idx += 1
146
+ if idx < len(lines) and lines[idx].strip() and not _is_special_line(
147
+ lines[idx].strip()
148
+ ):
149
+ continue
150
+ break
151
+ blocks.append(MarkdownParagraph(text=_join_paragraph_parts(paragraph_parts)))
152
+ return tuple(blocks)
153
+
154
+
155
+ def _strip_hard_break(line: str) -> tuple[str, bool]:
156
+ has_hard_break = line.endswith(" ")
157
+ return line.rstrip(), has_hard_break
158
+
159
+
160
+ def _join_paragraph_parts(parts: list[tuple[str, bool]]) -> str:
161
+ if not parts:
162
+ return ""
163
+ text = parts[0][0]
164
+ for previous, current in zip(parts, parts[1:], strict=False):
165
+ text += ("\n" if previous[1] else " ") + current[0]
166
+ return text
167
+
168
+
169
+ def _is_special_line(text: str) -> bool:
170
+ return (
171
+ text == r"\newpage"
172
+ or text == r"\vspace"
173
+ or text.startswith("|")
174
+ or RE_HEADING.match(text) is not None
175
+ or RE_IMAGE.match(text) is not None
176
+ or RE_ORDERED_LIST_ITEM.match(text) is not None
177
+ )