text2markdown 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.3
2
+ Name: text2markdown
3
+ Version: 0.1.0
4
+ Summary: A Python library for intelligently converting text into Markdown.
5
+ Author: Isaacus
6
+ Author-email: Isaacus <support@isaacus.com>
7
+ Requires-Dist: isaacus
8
+ Requires-Python: >=3.10
9
+ Project-URL: Homepage, https://docs.isaacus.com/text2markdown
10
+ Project-URL: Documentation, https://github.com/isaacus-dev/text2markdown/blob/main/README.md
11
+ Project-URL: Issues, https://github.com/isaacus-dev/text2markdown/issues
12
+ Project-URL: Source, https://github.com/isaacus-dev/text2markdown
13
+ Description-Content-Type: text/markdown
14
+
15
+ # text2markdown 📝
16
+ **text2markdown** is a Python library for intelligently converting plain text into Markdown.
17
+
18
+ text2markdown is powered by the [Isaacus enrichment API](https://docs.isaacus.com/capabilities/enrichment), which converts unstructured documents into rich, highly structured knowledge graphs that can easily be transformed into Markdown.
19
+
20
+ In all, text2markdown is capable of:
21
+ - Identifying and formatting headings.
22
+ - Segmenting text into nested sections.
23
+ - Hyperlinking cross-references within texts to other sections.
24
+ - Italicizing cited documents.
25
+ - Detecting and formatting block quotations.
26
+ - Striking through junk text.
27
+
28
+ ## Setup 📦
29
+ text2markdown can be installed with `pip` (or `uv`):
30
+ ```bash
31
+ pip install text2markdown
32
+ ```
33
+
34
+ An [Isaacus API key](https://platform.isaacus.com/accounts/signup) is also required to use this library.
35
+
36
+ ## Usage 👩‍💻
37
+ The code snippet below demonstrates how you might use `text2markdown()` to intelligently convert a short document into Markdown.
38
+ ```python
39
+ from text2markdown import text2markdown
40
+
41
+ text = """\
42
+ The Smallest Document In The World
43
+ This is a generic document.
44
+
45
+ Section 1 - Background
46
+ One upon a time, there was a mayor who said:
47
+ We love Markdown so much that everyone should and must use it for everything.
48
+
49
+ Section 2 - Problem
50
+ The mayor's directive, as stated in Section 1, was sadly too difficult to enforce."""
51
+
52
+ output = text2markdown(text)
53
+ print(output)
54
+ ```
55
+
56
+ The output should look something like this:
57
+ ```markdown
58
+ # The Smallest Document In The World
59
+
60
+ This is a generic document.
61
+
62
+ ## <a id="seg-1"></a>Section 1 - Background
63
+
64
+ One upon a time, there was a mayor who said:
65
+
66
+ > We love Markdown so much that everyone should and must use it for everything.
67
+
68
+ ## Section 2 - Problem
69
+
70
+ The mayor's directive, as stated in [Section 1](#seg-1), was sadly too difficult to enforce.
71
+ ```
72
+
73
+ An asynchronous version of `text2markdown()` is also available, supporting all of the same features and arguments as its synchronous equivalent. It can be used like so:
74
+ ```python
75
+ from text2markdown import text2markdown_async
76
+
77
+ output = await text2markdown_async(text)
78
+ print(output)
79
+ ```
80
+
81
+ All of the various capabilities of text2markdown can be toggled on or off using optional Boolean parameters, as shown below:
82
+ ```python
83
+ from text2markdown import text2markdown
84
+
85
+ from isaacus import Isaacus
86
+
87
+ output = text2markdown(
88
+ text,
89
+ link_xrefs=True,
90
+ strike_junk=True,
91
+ block_quotes=True,
92
+ italicize_refs=True,
93
+ enrichment_model="kanon-2-enricher",
94
+ isaacus_client=Isaacus(),
95
+ )
96
+ print(output)
97
+ ```
98
+
99
+ ## License 📜
100
+ This library is licensed under the [MIT License](https://github.com/isaacus-dev/text2markdown/blob/main/LICENCE).
@@ -0,0 +1,86 @@
1
+ # text2markdown 📝
2
+ **text2markdown** is a Python library for intelligently converting plain text into Markdown.
3
+
4
+ text2markdown is powered by the [Isaacus enrichment API](https://docs.isaacus.com/capabilities/enrichment), which converts unstructured documents into rich, highly structured knowledge graphs that can easily be transformed into Markdown.
5
+
6
+ In all, text2markdown is capable of:
7
+ - Identifying and formatting headings.
8
+ - Segmenting text into nested sections.
9
+ - Hyperlinking cross-references within texts to other sections.
10
+ - Italicizing cited documents.
11
+ - Detecting and formatting block quotations.
12
+ - Striking through junk text.
13
+
14
+ ## Setup 📦
15
+ text2markdown can be installed with `pip` (or `uv`):
16
+ ```bash
17
+ pip install text2markdown
18
+ ```
19
+
20
+ An [Isaacus API key](https://platform.isaacus.com/accounts/signup) is also required to use this library.
21
+
22
+ ## Usage 👩‍💻
23
+ The code snippet below demonstrates how you might use `text2markdown()` to intelligently convert a short document into Markdown.
24
+ ```python
25
+ from text2markdown import text2markdown
26
+
27
+ text = """\
28
+ The Smallest Document In The World
29
+ This is a generic document.
30
+
31
+ Section 1 - Background
32
+ One upon a time, there was a mayor who said:
33
+ We love Markdown so much that everyone should and must use it for everything.
34
+
35
+ Section 2 - Problem
36
+ The mayor's directive, as stated in Section 1, was sadly too difficult to enforce."""
37
+
38
+ output = text2markdown(text)
39
+ print(output)
40
+ ```
41
+
42
+ The output should look something like this:
43
+ ```markdown
44
+ # The Smallest Document In The World
45
+
46
+ This is a generic document.
47
+
48
+ ## <a id="seg-1"></a>Section 1 - Background
49
+
50
+ One upon a time, there was a mayor who said:
51
+
52
+ > We love Markdown so much that everyone should and must use it for everything.
53
+
54
+ ## Section 2 - Problem
55
+
56
+ The mayor's directive, as stated in [Section 1](#seg-1), was sadly too difficult to enforce.
57
+ ```
58
+
59
+ An asynchronous version of `text2markdown()` is also available, supporting all of the same features and arguments as its synchronous equivalent. It can be used like so:
60
+ ```python
61
+ from text2markdown import text2markdown_async
62
+
63
+ output = await text2markdown_async(text)
64
+ print(output)
65
+ ```
66
+
67
+ All of the various capabilities of text2markdown can be toggled on or off using optional Boolean parameters, as shown below:
68
+ ```python
69
+ from text2markdown import text2markdown
70
+
71
+ from isaacus import Isaacus
72
+
73
+ output = text2markdown(
74
+ text,
75
+ link_xrefs=True,
76
+ strike_junk=True,
77
+ block_quotes=True,
78
+ italicize_refs=True,
79
+ enrichment_model="kanon-2-enricher",
80
+ isaacus_client=Isaacus(),
81
+ )
82
+ print(output)
83
+ ```
84
+
85
+ ## License 📜
86
+ This library is licensed under the [MIT License](https://github.com/isaacus-dev/text2markdown/blob/main/LICENCE).
@@ -0,0 +1,113 @@
1
+ [project]
2
+ name = "text2markdown"
3
+ version = "0.1.0"
4
+ authors = [{ name = "Isaacus", email = "support@isaacus.com" }]
5
+ description = "A Python library for intelligently converting text into Markdown."
6
+ readme = "README.md"
7
+ requires-python = ">=3.10"
8
+ dependencies = [
9
+ "isaacus",
10
+ ]
11
+
12
+ [dependency-groups]
13
+ dev = ["ipykernel"]
14
+
15
+ [project.urls]
16
+ Homepage = "https://docs.isaacus.com/text2markdown"
17
+ Documentation = "https://github.com/isaacus-dev/text2markdown/blob/main/README.md"
18
+ Issues = "https://github.com/isaacus-dev/text2markdown/issues"
19
+ Source = "https://github.com/isaacus-dev/text2markdown"
20
+
21
+ [tool.ruff]
22
+ exclude = [
23
+ "__pycache__",
24
+ "develop-eggs",
25
+ "eggs",
26
+ ".eggs",
27
+ "wheels",
28
+ "htmlcov",
29
+ ".tox",
30
+ ".nox",
31
+ ".coverage",
32
+ ".cache",
33
+ ".pytest_cache",
34
+ ".ipynb_checkpoints",
35
+ ".mypy_cache",
36
+ ".pybuilder",
37
+ "__pypackages__",
38
+ ".env",
39
+ ".venv",
40
+ "venv",
41
+ "env",
42
+ "ENV",
43
+ "env.bak",
44
+ "venv.bak",
45
+ ".archive",
46
+ ".persist_cache",
47
+ "site-packages",
48
+ "node_modules",
49
+ "dist",
50
+ "build",
51
+ "dist-info",
52
+ "egg-info",
53
+ ".hatchling",
54
+ ".bzr",
55
+ ".direnv",
56
+ ".git",
57
+ ".git-rewrite",
58
+ ".hg",
59
+ ".pants.d",
60
+ ".pytype",
61
+ ".ruff_cache",
62
+ ".svn",
63
+ ".vscode",
64
+ "_build",
65
+ "buck-out",
66
+ "migrations",
67
+ "target",
68
+ "bin",
69
+ "lib",
70
+ "lib64",
71
+ "include",
72
+ "share",
73
+ "var",
74
+ "tmp",
75
+ "temp",
76
+ "logs",
77
+ ]
78
+ line-length = 120
79
+ indent-width = 4
80
+ target-version = "py312"
81
+
82
+ [tool.ruff.lint]
83
+ select = ["E4", "E7", "E9", "F", "I"]
84
+ fixable = ["ALL"]
85
+ unfixable = []
86
+ ignore = ["E741"]
87
+
88
+ [tool.ruff.lint.isort]
89
+ length-sort = true
90
+ section-order = [
91
+ "future",
92
+ "standard-library",
93
+ "first-party",
94
+ "third-party",
95
+ "local-folder",
96
+ ]
97
+ lines-between-types = 1
98
+ order-by-type = false
99
+ combine-as-imports = true
100
+ known-first-party = ['_parent']
101
+
102
+ [tool.ruff.lint.per-file-ignores]
103
+ "__init__.py" = ["F401"]
104
+
105
+ [tool.ruff.format]
106
+ quote-style = "double"
107
+ indent-style = "space"
108
+ skip-magic-trailing-comma = false
109
+ line-ending = "auto"
110
+
111
+ [build-system]
112
+ requires = ["uv_build>=0.9.16,<0.10.0"]
113
+ build-backend = "uv_build"
@@ -0,0 +1,4 @@
1
+ """A Python library for intelligently converting text into Markdown."""
2
+
3
+ from .text2markdown import text2markdown
4
+ from .async_text2markdown import text2markdown_async
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ import isaacus
4
+
5
+ from isaacus.types.ilgs.v1.document import Document as ILGSDocument
6
+
7
+ from .text2markdown import text2markdown
8
+
9
+
10
+ async def text2markdown_async(
11
+ text: str | ILGSDocument,
12
+ *,
13
+ link_xrefs: bool = True,
14
+ strike_junk: bool = True,
15
+ block_quotes: bool = True,
16
+ escape_lists: bool = True,
17
+ italicize_refs: bool = True,
18
+ italicize_terms: bool = True,
19
+ enrichment_model: str = "kanon-2-enricher",
20
+ isaacus_client: isaacus.AsyncIsaacus | None = None,
21
+ ) -> str:
22
+ """Intelligently converts plain text into Markdown asynchronously.
23
+
24
+ Args:
25
+ text (str | ILGSDocument): Input to be converted into Markdown. If an Isaacus Legal Graph Schema (ILGS) Document is supplied, this function will convert the Document's text into Markdown without needing to enrich it first with an Isaacus enrichment model.
26
+
27
+ link_xrefs (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
28
+
29
+ strike_junk (bool, optional): Whether to strike out junk text.
30
+
31
+ block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
32
+
33
+ escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
34
+
35
+ italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
36
+
37
+ italicize_terms (bool, optional): Whether to italicize the names of any defined terms.
38
+
39
+ enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
40
+
41
+ isaacus_client (isaacus.AsyncIsaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
42
+ """
43
+
44
+ # Raise an error if supplied with a synchronous Isaacus client.
45
+ if isinstance(isaacus_client, isaacus.Isaacus):
46
+ raise ValueError("""\
47
+ `text2markdown_async()` requires an asynchronous Isaacus client, but a synchronous Isaacus client was provided. Please supply an `isaacus.AsyncIsaacus` client or set `isaacus_client` to `None` to have an asynchronous client created automatically.""")
48
+
49
+ # Convert the text into an Isaacus Legal Graph Schema (ILGS) Document if it is not one already.
50
+ doc = text
51
+
52
+ if isinstance(text, str):
53
+ if isaacus_client is None:
54
+ isaacus_client = isaacus.AsyncIsaacus()
55
+
56
+ response = await isaacus_client.enrichments.create(model=enrichment_model, texts=text, overflow_strategy="auto")
57
+ doc = response.results[0].document
58
+
59
+ return text2markdown(
60
+ doc,
61
+ link_xrefs=link_xrefs,
62
+ strike_junk=strike_junk,
63
+ block_quotes=block_quotes,
64
+ escape_lists=escape_lists,
65
+ italicize_refs=italicize_refs,
66
+ italicize_terms=italicize_terms,
67
+ enrichment_model=enrichment_model,
68
+ isaacus_client=None,
69
+ )
@@ -0,0 +1,462 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from typing import Literal, Iterable, NamedTuple
6
+ from collections import deque
7
+ from dataclasses import dataclass
8
+
9
+ import isaacus
10
+
11
+ from isaacus.types.ilgs.v1.segment import Segment
12
+ from isaacus.types.ilgs.v1.document import Document as ILGSDocument
13
+
14
+ _LIST_PATTERNS = [
15
+ re.compile(r"^\s{0,3}[-+*]\s+"), # Unordered lists: -, *, +
16
+ re.compile(r"^\s{0,3}\d+\.\s+"), # Ordered lists: 1. 2. 10.
17
+ re.compile(r"^\s{0,3}\d+\)\s+"), # Ordered lists with parentheses: 1) 2)
18
+ ]
19
+
20
+ _AnnotationKind = Literal[
21
+ "heading",
22
+ "xref", # Cross referencing another annotation
23
+ "junk",
24
+ "quote",
25
+ "ext_ref", # External references
26
+ "src_ref", # Pointed to by a xref
27
+ "terms", # Defined terms
28
+ ]
29
+
30
+
31
+ @dataclass
32
+ class _Annotation:
33
+ start: int # Annotation starting index
34
+ end: int # Annotation ending index
35
+ kind: _AnnotationKind
36
+ force_blank_line: bool = False
37
+ level: int | None = None # not `None` for `kind==heading` only
38
+ start_id: str | None = None # not `None` for `kind==xref` or `src_ref` only
39
+
40
+ _static_tags = { # Markdown tags to attach to each `_Annotation` kind
41
+ "junk": ("~~", "~~"),
42
+ "quote": ("> ", None),
43
+ "ext_ref": ("*", "*"),
44
+ "terms": ("*", "*"),
45
+ }
46
+
47
+ @property
48
+ def tags(self) -> tuple[str, str | None]:
49
+ """Returns the markdown/html tags that need to be added at the `start` and `end` index of this `_Annotation`, respectively."""
50
+ match self.kind:
51
+ case "heading":
52
+ return (f"\n{'#' * min(6, self.level)} ", None)
53
+
54
+ case "xref":
55
+ return ("[", f"](#{self.start_id.replace(':', '-')})")
56
+
57
+ case "src_ref":
58
+ return (f"""<a id="{self.start_id.replace(":", "-")}"></a>""", None)
59
+
60
+ return self._static_tags[self.kind]
61
+
62
+ def __hash__(self):
63
+ return hash((self.start, self.end, self.kind, self.force_blank_line, self.level, self.start_id))
64
+
65
+
66
+ class _Event(NamedTuple):
67
+ position: int
68
+ time: Literal["start", "end"]
69
+ annotation: _Annotation
70
+
71
+
72
+ # ==== START HELPER FUNCTIONS ====
73
+
74
+
75
+ def _is_list_line(line: str) -> bool:
76
+ """Determines if `line` will be rendered as a list item in markdown."""
77
+ return any(p.match(line) for p in _LIST_PATTERNS)
78
+
79
+
80
+ def _annotate_each_line(
81
+ full_annotation: _Annotation, doc_text: str, add_newlines: bool = False
82
+ ) -> Iterable[_Annotation]:
83
+ """Creates `_Annotation`s on `doc_text` for each line included in `full_annotation`."""
84
+ a_start, a_end = full_annotation.start, full_annotation.end
85
+ span_text_lines = doc_text[a_start:a_end].splitlines(keepends=True)
86
+ offset = a_start
87
+ for i, line in enumerate(span_text_lines):
88
+ # add newline at the end of annotation group if `force_blank == True``
89
+ add_newline = (i == len(span_text_lines) - 1) and add_newlines
90
+
91
+ line_start = offset
92
+ line_end = offset + len(line)
93
+
94
+ # skip whitespace lines
95
+ if line.strip():
96
+ yield _Annotation(
97
+ line_start,
98
+ line_end,
99
+ kind=full_annotation.kind,
100
+ level=full_annotation.level,
101
+ start_id=full_annotation.start_id,
102
+ force_blank_line=add_newline,
103
+ )
104
+
105
+ offset = line_end
106
+
107
+
108
+ def _safe_append_tag(md: list[str], tag: str | None):
109
+ """Safely appends `tag` to the last non-newline/whitespace entry of `md`, preserving
110
+ trailing and leading newlines/whitespaces.
111
+ """
112
+ if tag is None:
113
+ return
114
+
115
+ i = len(md) - 1
116
+ while i > 0 and not md[i].strip():
117
+ i -= 1
118
+
119
+ text_to_tag = md[i]
120
+ stripped = text_to_tag.rstrip()
121
+ md[i] = text_to_tag[: len(stripped)] + tag + text_to_tag[len(stripped) :]
122
+
123
+
124
+ def _filter_events(events: list[_Event]) -> list[_Event]:
125
+ """Filters `events`, removing overlapping annotations which could break the markdown output."""
126
+ priority = {
127
+ "junk": 0, # Lower value = lower priority
128
+ "ext_ref": 1,
129
+ "terms": 1,
130
+ "xref": 2,
131
+ }
132
+ active: list[_Annotation] = [] # stack of active annotations
133
+ filtered_events: list[_Event] = []
134
+
135
+ for e in events:
136
+ ann = e.annotation
137
+ kind = ann.kind
138
+ if kind not in priority.keys():
139
+ filtered_events.append(e)
140
+ continue
141
+
142
+ if e.time == "start":
143
+ # Check conflict with currently active annotations
144
+ to_remove = []
145
+ discard = False
146
+
147
+ for a in active:
148
+ # overlap condition
149
+ if ann.start < a.end and ann.end > a.start:
150
+ if priority[kind] > priority[a.kind]:
151
+ to_remove.append(a)
152
+ else:
153
+ discard = True
154
+ break
155
+
156
+ if discard:
157
+ continue
158
+
159
+ # Remove weaker overlapping annotations
160
+ if to_remove:
161
+ active = [a for a in active if a not in to_remove]
162
+ filtered_events = [ev for ev in filtered_events if ev.annotation not in to_remove]
163
+
164
+ active.append(ann)
165
+ filtered_events.append(e)
166
+
167
+ else: # time == end
168
+ # only append if the start has already been seen
169
+ if ann in active:
170
+ active.remove(ann)
171
+ filtered_events.append(e)
172
+
173
+ # replace events with filtered version
174
+ return filtered_events
175
+
176
+
177
+ def _merge_annotations(anns: list[_Annotation], kinds: set[_AnnotationKind]) -> Iterable[_Annotation]:
178
+ """Merges annotations with `kind` in `kinds` if they have the same start and end indices, returning the merged list of annotations."""
179
+ anns = sorted(anns, key=lambda a: (a.start, a.end, a.kind in kinds))
180
+ skip_next = False
181
+ skipped_ann: _Annotation | None = None
182
+ for i in range(len(anns) - 1):
183
+ a1, a2 = anns[i], anns[i + 1]
184
+ if skip_next:
185
+ # Continue skipping if needed
186
+ skip_next = a2.kind in kinds and skipped_ann and (skipped_ann.start, skipped_ann.end) == (a2.start, a2.end)
187
+ continue
188
+
189
+ skip_next = a1.kind in kinds and a2.kind in kinds and (a1.start, a2.start) == (a1.end, a2.end)
190
+ skipped_ann = a2
191
+ yield a1
192
+
193
+ if not skip_next:
194
+ yield anns[-1]
195
+
196
+
197
+ # ==== END HELPER FUNCTIONS ====
198
+
199
+
200
+ def text2markdown(
201
+ text: str | ILGSDocument,
202
+ *,
203
+ link_xrefs: bool = True,
204
+ strike_junk: bool = True,
205
+ block_quotes: bool = True,
206
+ escape_lists: bool = True,
207
+ italicize_refs: bool = True,
208
+ italicize_terms: bool = True,
209
+ enrichment_model: str = "kanon-2-enricher",
210
+ isaacus_client: isaacus.Isaacus | None = None,
211
+ ) -> str:
212
+ """Intelligently converts plain text into Markdown.
213
+
214
+ Args:
215
+ text (str | ILGSDocument): Input to be converted into Markdown. If an Isaacus Legal Graph Schema (ILGS) Document is supplied, this function will convert the Document's text into Markdown without needing to enrich it first with an Isaacus enrichment model.
216
+
217
+ link_xrefs (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
218
+
219
+ strike_junk (bool, optional): Whether to strike out junk text.
220
+
221
+ block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
222
+
223
+ escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
224
+
225
+ italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
226
+
227
+ italicize_terms (bool, optional): Whether to italicize any terms defined in the document.
228
+
229
+ enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
230
+
231
+ isaacus_client (isaacus.Isaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
232
+ """
233
+
234
+ # Convert the input text into an Isaacus Legal Graph Schema (ILGS) Document if it is not one already.
235
+ if isinstance(text, str):
236
+ if isaacus_client is None:
237
+ isaacus_client = isaacus.Isaacus()
238
+
239
+ response = isaacus_client.enrichments.create(model=enrichment_model, texts=text, overflow_strategy="auto")
240
+ doc = response.results[0].document
241
+
242
+ else:
243
+ doc = text
244
+
245
+ text = doc.text
246
+
247
+ # Idea: Gather all annotations to queue, build a hierarchy of events ordered by index,
248
+ # then perform the necessary plain text -> markdown transformations
249
+ # as we iterate over the input text
250
+ anns: set[_Annotation] = set()
251
+ headings = deque(sorted([h for h in doc.headings if h.decode(text).strip()], key=lambda span: span.start))
252
+ segs = sorted(doc.segments, key=lambda s: (s.span.start, -s.span.end))
253
+ num_segs = len(segs)
254
+
255
+ # we want to 'disjointify' our span segments. If we have segment spans [[25, 40], [30, 50]],
256
+ # then it is desirable to have a representation in the form [[25, 30], [30, 50]]. If we have it in this form,
257
+ # we can say the heading [30, 40] belongs to the segment [30, 50] because it is uniquely contained in it
258
+ # in the disjoint representation
259
+ disjoint_seg_spans: list[tuple[int, int]] = []
260
+ for seg in reversed(segs):
261
+ dj_start = seg.span.start
262
+ if disjoint_seg_spans and seg.span.end >= disjoint_seg_spans[-1][0]:
263
+ # this segment ends after the start of the next segment; cut off the intersection
264
+ dj_end = disjoint_seg_spans[-1][0]
265
+ else:
266
+ dj_end = seg.span.end
267
+
268
+ disjoint_seg_spans.append((dj_start, dj_end))
269
+
270
+ # Check for title; level 1 heading "#" is reserved for the title heading
271
+ if (title := doc.title) and headings and headings[0].start <= title.start < headings[0].end:
272
+ h = headings.popleft()
273
+ anns.add(_Annotation(h.start, h.end, kind="heading", level=1))
274
+
275
+ id_to_seg: dict[str | None, Segment | None] = {None: None}
276
+ has_heading: set[tuple[int, int]] = set()
277
+
278
+ # Find headings and add their annotations with levels
279
+ for idx, seg in enumerate(segs):
280
+ id_to_seg[seg.id] = seg
281
+
282
+ span_start, span_end = disjoint_seg_spans[num_segs - idx - 1] # disjoint span interval
283
+ if span_end - span_start <= 0:
284
+ continue
285
+
286
+ curr_level = seg.level + 2 # offset counting to start from 2 instead of 0 (number of #'s in markdown format)
287
+ while headings and headings[0].start < span_start:
288
+ h = headings.popleft()
289
+ # Default segmentless headings' level
290
+ anns.add(_Annotation(h.start, h.end, kind="heading", level=curr_level))
291
+
292
+ annotations: list[tuple[int, int, int]] = []
293
+ # annotate headings in segment
294
+ lev = curr_level
295
+ while headings and span_start <= headings[0].start < span_end:
296
+ h = headings.popleft()
297
+ annotations.append((h.start, h.end, lev))
298
+ lev += 1
299
+
300
+ if not annotations:
301
+ # no heading in this segment
302
+ continue
303
+
304
+ for ann in annotations:
305
+ ann_start, ann_end, ann_level = ann
306
+
307
+ # ensure heading depth is with respect to parents with headings
308
+ curr = id_to_seg[seg.parent]
309
+ while curr is not None:
310
+ # decrement level for each parent segment missing a heading
311
+ if (curr.span.start, curr.span.end) not in has_heading:
312
+ ann_level -= 1
313
+ curr = id_to_seg[curr.parent]
314
+ anns.update(
315
+ _annotate_each_line(_Annotation(ann_start, ann_end, kind="heading", level=max(2, ann_level)), text)
316
+ )
317
+
318
+ has_heading.add((seg.span.start, seg.span.end))
319
+
320
+ # Add any remaining headings which come after the last segment
321
+ for heading in headings:
322
+ anns.add(_Annotation(heading.start, heading.end, kind="heading", level=2))
323
+
324
+ # We've annotated all headings, now gather annotations for the optional parameters.
325
+ optional_annotators = {
326
+ "xref": (doc.crossreferences, link_xrefs),
327
+ "junk": (doc.junk, strike_junk),
328
+ "quote": (doc.quotes, block_quotes),
329
+ "ext_ref": (doc.external_documents, italicize_refs),
330
+ "terms": (doc.terms, italicize_terms),
331
+ }
332
+ for kind, (annotators, asked_to_implement) in optional_annotators.items():
333
+ if not asked_to_implement:
334
+ continue
335
+
336
+ for ann in annotators:
337
+ match kind:
338
+ case "xref":
339
+ start_id = ann.start # references' start segment id
340
+ # Add annotations for the text itself (indicated by ann.span)
341
+ anns.update(
342
+ _annotate_each_line(
343
+ _Annotation(ann.span.start, ann.span.end, kind=kind, start_id=start_id), text
344
+ )
345
+ )
346
+
347
+ # need to add in annotations for the source reference as well, for anchoring
348
+ start_seg_span = id_to_seg[start_id].span
349
+ anns.add(_Annotation(start_seg_span.start, start_seg_span.end, kind="src_ref", start_id=start_id))
350
+
351
+ case "junk":
352
+ anns.update(_annotate_each_line(_Annotation(ann.start, ann.end, kind=kind), text))
353
+
354
+ case "quote":
355
+ if ann.span.start > 0 and text[ann.span.start - 1] != "\n":
356
+ # Only annotate block quotes; must be preceded with '\n' char
357
+ continue
358
+ anns.update(
359
+ _annotate_each_line(
360
+ _Annotation(ann.span.start, ann.span.end, kind=kind), text, add_newlines=True
361
+ )
362
+ )
363
+
364
+ case "ext_ref":
365
+ # Each external reference has an array of mentions we want to annotate.
366
+ for mention in ann.mentions:
367
+ anns.update(_annotate_each_line(_Annotation(mention.start, mention.end, kind=kind), text))
368
+
369
+ case "terms":
370
+ anns.update(_annotate_each_line(_Annotation(ann.name.start, ann.name.end, kind=kind), text))
371
+
372
+ # ext_ref and terms both use italics, ensure they are merged to avoid duplication
373
+ anns = _merge_annotations(list(anns), kinds={"ext_ref", "terms"})
374
+
375
+ events: list[_Event] = []
376
+ for ann in anns:
377
+ events.append(_Event(ann.start, "start", ann))
378
+ # Don't need end events for some annotation types
379
+ if ann.kind != "src_ref":
380
+ events.append(_Event(ann.end, "end", ann))
381
+
382
+ kind_priority = {
383
+ "heading": 6,
384
+ "quote": 5,
385
+ "ext_ref": 4,
386
+ "terms": 4,
387
+ "junk": 3,
388
+ "xref": 2,
389
+ "subtitle": 1,
390
+ "src_ref": 0,
391
+ }
392
+ zero_length_annotations = {"src_ref"}
393
+
394
+ def event_sort_key(e: _Event):
395
+ """Determines behaviour if two events occur at the same index."""
396
+ kind, start, end = e.annotation.kind, e.annotation.start, e.annotation.end
397
+ if e.time == "start":
398
+ start_first = 1
399
+ kind_order = -kind_priority[kind]
400
+ length_order = -(end - start) if kind not in zero_length_annotations else 1
401
+
402
+ else:
403
+ start_first = 0
404
+ kind_order = kind_priority[kind]
405
+ length_order = end - start if kind not in zero_length_annotations else -1
406
+
407
+ return (e.position, start_first, length_order, kind_order)
408
+
409
+ events.sort(key=event_sort_key)
410
+ events = _filter_events(events)
411
+
412
+ # ===== Process events =====
413
+ md: list[str] = [] # Output markdown
414
+ curr_idx = 0
415
+ for pos, t, ann in events:
416
+ kind = ann.kind
417
+ if curr_idx != pos:
418
+ md.append(text[curr_idx:pos])
419
+
420
+ if t == "start":
421
+ md.append(ann.tags[0])
422
+
423
+ else:
424
+ _safe_append_tag(md, ann.tags[1])
425
+ if ann.force_blank_line:
426
+ md.append("\n\n")
427
+
428
+ curr_idx = pos
429
+
430
+ md.append(text[curr_idx:])
431
+ raw = "".join(md)
432
+
433
+ # We have some post-processing to do
434
+ newlines_added = (f"{line}\n" if line.startswith("#") else line for line in raw.splitlines(True))
435
+
436
+ # ensure every line in the output is surrounded by exactly one blank line before and after,
437
+ # except for quotations. Additionally, preserve indentation by using html tags.
438
+ prev_is_blank = False
439
+ blank_removed: list[str] = []
440
+ for line in "".join(newlines_added).splitlines():
441
+ if prev_is_blank and not line.strip():
442
+ # second blank in a row
443
+ continue
444
+ prev_is_blank = not line.strip()
445
+
446
+ # prevent markdown list rendering
447
+ if _is_list_line(line) and line.lstrip() == line and escape_lists:
448
+ line = f"&#8203;{line}"
449
+
450
+ # Convert leading tabs/whitespace to html indent flags
451
+ line = line.expandtabs(4)
452
+ line = re.sub(r"^(?:\s{4})+", lambda m: "&emsp;" * (len(m.group(0)) // 4), line)
453
+ line = re.sub(r"^((?:&emsp;)*)\s{2}", r"\1&ensp;", line)
454
+ line = re.sub(r"^((?:&emsp;|&ensp;)*)\s", r"\1&nbsp;", line)
455
+
456
+ if not line.startswith("> "):
457
+ line = line.rstrip("\n") + "\n"
458
+ prev_is_blank = True
459
+
460
+ blank_removed.append(line + "\n" if line.strip() else line)
461
+
462
+ return "".join(blank_removed).strip()