text2markdown 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- text2markdown-0.1.0/PKG-INFO +100 -0
- text2markdown-0.1.0/README.md +86 -0
- text2markdown-0.1.0/pyproject.toml +113 -0
- text2markdown-0.1.0/src/text2markdown/__init__.py +4 -0
- text2markdown-0.1.0/src/text2markdown/async_text2markdown.py +69 -0
- text2markdown-0.1.0/src/text2markdown/text2markdown.py +462 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: text2markdown
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python library for intelligently converting text into Markdown.
|
|
5
|
+
Author: Isaacus
|
|
6
|
+
Author-email: Isaacus <support@isaacus.com>
|
|
7
|
+
Requires-Dist: isaacus
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Project-URL: Homepage, https://docs.isaacus.com/text2markdown
|
|
10
|
+
Project-URL: Documentation, https://github.com/isaacus-dev/text2markdown/blob/main/README.md
|
|
11
|
+
Project-URL: Issues, https://github.com/isaacus-dev/text2markdown/issues
|
|
12
|
+
Project-URL: Source, https://github.com/isaacus-dev/text2markdown
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# text2markdown 📝
|
|
16
|
+
**text2markdown** is a Python library for intelligently converting plain text into Markdown.
|
|
17
|
+
|
|
18
|
+
text2markdown is powered by the [Isaacus enrichment API](https://docs.isaacus.com/capabilities/enrichment), which converts unstructured documents into rich, highly structured knowledge graphs that can easily be transformed into Markdown.
|
|
19
|
+
|
|
20
|
+
In all, text2markdown is capable of:
|
|
21
|
+
- Identifying and formatting headings.
|
|
22
|
+
- Segmenting text into nested sections.
|
|
23
|
+
- Hyperlinking cross-references within texts to other sections.
|
|
24
|
+
- Italicizing cited documents.
|
|
25
|
+
- Detecting and formatting block quotations.
|
|
26
|
+
- Striking through junk text.
|
|
27
|
+
|
|
28
|
+
## Setup 📦
|
|
29
|
+
text2markdown can be installed with `pip` (or `uv`):
|
|
30
|
+
```bash
|
|
31
|
+
pip install text2markdown
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
An [Isaacus API key](https://platform.isaacus.com/accounts/signup) is also required to use this library.
|
|
35
|
+
|
|
36
|
+
## Usage 👩💻
|
|
37
|
+
The code snippet below demonstrates how you might use `text2markdown()` to intelligently convert a short document into Markdown.
|
|
38
|
+
```python
|
|
39
|
+
from text2markdown import text2markdown
|
|
40
|
+
|
|
41
|
+
text = """\
|
|
42
|
+
The Smallest Document In The World
|
|
43
|
+
This is a generic document.
|
|
44
|
+
|
|
45
|
+
Section 1 - Background
|
|
46
|
+
One upon a time, there was a mayor who said:
|
|
47
|
+
We love Markdown so much that everyone should and must use it for everything.
|
|
48
|
+
|
|
49
|
+
Section 2 - Problem
|
|
50
|
+
The mayor's directive, as stated in Section 1, was sadly too difficult to enforce."""
|
|
51
|
+
|
|
52
|
+
output = text2markdown(text)
|
|
53
|
+
print(output)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
The output should look something like this:
|
|
57
|
+
```markdown
|
|
58
|
+
# The Smallest Document In The World
|
|
59
|
+
|
|
60
|
+
This is a generic document.
|
|
61
|
+
|
|
62
|
+
## <a id="seg-1"></a>Section 1 - Background
|
|
63
|
+
|
|
64
|
+
One upon a time, there was a mayor who said:
|
|
65
|
+
|
|
66
|
+
> We love Markdown so much that everyone should and must use it for everything.
|
|
67
|
+
|
|
68
|
+
## Section 2 - Problem
|
|
69
|
+
|
|
70
|
+
The mayor's directive, as stated in [Section 1](#seg-1), was sadly too difficult to enforce.
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
An asynchronous version of `text2markdown()` is also available, supporting all of the same features and arguments as its synchronous equivalent. It can be used like so:
|
|
74
|
+
```python
|
|
75
|
+
from text2markdown import text2markdown_async
|
|
76
|
+
|
|
77
|
+
output = await text2markdown_async(text)
|
|
78
|
+
print(output)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
All of the various capabilities of text2markdown can be toggled on or off using optional Boolean parameters, as shown below:
|
|
82
|
+
```python
|
|
83
|
+
from text2markdown import text2markdown
|
|
84
|
+
|
|
85
|
+
from isaacus import Isaacus
|
|
86
|
+
|
|
87
|
+
output = text2markdown(
|
|
88
|
+
text,
|
|
89
|
+
link_xrefs=True,
|
|
90
|
+
strike_junk=True,
|
|
91
|
+
block_quotes=True,
|
|
92
|
+
italicize_refs=True,
|
|
93
|
+
enrichment_model="kanon-2-enricher",
|
|
94
|
+
isaacus_client=Isaacus(),
|
|
95
|
+
)
|
|
96
|
+
print(output)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## License 📜
|
|
100
|
+
This library is licensed under the [MIT License](https://github.com/isaacus-dev/text2markdown/blob/main/LICENCE).
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# text2markdown 📝
|
|
2
|
+
**text2markdown** is a Python library for intelligently converting plain text into Markdown.
|
|
3
|
+
|
|
4
|
+
text2markdown is powered by the [Isaacus enrichment API](https://docs.isaacus.com/capabilities/enrichment), which converts unstructured documents into rich, highly structured knowledge graphs that can easily be transformed into Markdown.
|
|
5
|
+
|
|
6
|
+
In all, text2markdown is capable of:
|
|
7
|
+
- Identifying and formatting headings.
|
|
8
|
+
- Segmenting text into nested sections.
|
|
9
|
+
- Hyperlinking cross-references within texts to other sections.
|
|
10
|
+
- Italicizing cited documents.
|
|
11
|
+
- Detecting and formatting block quotations.
|
|
12
|
+
- Striking through junk text.
|
|
13
|
+
|
|
14
|
+
## Setup 📦
|
|
15
|
+
text2markdown can be installed with `pip` (or `uv`):
|
|
16
|
+
```bash
|
|
17
|
+
pip install text2markdown
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
An [Isaacus API key](https://platform.isaacus.com/accounts/signup) is also required to use this library.
|
|
21
|
+
|
|
22
|
+
## Usage 👩💻
|
|
23
|
+
The code snippet below demonstrates how you might use `text2markdown()` to intelligently convert a short document into Markdown.
|
|
24
|
+
```python
|
|
25
|
+
from text2markdown import text2markdown
|
|
26
|
+
|
|
27
|
+
text = """\
|
|
28
|
+
The Smallest Document In The World
|
|
29
|
+
This is a generic document.
|
|
30
|
+
|
|
31
|
+
Section 1 - Background
|
|
32
|
+
One upon a time, there was a mayor who said:
|
|
33
|
+
We love Markdown so much that everyone should and must use it for everything.
|
|
34
|
+
|
|
35
|
+
Section 2 - Problem
|
|
36
|
+
The mayor's directive, as stated in Section 1, was sadly too difficult to enforce."""
|
|
37
|
+
|
|
38
|
+
output = text2markdown(text)
|
|
39
|
+
print(output)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
The output should look something like this:
|
|
43
|
+
```markdown
|
|
44
|
+
# The Smallest Document In The World
|
|
45
|
+
|
|
46
|
+
This is a generic document.
|
|
47
|
+
|
|
48
|
+
## <a id="seg-1"></a>Section 1 - Background
|
|
49
|
+
|
|
50
|
+
One upon a time, there was a mayor who said:
|
|
51
|
+
|
|
52
|
+
> We love Markdown so much that everyone should and must use it for everything.
|
|
53
|
+
|
|
54
|
+
## Section 2 - Problem
|
|
55
|
+
|
|
56
|
+
The mayor's directive, as stated in [Section 1](#seg-1), was sadly too difficult to enforce.
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
An asynchronous version of `text2markdown()` is also available, supporting all of the same features and arguments as its synchronous equivalent. It can be used like so:
|
|
60
|
+
```python
|
|
61
|
+
from text2markdown import text2markdown_async
|
|
62
|
+
|
|
63
|
+
output = await text2markdown_async(text)
|
|
64
|
+
print(output)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
All of the various capabilities of text2markdown can be toggled on or off using optional Boolean parameters, as shown below:
|
|
68
|
+
```python
|
|
69
|
+
from text2markdown import text2markdown
|
|
70
|
+
|
|
71
|
+
from isaacus import Isaacus
|
|
72
|
+
|
|
73
|
+
output = text2markdown(
|
|
74
|
+
text,
|
|
75
|
+
link_xrefs=True,
|
|
76
|
+
strike_junk=True,
|
|
77
|
+
block_quotes=True,
|
|
78
|
+
italicize_refs=True,
|
|
79
|
+
enrichment_model="kanon-2-enricher",
|
|
80
|
+
isaacus_client=Isaacus(),
|
|
81
|
+
)
|
|
82
|
+
print(output)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## License 📜
|
|
86
|
+
This library is licensed under the [MIT License](https://github.com/isaacus-dev/text2markdown/blob/main/LICENCE).
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "text2markdown"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
authors = [{ name = "Isaacus", email = "support@isaacus.com" }]
|
|
5
|
+
description = "A Python library for intelligently converting text into Markdown."
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"isaacus",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[dependency-groups]
|
|
13
|
+
dev = ["ipykernel"]
|
|
14
|
+
|
|
15
|
+
[project.urls]
|
|
16
|
+
Homepage = "https://docs.isaacus.com/text2markdown"
|
|
17
|
+
Documentation = "https://github.com/isaacus-dev/text2markdown/blob/main/README.md"
|
|
18
|
+
Issues = "https://github.com/isaacus-dev/text2markdown/issues"
|
|
19
|
+
Source = "https://github.com/isaacus-dev/text2markdown"
|
|
20
|
+
|
|
21
|
+
[tool.ruff]
|
|
22
|
+
exclude = [
|
|
23
|
+
"__pycache__",
|
|
24
|
+
"develop-eggs",
|
|
25
|
+
"eggs",
|
|
26
|
+
".eggs",
|
|
27
|
+
"wheels",
|
|
28
|
+
"htmlcov",
|
|
29
|
+
".tox",
|
|
30
|
+
".nox",
|
|
31
|
+
".coverage",
|
|
32
|
+
".cache",
|
|
33
|
+
".pytest_cache",
|
|
34
|
+
".ipynb_checkpoints",
|
|
35
|
+
".mypy_cache",
|
|
36
|
+
".pybuilder",
|
|
37
|
+
"__pypackages__",
|
|
38
|
+
".env",
|
|
39
|
+
".venv",
|
|
40
|
+
"venv",
|
|
41
|
+
"env",
|
|
42
|
+
"ENV",
|
|
43
|
+
"env.bak",
|
|
44
|
+
"venv.bak",
|
|
45
|
+
".archive",
|
|
46
|
+
".persist_cache",
|
|
47
|
+
"site-packages",
|
|
48
|
+
"node_modules",
|
|
49
|
+
"dist",
|
|
50
|
+
"build",
|
|
51
|
+
"dist-info",
|
|
52
|
+
"egg-info",
|
|
53
|
+
".hatchling",
|
|
54
|
+
".bzr",
|
|
55
|
+
".direnv",
|
|
56
|
+
".git",
|
|
57
|
+
".git-rewrite",
|
|
58
|
+
".hg",
|
|
59
|
+
".pants.d",
|
|
60
|
+
".pytype",
|
|
61
|
+
".ruff_cache",
|
|
62
|
+
".svn",
|
|
63
|
+
".vscode",
|
|
64
|
+
"_build",
|
|
65
|
+
"buck-out",
|
|
66
|
+
"migrations",
|
|
67
|
+
"target",
|
|
68
|
+
"bin",
|
|
69
|
+
"lib",
|
|
70
|
+
"lib64",
|
|
71
|
+
"include",
|
|
72
|
+
"share",
|
|
73
|
+
"var",
|
|
74
|
+
"tmp",
|
|
75
|
+
"temp",
|
|
76
|
+
"logs",
|
|
77
|
+
]
|
|
78
|
+
line-length = 120
|
|
79
|
+
indent-width = 4
|
|
80
|
+
target-version = "py312"
|
|
81
|
+
|
|
82
|
+
[tool.ruff.lint]
|
|
83
|
+
select = ["E4", "E7", "E9", "F", "I"]
|
|
84
|
+
fixable = ["ALL"]
|
|
85
|
+
unfixable = []
|
|
86
|
+
ignore = ["E741"]
|
|
87
|
+
|
|
88
|
+
[tool.ruff.lint.isort]
|
|
89
|
+
length-sort = true
|
|
90
|
+
section-order = [
|
|
91
|
+
"future",
|
|
92
|
+
"standard-library",
|
|
93
|
+
"first-party",
|
|
94
|
+
"third-party",
|
|
95
|
+
"local-folder",
|
|
96
|
+
]
|
|
97
|
+
lines-between-types = 1
|
|
98
|
+
order-by-type = false
|
|
99
|
+
combine-as-imports = true
|
|
100
|
+
known-first-party = ['_parent']
|
|
101
|
+
|
|
102
|
+
[tool.ruff.lint.per-file-ignores]
|
|
103
|
+
"__init__.py" = ["F401"]
|
|
104
|
+
|
|
105
|
+
[tool.ruff.format]
|
|
106
|
+
quote-style = "double"
|
|
107
|
+
indent-style = "space"
|
|
108
|
+
skip-magic-trailing-comma = false
|
|
109
|
+
line-ending = "auto"
|
|
110
|
+
|
|
111
|
+
[build-system]
|
|
112
|
+
requires = ["uv_build>=0.9.16,<0.10.0"]
|
|
113
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import isaacus
|
|
4
|
+
|
|
5
|
+
from isaacus.types.ilgs.v1.document import Document as ILGSDocument
|
|
6
|
+
|
|
7
|
+
from .text2markdown import text2markdown
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def text2markdown_async(
|
|
11
|
+
text: str | ILGSDocument,
|
|
12
|
+
*,
|
|
13
|
+
link_xrefs: bool = True,
|
|
14
|
+
strike_junk: bool = True,
|
|
15
|
+
block_quotes: bool = True,
|
|
16
|
+
escape_lists: bool = True,
|
|
17
|
+
italicize_refs: bool = True,
|
|
18
|
+
italicize_terms: bool = True,
|
|
19
|
+
enrichment_model: str = "kanon-2-enricher",
|
|
20
|
+
isaacus_client: isaacus.AsyncIsaacus | None = None,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Intelligently converts plain text into Markdown asynchronously.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
text (str | ILGSDocument): Input to be converted into Markdown. If an Isaacus Legal Graph Schema (ILGS) Document is supplied, this function will convert the Document's text into Markdown without needing to enrich it first with an Isaacus enrichment model.
|
|
26
|
+
|
|
27
|
+
link_xrefs (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
|
|
28
|
+
|
|
29
|
+
strike_junk (bool, optional): Whether to strike out junk text.
|
|
30
|
+
|
|
31
|
+
block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
|
|
32
|
+
|
|
33
|
+
escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
|
|
34
|
+
|
|
35
|
+
italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
|
|
36
|
+
|
|
37
|
+
italicize_terms (bool, optional): Whether to italicize the names of any defined terms.
|
|
38
|
+
|
|
39
|
+
enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
|
|
40
|
+
|
|
41
|
+
isaacus_client (isaacus.AsyncIsaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# Raise an error if supplied with a synchronous Isaacus client.
|
|
45
|
+
if isinstance(isaacus_client, isaacus.Isaacus):
|
|
46
|
+
raise ValueError("""\
|
|
47
|
+
`text2markdown_async()` requires an asynchronous Isaacus client, but a synchronous Isaacus client was provided. Please supply an `isaacus.AsyncIsaacus` client or set `isaacus_client` to `None` to have an asynchronous client created automatically.""")
|
|
48
|
+
|
|
49
|
+
# Convert the text into an Isaacus Legal Graph Schema (ILGS) Document if it is not one already.
|
|
50
|
+
doc = text
|
|
51
|
+
|
|
52
|
+
if isinstance(text, str):
|
|
53
|
+
if isaacus_client is None:
|
|
54
|
+
isaacus_client = isaacus.AsyncIsaacus()
|
|
55
|
+
|
|
56
|
+
response = await isaacus_client.enrichments.create(model=enrichment_model, texts=text, overflow_strategy="auto")
|
|
57
|
+
doc = response.results[0].document
|
|
58
|
+
|
|
59
|
+
return text2markdown(
|
|
60
|
+
doc,
|
|
61
|
+
link_xrefs=link_xrefs,
|
|
62
|
+
strike_junk=strike_junk,
|
|
63
|
+
block_quotes=block_quotes,
|
|
64
|
+
escape_lists=escape_lists,
|
|
65
|
+
italicize_refs=italicize_refs,
|
|
66
|
+
italicize_terms=italicize_terms,
|
|
67
|
+
enrichment_model=enrichment_model,
|
|
68
|
+
isaacus_client=None,
|
|
69
|
+
)
|
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from typing import Literal, Iterable, NamedTuple
|
|
6
|
+
from collections import deque
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
import isaacus
|
|
10
|
+
|
|
11
|
+
from isaacus.types.ilgs.v1.segment import Segment
|
|
12
|
+
from isaacus.types.ilgs.v1.document import Document as ILGSDocument
|
|
13
|
+
|
|
14
|
+
_LIST_PATTERNS = [
|
|
15
|
+
re.compile(r"^\s{0,3}[-+*]\s+"), # Unordered lists: -, *, +
|
|
16
|
+
re.compile(r"^\s{0,3}\d+\.\s+"), # Ordered lists: 1. 2. 10.
|
|
17
|
+
re.compile(r"^\s{0,3}\d+\)\s+"), # Ordered lists with parentheses: 1) 2)
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
_AnnotationKind = Literal[
|
|
21
|
+
"heading",
|
|
22
|
+
"xref", # Cross referencing another annotation
|
|
23
|
+
"junk",
|
|
24
|
+
"quote",
|
|
25
|
+
"ext_ref", # External references
|
|
26
|
+
"src_ref", # Pointed to by a xref
|
|
27
|
+
"terms", # Defined terms
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class _Annotation:
|
|
33
|
+
start: int # Annotation starting index
|
|
34
|
+
end: int # Annotation ending index
|
|
35
|
+
kind: _AnnotationKind
|
|
36
|
+
force_blank_line: bool = False
|
|
37
|
+
level: int | None = None # not `None` for `kind==heading` only
|
|
38
|
+
start_id: str | None = None # not `None` for `kind==xref` or `src_ref` only
|
|
39
|
+
|
|
40
|
+
_static_tags = { # Markdown tags to attach to each `_Annotation` kind
|
|
41
|
+
"junk": ("~~", "~~"),
|
|
42
|
+
"quote": ("> ", None),
|
|
43
|
+
"ext_ref": ("*", "*"),
|
|
44
|
+
"terms": ("*", "*"),
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def tags(self) -> tuple[str, str | None]:
|
|
49
|
+
"""Returns the markdown/html tags that need to be added at the `start` and `end` index of this `_Annotation`, respectively."""
|
|
50
|
+
match self.kind:
|
|
51
|
+
case "heading":
|
|
52
|
+
return (f"\n{'#' * min(6, self.level)} ", None)
|
|
53
|
+
|
|
54
|
+
case "xref":
|
|
55
|
+
return ("[", f"](#{self.start_id.replace(':', '-')})")
|
|
56
|
+
|
|
57
|
+
case "src_ref":
|
|
58
|
+
return (f"""<a id="{self.start_id.replace(":", "-")}"></a>""", None)
|
|
59
|
+
|
|
60
|
+
return self._static_tags[self.kind]
|
|
61
|
+
|
|
62
|
+
def __hash__(self):
|
|
63
|
+
return hash((self.start, self.end, self.kind, self.force_blank_line, self.level, self.start_id))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class _Event(NamedTuple):
|
|
67
|
+
position: int
|
|
68
|
+
time: Literal["start", "end"]
|
|
69
|
+
annotation: _Annotation
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ==== START HELPER FUNCTIONS ====
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _is_list_line(line: str) -> bool:
|
|
76
|
+
"""Determines if `line` will be rendered as a list item in markdown."""
|
|
77
|
+
return any(p.match(line) for p in _LIST_PATTERNS)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _annotate_each_line(
|
|
81
|
+
full_annotation: _Annotation, doc_text: str, add_newlines: bool = False
|
|
82
|
+
) -> Iterable[_Annotation]:
|
|
83
|
+
"""Creates `_Annotation`s on `doc_text` for each line included in `full_annotation`."""
|
|
84
|
+
a_start, a_end = full_annotation.start, full_annotation.end
|
|
85
|
+
span_text_lines = doc_text[a_start:a_end].splitlines(keepends=True)
|
|
86
|
+
offset = a_start
|
|
87
|
+
for i, line in enumerate(span_text_lines):
|
|
88
|
+
# add newline at the end of annotation group if `force_blank == True``
|
|
89
|
+
add_newline = (i == len(span_text_lines) - 1) and add_newlines
|
|
90
|
+
|
|
91
|
+
line_start = offset
|
|
92
|
+
line_end = offset + len(line)
|
|
93
|
+
|
|
94
|
+
# skip whitespace lines
|
|
95
|
+
if line.strip():
|
|
96
|
+
yield _Annotation(
|
|
97
|
+
line_start,
|
|
98
|
+
line_end,
|
|
99
|
+
kind=full_annotation.kind,
|
|
100
|
+
level=full_annotation.level,
|
|
101
|
+
start_id=full_annotation.start_id,
|
|
102
|
+
force_blank_line=add_newline,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
offset = line_end
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _safe_append_tag(md: list[str], tag: str | None):
|
|
109
|
+
"""Safely appends `tag` to the last non-newline/whitespace entry of `md`, preserving
|
|
110
|
+
trailing and leading newlines/whitespaces.
|
|
111
|
+
"""
|
|
112
|
+
if tag is None:
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
i = len(md) - 1
|
|
116
|
+
while i > 0 and not md[i].strip():
|
|
117
|
+
i -= 1
|
|
118
|
+
|
|
119
|
+
text_to_tag = md[i]
|
|
120
|
+
stripped = text_to_tag.rstrip()
|
|
121
|
+
md[i] = text_to_tag[: len(stripped)] + tag + text_to_tag[len(stripped) :]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _filter_events(events: list[_Event]) -> list[_Event]:
|
|
125
|
+
"""Filters `events`, removing overlapping annotations which could break the markdown output."""
|
|
126
|
+
priority = {
|
|
127
|
+
"junk": 0, # Lower value = lower priority
|
|
128
|
+
"ext_ref": 1,
|
|
129
|
+
"terms": 1,
|
|
130
|
+
"xref": 2,
|
|
131
|
+
}
|
|
132
|
+
active: list[_Annotation] = [] # stack of active annotations
|
|
133
|
+
filtered_events: list[_Event] = []
|
|
134
|
+
|
|
135
|
+
for e in events:
|
|
136
|
+
ann = e.annotation
|
|
137
|
+
kind = ann.kind
|
|
138
|
+
if kind not in priority.keys():
|
|
139
|
+
filtered_events.append(e)
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
if e.time == "start":
|
|
143
|
+
# Check conflict with currently active annotations
|
|
144
|
+
to_remove = []
|
|
145
|
+
discard = False
|
|
146
|
+
|
|
147
|
+
for a in active:
|
|
148
|
+
# overlap condition
|
|
149
|
+
if ann.start < a.end and ann.end > a.start:
|
|
150
|
+
if priority[kind] > priority[a.kind]:
|
|
151
|
+
to_remove.append(a)
|
|
152
|
+
else:
|
|
153
|
+
discard = True
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
if discard:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Remove weaker overlapping annotations
|
|
160
|
+
if to_remove:
|
|
161
|
+
active = [a for a in active if a not in to_remove]
|
|
162
|
+
filtered_events = [ev for ev in filtered_events if ev.annotation not in to_remove]
|
|
163
|
+
|
|
164
|
+
active.append(ann)
|
|
165
|
+
filtered_events.append(e)
|
|
166
|
+
|
|
167
|
+
else: # time == end
|
|
168
|
+
# only append if the start has already been seen
|
|
169
|
+
if ann in active:
|
|
170
|
+
active.remove(ann)
|
|
171
|
+
filtered_events.append(e)
|
|
172
|
+
|
|
173
|
+
# replace events with filtered version
|
|
174
|
+
return filtered_events
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _merge_annotations(anns: list[_Annotation], kinds: set[_AnnotationKind]) -> Iterable[_Annotation]:
|
|
178
|
+
"""Merges annotations with `kind` in `kinds` if they have the same start and end indices, returning the merged list of annotations."""
|
|
179
|
+
anns = sorted(anns, key=lambda a: (a.start, a.end, a.kind in kinds))
|
|
180
|
+
skip_next = False
|
|
181
|
+
skipped_ann: _Annotation | None = None
|
|
182
|
+
for i in range(len(anns) - 1):
|
|
183
|
+
a1, a2 = anns[i], anns[i + 1]
|
|
184
|
+
if skip_next:
|
|
185
|
+
# Continue skipping if needed
|
|
186
|
+
skip_next = a2.kind in kinds and skipped_ann and (skipped_ann.start, skipped_ann.end) == (a2.start, a2.end)
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
skip_next = a1.kind in kinds and a2.kind in kinds and (a1.start, a2.start) == (a1.end, a2.end)
|
|
190
|
+
skipped_ann = a2
|
|
191
|
+
yield a1
|
|
192
|
+
|
|
193
|
+
if not skip_next:
|
|
194
|
+
yield anns[-1]
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ==== END HELPER FUNCTIONS ====
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def text2markdown(
|
|
201
|
+
text: str | ILGSDocument,
|
|
202
|
+
*,
|
|
203
|
+
link_xrefs: bool = True,
|
|
204
|
+
strike_junk: bool = True,
|
|
205
|
+
block_quotes: bool = True,
|
|
206
|
+
escape_lists: bool = True,
|
|
207
|
+
italicize_refs: bool = True,
|
|
208
|
+
italicize_terms: bool = True,
|
|
209
|
+
enrichment_model: str = "kanon-2-enricher",
|
|
210
|
+
isaacus_client: isaacus.Isaacus | None = None,
|
|
211
|
+
) -> str:
|
|
212
|
+
"""Intelligently converts plain text into Markdown.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
text (str | ILGSDocument): Input to be converted into Markdown. If an Isaacus Legal Graph Schema (ILGS) Document is supplied, this function will convert the Document's text into Markdown without needing to enrich it first with an Isaacus enrichment model.
|
|
216
|
+
|
|
217
|
+
link_xrefs (bool, optional): Whether to link cross-references in the input text to their targets, for example, linking "as mentioned in Section 2.1" to the relevant section.
|
|
218
|
+
|
|
219
|
+
strike_junk (bool, optional): Whether to strike out junk text.
|
|
220
|
+
|
|
221
|
+
block_quotes (bool, optional): Whether to transform non-inline quotes into Markdown block quotes.
|
|
222
|
+
|
|
223
|
+
escape_lists (bool, optional): Whether to escape list-like lines (lines starting with "-", "*", "+", or numbered lists). This leads to nicer rendering at the cost of cleaner Markdown source code.
|
|
224
|
+
|
|
225
|
+
italicize_refs (bool, optional): Whether to italicize the names of any referenced documents, for example, "as mentioned in *Smith v. Jones*".
|
|
226
|
+
|
|
227
|
+
italicize_terms (bool, optional): Whether to italicize any terms defined in the document.
|
|
228
|
+
|
|
229
|
+
enrichment_model (str, optional): The name of the Isaacus enrichment model to use for converting the input text into Markdown. Defaults to the latest and most advanced Isaacus enrichment model, currently `kanon-2-enricher`.
|
|
230
|
+
|
|
231
|
+
isaacus_client (isaacus.Isaacus, optional): An Isaacus API client to use for enriching the input text with an Isaacus enrichment model if the input is not already an Isaacus Legal Graph Schema (ILGS) Document. If `None`, a new instance will be created instead where necessary.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
# Convert the input text into an Isaacus Legal Graph Schema (ILGS) Document if it is not one already.
|
|
235
|
+
if isinstance(text, str):
|
|
236
|
+
if isaacus_client is None:
|
|
237
|
+
isaacus_client = isaacus.Isaacus()
|
|
238
|
+
|
|
239
|
+
response = isaacus_client.enrichments.create(model=enrichment_model, texts=text, overflow_strategy="auto")
|
|
240
|
+
doc = response.results[0].document
|
|
241
|
+
|
|
242
|
+
else:
|
|
243
|
+
doc = text
|
|
244
|
+
|
|
245
|
+
text = doc.text
|
|
246
|
+
|
|
247
|
+
# Idea: Gather all annotations to queue, build a hierarchy of events ordered by index,
|
|
248
|
+
# then perform the necessary plain text -> markdown transformations
|
|
249
|
+
# as we iterate over the input text
|
|
250
|
+
anns: set[_Annotation] = set()
|
|
251
|
+
headings = deque(sorted([h for h in doc.headings if h.decode(text).strip()], key=lambda span: span.start))
|
|
252
|
+
segs = sorted(doc.segments, key=lambda s: (s.span.start, -s.span.end))
|
|
253
|
+
num_segs = len(segs)
|
|
254
|
+
|
|
255
|
+
# we want to 'disjointify' our span segments. If we have segment spans [[25, 40], [30, 50]],
|
|
256
|
+
# then it is desirable to have a representation in the form [[25, 30], [30, 50]]. If we have it in this form,
|
|
257
|
+
# we can say the heading [30, 40] belongs to the segment [30, 50] because it is uniquely contained in it
|
|
258
|
+
# in the disjoint representation
|
|
259
|
+
disjoint_seg_spans: list[tuple[int, int]] = []
|
|
260
|
+
for seg in reversed(segs):
|
|
261
|
+
dj_start = seg.span.start
|
|
262
|
+
if disjoint_seg_spans and seg.span.end >= disjoint_seg_spans[-1][0]:
|
|
263
|
+
# this segment ends after the start of the next segment; cut off the intersection
|
|
264
|
+
dj_end = disjoint_seg_spans[-1][0]
|
|
265
|
+
else:
|
|
266
|
+
dj_end = seg.span.end
|
|
267
|
+
|
|
268
|
+
disjoint_seg_spans.append((dj_start, dj_end))
|
|
269
|
+
|
|
270
|
+
# Check for title; level 1 heading "#" is reserved for the title heading
|
|
271
|
+
if (title := doc.title) and headings and headings[0].start <= title.start < headings[0].end:
|
|
272
|
+
h = headings.popleft()
|
|
273
|
+
anns.add(_Annotation(h.start, h.end, kind="heading", level=1))
|
|
274
|
+
|
|
275
|
+
id_to_seg: dict[str | None, Segment | None] = {None: None}
|
|
276
|
+
has_heading: set[tuple[int, int]] = set()
|
|
277
|
+
|
|
278
|
+
# Find headings and add their annotations with levels
|
|
279
|
+
for idx, seg in enumerate(segs):
|
|
280
|
+
id_to_seg[seg.id] = seg
|
|
281
|
+
|
|
282
|
+
span_start, span_end = disjoint_seg_spans[num_segs - idx - 1] # disjoint span interval
|
|
283
|
+
if span_end - span_start <= 0:
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
curr_level = seg.level + 2 # offset counting to start from 2 instead of 0 (number of #'s in markdown format)
|
|
287
|
+
while headings and headings[0].start < span_start:
|
|
288
|
+
h = headings.popleft()
|
|
289
|
+
# Default segmentless headings' level
|
|
290
|
+
anns.add(_Annotation(h.start, h.end, kind="heading", level=curr_level))
|
|
291
|
+
|
|
292
|
+
annotations: list[tuple[int, int, int]] = []
|
|
293
|
+
# annotate headings in segment
|
|
294
|
+
lev = curr_level
|
|
295
|
+
while headings and span_start <= headings[0].start < span_end:
|
|
296
|
+
h = headings.popleft()
|
|
297
|
+
annotations.append((h.start, h.end, lev))
|
|
298
|
+
lev += 1
|
|
299
|
+
|
|
300
|
+
if not annotations:
|
|
301
|
+
# no heading in this segment
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
for ann in annotations:
|
|
305
|
+
ann_start, ann_end, ann_level = ann
|
|
306
|
+
|
|
307
|
+
# ensure heading depth is with respect to parents with headings
|
|
308
|
+
curr = id_to_seg[seg.parent]
|
|
309
|
+
while curr is not None:
|
|
310
|
+
# decrement level for each parent segment missing a heading
|
|
311
|
+
if (curr.span.start, curr.span.end) not in has_heading:
|
|
312
|
+
ann_level -= 1
|
|
313
|
+
curr = id_to_seg[curr.parent]
|
|
314
|
+
anns.update(
|
|
315
|
+
_annotate_each_line(_Annotation(ann_start, ann_end, kind="heading", level=max(2, ann_level)), text)
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
has_heading.add((seg.span.start, seg.span.end))
|
|
319
|
+
|
|
320
|
+
# Add any remaining headings which come after the last segment
|
|
321
|
+
for heading in headings:
|
|
322
|
+
anns.add(_Annotation(heading.start, heading.end, kind="heading", level=2))
|
|
323
|
+
|
|
324
|
+
# We've annotated all headings, now gather annotations for the optional parameters.
|
|
325
|
+
optional_annotators = {
|
|
326
|
+
"xref": (doc.crossreferences, link_xrefs),
|
|
327
|
+
"junk": (doc.junk, strike_junk),
|
|
328
|
+
"quote": (doc.quotes, block_quotes),
|
|
329
|
+
"ext_ref": (doc.external_documents, italicize_refs),
|
|
330
|
+
"terms": (doc.terms, italicize_terms),
|
|
331
|
+
}
|
|
332
|
+
for kind, (annotators, asked_to_implement) in optional_annotators.items():
|
|
333
|
+
if not asked_to_implement:
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
for ann in annotators:
|
|
337
|
+
match kind:
|
|
338
|
+
case "xref":
|
|
339
|
+
start_id = ann.start # references' start segment id
|
|
340
|
+
# Add annotations for the text itself (indicated by ann.span)
|
|
341
|
+
anns.update(
|
|
342
|
+
_annotate_each_line(
|
|
343
|
+
_Annotation(ann.span.start, ann.span.end, kind=kind, start_id=start_id), text
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# need to add in annotations for the source reference as well, for anchoring
|
|
348
|
+
start_seg_span = id_to_seg[start_id].span
|
|
349
|
+
anns.add(_Annotation(start_seg_span.start, start_seg_span.end, kind="src_ref", start_id=start_id))
|
|
350
|
+
|
|
351
|
+
case "junk":
|
|
352
|
+
anns.update(_annotate_each_line(_Annotation(ann.start, ann.end, kind=kind), text))
|
|
353
|
+
|
|
354
|
+
case "quote":
|
|
355
|
+
if ann.span.start > 0 and text[ann.span.start - 1] != "\n":
|
|
356
|
+
# Only annotate block quotes; must be preceded with '\n' char
|
|
357
|
+
continue
|
|
358
|
+
anns.update(
|
|
359
|
+
_annotate_each_line(
|
|
360
|
+
_Annotation(ann.span.start, ann.span.end, kind=kind), text, add_newlines=True
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
case "ext_ref":
|
|
365
|
+
# Each external reference has an array of mentions we want to annotate.
|
|
366
|
+
for mention in ann.mentions:
|
|
367
|
+
anns.update(_annotate_each_line(_Annotation(mention.start, mention.end, kind=kind), text))
|
|
368
|
+
|
|
369
|
+
case "terms":
|
|
370
|
+
anns.update(_annotate_each_line(_Annotation(ann.name.start, ann.name.end, kind=kind), text))
|
|
371
|
+
|
|
372
|
+
# ext_ref and terms both use italics, ensure they are merged to avoid duplication
|
|
373
|
+
anns = _merge_annotations(list(anns), kinds={"ext_ref", "terms"})
|
|
374
|
+
|
|
375
|
+
events: list[_Event] = []
|
|
376
|
+
for ann in anns:
|
|
377
|
+
events.append(_Event(ann.start, "start", ann))
|
|
378
|
+
# Don't need end events for some annotation types
|
|
379
|
+
if ann.kind != "src_ref":
|
|
380
|
+
events.append(_Event(ann.end, "end", ann))
|
|
381
|
+
|
|
382
|
+
kind_priority = {
|
|
383
|
+
"heading": 6,
|
|
384
|
+
"quote": 5,
|
|
385
|
+
"ext_ref": 4,
|
|
386
|
+
"terms": 4,
|
|
387
|
+
"junk": 3,
|
|
388
|
+
"xref": 2,
|
|
389
|
+
"subtitle": 1,
|
|
390
|
+
"src_ref": 0,
|
|
391
|
+
}
|
|
392
|
+
zero_length_annotations = {"src_ref"}
|
|
393
|
+
|
|
394
|
+
def event_sort_key(e: _Event):
|
|
395
|
+
"""Determines behaviour if two events occur at the same index."""
|
|
396
|
+
kind, start, end = e.annotation.kind, e.annotation.start, e.annotation.end
|
|
397
|
+
if e.time == "start":
|
|
398
|
+
start_first = 1
|
|
399
|
+
kind_order = -kind_priority[kind]
|
|
400
|
+
length_order = -(end - start) if kind not in zero_length_annotations else 1
|
|
401
|
+
|
|
402
|
+
else:
|
|
403
|
+
start_first = 0
|
|
404
|
+
kind_order = kind_priority[kind]
|
|
405
|
+
length_order = end - start if kind not in zero_length_annotations else -1
|
|
406
|
+
|
|
407
|
+
return (e.position, start_first, length_order, kind_order)
|
|
408
|
+
|
|
409
|
+
events.sort(key=event_sort_key)
|
|
410
|
+
events = _filter_events(events)
|
|
411
|
+
|
|
412
|
+
# ===== Process events =====
|
|
413
|
+
md: list[str] = [] # Output markdown
|
|
414
|
+
curr_idx = 0
|
|
415
|
+
for pos, t, ann in events:
|
|
416
|
+
kind = ann.kind
|
|
417
|
+
if curr_idx != pos:
|
|
418
|
+
md.append(text[curr_idx:pos])
|
|
419
|
+
|
|
420
|
+
if t == "start":
|
|
421
|
+
md.append(ann.tags[0])
|
|
422
|
+
|
|
423
|
+
else:
|
|
424
|
+
_safe_append_tag(md, ann.tags[1])
|
|
425
|
+
if ann.force_blank_line:
|
|
426
|
+
md.append("\n\n")
|
|
427
|
+
|
|
428
|
+
curr_idx = pos
|
|
429
|
+
|
|
430
|
+
md.append(text[curr_idx:])
|
|
431
|
+
raw = "".join(md)
|
|
432
|
+
|
|
433
|
+
# We have some post-processing to do
|
|
434
|
+
newlines_added = (f"{line}\n" if line.startswith("#") else line for line in raw.splitlines(True))
|
|
435
|
+
|
|
436
|
+
# ensure every line in the output is surrounded by exactly one blank line before and after,
|
|
437
|
+
# except for quotations. Additionally, preserve indentation by using html tags.
|
|
438
|
+
prev_is_blank = False
|
|
439
|
+
blank_removed: list[str] = []
|
|
440
|
+
for line in "".join(newlines_added).splitlines():
|
|
441
|
+
if prev_is_blank and not line.strip():
|
|
442
|
+
# second blank in a row
|
|
443
|
+
continue
|
|
444
|
+
prev_is_blank = not line.strip()
|
|
445
|
+
|
|
446
|
+
# prevent markdown list rendering
|
|
447
|
+
if _is_list_line(line) and line.lstrip() == line and escape_lists:
|
|
448
|
+
line = f"​{line}"
|
|
449
|
+
|
|
450
|
+
# Convert leading tabs/whitespace to html indent flags
|
|
451
|
+
line = line.expandtabs(4)
|
|
452
|
+
line = re.sub(r"^(?:\s{4})+", lambda m: " " * (len(m.group(0)) // 4), line)
|
|
453
|
+
line = re.sub(r"^((?: )*)\s{2}", r"\1 ", line)
|
|
454
|
+
line = re.sub(r"^((?: | )*)\s", r"\1 ", line)
|
|
455
|
+
|
|
456
|
+
if not line.startswith("> "):
|
|
457
|
+
line = line.rstrip("\n") + "\n"
|
|
458
|
+
prev_is_blank = True
|
|
459
|
+
|
|
460
|
+
blank_removed.append(line + "\n" if line.strip() else line)
|
|
461
|
+
|
|
462
|
+
return "".join(blank_removed).strip()
|