inscriptis 2.5.3__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {inscriptis-2.5.3 → inscriptis-2.6.0}/PKG-INFO +7 -5
- {inscriptis-2.5.3 → inscriptis-2.6.0}/README.rst +4 -2
- {inscriptis-2.5.3 → inscriptis-2.6.0}/pyproject.toml +6 -3
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/__init__.py +3 -2
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/html.py +14 -30
- inscriptis-2.6.0/src/inscriptis/annotation/output/xml.py +30 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_engine.py +3 -1
- inscriptis-2.5.3/src/inscriptis/annotation/output/xml.py +0 -49
- {inscriptis-2.5.3 → inscriptis-2.6.0}/AUTHORS +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/LICENSE +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/surface.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/parser.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/inscript.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/css_profiles.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_properties.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/metadata.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/attribute.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/block.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/prefix.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/config.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/css.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_document_state.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_element.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/table.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/a_tag.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/br_tag.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/img_tag.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/list_tag.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/table_tag.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/__init__.py +0 -0
- {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/web.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: inscriptis
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: inscriptis - HTML to text converter.
|
|
5
5
|
Home-page: https://github.com/weblyzard/inscriptis
|
|
6
6
|
License: Apache-2.0
|
|
@@ -21,10 +21,10 @@ Classifier: Topic :: Text Processing
|
|
|
21
21
|
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
22
22
|
Classifier: Topic :: Utilities
|
|
23
23
|
Provides-Extra: web-service
|
|
24
|
-
Requires-Dist: fastapi (>=0.
|
|
24
|
+
Requires-Dist: fastapi (>=0.115.11,<0.116.0) ; extra == "web-service"
|
|
25
25
|
Requires-Dist: lxml (>=4.9.3)
|
|
26
26
|
Requires-Dist: requests (>=2.32.2)
|
|
27
|
-
Requires-Dist: uvicorn (>=0.
|
|
27
|
+
Requires-Dist: uvicorn (>=0.34.0,<0.35.0) ; extra == "web-service"
|
|
28
28
|
Project-URL: Documentation, https://inscriptis.readthedocs.io/en
|
|
29
29
|
Project-URL: Repository, https://github.com/weblyzard/inscriptis
|
|
30
30
|
Description-Content-Type: text/x-rst
|
|
@@ -267,7 +267,7 @@ that are suitable for your particular application. Post processors can be
|
|
|
267
267
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
268
268
|
|
|
269
269
|
$ inscript https://www.fhgr.ch \
|
|
270
|
-
-r ./
|
|
270
|
+
-r ./examples/annotation/annotation-profile.json \
|
|
271
271
|
-p surface
|
|
272
272
|
|
|
273
273
|
|
|
@@ -296,10 +296,12 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
296
296
|
- xml: returns an additional annotated text version::
|
|
297
297
|
|
|
298
298
|
<?xml version="1.0" encoding="UTF-8" ?>
|
|
299
|
+
<content>
|
|
299
300
|
<heading>Chur</heading>
|
|
300
301
|
|
|
301
302
|
<emphasis>Chur</emphasis> is the capital and largest town of the Swiss
|
|
302
303
|
canton of the Grisons and lies in the Grisonian Rhine Valley.
|
|
304
|
+
</content>
|
|
303
305
|
|
|
304
306
|
- html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
|
|
305
307
|
|
|
@@ -313,7 +315,7 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
313
315
|
|
|
314
316
|
inscript --annotation-rules ./wikipedia.json \
|
|
315
317
|
--postprocessor html \
|
|
316
|
-
https://en.wikipedia.org/wiki/Chur
|
|
318
|
+
https://en.wikipedia.org/wiki/Chur
|
|
317
319
|
|
|
318
320
|
Annotation rules encoded in the ``wikipedia.json`` file:
|
|
319
321
|
|
|
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
|
|
|
236
236
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
237
237
|
|
|
238
238
|
$ inscript https://www.fhgr.ch \
|
|
239
|
-
-r ./
|
|
239
|
+
-r ./examples/annotation/annotation-profile.json \
|
|
240
240
|
-p surface
|
|
241
241
|
|
|
242
242
|
|
|
@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
265
265
|
- xml: returns an additional annotated text version::
|
|
266
266
|
|
|
267
267
|
<?xml version="1.0" encoding="UTF-8" ?>
|
|
268
|
+
<content>
|
|
268
269
|
<heading>Chur</heading>
|
|
269
270
|
|
|
270
271
|
<emphasis>Chur</emphasis> is the capital and largest town of the Swiss
|
|
271
272
|
canton of the Grisons and lies in the Grisonian Rhine Valley.
|
|
273
|
+
</content>
|
|
272
274
|
|
|
273
275
|
- html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
|
|
274
276
|
|
|
@@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
282
284
|
|
|
283
285
|
inscript --annotation-rules ./wikipedia.json \
|
|
284
286
|
--postprocessor html \
|
|
285
|
-
https://en.wikipedia.org/wiki/Chur
|
|
287
|
+
https://en.wikipedia.org/wiki/Chur
|
|
286
288
|
|
|
287
289
|
Annotation rules encoded in the ``wikipedia.json`` file:
|
|
288
290
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "inscriptis"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.6.0"
|
|
4
4
|
authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
|
|
5
5
|
description = "inscriptis - HTML to text converter."
|
|
6
6
|
keywords = ["HTML", "converter", "text"]
|
|
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
|
|
|
44
44
|
lxml = ">=4.9.3"
|
|
45
45
|
|
|
46
46
|
# optional dependencies
|
|
47
|
-
fastapi = { version = "^0.
|
|
48
|
-
uvicorn = { version = "^0.
|
|
47
|
+
fastapi = { version = "^0.115.11", optional = true }
|
|
48
|
+
uvicorn = { version = "^0.34.0", optional = true }
|
|
49
|
+
|
|
50
|
+
[tool.poetry.group.dev.dependencies]
|
|
51
|
+
pytest = "^8.3.5"
|
|
49
52
|
|
|
50
53
|
|
|
51
54
|
[build-system]
|
|
@@ -10,9 +10,9 @@ by overwrite the class's :meth:`AnnotationProcessor.__call__` method.
|
|
|
10
10
|
2. The overwritten :meth:`__call__` method may either extend the original
|
|
11
11
|
dictionary which contains the extracted text and annotations (e.g.,
|
|
12
12
|
:class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
|
|
13
|
-
may replace it with
|
|
13
|
+
may replace it with a custom output (e.g.,
|
|
14
14
|
:class:`~inscriptis.annotation.output.html.HtmlExtractor` and
|
|
15
|
-
:class:`~inscriptis.annotation.output.xml.XmlExtractor
|
|
15
|
+
:class:`~inscriptis.annotation.output.xml.XmlExtractor`).
|
|
16
16
|
|
|
17
17
|
Currently, Inscriptis supports the following built-in AnnotationProcessors:
|
|
18
18
|
|
|
@@ -25,6 +25,7 @@ Currently, Inscriptis supports the following built-in AnnotationProcessors:
|
|
|
25
25
|
of the extracted annotations.
|
|
26
26
|
|
|
27
27
|
"""
|
|
28
|
+
|
|
28
29
|
from typing import Dict, Any
|
|
29
30
|
|
|
30
31
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""HTML Annotation Processor."""
|
|
2
|
+
|
|
2
3
|
from collections import defaultdict
|
|
3
4
|
from itertools import cycle
|
|
4
5
|
from typing import Dict, Any, List
|
|
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
|
|
|
18
19
|
verbatim = True
|
|
19
20
|
|
|
20
21
|
def __call__(self, annotated_text: Dict[str, Any]) -> str:
|
|
21
|
-
|
|
22
|
+
tag_dict = defaultdict(list)
|
|
22
23
|
|
|
23
|
-
for start, end, label in
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
for start, end, label in reversed(annotated_text["label"]):
|
|
25
|
+
tag_dict[start].append(
|
|
26
|
+
f'<span class="{label}-label">{label}</span><span class="{label}">'
|
|
27
|
+
)
|
|
28
|
+
tag_dict[end].insert(0, "</span>")
|
|
26
29
|
|
|
27
|
-
open_tags = []
|
|
28
30
|
tagged_content = [
|
|
29
31
|
"<html><head><style>",
|
|
30
32
|
self._get_css(annotated_text["label"]),
|
|
31
33
|
"</style></head><body><pre>",
|
|
32
34
|
]
|
|
33
|
-
for idx, ch in enumerate(annotated_text["text"]):
|
|
34
|
-
if idx in tag_indices:
|
|
35
|
-
tags = tag_indices[idx]
|
|
36
|
-
# close tags:
|
|
37
|
-
for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
|
|
38
|
-
open_tags.pop()
|
|
39
|
-
tagged_content.append("</span>")
|
|
40
|
-
# open tags
|
|
41
|
-
for tag in (
|
|
42
|
-
t for t in sorted(tags, reverse=True) if not t.startswith("/")
|
|
43
|
-
):
|
|
44
|
-
open_tags.append(tag)
|
|
45
|
-
tagged_content.append(
|
|
46
|
-
'<span class="{tag}-label">{tag}</span>'
|
|
47
|
-
'<span class="{tag}">'.format(tag=tag)
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
if ch == "\n":
|
|
51
|
-
tagged_content.extend(["</span>" for _ in open_tags])
|
|
52
|
-
tagged_content.append("</pre>\n<pre>")
|
|
53
|
-
tagged_content.extend(
|
|
54
|
-
['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
|
|
55
|
-
)
|
|
56
|
-
else:
|
|
57
|
-
tagged_content.append(ch)
|
|
58
35
|
|
|
36
|
+
text = annotated_text["text"]
|
|
37
|
+
current_idx = 0
|
|
38
|
+
for idx, tags in sorted(tag_dict.items()):
|
|
39
|
+
tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
|
|
40
|
+
current_idx = idx
|
|
41
|
+
tagged_content.extend(tags)
|
|
42
|
+
tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
|
|
59
43
|
return "".join(tagged_content) + "</pre></body></html>"
|
|
60
44
|
|
|
61
45
|
@staticmethod
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""XML Annotation processor."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
|
|
6
|
+
from inscriptis.annotation.output import AnnotationProcessor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class XmlExtractor(AnnotationProcessor):
|
|
10
|
+
"""Provide the converted text with XML-style annotations."""
|
|
11
|
+
|
|
12
|
+
verbatim = True
|
|
13
|
+
|
|
14
|
+
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
|
|
15
|
+
tag_dict = defaultdict(list)
|
|
16
|
+
for start, end, tag in reversed(annotated_text["label"]):
|
|
17
|
+
tag_dict[start].append(f"<{tag}>")
|
|
18
|
+
tag_dict[end].insert(0, f"</{tag}>")
|
|
19
|
+
|
|
20
|
+
current_idx = 0
|
|
21
|
+
text = annotated_text["text"]
|
|
22
|
+
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
|
|
23
|
+
for idx, tags in sorted(tag_dict.items()):
|
|
24
|
+
tagged_content.append(text[current_idx:idx])
|
|
25
|
+
current_idx = idx
|
|
26
|
+
tagged_content.extend(tags)
|
|
27
|
+
|
|
28
|
+
tagged_content.append(text[current_idx:])
|
|
29
|
+
tagged_content.append("\n</content>")
|
|
30
|
+
return "".join(tagged_content)
|
|
@@ -51,7 +51,9 @@ class Inscriptis:
|
|
|
51
51
|
text = parser.get_text()
|
|
52
52
|
"""
|
|
53
53
|
|
|
54
|
-
def __init__(
|
|
54
|
+
def __init__(
|
|
55
|
+
self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
|
|
56
|
+
) -> None:
|
|
55
57
|
# use the default configuration, if no config object is provided
|
|
56
58
|
config = config or ParserConfig()
|
|
57
59
|
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
"""XML Annotation processor."""
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from typing import Dict, Any
|
|
4
|
-
|
|
5
|
-
from inscriptis.annotation.output import AnnotationProcessor
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class XmlExtractor(AnnotationProcessor):
|
|
9
|
-
"""Provide the converted text with XML-style annotations."""
|
|
10
|
-
|
|
11
|
-
verbatim = True
|
|
12
|
-
|
|
13
|
-
def __call__(self, annotated_text: Dict[str, Any]) -> str:
|
|
14
|
-
"""Provide an XML version of the given text and annotations.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
annotated_text: a dictionary containing the plain text and the
|
|
18
|
-
extracted annotations.
|
|
19
|
-
|
|
20
|
-
Returns:
|
|
21
|
-
A string with the XML-version of the content.
|
|
22
|
-
"""
|
|
23
|
-
tag_indices = defaultdict(list)
|
|
24
|
-
|
|
25
|
-
for start, end, label in sorted(annotated_text["label"]):
|
|
26
|
-
tag_indices[start].append(label)
|
|
27
|
-
tag_indices[end].append("/" + label)
|
|
28
|
-
|
|
29
|
-
current_idx = 0
|
|
30
|
-
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
|
|
31
|
-
text = annotated_text["text"]
|
|
32
|
-
for index, tags in sorted(tag_indices.items()):
|
|
33
|
-
tagged_content.append(text[current_idx:index])
|
|
34
|
-
# close tags
|
|
35
|
-
tagged_content.extend(
|
|
36
|
-
[
|
|
37
|
-
"<" + tag + ">"
|
|
38
|
-
for tag in sorted(tags, reverse=True)
|
|
39
|
-
if tag.startswith("/")
|
|
40
|
-
]
|
|
41
|
-
)
|
|
42
|
-
# open tags
|
|
43
|
-
tagged_content.extend(
|
|
44
|
-
["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
|
|
45
|
-
)
|
|
46
|
-
current_idx = index
|
|
47
|
-
tagged_content.append(text[current_idx:])
|
|
48
|
-
|
|
49
|
-
return "".join(tagged_content)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|