PyPI - inscriptis - Versions diffs - 2.5.3__tar.gz → 2.6.0__tar.gz - Mend

inscriptis 2.5.3tar.gz → 2.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{inscriptis-2.5.3 → inscriptis-2.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: inscriptis
-Version: 2.5.3
+Version: 2.6.0
 Summary: inscriptis - HTML to text converter.
 Home-page: https://github.com/weblyzard/inscriptis
 License: Apache-2.0
@@ -21,10 +21,10 @@ Classifier: Topic :: Text Processing
 Classifier: Topic :: Text Processing :: Markup :: HTML
 Classifier: Topic :: Utilities
 Provides-Extra: web-service
-Requires-Dist: fastapi (>=0.109.1,<0.110.0) ; extra == "web-service"
+Requires-Dist: fastapi (>=0.115.11,<0.116.0) ; extra == "web-service"
 Requires-Dist: lxml (>=4.9.3)
 Requires-Dist: requests (>=2.32.2)
-Requires-Dist: uvicorn (>=0.27.1,<0.28.0) ; extra == "web-service"
+Requires-Dist: uvicorn (>=0.34.0,<0.35.0) ; extra == "web-service"
 Project-URL: Documentation, https://inscriptis.readthedocs.io/en
 Project-URL: Repository, https://github.com/weblyzard/inscriptis
 Description-Content-Type: text/x-rst
@@ -267,7 +267,7 @@ that are suitable for your particular application. Post processors can be
 specified with the ``-p`` or ``--postprocessor`` command line argument::
   $ inscript https://www.fhgr.ch \
-          -r ./annotation/examples/annotation-profile.json \
+          -r ./examples/annotation/annotation-profile.json \
           -p surface
@@ -296,10 +296,12 @@ Currently, inscriptis supports the following postprocessors:
 - xml: returns an additional annotated text version::
     <?xml version="1.0" encoding="UTF-8" ?>
+    <content>
     <heading>Chur</heading>
     <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
     canton of the Grisons and lies in the Grisonian Rhine Valley.
+    </content>
 - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
@@ -313,7 +315,7 @@ Currently, inscriptis supports the following postprocessors:
       inscript --annotation-rules ./wikipedia.json \
                   --postprocessor html \
-                  https://en.wikipedia.org/wiki/Chur.html
+                  https://en.wikipedia.org/wiki/Chur
    Annotation rules encoded in the ``wikipedia.json`` file:

{inscriptis-2.5.3 → inscriptis-2.6.0}/README.rst RENAMED Viewed

@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
 specified with the ``-p`` or ``--postprocessor`` command line argument::
   $ inscript https://www.fhgr.ch \
-          -r ./annotation/examples/annotation-profile.json \
+          -r ./examples/annotation/annotation-profile.json \
           -p surface
@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
 - xml: returns an additional annotated text version::
     <?xml version="1.0" encoding="UTF-8" ?>
+    <content>
     <heading>Chur</heading>
     <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
     canton of the Grisons and lies in the Grisonian Rhine Valley.
+    </content>
 - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
@@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
       inscript --annotation-rules ./wikipedia.json \
                   --postprocessor html \
-                  https://en.wikipedia.org/wiki/Chur.html
+                  https://en.wikipedia.org/wiki/Chur
    Annotation rules encoded in the ``wikipedia.json`` file:

{inscriptis-2.5.3 → inscriptis-2.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "inscriptis"
-version = "2.5.3"
+version = "2.6.0"
 authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
 description = "inscriptis - HTML to text converter."
 keywords = ["HTML", "converter", "text"]
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
 lxml = ">=4.9.3"
 # optional dependencies
-fastapi = { version = "^0.109.1", optional = true }
-uvicorn = { version = "^0.27.1", optional = true }
+fastapi = { version = "^0.115.11", optional = true }
+uvicorn = { version = "^0.34.0", optional = true }
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
 [build-system]

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/__init__.py RENAMED Viewed

@@ -10,9 +10,9 @@ by overwrite the class's :meth:`AnnotationProcessor.__call__` method.
     2. The overwritten :meth:`__call__` method may either extend the original
        dictionary which contains the extracted text and annotations (e.g.,
        :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
-       may replace it with an custom output (e.g.,
+       may replace it with a custom output (e.g.,
        :class:`~inscriptis.annotation.output.html.HtmlExtractor` and
-       :class:`~inscriptis.annotation.output.xml.XmlExtractor`.
+       :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
 Currently, Inscriptis supports the following built-in AnnotationProcessors:
@@ -25,6 +25,7 @@ Currently, Inscriptis supports the following built-in AnnotationProcessors:
     of the extracted annotations.
 """
 from typing import Dict, Any

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/html.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """HTML Annotation Processor."""
 from collections import defaultdict
 from itertools import cycle
 from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
     verbatim = True
     def __call__(self, annotated_text: Dict[str, Any]) -> str:
-        tag_indices = defaultdict(list)
+        tag_dict = defaultdict(list)
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
+        for start, end, label in reversed(annotated_text["label"]):
+            tag_dict[start].append(
+                f'<span class="{label}-label">{label}</span><span class="{label}">'
+            )
+            tag_dict[end].insert(0, "</span>")
-        open_tags = []
         tagged_content = [
             "<html><head><style>",
             self._get_css(annotated_text["label"]),
             "</style></head><body><pre>",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("</span>")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '<span class="{tag}-label">{tag}</span>'
-                        '<span class="{tag}">'.format(tag=tag)
-                    )
-            if ch == "\n":
-                tagged_content.extend(["</span>" for _ in open_tags])
-                tagged_content.append("</pre>\n<pre>")
-                tagged_content.extend(
-                    ['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
         return "".join(tagged_content) + "</pre></body></html>"
     @staticmethod

inscriptis-2.6.0/src/inscriptis/annotation/output/xml.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""XML Annotation processor."""
+from collections import defaultdict
+from typing import Dict, Any
+from inscriptis.annotation.output import AnnotationProcessor
+class XmlExtractor(AnnotationProcessor):
+    """Provide the converted text with XML-style annotations."""
+    verbatim = True
+    def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
+        tag_dict = defaultdict(list)
+        for start, end, tag in reversed(annotated_text["label"]):
+            tag_dict[start].append(f"<{tag}>")
+            tag_dict[end].insert(0, f"</{tag}>")
+        current_idx = 0
+        text = annotated_text["text"]
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx])
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:])
+        tagged_content.append("\n</content>")
+        return "".join(tagged_content)

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_engine.py RENAMED Viewed

@@ -51,7 +51,9 @@ class Inscriptis:
       text = parser.get_text()
     """
-    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
+    def __init__(
+        self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
+    ) -> None:
         # use the default configuration, if no config object is provided
         config = config or ParserConfig()

inscriptis-2.5.3/src/inscriptis/annotation/output/xml.py DELETED Viewed

@@ -1,49 +0,0 @@
-"""XML Annotation processor."""
-from collections import defaultdict
-from typing import Dict, Any
-from inscriptis.annotation.output import AnnotationProcessor
-class XmlExtractor(AnnotationProcessor):
-    """Provide the converted text with XML-style annotations."""
-    verbatim = True
-    def __call__(self, annotated_text: Dict[str, Any]) -> str:
-        """Provide an XML version of the given text and annotations.
-        Args:
-            annotated_text: a dictionary containing the plain text and the
-                            extracted annotations.
-        Returns:
-            A string with the XML-version of the content.
-        """
-        tag_indices = defaultdict(list)
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
-        current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
-        text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
-            tagged_content.append(text[current_idx:index])
-            # close tags
-            tagged_content.extend(
-                [
-                    "<" + tag + ">"
-                    for tag in sorted(tags, reverse=True)
-                    if tag.startswith("/")
-                ]
-            )
-            # open tags
-            tagged_content.extend(
-                ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
-            )
-            current_idx = index
-        tagged_content.append(text[current_idx:])
-        return "".join(tagged_content)

{inscriptis-2.5.3 → inscriptis-2.6.0}/AUTHORS RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/LICENSE RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/surface.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/parser.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/inscript.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/css_profiles.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_properties.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/metadata.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/attribute.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/block.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/prefix.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/config.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/css.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_document_state.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_element.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/table.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/a_tag.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/br_tag.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/img_tag.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/list_tag.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/table_tag.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/__init__.py RENAMED Viewed

File without changes

{inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/web.py RENAMED Viewed

File without changes

inscriptis 2.5.3__tar.gz → 2.6.0__tar.gz

inscriptis 2.5.3tar.gz → 2.6.0tar.gz