inscriptis 2.5.3__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {inscriptis-2.5.3 → inscriptis-2.6.0}/PKG-INFO +7 -5
  2. {inscriptis-2.5.3 → inscriptis-2.6.0}/README.rst +4 -2
  3. {inscriptis-2.5.3 → inscriptis-2.6.0}/pyproject.toml +6 -3
  4. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/__init__.py +3 -2
  5. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/html.py +14 -30
  6. inscriptis-2.6.0/src/inscriptis/annotation/output/xml.py +30 -0
  7. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_engine.py +3 -1
  8. inscriptis-2.5.3/src/inscriptis/annotation/output/xml.py +0 -49
  9. {inscriptis-2.5.3 → inscriptis-2.6.0}/AUTHORS +0 -0
  10. {inscriptis-2.5.3 → inscriptis-2.6.0}/LICENSE +0 -0
  11. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/__init__.py +0 -0
  12. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/__init__.py +0 -0
  13. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/output/surface.py +0 -0
  14. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/annotation/parser.py +0 -0
  15. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/__init__.py +0 -0
  16. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/cli/inscript.py +0 -0
  17. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/css_profiles.py +0 -0
  18. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/html_properties.py +0 -0
  19. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/metadata.py +0 -0
  20. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/__init__.py +0 -0
  21. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/attribute.py +0 -0
  22. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/__init__.py +0 -0
  23. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/block.py +0 -0
  24. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/canvas/prefix.py +0 -0
  25. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/config.py +0 -0
  26. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/css.py +0 -0
  27. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_document_state.py +0 -0
  28. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/html_element.py +0 -0
  29. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/table.py +0 -0
  30. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/__init__.py +0 -0
  31. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/a_tag.py +0 -0
  32. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/br_tag.py +0 -0
  33. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/img_tag.py +0 -0
  34. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/list_tag.py +0 -0
  35. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/model/tag/table_tag.py +0 -0
  36. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/__init__.py +0 -0
  37. {inscriptis-2.5.3 → inscriptis-2.6.0}/src/inscriptis/service/web.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inscriptis
3
- Version: 2.5.3
3
+ Version: 2.6.0
4
4
  Summary: inscriptis - HTML to text converter.
5
5
  Home-page: https://github.com/weblyzard/inscriptis
6
6
  License: Apache-2.0
@@ -21,10 +21,10 @@ Classifier: Topic :: Text Processing
21
21
  Classifier: Topic :: Text Processing :: Markup :: HTML
22
22
  Classifier: Topic :: Utilities
23
23
  Provides-Extra: web-service
24
- Requires-Dist: fastapi (>=0.109.1,<0.110.0) ; extra == "web-service"
24
+ Requires-Dist: fastapi (>=0.115.11,<0.116.0) ; extra == "web-service"
25
25
  Requires-Dist: lxml (>=4.9.3)
26
26
  Requires-Dist: requests (>=2.32.2)
27
- Requires-Dist: uvicorn (>=0.27.1,<0.28.0) ; extra == "web-service"
27
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0) ; extra == "web-service"
28
28
  Project-URL: Documentation, https://inscriptis.readthedocs.io/en
29
29
  Project-URL: Repository, https://github.com/weblyzard/inscriptis
30
30
  Description-Content-Type: text/x-rst
@@ -267,7 +267,7 @@ that are suitable for your particular application. Post processors can be
267
267
  specified with the ``-p`` or ``--postprocessor`` command line argument::
268
268
 
269
269
  $ inscript https://www.fhgr.ch \
270
- -r ./annotation/examples/annotation-profile.json \
270
+ -r ./examples/annotation/annotation-profile.json \
271
271
  -p surface
272
272
 
273
273
 
@@ -296,10 +296,12 @@ Currently, inscriptis supports the following postprocessors:
296
296
  - xml: returns an additional annotated text version::
297
297
 
298
298
  <?xml version="1.0" encoding="UTF-8" ?>
299
+ <content>
299
300
  <heading>Chur</heading>
300
301
 
301
302
  <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
302
303
  canton of the Grisons and lies in the Grisonian Rhine Valley.
304
+ </content>
303
305
 
304
306
  - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
305
307
 
@@ -313,7 +315,7 @@ Currently, inscriptis supports the following postprocessors:
313
315
 
314
316
  inscript --annotation-rules ./wikipedia.json \
315
317
  --postprocessor html \
316
- https://en.wikipedia.org/wiki/Chur.html
318
+ https://en.wikipedia.org/wiki/Chur
317
319
 
318
320
  Annotation rules encoded in the ``wikipedia.json`` file:
319
321
 
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
236
236
  specified with the ``-p`` or ``--postprocessor`` command line argument::
237
237
 
238
238
  $ inscript https://www.fhgr.ch \
239
- -r ./annotation/examples/annotation-profile.json \
239
+ -r ./examples/annotation/annotation-profile.json \
240
240
  -p surface
241
241
 
242
242
 
@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
265
265
  - xml: returns an additional annotated text version::
266
266
 
267
267
  <?xml version="1.0" encoding="UTF-8" ?>
268
+ <content>
268
269
  <heading>Chur</heading>
269
270
 
270
271
  <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
271
272
  canton of the Grisons and lies in the Grisonian Rhine Valley.
273
+ </content>
272
274
 
273
275
  - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
274
276
 
@@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
282
284
 
283
285
  inscript --annotation-rules ./wikipedia.json \
284
286
  --postprocessor html \
285
- https://en.wikipedia.org/wiki/Chur.html
287
+ https://en.wikipedia.org/wiki/Chur
286
288
 
287
289
  Annotation rules encoded in the ``wikipedia.json`` file:
288
290
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "inscriptis"
3
- version = "2.5.3"
3
+ version = "2.6.0"
4
4
  authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
5
5
  description = "inscriptis - HTML to text converter."
6
6
  keywords = ["HTML", "converter", "text"]
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
44
44
  lxml = ">=4.9.3"
45
45
 
46
46
  # optional dependencies
47
- fastapi = { version = "^0.109.1", optional = true }
48
- uvicorn = { version = "^0.27.1", optional = true }
47
+ fastapi = { version = "^0.115.11", optional = true }
48
+ uvicorn = { version = "^0.34.0", optional = true }
49
+
50
+ [tool.poetry.group.dev.dependencies]
51
+ pytest = "^8.3.5"
49
52
 
50
53
 
51
54
  [build-system]
@@ -10,9 +10,9 @@ by overwrite the class's :meth:`AnnotationProcessor.__call__` method.
10
10
  2. The overwritten :meth:`__call__` method may either extend the original
11
11
  dictionary which contains the extracted text and annotations (e.g.,
12
12
  :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
13
- may replace it with an custom output (e.g.,
13
+ may replace it with a custom output (e.g.,
14
14
  :class:`~inscriptis.annotation.output.html.HtmlExtractor` and
15
- :class:`~inscriptis.annotation.output.xml.XmlExtractor`.
15
+ :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
16
16
 
17
17
  Currently, Inscriptis supports the following built-in AnnotationProcessors:
18
18
 
@@ -25,6 +25,7 @@ Currently, Inscriptis supports the following built-in AnnotationProcessors:
25
25
  of the extracted annotations.
26
26
 
27
27
  """
28
+
28
29
  from typing import Dict, Any
29
30
 
30
31
 
@@ -1,4 +1,5 @@
1
1
  """HTML Annotation Processor."""
2
+
2
3
  from collections import defaultdict
3
4
  from itertools import cycle
4
5
  from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
18
19
  verbatim = True
19
20
 
20
21
  def __call__(self, annotated_text: Dict[str, Any]) -> str:
21
- tag_indices = defaultdict(list)
22
+ tag_dict = defaultdict(list)
22
23
 
23
- for start, end, label in sorted(annotated_text["label"]):
24
- tag_indices[start].append(label)
25
- tag_indices[end].append("/" + label)
24
+ for start, end, label in reversed(annotated_text["label"]):
25
+ tag_dict[start].append(
26
+ f'<span class="{label}-label">{label}</span><span class="{label}">'
27
+ )
28
+ tag_dict[end].insert(0, "</span>")
26
29
 
27
- open_tags = []
28
30
  tagged_content = [
29
31
  "<html><head><style>",
30
32
  self._get_css(annotated_text["label"]),
31
33
  "</style></head><body><pre>",
32
34
  ]
33
- for idx, ch in enumerate(annotated_text["text"]):
34
- if idx in tag_indices:
35
- tags = tag_indices[idx]
36
- # close tags:
37
- for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
38
- open_tags.pop()
39
- tagged_content.append("</span>")
40
- # open tags
41
- for tag in (
42
- t for t in sorted(tags, reverse=True) if not t.startswith("/")
43
- ):
44
- open_tags.append(tag)
45
- tagged_content.append(
46
- '<span class="{tag}-label">{tag}</span>'
47
- '<span class="{tag}">'.format(tag=tag)
48
- )
49
-
50
- if ch == "\n":
51
- tagged_content.extend(["</span>" for _ in open_tags])
52
- tagged_content.append("</pre>\n<pre>")
53
- tagged_content.extend(
54
- ['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
55
- )
56
- else:
57
- tagged_content.append(ch)
58
35
 
36
+ text = annotated_text["text"]
37
+ current_idx = 0
38
+ for idx, tags in sorted(tag_dict.items()):
39
+ tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
40
+ current_idx = idx
41
+ tagged_content.extend(tags)
42
+ tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
59
43
  return "".join(tagged_content) + "</pre></body></html>"
60
44
 
61
45
  @staticmethod
@@ -0,0 +1,30 @@
1
+ """XML Annotation processor."""
2
+
3
+ from collections import defaultdict
4
+ from typing import Dict, Any
5
+
6
+ from inscriptis.annotation.output import AnnotationProcessor
7
+
8
+
9
+ class XmlExtractor(AnnotationProcessor):
10
+ """Provide the converted text with XML-style annotations."""
11
+
12
+ verbatim = True
13
+
14
+ def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
15
+ tag_dict = defaultdict(list)
16
+ for start, end, tag in reversed(annotated_text["label"]):
17
+ tag_dict[start].append(f"<{tag}>")
18
+ tag_dict[end].insert(0, f"</{tag}>")
19
+
20
+ current_idx = 0
21
+ text = annotated_text["text"]
22
+ tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
23
+ for idx, tags in sorted(tag_dict.items()):
24
+ tagged_content.append(text[current_idx:idx])
25
+ current_idx = idx
26
+ tagged_content.extend(tags)
27
+
28
+ tagged_content.append(text[current_idx:])
29
+ tagged_content.append("\n</content>")
30
+ return "".join(tagged_content)
@@ -51,7 +51,9 @@ class Inscriptis:
51
51
  text = parser.get_text()
52
52
  """
53
53
 
54
- def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
54
+ def __init__(
55
+ self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
56
+ ) -> None:
55
57
  # use the default configuration, if no config object is provided
56
58
  config = config or ParserConfig()
57
59
 
@@ -1,49 +0,0 @@
1
- """XML Annotation processor."""
2
- from collections import defaultdict
3
- from typing import Dict, Any
4
-
5
- from inscriptis.annotation.output import AnnotationProcessor
6
-
7
-
8
- class XmlExtractor(AnnotationProcessor):
9
- """Provide the converted text with XML-style annotations."""
10
-
11
- verbatim = True
12
-
13
- def __call__(self, annotated_text: Dict[str, Any]) -> str:
14
- """Provide an XML version of the given text and annotations.
15
-
16
- Args:
17
- annotated_text: a dictionary containing the plain text and the
18
- extracted annotations.
19
-
20
- Returns:
21
- A string with the XML-version of the content.
22
- """
23
- tag_indices = defaultdict(list)
24
-
25
- for start, end, label in sorted(annotated_text["label"]):
26
- tag_indices[start].append(label)
27
- tag_indices[end].append("/" + label)
28
-
29
- current_idx = 0
30
- tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
31
- text = annotated_text["text"]
32
- for index, tags in sorted(tag_indices.items()):
33
- tagged_content.append(text[current_idx:index])
34
- # close tags
35
- tagged_content.extend(
36
- [
37
- "<" + tag + ">"
38
- for tag in sorted(tags, reverse=True)
39
- if tag.startswith("/")
40
- ]
41
- )
42
- # open tags
43
- tagged_content.extend(
44
- ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
45
- )
46
- current_idx = index
47
- tagged_content.append(text[current_idx:])
48
-
49
- return "".join(tagged_content)
File without changes
File without changes