inscriptis 2.4.0.1__tar.gz → 2.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. inscriptis-2.5.2/AUTHORS +5 -0
  2. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/PKG-INFO +23 -19
  3. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/README.rst +17 -13
  4. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/pyproject.toml +8 -8
  5. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/__init__.py +6 -5
  6. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/__init__.py +3 -3
  7. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/parser.py +2 -1
  8. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/cli/inscript.py +6 -7
  9. inscriptis-2.5.2/src/inscriptis/html_engine.py +132 -0
  10. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/html_properties.py +1 -1
  11. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/metadata.py +1 -1
  12. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/attribute.py +4 -8
  13. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/__init__.py +10 -10
  14. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/block.py +8 -1
  15. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/prefix.py +6 -6
  16. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/config.py +9 -3
  17. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/css.py +1 -0
  18. inscriptis-2.5.2/src/inscriptis/model/html_document_state.py +52 -0
  19. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/html_element.py +4 -4
  20. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/table.py +5 -5
  21. inscriptis-2.5.2/src/inscriptis/model/tag/__init__.py +20 -0
  22. inscriptis-2.5.2/src/inscriptis/model/tag/a_tag.py +22 -0
  23. inscriptis-2.5.2/src/inscriptis/model/tag/br_tag.py +9 -0
  24. inscriptis-2.5.2/src/inscriptis/model/tag/img_tag.py +14 -0
  25. inscriptis-2.5.2/src/inscriptis/model/tag/list_tag.py +44 -0
  26. inscriptis-2.5.2/src/inscriptis/model/tag/table_tag.py +68 -0
  27. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/service/web.py +4 -4
  28. inscriptis-2.4.0.1/src/inscriptis/html_engine.py +0 -265
  29. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/LICENSE +0 -0
  30. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/__init__.py +0 -0
  31. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/html.py +0 -0
  32. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/surface.py +0 -0
  33. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/xml.py +0 -0
  34. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/cli/__init__.py +0 -0
  35. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/css_profiles.py +1 -1
  36. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/__init__.py +0 -0
  37. {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/service/__init__.py +0 -0
@@ -0,0 +1,5 @@
1
+ Albert Weichselbraun <albert.weichselbraun@fhgr.ch>
2
+ Fabian Odoni <fabian.odoni@fhgr.ch>
3
+
4
+ The design of inscriptis has originally been inspired by SpiffWikiMarkup
5
+ developed by Samuel Abels <spam2, debain, org>.
@@ -1,30 +1,30 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inscriptis
3
- Version: 2.4.0.1
3
+ Version: 2.5.2
4
4
  Summary: inscriptis - HTML to text converter.
5
5
  Home-page: https://github.com/weblyzard/inscriptis
6
6
  License: Apache-2.0
7
7
  Keywords: HTML,converter,text
8
8
  Author: Albert Weichselbraun
9
9
  Author-email: albert.weichselbraun@fhgr.ch
10
- Requires-Python: >=3.8,<4.0
10
+ Requires-Python: >=3.9,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: License :: OSI Approved :: Apache Software License
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.8
16
15
  Classifier: Programming Language :: Python :: 3.9
17
16
  Classifier: Programming Language :: Python :: 3.10
18
17
  Classifier: Programming Language :: Python :: 3.11
19
18
  Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
20
  Classifier: Topic :: Text Processing
21
21
  Classifier: Topic :: Text Processing :: Markup :: HTML
22
22
  Classifier: Topic :: Utilities
23
23
  Provides-Extra: web-service
24
- Requires-Dist: fastapi (>=0.109.0,<0.110.0) ; extra == "web-service"
24
+ Requires-Dist: fastapi (>=0.109.1,<0.110.0) ; extra == "web-service"
25
25
  Requires-Dist: lxml (>=4.9.3)
26
- Requires-Dist: requests (>=2.31.0)
27
- Requires-Dist: uvicorn (>=0.25.0,<0.26.0) ; extra == "web-service"
26
+ Requires-Dist: requests (==2.32.2)
27
+ Requires-Dist: uvicorn (>=0.27.1,<0.28.0) ; extra == "web-service"
28
28
  Project-URL: Documentation, https://inscriptis.readthedocs.io/en
29
29
  Project-URL: Repository, https://github.com/weblyzard/inscriptis
30
30
  Description-Content-Type: text/x-rst
@@ -129,12 +129,6 @@ Or, if you don't have pip installed::
129
129
 
130
130
  $ easy_install inscriptis
131
131
 
132
- If you want to install from the latest sources, you can do::
133
-
134
- $ git clone https://github.com/weblyzard/inscriptis.git
135
- $ cd inscriptis
136
- $ python setup.py install
137
-
138
132
 
139
133
  Python library
140
134
  ==============
@@ -222,7 +216,7 @@ HTML to annotated text conversion
222
216
  ---------------------------------
223
217
  convert and annotate HTML from a Web page using the provided annotation rules.
224
218
 
225
- Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
219
+ Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
226
220
 
227
221
  $ inscript https://www.fhgr.ch -r annotation-profile.json
228
222
 
@@ -273,7 +267,7 @@ that are suitable for your particular application. Post processors can be
273
267
  specified with the ``-p`` or ``--postprocessor`` command line argument::
274
268
 
275
269
  $ inscript https://www.fhgr.ch \
276
- -r ./examples/annotation-profile.json \
270
+ -r ./annotation/examples/annotation-profile.json \
277
271
  -p surface
278
272
 
279
273
 
@@ -511,7 +505,8 @@ be used within a program:
511
505
  .. code-block:: python
512
506
 
513
507
  import urllib.request
514
- from inscriptis import get_annotated_text, ParserConfig
508
+ from inscriptis import get_annotated_text
509
+ from inscriptis.model.config import ParserConfig
515
510
 
516
511
  url = "https://www.fhgr.ch"
517
512
  html = urllib.request.urlopen(url).read().decode('utf-8')
@@ -570,15 +565,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
570
565
 
571
566
  .. code-block:: python
572
567
 
573
- inscriptis = Inscriptis(html, config)
568
+ from inscriptis import ParserConfig
569
+ from inscriptis.html_engine import Inscriptis
570
+ from inscriptis.model.tag import CustomHtmlTagHandlerMapping
574
571
 
575
- inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
576
- inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
572
+ my_mapping = CustomHtmlTagHandlerMapping(
573
+ start_tag_mapping={'a': my_handle_start_a},
574
+ end_tag_mapping={'a': my_handle_end_a}
575
+ )
576
+ inscriptis = Inscriptis(html_tree,
577
+ ParserConfig(custom_html_tag_handler_mapping=my_mapping))
577
578
  text = inscriptis.get_text()
578
579
 
579
580
 
580
581
  In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
581
- You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.
582
+ You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
583
+
584
+ Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
585
+ The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
582
586
 
583
587
  Optimizing memory consumption
584
588
  -----------------------------
@@ -98,12 +98,6 @@ Or, if you don't have pip installed::
98
98
 
99
99
  $ easy_install inscriptis
100
100
 
101
- If you want to install from the latest sources, you can do::
102
-
103
- $ git clone https://github.com/weblyzard/inscriptis.git
104
- $ cd inscriptis
105
- $ python setup.py install
106
-
107
101
 
108
102
  Python library
109
103
  ==============
@@ -191,7 +185,7 @@ HTML to annotated text conversion
191
185
  ---------------------------------
192
186
  convert and annotate HTML from a Web page using the provided annotation rules.
193
187
 
194
- Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
188
+ Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
195
189
 
196
190
  $ inscript https://www.fhgr.ch -r annotation-profile.json
197
191
 
@@ -242,7 +236,7 @@ that are suitable for your particular application. Post processors can be
242
236
  specified with the ``-p`` or ``--postprocessor`` command line argument::
243
237
 
244
238
  $ inscript https://www.fhgr.ch \
245
- -r ./examples/annotation-profile.json \
239
+ -r ./annotation/examples/annotation-profile.json \
246
240
  -p surface
247
241
 
248
242
 
@@ -480,7 +474,8 @@ be used within a program:
480
474
  .. code-block:: python
481
475
 
482
476
  import urllib.request
483
- from inscriptis import get_annotated_text, ParserConfig
477
+ from inscriptis import get_annotated_text
478
+ from inscriptis.model.config import ParserConfig
484
479
 
485
480
  url = "https://www.fhgr.ch"
486
481
  html = urllib.request.urlopen(url).read().decode('utf-8')
@@ -539,15 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
539
534
 
540
535
  .. code-block:: python
541
536
 
542
- inscriptis = Inscriptis(html, config)
537
+ from inscriptis import ParserConfig
538
+ from inscriptis.html_engine import Inscriptis
539
+ from inscriptis.model.tag import CustomHtmlTagHandlerMapping
543
540
 
544
- inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
545
- inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
541
+ my_mapping = CustomHtmlTagHandlerMapping(
542
+ start_tag_mapping={'a': my_handle_start_a},
543
+ end_tag_mapping={'a': my_handle_end_a}
544
+ )
545
+ inscriptis = Inscriptis(html_tree,
546
+ ParserConfig(custom_html_tag_handler_mapping=my_mapping))
546
547
  text = inscriptis.get_text()
547
548
 
548
549
 
549
550
  In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
550
- You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.
551
+ You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
552
+
553
+ Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
554
+ The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
551
555
 
552
556
  Optimizing memory consumption
553
557
  -----------------------------
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "inscriptis"
3
- version = "2.4.0.1"
3
+ version = "2.5.2"
4
4
  authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
5
5
  description = "inscriptis - HTML to text converter."
6
6
  keywords = ["HTML", "converter", "text"]
@@ -12,11 +12,11 @@ classifiers = [
12
12
  'Topic :: Text Processing :: Markup :: HTML',
13
13
  'Topic :: Utilities',
14
14
  'Programming Language :: Python :: 3',
15
- 'Programming Language :: Python :: 3.8',
16
15
  'Programming Language :: Python :: 3.9',
17
16
  'Programming Language :: Python :: 3.10',
18
17
  'Programming Language :: Python :: 3.11',
19
18
  'Programming Language :: Python :: 3.12',
19
+ 'Programming Language :: Python :: 3.13',
20
20
  ]
21
21
  homepage = "https://github.com/weblyzard/inscriptis"
22
22
  repository = "https://github.com/weblyzard/inscriptis"
@@ -39,13 +39,13 @@ web-service = ["fastapi", "uvicorn"]
39
39
 
40
40
 
41
41
  [tool.poetry.dependencies]
42
- python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12"
43
- requests = ">=2.31.0"
42
+ python = "^3.9 || ^3.10 || ^3.11 || ^3.12 || ^3.13"
43
+ requests = "2.32.2"
44
44
  lxml = ">=4.9.3"
45
45
 
46
46
  # optional dependencies
47
- fastapi = { version = "^0.109.0", optional = true }
48
- uvicorn = { version = "^0.25.0", optional = true }
47
+ fastapi = { version = "^0.109.1", optional = true }
48
+ uvicorn = { version = "^0.27.1", optional = true }
49
49
 
50
50
 
51
51
  [build-system]
@@ -56,8 +56,8 @@ build-backend = "poetry.core.masonry.api"
56
56
  # code formatting with black
57
57
  [tool.black]
58
58
  line-length = 88
59
- target-version = ["py38", "py39", "py310", "py311", "py312"]
59
+ target-version = ["py39", "py310", "py311", "py312", "py313"]
60
60
  extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
61
61
  include = '''
62
- ^/src/|^/tests/|^/benchmarking/
62
+ ^/src/|^/tests/|^/benchmarking/|^/examples/
63
63
  '''
@@ -60,12 +60,12 @@ Annotations in the `label` field are returned as a list of triples with
60
60
  """
61
61
 
62
62
  import re
63
- from lxml.html import fromstring, HtmlElement
64
- from lxml.etree import ParserError
65
-
66
63
  from typing import Dict, Optional, Any
67
-
68
64
  from inscriptis.model.config import ParserConfig
65
+
66
+ from lxml.etree import ParserError
67
+ from lxml.html import fromstring, HtmlElement
68
+
69
69
  from inscriptis.html_engine import Inscriptis
70
70
 
71
71
  RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
@@ -132,5 +132,6 @@ def get_annotated_text(
132
132
  return {}
133
133
 
134
134
  inscriptis = Inscriptis(html_tree, config)
135
+ text = inscriptis.get_text()
135
136
  labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
136
- return {"text": inscriptis.get_text(), "label": labels}
137
+ return {"text": text, "label": labels}
@@ -1,7 +1,7 @@
1
1
  """The model used for saving annotations."""
2
2
 
3
- from typing import NamedTuple, Tuple
4
3
  from typing import List
4
+ from typing import NamedTuple
5
5
 
6
6
  from inscriptis.html_properties import HorizontalAlignment
7
7
 
@@ -25,8 +25,8 @@ class Annotation(NamedTuple):
25
25
  """the annotation's start index within the text output."""
26
26
  end: int
27
27
  """the annotation's end index within the text output."""
28
- metadata: Tuple[str]
29
- """a tuple of tags to be attached to the annotation."""
28
+ metadata: str
29
+ """the tag to be attached to the annotation."""
30
30
 
31
31
 
32
32
  def horizontal_shift(
@@ -18,6 +18,7 @@ Example::
18
18
  """
19
19
  from collections import defaultdict
20
20
  from copy import copy
21
+ from typing import Dict, Tuple, List
21
22
 
22
23
  from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT
23
24
 
@@ -85,7 +86,7 @@ class AnnotationModel:
85
86
  self.css = css_profile
86
87
 
87
88
  @staticmethod
88
- def _parse(model: dict) -> "AnnotationModel":
89
+ def _parse(model: dict) -> Tuple[Dict, List]:
89
90
  """Compute the AnnotationModel from a model dictionary.
90
91
 
91
92
  Returns:
@@ -5,14 +5,14 @@
5
5
  import argparse
6
6
  import sys
7
7
  from json import load, dumps
8
- from typing import Optional
9
8
  from pathlib import Path
9
+ from typing import Optional
10
10
 
11
11
  import requests
12
12
 
13
13
  from inscriptis import get_text, get_annotated_text
14
- from inscriptis.metadata import __version__, __copyright__, __license__
15
14
  from inscriptis.css_profiles import CSS_PROFILES
15
+ from inscriptis.metadata import __version__, __copyright__, __license__
16
16
  from inscriptis.model.config import ParserConfig
17
17
 
18
18
  DEFAULT_ENCODING = "utf8"
@@ -148,6 +148,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
148
148
  Args:
149
149
  url: URL to the HTML content, or None if the content is obtained from stdin.
150
150
  encoding: used encoding.
151
+ timeout: timeout in seconds for retrieving the URL.
151
152
 
152
153
  Returns:
153
154
  The html_content or None, if no content could be extracted.
@@ -155,17 +156,15 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
155
156
  """
156
157
  if not url:
157
158
  return sys.stdin.read()
158
- elif Path(url).is_file():
159
- with Path(url).open(
160
- encoding=encoding or DEFAULT_ENCODING, errors="ignore"
161
- ) as f:
159
+ elif (p := Path(url)).is_file():
160
+ with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
162
161
  return f.read()
163
162
  elif url.startswith("http://") or url.startswith("https://"):
164
163
  req = requests.get(url, timeout=timeout)
165
164
  return req.content.decode(encoding or req.encoding)
166
165
 
167
166
 
168
- def cli():
167
+ def cli() -> None:
169
168
  """Run the inscript command line client."""
170
169
  args = parse_command_line()
171
170
  if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python
2
+ # coding:utf-8
3
+ """The HTML Engine is responsible for converting HTML to text."""
4
+ from typing import List, Dict, Callable
5
+
6
+ import lxml.html
7
+ from lxml.etree import Comment
8
+
9
+ from inscriptis.annotation import Annotation
10
+ from inscriptis.model.canvas import Canvas
11
+ from inscriptis.model.config import ParserConfig
12
+ from inscriptis.model.html_document_state import HtmlDocumentState
13
+ from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
14
+ from inscriptis.model.tag.br_tag import br_start_handler
15
+ from inscriptis.model.tag.img_tag import img_start_handler
16
+ from inscriptis.model.tag.list_tag import (
17
+ ul_start_handler,
18
+ ol_start_handler,
19
+ li_start_handler,
20
+ ul_end_handler,
21
+ ol_end_handler,
22
+ )
23
+ from inscriptis.model.tag.table_tag import (
24
+ table_start_handler,
25
+ tr_start_handler,
26
+ td_start_handler,
27
+ table_end_handler,
28
+ td_end_handler,
29
+ )
30
+
31
+
32
+ class Inscriptis:
33
+ """Translate an lxml HTML tree to the corresponding text representation.
34
+
35
+ Args:
36
+ html_tree: the lxml HTML tree to convert.
37
+ config: an optional ParserConfig configuration object.
38
+
39
+ Example::
40
+
41
+ from lxml.html import fromstring
42
+ from inscriptis.html_engine import Inscriptis
43
+
44
+ html_content = "<html><body><h1>Test</h1></body></html>"
45
+
46
+ # create an HTML tree from the HTML content.
47
+ html_tree = fromstring(html_content)
48
+
49
+ # transform the HTML tree to text.
50
+ parser = Inscriptis(html_tree)
51
+ text = parser.get_text()
52
+ """
53
+
54
+ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
55
+ # use the default configuration, if no config object is provided
56
+ config = config or ParserConfig()
57
+
58
+ # setup start and end tag call tables
59
+ self.start_tag_handler_dict: Dict[
60
+ str, Callable[[HtmlDocumentState, Dict], None]
61
+ ] = {
62
+ "table": table_start_handler,
63
+ "tr": tr_start_handler,
64
+ "td": td_start_handler,
65
+ "th": td_start_handler,
66
+ "ul": ul_start_handler,
67
+ "ol": ol_start_handler,
68
+ "li": li_start_handler,
69
+ "br": br_start_handler,
70
+ "a": a_start_handler if config.parse_a() else None,
71
+ "img": img_start_handler if config.display_images else None,
72
+ }
73
+ self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = {
74
+ "table": table_end_handler,
75
+ "ul": ul_end_handler,
76
+ "ol": ol_end_handler,
77
+ "td": td_end_handler,
78
+ "th": td_end_handler,
79
+ "a": a_end_handler if config.parse_a() else None,
80
+ }
81
+
82
+ if config.custom_html_tag_handler_mapping:
83
+ self.start_tag_handler_dict.update(
84
+ config.custom_html_tag_handler_mapping.start_tag_mapping
85
+ )
86
+ self.end_tag_handler_dict.update(
87
+ config.custom_html_tag_handler_mapping.end_tag_mapping
88
+ )
89
+
90
+ # parse the HTML tree
91
+ self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)
92
+
93
+ def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
94
+ """Parse the HTML tree.
95
+
96
+ Args:
97
+ tree: the HTML tree to parse.
98
+ """
99
+ if isinstance(tree.tag, str):
100
+ state.apply_starttag_layout(tree.tag, tree.attrib)
101
+
102
+ if handler := self.start_tag_handler_dict.get(tree.tag):
103
+ handler(state, tree.attrib)
104
+ cur = state.tags[-1]
105
+ cur.canvas.open_tag(cur)
106
+
107
+ state.tags[-1].write(tree.text)
108
+
109
+ for node in tree:
110
+ self._parse_html_tree(state, node)
111
+
112
+ # handle the endtag
113
+ if handler := self.end_tag_handler_dict.get(tree.tag):
114
+ handler(state)
115
+ prev = state.tags.pop()
116
+ prev.canvas.close_tag(prev)
117
+
118
+ # write the tail text to the element's container
119
+ state.tags[-1].write(tree.tail)
120
+
121
+ elif tree.tag is Comment and tree.tail:
122
+ state.tags[-1].canvas.write(state.tags[-1], tree.tail)
123
+
124
+ return state.canvas
125
+
126
+ def get_text(self) -> str:
127
+ """Return the text extracted from the HTML page."""
128
+ return self.canvas.get_text()
129
+
130
+ def get_annotations(self) -> List[Annotation]:
131
+ """Return the annotations extracted from the HTML page."""
132
+ return self.canvas.annotations
@@ -4,7 +4,7 @@ Supported attributes::
4
4
  1. :class:`Display` properties.
5
5
  2. :class:`WhiteSpace` properties.
6
6
  3. :class:`HorizontalAlignment` properties.
7
- 4. :class:`VerticalAlignment` properites.
7
+ 4. :class:`VerticalAlignment` properties.
8
8
  """
9
9
 
10
10
  from enum import Enum
@@ -8,7 +8,7 @@ __author__ = "Albert Weichselbraun, Fabian Odoni"
8
8
  __author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch"
9
9
  __copyright__ = (
10
10
  f"{metadata.metadata(PACKAGE)['Name']} "
11
- + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2023 {__author__}"
11
+ + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2025 {__author__}"
12
12
  )
13
13
  __license__ = metadata.metadata(PACKAGE)["License"]
14
14
  __version__ = metadata.metadata(PACKAGE)["Version"]
@@ -57,16 +57,12 @@ class Attribute:
57
57
  attributes: the list of attributes
58
58
  html_element: the HTML element for which the attributes are parsed
59
59
  """
60
- supported_attributes = (
61
- (name, val)
62
- for name, val in attributes.items()
63
- if name in self.attribute_mapping
64
- )
65
- for attr_name, attr_value in supported_attributes:
66
- self.attribute_mapping[attr_name](attr_value, html_element)
60
+ for attr_name, attr_value in attributes.items():
61
+ if attr_name in self.attribute_mapping:
62
+ self.attribute_mapping[attr_name](attr_value, html_element)
67
63
  return html_element
68
64
 
69
- def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
65
+ def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
70
66
  attributes = copy(self.attribute_mapping)
71
67
  for a in annotations:
72
68
  attributes[a.attr] = (
@@ -17,8 +17,8 @@ textual content to the canvas which is managed by the following three classes:
17
17
  from inscriptis.annotation import Annotation
18
18
  from inscriptis.html_properties import WhiteSpace, Display
19
19
  from inscriptis.model.canvas.block import Block
20
- from inscriptis.model.html_element import HtmlElement
21
20
  from inscriptis.model.canvas.prefix import Prefix
21
+ from inscriptis.model.html_element import HtmlElement
22
22
 
23
23
 
24
24
  class Canvas:
@@ -64,10 +64,10 @@ class Canvas:
64
64
  if tag.display == Display.block:
65
65
  self.open_block(tag)
66
66
 
67
- def open_block(self, tag: HtmlElement):
67
+ def open_block(self, tag: HtmlElement) -> None:
68
68
  """Open an HTML block element."""
69
69
  # write missing bullets, if no content has been written
70
- if not self._flush_inline() and tag.list_bullet:
70
+ if not self.flush_inline() and tag.list_bullet:
71
71
  self.write_unconsumed_bullet()
72
72
  self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet)
73
73
 
@@ -79,7 +79,7 @@ class Canvas:
79
79
  self.blocks.append("\n" * (required_newlines - 1))
80
80
  self.margin = required_margin
81
81
 
82
- def write_unconsumed_bullet(self):
82
+ def write_unconsumed_bullet(self) -> None:
83
83
  """Write unconsumed bullets to the blocks list."""
84
84
  bullet = self.current_block.prefix.unconsumed_bullet
85
85
  if bullet:
@@ -100,7 +100,7 @@ class Canvas:
100
100
  """
101
101
  if tag.display == Display.block:
102
102
  # write missing bullets, if no content has been written so far.
103
- if not self._flush_inline() and tag.list_bullet:
103
+ if not self.flush_inline() and tag.list_bullet:
104
104
  self.write_unconsumed_bullet()
105
105
  self.current_block.prefix.remove_last_prefix()
106
106
  self.close_block(tag)
@@ -116,7 +116,7 @@ class Canvas:
116
116
  Annotation(start_idx, self.current_block.idx, annotation)
117
117
  )
118
118
 
119
- def close_block(self, tag: HtmlElement):
119
+ def close_block(self, tag: HtmlElement) -> None:
120
120
  """Close the given HtmlElement by writing its bottom margin.
121
121
 
122
122
  Args:
@@ -128,17 +128,17 @@ class Canvas:
128
128
  self.blocks.append("\n" * (required_newlines - 1))
129
129
  self.margin = tag.margin_after
130
130
 
131
- def write_newline(self):
132
- if not self._flush_inline():
131
+ def write_newline(self) -> None:
132
+ if not self.flush_inline():
133
133
  self.blocks.append("")
134
134
  self.current_block = self.current_block.new_block()
135
135
 
136
136
  def get_text(self) -> str:
137
137
  """Provide a text representation of the Canvas."""
138
- self._flush_inline()
138
+ self.flush_inline()
139
139
  return "\n".join(self.blocks)
140
140
 
141
- def _flush_inline(self) -> bool:
141
+ def flush_inline(self) -> bool:
142
142
  """Attempt to flush the content in self.current_block into a new block.
143
143
 
144
144
  Notes:
@@ -1,7 +1,14 @@
1
1
  """Representation of a text block within the HTML canvas."""
2
+ from __future__ import annotations
3
+
2
4
  from html import unescape
5
+ from typing import TYPE_CHECKING
6
+
3
7
  from inscriptis.html_properties import WhiteSpace
4
8
 
9
+ if TYPE_CHECKING:
10
+ from inscriptis.model.canvas import Prefix
11
+
5
12
 
6
13
  class Block:
7
14
  """The current block of text.
@@ -19,7 +26,7 @@ class Block:
19
26
 
20
27
  __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")
21
28
 
22
- def __init__(self, idx: int, prefix: str):
29
+ def __init__(self, idx: int, prefix: Prefix):
23
30
  self.idx = idx
24
31
  self.prefix = prefix
25
32
  self._content = ""
@@ -22,7 +22,7 @@ class Prefix:
22
22
  self.bullets = []
23
23
  self.consumed = False
24
24
 
25
- def register_prefix(self, padding_inline, bullet):
25
+ def register_prefix(self, padding_inline: int, bullet: str) -> None:
26
26
  """Register the given prefix.
27
27
 
28
28
  Args:
@@ -33,13 +33,13 @@ class Prefix:
33
33
  self.paddings.append(padding_inline)
34
34
  self.bullets.append(bullet if bullet else "")
35
35
 
36
- def remove_last_prefix(self):
36
+ def remove_last_prefix(self) -> None:
37
37
  """Remove the last prefix from the list."""
38
38
  with suppress(IndexError):
39
39
  self.current_padding -= self.paddings.pop()
40
40
  del self.bullets[-1]
41
41
 
42
- def pop_next_bullet(self):
42
+ def pop_next_bullet(self) -> str:
43
43
  """Pop the next bullet to use, if any bullet is available."""
44
44
  next_bullet_idx = (
45
45
  next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
@@ -53,7 +53,7 @@ class Prefix:
53
53
  return bullet
54
54
 
55
55
  @property
56
- def first(self):
56
+ def first(self) -> str:
57
57
  """Return the prefix used at the beginning of a tag.
58
58
 
59
59
  Note::
@@ -69,7 +69,7 @@ class Prefix:
69
69
  return " " * (self.current_padding - len(bullet)) + bullet
70
70
 
71
71
  @property
72
- def unconsumed_bullet(self):
72
+ def unconsumed_bullet(self) -> str:
73
73
  """Yield any yet unconsumed bullet.
74
74
 
75
75
  Note::
@@ -87,7 +87,7 @@ class Prefix:
87
87
  return " " * (padding - len(bullet)) + bullet
88
88
 
89
89
  @property
90
- def rest(self):
90
+ def rest(self) -> str:
91
91
  """Return the prefix used for new lines within a block.
92
92
 
93
93
  This prefix is used for pre-text that contains newlines. The lines