inscriptis 2.4.0.1__tar.gz → 2.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inscriptis-2.5.2/AUTHORS +5 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/PKG-INFO +23 -19
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/README.rst +17 -13
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/pyproject.toml +8 -8
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/__init__.py +6 -5
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/__init__.py +3 -3
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/parser.py +2 -1
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/cli/inscript.py +6 -7
- inscriptis-2.5.2/src/inscriptis/html_engine.py +132 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/html_properties.py +1 -1
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/metadata.py +1 -1
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/attribute.py +4 -8
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/__init__.py +10 -10
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/block.py +8 -1
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/canvas/prefix.py +6 -6
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/config.py +9 -3
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/css.py +1 -0
- inscriptis-2.5.2/src/inscriptis/model/html_document_state.py +52 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/html_element.py +4 -4
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/table.py +5 -5
- inscriptis-2.5.2/src/inscriptis/model/tag/__init__.py +20 -0
- inscriptis-2.5.2/src/inscriptis/model/tag/a_tag.py +22 -0
- inscriptis-2.5.2/src/inscriptis/model/tag/br_tag.py +9 -0
- inscriptis-2.5.2/src/inscriptis/model/tag/img_tag.py +14 -0
- inscriptis-2.5.2/src/inscriptis/model/tag/list_tag.py +44 -0
- inscriptis-2.5.2/src/inscriptis/model/tag/table_tag.py +68 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/service/web.py +4 -4
- inscriptis-2.4.0.1/src/inscriptis/html_engine.py +0 -265
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/LICENSE +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/__init__.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/html.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/surface.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/annotation/output/xml.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/cli/__init__.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/css_profiles.py +1 -1
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/model/__init__.py +0 -0
- {inscriptis-2.4.0.1 → inscriptis-2.5.2}/src/inscriptis/service/__init__.py +0 -0
inscriptis-2.5.2/AUTHORS
ADDED
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: inscriptis
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.2
|
|
4
4
|
Summary: inscriptis - HTML to text converter.
|
|
5
5
|
Home-page: https://github.com/weblyzard/inscriptis
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Keywords: HTML,converter,text
|
|
8
8
|
Author: Albert Weichselbraun
|
|
9
9
|
Author-email: albert.weichselbraun@fhgr.ch
|
|
10
|
-
Requires-Python: >=3.
|
|
10
|
+
Requires-Python: >=3.9,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
20
|
Classifier: Topic :: Text Processing
|
|
21
21
|
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
22
22
|
Classifier: Topic :: Utilities
|
|
23
23
|
Provides-Extra: web-service
|
|
24
|
-
Requires-Dist: fastapi (>=0.109.
|
|
24
|
+
Requires-Dist: fastapi (>=0.109.1,<0.110.0) ; extra == "web-service"
|
|
25
25
|
Requires-Dist: lxml (>=4.9.3)
|
|
26
|
-
Requires-Dist: requests (
|
|
27
|
-
Requires-Dist: uvicorn (>=0.
|
|
26
|
+
Requires-Dist: requests (==2.32.2)
|
|
27
|
+
Requires-Dist: uvicorn (>=0.27.1,<0.28.0) ; extra == "web-service"
|
|
28
28
|
Project-URL: Documentation, https://inscriptis.readthedocs.io/en
|
|
29
29
|
Project-URL: Repository, https://github.com/weblyzard/inscriptis
|
|
30
30
|
Description-Content-Type: text/x-rst
|
|
@@ -129,12 +129,6 @@ Or, if you don't have pip installed::
|
|
|
129
129
|
|
|
130
130
|
$ easy_install inscriptis
|
|
131
131
|
|
|
132
|
-
If you want to install from the latest sources, you can do::
|
|
133
|
-
|
|
134
|
-
$ git clone https://github.com/weblyzard/inscriptis.git
|
|
135
|
-
$ cd inscriptis
|
|
136
|
-
$ python setup.py install
|
|
137
|
-
|
|
138
132
|
|
|
139
133
|
Python library
|
|
140
134
|
==============
|
|
@@ -222,7 +216,7 @@ HTML to annotated text conversion
|
|
|
222
216
|
---------------------------------
|
|
223
217
|
convert and annotate HTML from a Web page using the provided annotation rules.
|
|
224
218
|
|
|
225
|
-
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
|
|
219
|
+
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
|
|
226
220
|
|
|
227
221
|
$ inscript https://www.fhgr.ch -r annotation-profile.json
|
|
228
222
|
|
|
@@ -273,7 +267,7 @@ that are suitable for your particular application. Post processors can be
|
|
|
273
267
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
274
268
|
|
|
275
269
|
$ inscript https://www.fhgr.ch \
|
|
276
|
-
-r ./examples/annotation-profile.json \
|
|
270
|
+
-r ./annotation/examples/annotation-profile.json \
|
|
277
271
|
-p surface
|
|
278
272
|
|
|
279
273
|
|
|
@@ -511,7 +505,8 @@ be used within a program:
|
|
|
511
505
|
.. code-block:: python
|
|
512
506
|
|
|
513
507
|
import urllib.request
|
|
514
|
-
from inscriptis import get_annotated_text
|
|
508
|
+
from inscriptis import get_annotated_text
|
|
509
|
+
from inscriptis.model.config import ParserConfig
|
|
515
510
|
|
|
516
511
|
url = "https://www.fhgr.ch"
|
|
517
512
|
html = urllib.request.urlopen(url).read().decode('utf-8')
|
|
@@ -570,15 +565,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
|
|
|
570
565
|
|
|
571
566
|
.. code-block:: python
|
|
572
567
|
|
|
573
|
-
inscriptis
|
|
568
|
+
from inscriptis import ParserConfig
|
|
569
|
+
from inscriptis.html_engine import Inscriptis
|
|
570
|
+
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
|
|
574
571
|
|
|
575
|
-
|
|
576
|
-
|
|
572
|
+
my_mapping = CustomHtmlTagHandlerMapping(
|
|
573
|
+
start_tag_mapping={'a': my_handle_start_a},
|
|
574
|
+
end_tag_mapping={'a': my_handle_end_a}
|
|
575
|
+
)
|
|
576
|
+
inscriptis = Inscriptis(html_tree,
|
|
577
|
+
ParserConfig(custom_html_tag_handler_mapping=my_mapping))
|
|
577
578
|
text = inscriptis.get_text()
|
|
578
579
|
|
|
579
580
|
|
|
580
581
|
In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
|
|
581
|
-
You may define custom handlers for any tag, regardless of whether it already exists in
|
|
582
|
+
You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
|
|
583
|
+
|
|
584
|
+
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
|
|
585
|
+
The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
|
|
582
586
|
|
|
583
587
|
Optimizing memory consumption
|
|
584
588
|
-----------------------------
|
|
@@ -98,12 +98,6 @@ Or, if you don't have pip installed::
|
|
|
98
98
|
|
|
99
99
|
$ easy_install inscriptis
|
|
100
100
|
|
|
101
|
-
If you want to install from the latest sources, you can do::
|
|
102
|
-
|
|
103
|
-
$ git clone https://github.com/weblyzard/inscriptis.git
|
|
104
|
-
$ cd inscriptis
|
|
105
|
-
$ python setup.py install
|
|
106
|
-
|
|
107
101
|
|
|
108
102
|
Python library
|
|
109
103
|
==============
|
|
@@ -191,7 +185,7 @@ HTML to annotated text conversion
|
|
|
191
185
|
---------------------------------
|
|
192
186
|
convert and annotate HTML from a Web page using the provided annotation rules.
|
|
193
187
|
|
|
194
|
-
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
|
|
188
|
+
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
|
|
195
189
|
|
|
196
190
|
$ inscript https://www.fhgr.ch -r annotation-profile.json
|
|
197
191
|
|
|
@@ -242,7 +236,7 @@ that are suitable for your particular application. Post processors can be
|
|
|
242
236
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
243
237
|
|
|
244
238
|
$ inscript https://www.fhgr.ch \
|
|
245
|
-
-r ./examples/annotation-profile.json \
|
|
239
|
+
-r ./annotation/examples/annotation-profile.json \
|
|
246
240
|
-p surface
|
|
247
241
|
|
|
248
242
|
|
|
@@ -480,7 +474,8 @@ be used within a program:
|
|
|
480
474
|
.. code-block:: python
|
|
481
475
|
|
|
482
476
|
import urllib.request
|
|
483
|
-
from inscriptis import get_annotated_text
|
|
477
|
+
from inscriptis import get_annotated_text
|
|
478
|
+
from inscriptis.model.config import ParserConfig
|
|
484
479
|
|
|
485
480
|
url = "https://www.fhgr.ch"
|
|
486
481
|
html = urllib.request.urlopen(url).read().decode('utf-8')
|
|
@@ -539,15 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
|
|
|
539
534
|
|
|
540
535
|
.. code-block:: python
|
|
541
536
|
|
|
542
|
-
inscriptis
|
|
537
|
+
from inscriptis import ParserConfig
|
|
538
|
+
from inscriptis.html_engine import Inscriptis
|
|
539
|
+
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
|
|
543
540
|
|
|
544
|
-
|
|
545
|
-
|
|
541
|
+
my_mapping = CustomHtmlTagHandlerMapping(
|
|
542
|
+
start_tag_mapping={'a': my_handle_start_a},
|
|
543
|
+
end_tag_mapping={'a': my_handle_end_a}
|
|
544
|
+
)
|
|
545
|
+
inscriptis = Inscriptis(html_tree,
|
|
546
|
+
ParserConfig(custom_html_tag_handler_mapping=my_mapping))
|
|
546
547
|
text = inscriptis.get_text()
|
|
547
548
|
|
|
548
549
|
|
|
549
550
|
In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
|
|
550
|
-
You may define custom handlers for any tag, regardless of whether it already exists in
|
|
551
|
+
You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
|
|
552
|
+
|
|
553
|
+
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
|
|
554
|
+
The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
|
|
551
555
|
|
|
552
556
|
Optimizing memory consumption
|
|
553
557
|
-----------------------------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "inscriptis"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.5.2"
|
|
4
4
|
authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
|
|
5
5
|
description = "inscriptis - HTML to text converter."
|
|
6
6
|
keywords = ["HTML", "converter", "text"]
|
|
@@ -12,11 +12,11 @@ classifiers = [
|
|
|
12
12
|
'Topic :: Text Processing :: Markup :: HTML',
|
|
13
13
|
'Topic :: Utilities',
|
|
14
14
|
'Programming Language :: Python :: 3',
|
|
15
|
-
'Programming Language :: Python :: 3.8',
|
|
16
15
|
'Programming Language :: Python :: 3.9',
|
|
17
16
|
'Programming Language :: Python :: 3.10',
|
|
18
17
|
'Programming Language :: Python :: 3.11',
|
|
19
18
|
'Programming Language :: Python :: 3.12',
|
|
19
|
+
'Programming Language :: Python :: 3.13',
|
|
20
20
|
]
|
|
21
21
|
homepage = "https://github.com/weblyzard/inscriptis"
|
|
22
22
|
repository = "https://github.com/weblyzard/inscriptis"
|
|
@@ -39,13 +39,13 @@ web-service = ["fastapi", "uvicorn"]
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
[tool.poetry.dependencies]
|
|
42
|
-
python = "^3.
|
|
43
|
-
requests = "
|
|
42
|
+
python = "^3.9 || ^3.10 || ^3.11 || ^3.12 || ^3.13"
|
|
43
|
+
requests = "2.32.2"
|
|
44
44
|
lxml = ">=4.9.3"
|
|
45
45
|
|
|
46
46
|
# optional dependencies
|
|
47
|
-
fastapi = { version = "^0.109.
|
|
48
|
-
uvicorn = { version = "^0.
|
|
47
|
+
fastapi = { version = "^0.109.1", optional = true }
|
|
48
|
+
uvicorn = { version = "^0.27.1", optional = true }
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
[build-system]
|
|
@@ -56,8 +56,8 @@ build-backend = "poetry.core.masonry.api"
|
|
|
56
56
|
# code formatting with black
|
|
57
57
|
[tool.black]
|
|
58
58
|
line-length = 88
|
|
59
|
-
target-version = ["
|
|
59
|
+
target-version = ["py39", "py310", "py311", "py312", "py313"]
|
|
60
60
|
extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
|
|
61
61
|
include = '''
|
|
62
|
-
^/src/|^/tests/|^/benchmarking/
|
|
62
|
+
^/src/|^/tests/|^/benchmarking/|^/examples/
|
|
63
63
|
'''
|
|
@@ -60,12 +60,12 @@ Annotations in the `label` field are returned as a list of triples with
|
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
62
|
import re
|
|
63
|
-
from lxml.html import fromstring, HtmlElement
|
|
64
|
-
from lxml.etree import ParserError
|
|
65
|
-
|
|
66
63
|
from typing import Dict, Optional, Any
|
|
67
|
-
|
|
68
64
|
from inscriptis.model.config import ParserConfig
|
|
65
|
+
|
|
66
|
+
from lxml.etree import ParserError
|
|
67
|
+
from lxml.html import fromstring, HtmlElement
|
|
68
|
+
|
|
69
69
|
from inscriptis.html_engine import Inscriptis
|
|
70
70
|
|
|
71
71
|
RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
|
|
@@ -132,5 +132,6 @@ def get_annotated_text(
|
|
|
132
132
|
return {}
|
|
133
133
|
|
|
134
134
|
inscriptis = Inscriptis(html_tree, config)
|
|
135
|
+
text = inscriptis.get_text()
|
|
135
136
|
labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
|
|
136
|
-
return {"text":
|
|
137
|
+
return {"text": text, "label": labels}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""The model used for saving annotations."""
|
|
2
2
|
|
|
3
|
-
from typing import NamedTuple, Tuple
|
|
4
3
|
from typing import List
|
|
4
|
+
from typing import NamedTuple
|
|
5
5
|
|
|
6
6
|
from inscriptis.html_properties import HorizontalAlignment
|
|
7
7
|
|
|
@@ -25,8 +25,8 @@ class Annotation(NamedTuple):
|
|
|
25
25
|
"""the annotation's start index within the text output."""
|
|
26
26
|
end: int
|
|
27
27
|
"""the annotation's end index within the text output."""
|
|
28
|
-
metadata:
|
|
29
|
-
"""
|
|
28
|
+
metadata: str
|
|
29
|
+
"""the tag to be attached to the annotation."""
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def horizontal_shift(
|
|
@@ -18,6 +18,7 @@ Example::
|
|
|
18
18
|
"""
|
|
19
19
|
from collections import defaultdict
|
|
20
20
|
from copy import copy
|
|
21
|
+
from typing import Dict, Tuple, List
|
|
21
22
|
|
|
22
23
|
from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT
|
|
23
24
|
|
|
@@ -85,7 +86,7 @@ class AnnotationModel:
|
|
|
85
86
|
self.css = css_profile
|
|
86
87
|
|
|
87
88
|
@staticmethod
|
|
88
|
-
def _parse(model: dict) ->
|
|
89
|
+
def _parse(model: dict) -> Tuple[Dict, List]:
|
|
89
90
|
"""Compute the AnnotationModel from a model dictionary.
|
|
90
91
|
|
|
91
92
|
Returns:
|
|
@@ -5,14 +5,14 @@
|
|
|
5
5
|
import argparse
|
|
6
6
|
import sys
|
|
7
7
|
from json import load, dumps
|
|
8
|
-
from typing import Optional
|
|
9
8
|
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
10
|
|
|
11
11
|
import requests
|
|
12
12
|
|
|
13
13
|
from inscriptis import get_text, get_annotated_text
|
|
14
|
-
from inscriptis.metadata import __version__, __copyright__, __license__
|
|
15
14
|
from inscriptis.css_profiles import CSS_PROFILES
|
|
15
|
+
from inscriptis.metadata import __version__, __copyright__, __license__
|
|
16
16
|
from inscriptis.model.config import ParserConfig
|
|
17
17
|
|
|
18
18
|
DEFAULT_ENCODING = "utf8"
|
|
@@ -148,6 +148,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
|
|
|
148
148
|
Args:
|
|
149
149
|
url: URL to the HTML content, or None if the content is obtained from stdin.
|
|
150
150
|
encoding: used encoding.
|
|
151
|
+
timeout: timeout in seconds for retrieving the URL.
|
|
151
152
|
|
|
152
153
|
Returns:
|
|
153
154
|
The html_content or None, if no content could be extracted.
|
|
@@ -155,17 +156,15 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
|
|
|
155
156
|
"""
|
|
156
157
|
if not url:
|
|
157
158
|
return sys.stdin.read()
|
|
158
|
-
elif Path(url).is_file():
|
|
159
|
-
with
|
|
160
|
-
encoding=encoding or DEFAULT_ENCODING, errors="ignore"
|
|
161
|
-
) as f:
|
|
159
|
+
elif (p := Path(url)).is_file():
|
|
160
|
+
with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
|
|
162
161
|
return f.read()
|
|
163
162
|
elif url.startswith("http://") or url.startswith("https://"):
|
|
164
163
|
req = requests.get(url, timeout=timeout)
|
|
165
164
|
return req.content.decode(encoding or req.encoding)
|
|
166
165
|
|
|
167
166
|
|
|
168
|
-
def cli():
|
|
167
|
+
def cli() -> None:
|
|
169
168
|
"""Run the inscript command line client."""
|
|
170
169
|
args = parse_command_line()
|
|
171
170
|
if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# coding:utf-8
|
|
3
|
+
"""The HTML Engine is responsible for converting HTML to text."""
|
|
4
|
+
from typing import List, Dict, Callable
|
|
5
|
+
|
|
6
|
+
import lxml.html
|
|
7
|
+
from lxml.etree import Comment
|
|
8
|
+
|
|
9
|
+
from inscriptis.annotation import Annotation
|
|
10
|
+
from inscriptis.model.canvas import Canvas
|
|
11
|
+
from inscriptis.model.config import ParserConfig
|
|
12
|
+
from inscriptis.model.html_document_state import HtmlDocumentState
|
|
13
|
+
from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
|
|
14
|
+
from inscriptis.model.tag.br_tag import br_start_handler
|
|
15
|
+
from inscriptis.model.tag.img_tag import img_start_handler
|
|
16
|
+
from inscriptis.model.tag.list_tag import (
|
|
17
|
+
ul_start_handler,
|
|
18
|
+
ol_start_handler,
|
|
19
|
+
li_start_handler,
|
|
20
|
+
ul_end_handler,
|
|
21
|
+
ol_end_handler,
|
|
22
|
+
)
|
|
23
|
+
from inscriptis.model.tag.table_tag import (
|
|
24
|
+
table_start_handler,
|
|
25
|
+
tr_start_handler,
|
|
26
|
+
td_start_handler,
|
|
27
|
+
table_end_handler,
|
|
28
|
+
td_end_handler,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Inscriptis:
|
|
33
|
+
"""Translate an lxml HTML tree to the corresponding text representation.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
html_tree: the lxml HTML tree to convert.
|
|
37
|
+
config: an optional ParserConfig configuration object.
|
|
38
|
+
|
|
39
|
+
Example::
|
|
40
|
+
|
|
41
|
+
from lxml.html import fromstring
|
|
42
|
+
from inscriptis.html_engine import Inscriptis
|
|
43
|
+
|
|
44
|
+
html_content = "<html><body><h1>Test</h1></body></html>"
|
|
45
|
+
|
|
46
|
+
# create an HTML tree from the HTML content.
|
|
47
|
+
html_tree = fromstring(html_content)
|
|
48
|
+
|
|
49
|
+
# transform the HTML tree to text.
|
|
50
|
+
parser = Inscriptis(html_tree)
|
|
51
|
+
text = parser.get_text()
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
|
|
55
|
+
# use the default configuration, if no config object is provided
|
|
56
|
+
config = config or ParserConfig()
|
|
57
|
+
|
|
58
|
+
# setup start and end tag call tables
|
|
59
|
+
self.start_tag_handler_dict: Dict[
|
|
60
|
+
str, Callable[[HtmlDocumentState, Dict], None]
|
|
61
|
+
] = {
|
|
62
|
+
"table": table_start_handler,
|
|
63
|
+
"tr": tr_start_handler,
|
|
64
|
+
"td": td_start_handler,
|
|
65
|
+
"th": td_start_handler,
|
|
66
|
+
"ul": ul_start_handler,
|
|
67
|
+
"ol": ol_start_handler,
|
|
68
|
+
"li": li_start_handler,
|
|
69
|
+
"br": br_start_handler,
|
|
70
|
+
"a": a_start_handler if config.parse_a() else None,
|
|
71
|
+
"img": img_start_handler if config.display_images else None,
|
|
72
|
+
}
|
|
73
|
+
self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = {
|
|
74
|
+
"table": table_end_handler,
|
|
75
|
+
"ul": ul_end_handler,
|
|
76
|
+
"ol": ol_end_handler,
|
|
77
|
+
"td": td_end_handler,
|
|
78
|
+
"th": td_end_handler,
|
|
79
|
+
"a": a_end_handler if config.parse_a() else None,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if config.custom_html_tag_handler_mapping:
|
|
83
|
+
self.start_tag_handler_dict.update(
|
|
84
|
+
config.custom_html_tag_handler_mapping.start_tag_mapping
|
|
85
|
+
)
|
|
86
|
+
self.end_tag_handler_dict.update(
|
|
87
|
+
config.custom_html_tag_handler_mapping.end_tag_mapping
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# parse the HTML tree
|
|
91
|
+
self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)
|
|
92
|
+
|
|
93
|
+
def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
|
|
94
|
+
"""Parse the HTML tree.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
tree: the HTML tree to parse.
|
|
98
|
+
"""
|
|
99
|
+
if isinstance(tree.tag, str):
|
|
100
|
+
state.apply_starttag_layout(tree.tag, tree.attrib)
|
|
101
|
+
|
|
102
|
+
if handler := self.start_tag_handler_dict.get(tree.tag):
|
|
103
|
+
handler(state, tree.attrib)
|
|
104
|
+
cur = state.tags[-1]
|
|
105
|
+
cur.canvas.open_tag(cur)
|
|
106
|
+
|
|
107
|
+
state.tags[-1].write(tree.text)
|
|
108
|
+
|
|
109
|
+
for node in tree:
|
|
110
|
+
self._parse_html_tree(state, node)
|
|
111
|
+
|
|
112
|
+
# handle the endtag
|
|
113
|
+
if handler := self.end_tag_handler_dict.get(tree.tag):
|
|
114
|
+
handler(state)
|
|
115
|
+
prev = state.tags.pop()
|
|
116
|
+
prev.canvas.close_tag(prev)
|
|
117
|
+
|
|
118
|
+
# write the tail text to the element's container
|
|
119
|
+
state.tags[-1].write(tree.tail)
|
|
120
|
+
|
|
121
|
+
elif tree.tag is Comment and tree.tail:
|
|
122
|
+
state.tags[-1].canvas.write(state.tags[-1], tree.tail)
|
|
123
|
+
|
|
124
|
+
return state.canvas
|
|
125
|
+
|
|
126
|
+
def get_text(self) -> str:
|
|
127
|
+
"""Return the text extracted from the HTML page."""
|
|
128
|
+
return self.canvas.get_text()
|
|
129
|
+
|
|
130
|
+
def get_annotations(self) -> List[Annotation]:
|
|
131
|
+
"""Return the annotations extracted from the HTML page."""
|
|
132
|
+
return self.canvas.annotations
|
|
@@ -8,7 +8,7 @@ __author__ = "Albert Weichselbraun, Fabian Odoni"
|
|
|
8
8
|
__author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch"
|
|
9
9
|
__copyright__ = (
|
|
10
10
|
f"{metadata.metadata(PACKAGE)['Name']} "
|
|
11
|
-
+ f"{metadata.metadata(PACKAGE)['Version']} © 2016-
|
|
11
|
+
+ f"{metadata.metadata(PACKAGE)['Version']} © 2016-2025 {__author__}"
|
|
12
12
|
)
|
|
13
13
|
__license__ = metadata.metadata(PACKAGE)["License"]
|
|
14
14
|
__version__ = metadata.metadata(PACKAGE)["Version"]
|
|
@@ -57,16 +57,12 @@ class Attribute:
|
|
|
57
57
|
attributes: the list of attributes
|
|
58
58
|
html_element: the HTML element for which the attributes are parsed
|
|
59
59
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if name in self.attribute_mapping
|
|
64
|
-
)
|
|
65
|
-
for attr_name, attr_value in supported_attributes:
|
|
66
|
-
self.attribute_mapping[attr_name](attr_value, html_element)
|
|
60
|
+
for attr_name, attr_value in attributes.items():
|
|
61
|
+
if attr_name in self.attribute_mapping:
|
|
62
|
+
self.attribute_mapping[attr_name](attr_value, html_element)
|
|
67
63
|
return html_element
|
|
68
64
|
|
|
69
|
-
def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
|
|
65
|
+
def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
|
|
70
66
|
attributes = copy(self.attribute_mapping)
|
|
71
67
|
for a in annotations:
|
|
72
68
|
attributes[a.attr] = (
|
|
@@ -17,8 +17,8 @@ textual content to the canvas which is managed by the following three classes:
|
|
|
17
17
|
from inscriptis.annotation import Annotation
|
|
18
18
|
from inscriptis.html_properties import WhiteSpace, Display
|
|
19
19
|
from inscriptis.model.canvas.block import Block
|
|
20
|
-
from inscriptis.model.html_element import HtmlElement
|
|
21
20
|
from inscriptis.model.canvas.prefix import Prefix
|
|
21
|
+
from inscriptis.model.html_element import HtmlElement
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class Canvas:
|
|
@@ -64,10 +64,10 @@ class Canvas:
|
|
|
64
64
|
if tag.display == Display.block:
|
|
65
65
|
self.open_block(tag)
|
|
66
66
|
|
|
67
|
-
def open_block(self, tag: HtmlElement):
|
|
67
|
+
def open_block(self, tag: HtmlElement) -> None:
|
|
68
68
|
"""Open an HTML block element."""
|
|
69
69
|
# write missing bullets, if no content has been written
|
|
70
|
-
if not self.
|
|
70
|
+
if not self.flush_inline() and tag.list_bullet:
|
|
71
71
|
self.write_unconsumed_bullet()
|
|
72
72
|
self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet)
|
|
73
73
|
|
|
@@ -79,7 +79,7 @@ class Canvas:
|
|
|
79
79
|
self.blocks.append("\n" * (required_newlines - 1))
|
|
80
80
|
self.margin = required_margin
|
|
81
81
|
|
|
82
|
-
def write_unconsumed_bullet(self):
|
|
82
|
+
def write_unconsumed_bullet(self) -> None:
|
|
83
83
|
"""Write unconsumed bullets to the blocks list."""
|
|
84
84
|
bullet = self.current_block.prefix.unconsumed_bullet
|
|
85
85
|
if bullet:
|
|
@@ -100,7 +100,7 @@ class Canvas:
|
|
|
100
100
|
"""
|
|
101
101
|
if tag.display == Display.block:
|
|
102
102
|
# write missing bullets, if no content has been written so far.
|
|
103
|
-
if not self.
|
|
103
|
+
if not self.flush_inline() and tag.list_bullet:
|
|
104
104
|
self.write_unconsumed_bullet()
|
|
105
105
|
self.current_block.prefix.remove_last_prefix()
|
|
106
106
|
self.close_block(tag)
|
|
@@ -116,7 +116,7 @@ class Canvas:
|
|
|
116
116
|
Annotation(start_idx, self.current_block.idx, annotation)
|
|
117
117
|
)
|
|
118
118
|
|
|
119
|
-
def close_block(self, tag: HtmlElement):
|
|
119
|
+
def close_block(self, tag: HtmlElement) -> None:
|
|
120
120
|
"""Close the given HtmlElement by writing its bottom margin.
|
|
121
121
|
|
|
122
122
|
Args:
|
|
@@ -128,17 +128,17 @@ class Canvas:
|
|
|
128
128
|
self.blocks.append("\n" * (required_newlines - 1))
|
|
129
129
|
self.margin = tag.margin_after
|
|
130
130
|
|
|
131
|
-
def write_newline(self):
|
|
132
|
-
if not self.
|
|
131
|
+
def write_newline(self) -> None:
|
|
132
|
+
if not self.flush_inline():
|
|
133
133
|
self.blocks.append("")
|
|
134
134
|
self.current_block = self.current_block.new_block()
|
|
135
135
|
|
|
136
136
|
def get_text(self) -> str:
|
|
137
137
|
"""Provide a text representation of the Canvas."""
|
|
138
|
-
self.
|
|
138
|
+
self.flush_inline()
|
|
139
139
|
return "\n".join(self.blocks)
|
|
140
140
|
|
|
141
|
-
def
|
|
141
|
+
def flush_inline(self) -> bool:
|
|
142
142
|
"""Attempt to flush the content in self.current_block into a new block.
|
|
143
143
|
|
|
144
144
|
Notes:
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
"""Representation of a text block within the HTML canvas."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
2
4
|
from html import unescape
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
3
7
|
from inscriptis.html_properties import WhiteSpace
|
|
4
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from inscriptis.model.canvas import Prefix
|
|
11
|
+
|
|
5
12
|
|
|
6
13
|
class Block:
|
|
7
14
|
"""The current block of text.
|
|
@@ -19,7 +26,7 @@ class Block:
|
|
|
19
26
|
|
|
20
27
|
__slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")
|
|
21
28
|
|
|
22
|
-
def __init__(self, idx: int, prefix:
|
|
29
|
+
def __init__(self, idx: int, prefix: Prefix):
|
|
23
30
|
self.idx = idx
|
|
24
31
|
self.prefix = prefix
|
|
25
32
|
self._content = ""
|
|
@@ -22,7 +22,7 @@ class Prefix:
|
|
|
22
22
|
self.bullets = []
|
|
23
23
|
self.consumed = False
|
|
24
24
|
|
|
25
|
-
def register_prefix(self, padding_inline, bullet):
|
|
25
|
+
def register_prefix(self, padding_inline: int, bullet: str) -> None:
|
|
26
26
|
"""Register the given prefix.
|
|
27
27
|
|
|
28
28
|
Args:
|
|
@@ -33,13 +33,13 @@ class Prefix:
|
|
|
33
33
|
self.paddings.append(padding_inline)
|
|
34
34
|
self.bullets.append(bullet if bullet else "")
|
|
35
35
|
|
|
36
|
-
def remove_last_prefix(self):
|
|
36
|
+
def remove_last_prefix(self) -> None:
|
|
37
37
|
"""Remove the last prefix from the list."""
|
|
38
38
|
with suppress(IndexError):
|
|
39
39
|
self.current_padding -= self.paddings.pop()
|
|
40
40
|
del self.bullets[-1]
|
|
41
41
|
|
|
42
|
-
def pop_next_bullet(self):
|
|
42
|
+
def pop_next_bullet(self) -> str:
|
|
43
43
|
"""Pop the next bullet to use, if any bullet is available."""
|
|
44
44
|
next_bullet_idx = (
|
|
45
45
|
next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
|
|
@@ -53,7 +53,7 @@ class Prefix:
|
|
|
53
53
|
return bullet
|
|
54
54
|
|
|
55
55
|
@property
|
|
56
|
-
def first(self):
|
|
56
|
+
def first(self) -> str:
|
|
57
57
|
"""Return the prefix used at the beginning of a tag.
|
|
58
58
|
|
|
59
59
|
Note::
|
|
@@ -69,7 +69,7 @@ class Prefix:
|
|
|
69
69
|
return " " * (self.current_padding - len(bullet)) + bullet
|
|
70
70
|
|
|
71
71
|
@property
|
|
72
|
-
def unconsumed_bullet(self):
|
|
72
|
+
def unconsumed_bullet(self) -> str:
|
|
73
73
|
"""Yield any yet unconsumed bullet.
|
|
74
74
|
|
|
75
75
|
Note::
|
|
@@ -87,7 +87,7 @@ class Prefix:
|
|
|
87
87
|
return " " * (padding - len(bullet)) + bullet
|
|
88
88
|
|
|
89
89
|
@property
|
|
90
|
-
def rest(self):
|
|
90
|
+
def rest(self) -> str:
|
|
91
91
|
"""Return the prefix used for new lines within a block.
|
|
92
92
|
|
|
93
93
|
This prefix is used for pre-text that contains newlines. The lines
|