inscriptis 2.3.2__tar.gz → 2.4.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {inscriptis-2.3.2/src/inscriptis.egg-info → inscriptis-2.4.0.1}/PKG-INFO +86 -58
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/README.rst +69 -44
- inscriptis-2.4.0.1/pyproject.toml +63 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/__init__.py +10 -12
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/__init__.py +10 -5
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/html.py +39 -38
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/surface.py +5 -3
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/xml.py +14 -9
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/parser.py +22 -17
- inscriptis-2.4.0.1/src/inscriptis/cli/__init__.py +1 -0
- inscriptis-2.4.0.1/src/inscriptis/cli/inscript.py +214 -0
- inscriptis-2.4.0.1/src/inscriptis/css_profiles.py +64 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/html_engine.py +57 -47
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/html_properties.py +3 -3
- inscriptis-2.4.0.1/src/inscriptis/metadata.py +14 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/attribute.py +18 -9
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/__init__.py +15 -11
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/block.py +15 -9
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/prefix.py +13 -15
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/config.py +13 -12
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/css.py +18 -14
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/html_element.py +56 -36
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/table.py +67 -45
- inscriptis-2.4.0.1/src/inscriptis/service/web.py +55 -0
- inscriptis-2.3.2/AUTHORS +0 -5
- inscriptis-2.3.2/PKG-INFO +0 -647
- inscriptis-2.3.2/scripts/inscript.py +0 -139
- inscriptis-2.3.2/setup.cfg +0 -4
- inscriptis-2.3.2/setup.py +0 -61
- inscriptis-2.3.2/src/inscriptis/css_profiles.py +0 -84
- inscriptis-2.3.2/src/inscriptis/metadata.py +0 -7
- inscriptis-2.3.2/src/inscriptis/service/web.py +0 -44
- inscriptis-2.3.2/src/inscriptis.egg-info/SOURCES.txt +0 -32
- inscriptis-2.3.2/src/inscriptis.egg-info/dependency_links.txt +0 -1
- inscriptis-2.3.2/src/inscriptis.egg-info/requires.txt +0 -2
- inscriptis-2.3.2/src/inscriptis.egg-info/top_level.txt +0 -1
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/LICENSE +0 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/__init__.py +0 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/__init__.py +0 -0
- {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/service/__init__.py +0 -0
|
@@ -1,29 +1,33 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: inscriptis
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0.1
|
|
4
4
|
Summary: inscriptis - HTML to text converter.
|
|
5
5
|
Home-page: https://github.com/weblyzard/inscriptis
|
|
6
|
-
|
|
7
|
-
Author-email: albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch
|
|
8
|
-
License: Apache 2.0
|
|
6
|
+
License: Apache-2.0
|
|
9
7
|
Keywords: HTML,converter,text
|
|
10
|
-
|
|
8
|
+
Author: Albert Weichselbraun
|
|
9
|
+
Author-email: albert.weichselbraun@fhgr.ch
|
|
10
|
+
Requires-Python: >=3.8,<4.0
|
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
-
Classifier: Topic :: Text Processing
|
|
15
|
-
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
16
|
-
Classifier: Topic :: Utilities
|
|
17
14
|
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.6
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.7
|
|
20
15
|
Classifier: Programming Language :: Python :: 3.8
|
|
21
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
22
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Text Processing
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
22
|
+
Classifier: Topic :: Utilities
|
|
23
|
+
Provides-Extra: web-service
|
|
24
|
+
Requires-Dist: fastapi (>=0.109.0,<0.110.0) ; extra == "web-service"
|
|
25
|
+
Requires-Dist: lxml (>=4.9.3)
|
|
26
|
+
Requires-Dist: requests (>=2.31.0)
|
|
27
|
+
Requires-Dist: uvicorn (>=0.25.0,<0.26.0) ; extra == "web-service"
|
|
28
|
+
Project-URL: Documentation, https://inscriptis.readthedocs.io/en
|
|
29
|
+
Project-URL: Repository, https://github.com/weblyzard/inscriptis
|
|
30
|
+
Description-Content-Type: text/x-rst
|
|
27
31
|
|
|
28
32
|
==================================================================================
|
|
29
33
|
inscriptis -- HTML to text conversion library, command line client and Web service
|
|
@@ -158,9 +162,9 @@ the corresponding text representation.
|
|
|
158
162
|
Command line parameters
|
|
159
163
|
-----------------------
|
|
160
164
|
|
|
161
|
-
The inscript
|
|
165
|
+
The inscript command line client supports the following parameters::
|
|
162
166
|
|
|
163
|
-
usage: inscript
|
|
167
|
+
usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
|
|
164
168
|
[--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
|
|
165
169
|
[input]
|
|
166
170
|
|
|
@@ -199,19 +203,19 @@ HTML to text conversion
|
|
|
199
203
|
-----------------------
|
|
200
204
|
convert the given page to text and output the result to the screen::
|
|
201
205
|
|
|
202
|
-
$ inscript
|
|
206
|
+
$ inscript https://www.fhgr.ch
|
|
203
207
|
|
|
204
208
|
convert the file to text and save the output to fhgr.txt::
|
|
205
209
|
|
|
206
|
-
$ inscript
|
|
210
|
+
$ inscript fhgr.html -o fhgr.txt
|
|
207
211
|
|
|
208
212
|
convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt::
|
|
209
213
|
|
|
210
|
-
$ inscript
|
|
214
|
+
$ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt
|
|
211
215
|
|
|
212
216
|
convert HTML provided via stdin and save the output to output.txt::
|
|
213
217
|
|
|
214
|
-
$ echo "<body><p>Make it so!</p></body>" | inscript
|
|
218
|
+
$ echo "<body><p>Make it so!</p></body>" | inscript -o output.txt
|
|
215
219
|
|
|
216
220
|
|
|
217
221
|
HTML to annotated text conversion
|
|
@@ -220,7 +224,7 @@ convert and annotate HTML from a Web page using the provided annotation rules.
|
|
|
220
224
|
|
|
221
225
|
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
|
|
222
226
|
|
|
223
|
-
$ inscript
|
|
227
|
+
$ inscript https://www.fhgr.ch -r annotation-profile.json
|
|
224
228
|
|
|
225
229
|
The annotation rules are specified in `annotation-profile.json`:
|
|
226
230
|
|
|
@@ -268,7 +272,7 @@ Annotation postprocessors enable the post processing of annotations to formats
|
|
|
268
272
|
that are suitable for your particular application. Post processors can be
|
|
269
273
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
270
274
|
|
|
271
|
-
$ inscript
|
|
275
|
+
$ inscript https://www.fhgr.ch \
|
|
272
276
|
-r ./examples/annotation-profile.json \
|
|
273
277
|
-p surface
|
|
274
278
|
|
|
@@ -313,7 +317,7 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
313
317
|
|
|
314
318
|
.. code-block:: bash
|
|
315
319
|
|
|
316
|
-
inscript
|
|
320
|
+
inscript --annotation-rules ./wikipedia.json \
|
|
317
321
|
--postprocessor html \
|
|
318
322
|
https://en.wikipedia.org/wiki/Chur.html
|
|
319
323
|
|
|
@@ -338,14 +342,18 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
338
342
|
Web Service
|
|
339
343
|
===========
|
|
340
344
|
|
|
341
|
-
|
|
345
|
+
A FastAPI-based Web Service that uses Inscriptis for translating HTML pages to plain text.
|
|
342
346
|
|
|
343
347
|
Run the Web Service on your host system
|
|
344
348
|
---------------------------------------
|
|
345
|
-
|
|
349
|
+
Install the optional feature `web-service` for inscriptis::
|
|
350
|
+
|
|
351
|
+
$ pip install inscriptis[web-service]
|
|
352
|
+
|
|
353
|
+
Start the Inscriptis Web service with the following command::
|
|
354
|
+
|
|
355
|
+
$ uvicorn inscriptis.service.web:app --port 5000 --host 127.0.0.1
|
|
346
356
|
|
|
347
|
-
$ export FLASK_APP="inscriptis.service.web"
|
|
348
|
-
$ python3 -m flask run
|
|
349
357
|
|
|
350
358
|
Run the Web Service with Docker
|
|
351
359
|
-------------------------------
|
|
@@ -526,7 +534,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
|
|
|
526
534
|
1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the
|
|
527
535
|
parameter ``indentation='extended'`` to also use indentation for tags such as
|
|
528
536
|
``<div>`` and ``<span>`` that do not provide indentation in their standard
|
|
529
|
-
definition. This strategy is the default in ``inscript
|
|
537
|
+
definition. This strategy is the default in ``inscript`` and many other
|
|
530
538
|
tools such as Lynx. If you do not want extended indentation you can use the
|
|
531
539
|
parameter ``indentation='standard'`` instead.
|
|
532
540
|
|
|
@@ -551,36 +559,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
|
|
|
551
559
|
html_tree = fromstring(html)
|
|
552
560
|
# create a parser using a custom css
|
|
553
561
|
config = ParserConfig(css=css)
|
|
554
|
-
parser = Inscriptis(html_tree, config)
|
|
555
|
-
[--indentation INDENTATION] [-v]
|
|
556
|
-
[input]
|
|
557
|
-
|
|
558
|
-
Convert the given HTML document to text.
|
|
559
|
-
|
|
560
|
-
positional arguments:
|
|
561
|
-
input Html input either from a file or a URL (default:stdin).
|
|
562
|
-
|
|
563
|
-
optional arguments:
|
|
564
|
-
-h, --help show this help message and exit
|
|
565
|
-
-o OUTPUT, --output OUTPUT
|
|
566
|
-
Output file (default:stdout).
|
|
567
|
-
-e ENCODING, --encoding ENCODING
|
|
568
|
-
Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
|
|
569
|
-
-i, --display-image-captions
|
|
570
|
-
Display image captions (default:false).
|
|
571
|
-
-d, --deduplicate-image-captions
|
|
572
|
-
Deduplicate image captions (default:false).
|
|
573
|
-
-l, --display-link-targets
|
|
574
|
-
Display link targets (default:false).
|
|
575
|
-
-a, --display-anchor-urls
|
|
576
|
-
Display anchor URLs (default:false).
|
|
577
|
-
-r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
|
|
578
|
-
Path to an optional JSON file containing rules for annotating the retrieved text.
|
|
579
|
-
-p POSTPROCESSOR, --postprocessor POSTPROCESSOR
|
|
580
|
-
Optional component for postprocessing the result (html, surface, xml).
|
|
581
|
-
--indentation INDENTATION
|
|
582
|
-
How to handle indentation (extended or strict; default: extended).
|
|
583
|
-
-v, --version display version information
|
|
562
|
+
parser = Inscriptis(html_tree, config)
|
|
584
563
|
text = parser.get_text()
|
|
585
564
|
|
|
586
565
|
|
|
@@ -616,6 +595,56 @@ The following code mitigates this problem on Unix systems by manually forcing lx
|
|
|
616
595
|
return libc.malloc_trim(0)
|
|
617
596
|
|
|
618
597
|
|
|
598
|
+
Examples
|
|
599
|
+
========
|
|
600
|
+
|
|
601
|
+
Strict indentation handling
|
|
602
|
+
---------------------------
|
|
603
|
+
|
|
604
|
+
The following example demonstrates modifying ``ParserConfig`` for strict indentation handling.
|
|
605
|
+
|
|
606
|
+
.. code-block:: python
|
|
607
|
+
|
|
608
|
+
from inscriptis import get_text
|
|
609
|
+
from inscriptis.css_profiles import CSS_PROFILES
|
|
610
|
+
from inscriptis.model.config import ParserConfig
|
|
611
|
+
|
|
612
|
+
config = ParserConfig(css=CSS_PROFILES['strict'].copy())
|
|
613
|
+
text = get_text('fi<span>r</span>st', config)
|
|
614
|
+
print(text)
|
|
615
|
+
|
|
616
|
+
Ignore elements during parsing
|
|
617
|
+
------------------------------
|
|
618
|
+
|
|
619
|
+
Overwriting the default CSS profile also allows changing the rendering of selected elements.
|
|
620
|
+
The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
|
|
621
|
+
|
|
622
|
+
.. code-block:: python
|
|
623
|
+
|
|
624
|
+
from inscriptis import get_text
|
|
625
|
+
from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
|
|
626
|
+
from inscriptis.html_properties import Display
|
|
627
|
+
from inscriptis.model.config import ParserConfig
|
|
628
|
+
|
|
629
|
+
# create a custom CSS based on the default style sheet and change the
|
|
630
|
+
# rendering of `div` and `span` elements
|
|
631
|
+
css = CSS_PROFILES['strict'].copy()
|
|
632
|
+
css['form'] = HtmlElement(display=Display.none)
|
|
633
|
+
|
|
634
|
+
# create a parser configuration using a custom css
|
|
635
|
+
html = """First line.
|
|
636
|
+
<form>
|
|
637
|
+
User data
|
|
638
|
+
<label for="name">Name:</label><br>
|
|
639
|
+
<input type="text" id="name" name="name"><br>
|
|
640
|
+
<label for="pass">Password:</label><br>
|
|
641
|
+
<input type="hidden" id="pass" name="pass">
|
|
642
|
+
</form>"""
|
|
643
|
+
config = ParserConfig(css=css)
|
|
644
|
+
text = get_text(html, config)
|
|
645
|
+
print(text)
|
|
646
|
+
|
|
647
|
+
|
|
619
648
|
Citation
|
|
620
649
|
========
|
|
621
650
|
|
|
@@ -644,4 +673,3 @@ A full list of changes can be found in the
|
|
|
644
673
|
`release notes <https://github.com/weblyzard/inscriptis/releases>`_.
|
|
645
674
|
|
|
646
675
|
|
|
647
|
-
|
|
@@ -131,9 +131,9 @@ the corresponding text representation.
|
|
|
131
131
|
Command line parameters
|
|
132
132
|
-----------------------
|
|
133
133
|
|
|
134
|
-
The inscript
|
|
134
|
+
The inscript command line client supports the following parameters::
|
|
135
135
|
|
|
136
|
-
usage: inscript
|
|
136
|
+
usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
|
|
137
137
|
[--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
|
|
138
138
|
[input]
|
|
139
139
|
|
|
@@ -172,19 +172,19 @@ HTML to text conversion
|
|
|
172
172
|
-----------------------
|
|
173
173
|
convert the given page to text and output the result to the screen::
|
|
174
174
|
|
|
175
|
-
$ inscript
|
|
175
|
+
$ inscript https://www.fhgr.ch
|
|
176
176
|
|
|
177
177
|
convert the file to text and save the output to fhgr.txt::
|
|
178
178
|
|
|
179
|
-
$ inscript
|
|
179
|
+
$ inscript fhgr.html -o fhgr.txt
|
|
180
180
|
|
|
181
181
|
convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt::
|
|
182
182
|
|
|
183
|
-
$ inscript
|
|
183
|
+
$ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt
|
|
184
184
|
|
|
185
185
|
convert HTML provided via stdin and save the output to output.txt::
|
|
186
186
|
|
|
187
|
-
$ echo "<body><p>Make it so!</p></body>" | inscript
|
|
187
|
+
$ echo "<body><p>Make it so!</p></body>" | inscript -o output.txt
|
|
188
188
|
|
|
189
189
|
|
|
190
190
|
HTML to annotated text conversion
|
|
@@ -193,7 +193,7 @@ convert and annotate HTML from a Web page using the provided annotation rules.
|
|
|
193
193
|
|
|
194
194
|
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
|
|
195
195
|
|
|
196
|
-
$ inscript
|
|
196
|
+
$ inscript https://www.fhgr.ch -r annotation-profile.json
|
|
197
197
|
|
|
198
198
|
The annotation rules are specified in `annotation-profile.json`:
|
|
199
199
|
|
|
@@ -241,7 +241,7 @@ Annotation postprocessors enable the post processing of annotations to formats
|
|
|
241
241
|
that are suitable for your particular application. Post processors can be
|
|
242
242
|
specified with the ``-p`` or ``--postprocessor`` command line argument::
|
|
243
243
|
|
|
244
|
-
$ inscript
|
|
244
|
+
$ inscript https://www.fhgr.ch \
|
|
245
245
|
-r ./examples/annotation-profile.json \
|
|
246
246
|
-p surface
|
|
247
247
|
|
|
@@ -286,7 +286,7 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
286
286
|
|
|
287
287
|
.. code-block:: bash
|
|
288
288
|
|
|
289
|
-
inscript
|
|
289
|
+
inscript --annotation-rules ./wikipedia.json \
|
|
290
290
|
--postprocessor html \
|
|
291
291
|
https://en.wikipedia.org/wiki/Chur.html
|
|
292
292
|
|
|
@@ -311,14 +311,18 @@ Currently, inscriptis supports the following postprocessors:
|
|
|
311
311
|
Web Service
|
|
312
312
|
===========
|
|
313
313
|
|
|
314
|
-
|
|
314
|
+
A FastAPI-based Web Service that uses Inscriptis for translating HTML pages to plain text.
|
|
315
315
|
|
|
316
316
|
Run the Web Service on your host system
|
|
317
317
|
---------------------------------------
|
|
318
|
-
|
|
318
|
+
Install the optional feature `web-service` for inscriptis::
|
|
319
|
+
|
|
320
|
+
$ pip install inscriptis[web-service]
|
|
321
|
+
|
|
322
|
+
Start the Inscriptis Web service with the following command::
|
|
323
|
+
|
|
324
|
+
$ uvicorn inscriptis.service.web:app --port 5000 --host 127.0.0.1
|
|
319
325
|
|
|
320
|
-
$ export FLASK_APP="inscriptis.service.web"
|
|
321
|
-
$ python3 -m flask run
|
|
322
326
|
|
|
323
327
|
Run the Web Service with Docker
|
|
324
328
|
-------------------------------
|
|
@@ -499,7 +503,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
|
|
|
499
503
|
1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the
|
|
500
504
|
parameter ``indentation='extended'`` to also use indentation for tags such as
|
|
501
505
|
``<div>`` and ``<span>`` that do not provide indentation in their standard
|
|
502
|
-
definition. This strategy is the default in ``inscript
|
|
506
|
+
definition. This strategy is the default in ``inscript`` and many other
|
|
503
507
|
tools such as Lynx. If you do not want extended indentation you can use the
|
|
504
508
|
parameter ``indentation='standard'`` instead.
|
|
505
509
|
|
|
@@ -524,36 +528,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
|
|
|
524
528
|
html_tree = fromstring(html)
|
|
525
529
|
# create a parser using a custom css
|
|
526
530
|
config = ParserConfig(css=css)
|
|
527
|
-
parser = Inscriptis(html_tree, config)
|
|
528
|
-
[--indentation INDENTATION] [-v]
|
|
529
|
-
[input]
|
|
530
|
-
|
|
531
|
-
Convert the given HTML document to text.
|
|
532
|
-
|
|
533
|
-
positional arguments:
|
|
534
|
-
input Html input either from a file or a URL (default:stdin).
|
|
535
|
-
|
|
536
|
-
optional arguments:
|
|
537
|
-
-h, --help show this help message and exit
|
|
538
|
-
-o OUTPUT, --output OUTPUT
|
|
539
|
-
Output file (default:stdout).
|
|
540
|
-
-e ENCODING, --encoding ENCODING
|
|
541
|
-
Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
|
|
542
|
-
-i, --display-image-captions
|
|
543
|
-
Display image captions (default:false).
|
|
544
|
-
-d, --deduplicate-image-captions
|
|
545
|
-
Deduplicate image captions (default:false).
|
|
546
|
-
-l, --display-link-targets
|
|
547
|
-
Display link targets (default:false).
|
|
548
|
-
-a, --display-anchor-urls
|
|
549
|
-
Display anchor URLs (default:false).
|
|
550
|
-
-r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
|
|
551
|
-
Path to an optional JSON file containing rules for annotating the retrieved text.
|
|
552
|
-
-p POSTPROCESSOR, --postprocessor POSTPROCESSOR
|
|
553
|
-
Optional component for postprocessing the result (html, surface, xml).
|
|
554
|
-
--indentation INDENTATION
|
|
555
|
-
How to handle indentation (extended or strict; default: extended).
|
|
556
|
-
-v, --version display version information
|
|
531
|
+
parser = Inscriptis(html_tree, config)
|
|
557
532
|
text = parser.get_text()
|
|
558
533
|
|
|
559
534
|
|
|
@@ -589,6 +564,56 @@ The following code mitigates this problem on Unix systems by manually forcing lx
|
|
|
589
564
|
return libc.malloc_trim(0)
|
|
590
565
|
|
|
591
566
|
|
|
567
|
+
Examples
|
|
568
|
+
========
|
|
569
|
+
|
|
570
|
+
Strict indentation handling
|
|
571
|
+
---------------------------
|
|
572
|
+
|
|
573
|
+
The following example demonstrates modifying ``ParserConfig`` for strict indentation handling.
|
|
574
|
+
|
|
575
|
+
.. code-block:: python
|
|
576
|
+
|
|
577
|
+
from inscriptis import get_text
|
|
578
|
+
from inscriptis.css_profiles import CSS_PROFILES
|
|
579
|
+
from inscriptis.model.config import ParserConfig
|
|
580
|
+
|
|
581
|
+
config = ParserConfig(css=CSS_PROFILES['strict'].copy())
|
|
582
|
+
text = get_text('fi<span>r</span>st', config)
|
|
583
|
+
print(text)
|
|
584
|
+
|
|
585
|
+
Ignore elements during parsing
|
|
586
|
+
------------------------------
|
|
587
|
+
|
|
588
|
+
Overwriting the default CSS profile also allows changing the rendering of selected elements.
|
|
589
|
+
The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
|
|
590
|
+
|
|
591
|
+
.. code-block:: python
|
|
592
|
+
|
|
593
|
+
from inscriptis import get_text
|
|
594
|
+
from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
|
|
595
|
+
from inscriptis.html_properties import Display
|
|
596
|
+
from inscriptis.model.config import ParserConfig
|
|
597
|
+
|
|
598
|
+
# create a custom CSS based on the default style sheet and change the
|
|
599
|
+
# rendering of `div` and `span` elements
|
|
600
|
+
css = CSS_PROFILES['strict'].copy()
|
|
601
|
+
css['form'] = HtmlElement(display=Display.none)
|
|
602
|
+
|
|
603
|
+
# create a parser configuration using a custom css
|
|
604
|
+
html = """First line.
|
|
605
|
+
<form>
|
|
606
|
+
User data
|
|
607
|
+
<label for="name">Name:</label><br>
|
|
608
|
+
<input type="text" id="name" name="name"><br>
|
|
609
|
+
<label for="pass">Password:</label><br>
|
|
610
|
+
<input type="hidden" id="pass" name="pass">
|
|
611
|
+
</form>"""
|
|
612
|
+
config = ParserConfig(css=css)
|
|
613
|
+
text = get_text(html, config)
|
|
614
|
+
print(text)
|
|
615
|
+
|
|
616
|
+
|
|
592
617
|
Citation
|
|
593
618
|
========
|
|
594
619
|
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "inscriptis"
|
|
3
|
+
version = "2.4.0.1"
|
|
4
|
+
authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
|
|
5
|
+
description = "inscriptis - HTML to text converter."
|
|
6
|
+
keywords = ["HTML", "converter", "text"]
|
|
7
|
+
classifiers = [
|
|
8
|
+
'Development Status :: 5 - Production/Stable',
|
|
9
|
+
'Intended Audience :: Developers',
|
|
10
|
+
'License :: OSI Approved :: Apache Software License',
|
|
11
|
+
'Topic :: Text Processing',
|
|
12
|
+
'Topic :: Text Processing :: Markup :: HTML',
|
|
13
|
+
'Topic :: Utilities',
|
|
14
|
+
'Programming Language :: Python :: 3',
|
|
15
|
+
'Programming Language :: Python :: 3.8',
|
|
16
|
+
'Programming Language :: Python :: 3.9',
|
|
17
|
+
'Programming Language :: Python :: 3.10',
|
|
18
|
+
'Programming Language :: Python :: 3.11',
|
|
19
|
+
'Programming Language :: Python :: 3.12',
|
|
20
|
+
]
|
|
21
|
+
homepage = "https://github.com/weblyzard/inscriptis"
|
|
22
|
+
repository = "https://github.com/weblyzard/inscriptis"
|
|
23
|
+
documentation = "https://inscriptis.readthedocs.io/en"
|
|
24
|
+
license = "Apache-2.0"
|
|
25
|
+
readme = "README.rst"
|
|
26
|
+
|
|
27
|
+
packages = [
|
|
28
|
+
{include = "inscriptis", from="src"},
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
[tool.poetry.scripts]
|
|
33
|
+
inscript = "inscriptis.cli.inscript:cli"
|
|
34
|
+
inscriptis-api = "inscriptis.service.web:start"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
[tool.poetry.extras]
|
|
38
|
+
web-service = ["fastapi", "uvicorn"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
[tool.poetry.dependencies]
|
|
42
|
+
python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12"
|
|
43
|
+
requests = ">=2.31.0"
|
|
44
|
+
lxml = ">=4.9.3"
|
|
45
|
+
|
|
46
|
+
# optional dependencies
|
|
47
|
+
fastapi = { version = "^0.109.0", optional = true }
|
|
48
|
+
uvicorn = { version = "^0.25.0", optional = true }
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
[build-system]
|
|
52
|
+
requires = ["poetry-core"]
|
|
53
|
+
build-backend = "poetry.core.masonry.api"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# code formatting with black
|
|
57
|
+
[tool.black]
|
|
58
|
+
line-length = 88
|
|
59
|
+
target-version = ["py38", "py39", "py310", "py311", "py312"]
|
|
60
|
+
extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
|
|
61
|
+
include = '''
|
|
62
|
+
^/src/|^/tests/|^/benchmarking/
|
|
63
|
+
'''
|
|
@@ -68,7 +68,7 @@ from typing import Dict, Optional, Any
|
|
|
68
68
|
from inscriptis.model.config import ParserConfig
|
|
69
69
|
from inscriptis.html_engine import Inscriptis
|
|
70
70
|
|
|
71
|
-
RE_STRIP_XML_DECLARATION = re.compile(r
|
|
71
|
+
RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
|
|
@@ -85,13 +85,13 @@ def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
|
|
|
85
85
|
return None
|
|
86
86
|
|
|
87
87
|
# strip XML declaration, if necessary
|
|
88
|
-
if html_content.startswith(
|
|
89
|
-
html_content = RE_STRIP_XML_DECLARATION.sub(
|
|
88
|
+
if html_content.startswith("<?xml "):
|
|
89
|
+
html_content = RE_STRIP_XML_DECLARATION.sub("", html_content, count=1)
|
|
90
90
|
|
|
91
91
|
try:
|
|
92
92
|
return fromstring(html_content)
|
|
93
93
|
except ParserError:
|
|
94
|
-
return fromstring(
|
|
94
|
+
return fromstring("<pre>" + html_content + "</pre>")
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
def get_text(html_content: str, config: ParserConfig = None) -> str:
|
|
@@ -105,12 +105,12 @@ def get_text(html_content: str, config: ParserConfig = None) -> str:
|
|
|
105
105
|
The text representation of the HTML content.
|
|
106
106
|
"""
|
|
107
107
|
html_tree = _get_html_tree(html_content)
|
|
108
|
-
return Inscriptis(html_tree, config).get_text() if html_tree is not None
|
|
109
|
-
else ''
|
|
108
|
+
return Inscriptis(html_tree, config).get_text() if html_tree is not None else ""
|
|
110
109
|
|
|
111
110
|
|
|
112
|
-
def get_annotated_text(
|
|
113
|
-
|
|
111
|
+
def get_annotated_text(
|
|
112
|
+
html_content: str, config: ParserConfig = None
|
|
113
|
+
) -> Dict[str, Any]:
|
|
114
114
|
"""Return a dictionary of the extracted text and annotations.
|
|
115
115
|
|
|
116
116
|
Notes:
|
|
@@ -132,7 +132,5 @@ def get_annotated_text(html_content: str,
|
|
|
132
132
|
return {}
|
|
133
133
|
|
|
134
134
|
inscriptis = Inscriptis(html_tree, config)
|
|
135
|
-
labels = [(a.start, a.end, a.metadata)
|
|
136
|
-
|
|
137
|
-
return {'text': inscriptis.get_text(),
|
|
138
|
-
'label': labels}
|
|
135
|
+
labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
|
|
136
|
+
return {"text": inscriptis.get_text(), "label": labels}
|
|
@@ -29,9 +29,13 @@ class Annotation(NamedTuple):
|
|
|
29
29
|
"""a tuple of tags to be attached to the annotation."""
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def horizontal_shift(
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
def horizontal_shift(
|
|
33
|
+
annotations: List[Annotation],
|
|
34
|
+
content_width: int,
|
|
35
|
+
line_width: int,
|
|
36
|
+
align: HorizontalAlignment,
|
|
37
|
+
shift: int = 0,
|
|
38
|
+
) -> List[Annotation]:
|
|
35
39
|
r"""Shift annotations based on the given line's formatting.
|
|
36
40
|
|
|
37
41
|
Adjusts the start and end indices of annotations based on the line's
|
|
@@ -56,5 +60,6 @@ def horizontal_shift(annotations: List[Annotation], content_width: int,
|
|
|
56
60
|
else:
|
|
57
61
|
h_align = shift + (line_width - content_width) // 2
|
|
58
62
|
|
|
59
|
-
return [
|
|
60
|
-
|
|
63
|
+
return [
|
|
64
|
+
Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations
|
|
65
|
+
]
|