inscriptis 2.3.2__tar.gz → 2.4.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {inscriptis-2.3.2/src/inscriptis.egg-info → inscriptis-2.4.0.1}/PKG-INFO +86 -58
  2. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/README.rst +69 -44
  3. inscriptis-2.4.0.1/pyproject.toml +63 -0
  4. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/__init__.py +10 -12
  5. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/__init__.py +10 -5
  6. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/html.py +39 -38
  7. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/surface.py +5 -3
  8. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/xml.py +14 -9
  9. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/parser.py +22 -17
  10. inscriptis-2.4.0.1/src/inscriptis/cli/__init__.py +1 -0
  11. inscriptis-2.4.0.1/src/inscriptis/cli/inscript.py +214 -0
  12. inscriptis-2.4.0.1/src/inscriptis/css_profiles.py +64 -0
  13. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/html_engine.py +57 -47
  14. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/html_properties.py +3 -3
  15. inscriptis-2.4.0.1/src/inscriptis/metadata.py +14 -0
  16. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/attribute.py +18 -9
  17. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/__init__.py +15 -11
  18. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/block.py +15 -9
  19. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/canvas/prefix.py +13 -15
  20. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/config.py +13 -12
  21. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/css.py +18 -14
  22. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/html_element.py +56 -36
  23. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/table.py +67 -45
  24. inscriptis-2.4.0.1/src/inscriptis/service/web.py +55 -0
  25. inscriptis-2.3.2/AUTHORS +0 -5
  26. inscriptis-2.3.2/PKG-INFO +0 -647
  27. inscriptis-2.3.2/scripts/inscript.py +0 -139
  28. inscriptis-2.3.2/setup.cfg +0 -4
  29. inscriptis-2.3.2/setup.py +0 -61
  30. inscriptis-2.3.2/src/inscriptis/css_profiles.py +0 -84
  31. inscriptis-2.3.2/src/inscriptis/metadata.py +0 -7
  32. inscriptis-2.3.2/src/inscriptis/service/web.py +0 -44
  33. inscriptis-2.3.2/src/inscriptis.egg-info/SOURCES.txt +0 -32
  34. inscriptis-2.3.2/src/inscriptis.egg-info/dependency_links.txt +0 -1
  35. inscriptis-2.3.2/src/inscriptis.egg-info/requires.txt +0 -2
  36. inscriptis-2.3.2/src/inscriptis.egg-info/top_level.txt +0 -1
  37. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/LICENSE +0 -0
  38. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/annotation/output/__init__.py +0 -0
  39. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/model/__init__.py +0 -0
  40. {inscriptis-2.3.2 → inscriptis-2.4.0.1}/src/inscriptis/service/__init__.py +0 -0
@@ -1,29 +1,33 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: inscriptis
3
- Version: 2.3.2
3
+ Version: 2.4.0.1
4
4
  Summary: inscriptis - HTML to text converter.
5
5
  Home-page: https://github.com/weblyzard/inscriptis
6
- Author: Albert Weichselbraun, Fabian Odoni
7
- Author-email: albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch
8
- License: Apache 2.0
6
+ License: Apache-2.0
9
7
  Keywords: HTML,converter,text
10
- Platform: UNKNOWN
8
+ Author: Albert Weichselbraun
9
+ Author-email: albert.weichselbraun@fhgr.ch
10
+ Requires-Python: >=3.8,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: License :: OSI Approved :: Apache Software License
14
- Classifier: Topic :: Text Processing
15
- Classifier: Topic :: Text Processing :: Markup :: HTML
16
- Classifier: Topic :: Utilities
17
14
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.6
19
- Classifier: Programming Language :: Python :: 3.7
20
15
  Classifier: Programming Language :: Python :: 3.8
21
16
  Classifier: Programming Language :: Python :: 3.9
22
17
  Classifier: Programming Language :: Python :: 3.10
23
18
  Classifier: Programming Language :: Python :: 3.11
24
- Requires-Python: >=3.6
25
- License-File: LICENSE
26
- License-File: AUTHORS
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Text Processing
21
+ Classifier: Topic :: Text Processing :: Markup :: HTML
22
+ Classifier: Topic :: Utilities
23
+ Provides-Extra: web-service
24
+ Requires-Dist: fastapi (>=0.109.0,<0.110.0) ; extra == "web-service"
25
+ Requires-Dist: lxml (>=4.9.3)
26
+ Requires-Dist: requests (>=2.31.0)
27
+ Requires-Dist: uvicorn (>=0.25.0,<0.26.0) ; extra == "web-service"
28
+ Project-URL: Documentation, https://inscriptis.readthedocs.io/en
29
+ Project-URL: Repository, https://github.com/weblyzard/inscriptis
30
+ Description-Content-Type: text/x-rst
27
31
 
28
32
  ==================================================================================
29
33
  inscriptis -- HTML to text conversion library, command line client and Web service
@@ -158,9 +162,9 @@ the corresponding text representation.
158
162
  Command line parameters
159
163
  -----------------------
160
164
 
161
- The inscript.py command line client supports the following parameters::
165
+ The inscript command line client supports the following parameters::
162
166
 
163
- usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
167
+ usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
164
168
  [--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
165
169
  [input]
166
170
 
@@ -199,19 +203,19 @@ HTML to text conversion
199
203
  -----------------------
200
204
  convert the given page to text and output the result to the screen::
201
205
 
202
- $ inscript.py https://www.fhgr.ch
206
+ $ inscript https://www.fhgr.ch
203
207
 
204
208
  convert the file to text and save the output to fhgr.txt::
205
209
 
206
- $ inscript.py fhgr.html -o fhgr.txt
210
+ $ inscript fhgr.html -o fhgr.txt
207
211
 
208
212
  convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt::
209
213
 
210
- $ inscript.py --indentation strict fhgr.html -o fhgr-layout-optimized.txt
214
+ $ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt
211
215
 
212
216
  convert HTML provided via stdin and save the output to output.txt::
213
217
 
214
- $ echo "<body><p>Make it so!</p></body>" | inscript.py -o output.txt
218
+ $ echo "<body><p>Make it so!</p></body>" | inscript -o output.txt
215
219
 
216
220
 
217
221
  HTML to annotated text conversion
@@ -220,7 +224,7 @@ convert and annotate HTML from a Web page using the provided annotation rules.
220
224
 
221
225
  Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
222
226
 
223
- $ inscript.py https://www.fhgr.ch -r annotation-profile.json
227
+ $ inscript https://www.fhgr.ch -r annotation-profile.json
224
228
 
225
229
  The annotation rules are specified in `annotation-profile.json`:
226
230
 
@@ -268,7 +272,7 @@ Annotation postprocessors enable the post processing of annotations to formats
268
272
  that are suitable for your particular application. Post processors can be
269
273
  specified with the ``-p`` or ``--postprocessor`` command line argument::
270
274
 
271
- $ inscript.py https://www.fhgr.ch \
275
+ $ inscript https://www.fhgr.ch \
272
276
  -r ./examples/annotation-profile.json \
273
277
  -p surface
274
278
 
@@ -313,7 +317,7 @@ Currently, inscriptis supports the following postprocessors:
313
317
 
314
318
  .. code-block:: bash
315
319
 
316
- inscript.py --annotation-rules ./wikipedia.json \
320
+ inscript --annotation-rules ./wikipedia.json \
317
321
  --postprocessor html \
318
322
  https://en.wikipedia.org/wiki/Chur.html
319
323
 
@@ -338,14 +342,18 @@ Currently, inscriptis supports the following postprocessors:
338
342
  Web Service
339
343
  ===========
340
344
 
341
- The Flask Web Service translates HTML pages to the corresponding plain text.
345
+ A FastAPI-based Web Service that uses Inscriptis for translating HTML pages to plain text.
342
346
 
343
347
  Run the Web Service on your host system
344
348
  ---------------------------------------
345
- Provide additional requirement `python3-flask <https://flask.palletsprojects.com/en/2.2.x/>`_, then start the inscriptis Web service with the following command::
349
+ Install the optional feature `web-service` for inscriptis::
350
+
351
+ $ pip install inscriptis[web-service]
352
+
353
+ Start the Inscriptis Web service with the following command::
354
+
355
+ $ uvicorn inscriptis.service.web:app --port 5000 --host 127.0.0.1
346
356
 
347
- $ export FLASK_APP="inscriptis.service.web"
348
- $ python3 -m flask run
349
357
 
350
358
  Run the Web Service with Docker
351
359
  -------------------------------
@@ -526,7 +534,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
526
534
  1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the
527
535
  parameter ``indentation='extended'`` to also use indentation for tags such as
528
536
  ``<div>`` and ``<span>`` that do not provide indentation in their standard
529
- definition. This strategy is the default in ``inscript.py`` and many other
537
+ definition. This strategy is the default in ``inscript`` and many other
530
538
  tools such as Lynx. If you do not want extended indentation you can use the
531
539
  parameter ``indentation='standard'`` instead.
532
540
 
@@ -551,36 +559,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
551
559
  html_tree = fromstring(html)
552
560
  # create a parser using a custom css
553
561
  config = ParserConfig(css=css)
554
- parser = Inscriptis(html_tree, config) usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
555
- [--indentation INDENTATION] [-v]
556
- [input]
557
-
558
- Convert the given HTML document to text.
559
-
560
- positional arguments:
561
- input Html input either from a file or a URL (default:stdin).
562
-
563
- optional arguments:
564
- -h, --help show this help message and exit
565
- -o OUTPUT, --output OUTPUT
566
- Output file (default:stdout).
567
- -e ENCODING, --encoding ENCODING
568
- Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
569
- -i, --display-image-captions
570
- Display image captions (default:false).
571
- -d, --deduplicate-image-captions
572
- Deduplicate image captions (default:false).
573
- -l, --display-link-targets
574
- Display link targets (default:false).
575
- -a, --display-anchor-urls
576
- Display anchor URLs (default:false).
577
- -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
578
- Path to an optional JSON file containing rules for annotating the retrieved text.
579
- -p POSTPROCESSOR, --postprocessor POSTPROCESSOR
580
- Optional component for postprocessing the result (html, surface, xml).
581
- --indentation INDENTATION
582
- How to handle indentation (extended or strict; default: extended).
583
- -v, --version display version information
562
+ parser = Inscriptis(html_tree, config)
584
563
  text = parser.get_text()
585
564
 
586
565
 
@@ -616,6 +595,56 @@ The following code mitigates this problem on Unix systems by manually forcing lx
616
595
  return libc.malloc_trim(0)
617
596
 
618
597
 
598
+ Examples
599
+ ========
600
+
601
+ Strict indentation handling
602
+ ---------------------------
603
+
604
+ The following example demonstrates modifying ``ParserConfig`` for strict indentation handling.
605
+
606
+ .. code-block:: python
607
+
608
+ from inscriptis import get_text
609
+ from inscriptis.css_profiles import CSS_PROFILES
610
+ from inscriptis.model.config import ParserConfig
611
+
612
+ config = ParserConfig(css=CSS_PROFILES['strict'].copy())
613
+ text = get_text('fi<span>r</span>st', config)
614
+ print(text)
615
+
616
+ Ignore elements during parsing
617
+ ------------------------------
618
+
619
+ Overwriting the default CSS profile also allows changing the rendering of selected elements.
620
+ The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
621
+
622
+ .. code-block:: python
623
+
624
+ from inscriptis import get_text
625
+ from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
626
+ from inscriptis.html_properties import Display
627
+ from inscriptis.model.config import ParserConfig
628
+
629
+ # create a custom CSS based on the default style sheet and change the
630
+ # rendering of `div` and `span` elements
631
+ css = CSS_PROFILES['strict'].copy()
632
+ css['form'] = HtmlElement(display=Display.none)
633
+
634
+ # create a parser configuration using a custom css
635
+ html = """First line.
636
+ <form>
637
+ User data
638
+ <label for="name">Name:</label><br>
639
+ <input type="text" id="name" name="name"><br>
640
+ <label for="pass">Password:</label><br>
641
+ <input type="hidden" id="pass" name="pass">
642
+ </form>"""
643
+ config = ParserConfig(css=css)
644
+ text = get_text(html, config)
645
+ print(text)
646
+
647
+
619
648
  Citation
620
649
  ========
621
650
 
@@ -644,4 +673,3 @@ A full list of changes can be found in the
644
673
  `release notes <https://github.com/weblyzard/inscriptis/releases>`_.
645
674
 
646
675
 
647
-
@@ -131,9 +131,9 @@ the corresponding text representation.
131
131
  Command line parameters
132
132
  -----------------------
133
133
 
134
- The inscript.py command line client supports the following parameters::
134
+ The inscript command line client supports the following parameters::
135
135
 
136
- usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
136
+ usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION]
137
137
  [--table-cell-separator TABLE_CELL_SEPARATOR] [-v]
138
138
  [input]
139
139
 
@@ -172,19 +172,19 @@ HTML to text conversion
172
172
  -----------------------
173
173
  convert the given page to text and output the result to the screen::
174
174
 
175
- $ inscript.py https://www.fhgr.ch
175
+ $ inscript https://www.fhgr.ch
176
176
 
177
177
  convert the file to text and save the output to fhgr.txt::
178
178
 
179
- $ inscript.py fhgr.html -o fhgr.txt
179
+ $ inscript fhgr.html -o fhgr.txt
180
180
 
181
181
  convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt::
182
182
 
183
- $ inscript.py --indentation strict fhgr.html -o fhgr-layout-optimized.txt
183
+ $ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt
184
184
 
185
185
  convert HTML provided via stdin and save the output to output.txt::
186
186
 
187
- $ echo "<body><p>Make it so!</p></body>" | inscript.py -o output.txt
187
+ $ echo "<body><p>Make it so!</p></body>" | inscript -o output.txt
188
188
 
189
189
 
190
190
  HTML to annotated text conversion
@@ -193,7 +193,7 @@ convert and annotate HTML from a Web page using the provided annotation rules.
193
193
 
194
194
  Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
195
195
 
196
- $ inscript.py https://www.fhgr.ch -r annotation-profile.json
196
+ $ inscript https://www.fhgr.ch -r annotation-profile.json
197
197
 
198
198
  The annotation rules are specified in `annotation-profile.json`:
199
199
 
@@ -241,7 +241,7 @@ Annotation postprocessors enable the post processing of annotations to formats
241
241
  that are suitable for your particular application. Post processors can be
242
242
  specified with the ``-p`` or ``--postprocessor`` command line argument::
243
243
 
244
- $ inscript.py https://www.fhgr.ch \
244
+ $ inscript https://www.fhgr.ch \
245
245
  -r ./examples/annotation-profile.json \
246
246
  -p surface
247
247
 
@@ -286,7 +286,7 @@ Currently, inscriptis supports the following postprocessors:
286
286
 
287
287
  .. code-block:: bash
288
288
 
289
- inscript.py --annotation-rules ./wikipedia.json \
289
+ inscript --annotation-rules ./wikipedia.json \
290
290
  --postprocessor html \
291
291
  https://en.wikipedia.org/wiki/Chur.html
292
292
 
@@ -311,14 +311,18 @@ Currently, inscriptis supports the following postprocessors:
311
311
  Web Service
312
312
  ===========
313
313
 
314
- The Flask Web Service translates HTML pages to the corresponding plain text.
314
+ A FastAPI-based Web Service that uses Inscriptis for translating HTML pages to plain text.
315
315
 
316
316
  Run the Web Service on your host system
317
317
  ---------------------------------------
318
- Provide additional requirement `python3-flask <https://flask.palletsprojects.com/en/2.2.x/>`_, then start the inscriptis Web service with the following command::
318
+ Install the optional feature `web-service` for inscriptis::
319
+
320
+ $ pip install inscriptis[web-service]
321
+
322
+ Start the Inscriptis Web service with the following command::
323
+
324
+ $ uvicorn inscriptis.service.web:app --port 5000 --host 127.0.0.1
319
325
 
320
- $ export FLASK_APP="inscriptis.service.web"
321
- $ python3 -m flask run
322
326
 
323
327
  Run the Web Service with Docker
324
328
  -------------------------------
@@ -499,7 +503,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
499
503
  1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the
500
504
  parameter ``indentation='extended'`` to also use indentation for tags such as
501
505
  ``<div>`` and ``<span>`` that do not provide indentation in their standard
502
- definition. This strategy is the default in ``inscript.py`` and many other
506
+ definition. This strategy is the default in ``inscript`` and many other
503
507
  tools such as Lynx. If you do not want extended indentation you can use the
504
508
  parameter ``indentation='standard'`` instead.
505
509
 
@@ -524,36 +528,7 @@ The following options are available for fine tuning inscriptis' HTML rendering:
524
528
  html_tree = fromstring(html)
525
529
  # create a parser using a custom css
526
530
  config = ParserConfig(css=css)
527
- parser = Inscriptis(html_tree, config) usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR]
528
- [--indentation INDENTATION] [-v]
529
- [input]
530
-
531
- Convert the given HTML document to text.
532
-
533
- positional arguments:
534
- input Html input either from a file or a URL (default:stdin).
535
-
536
- optional arguments:
537
- -h, --help show this help message and exit
538
- -o OUTPUT, --output OUTPUT
539
- Output file (default:stdout).
540
- -e ENCODING, --encoding ENCODING
541
- Input encoding to use (default:utf-8 for files; detected server encoding for Web URLs).
542
- -i, --display-image-captions
543
- Display image captions (default:false).
544
- -d, --deduplicate-image-captions
545
- Deduplicate image captions (default:false).
546
- -l, --display-link-targets
547
- Display link targets (default:false).
548
- -a, --display-anchor-urls
549
- Display anchor URLs (default:false).
550
- -r ANNOTATION_RULES, --annotation-rules ANNOTATION_RULES
551
- Path to an optional JSON file containing rules for annotating the retrieved text.
552
- -p POSTPROCESSOR, --postprocessor POSTPROCESSOR
553
- Optional component for postprocessing the result (html, surface, xml).
554
- --indentation INDENTATION
555
- How to handle indentation (extended or strict; default: extended).
556
- -v, --version display version information
531
+ parser = Inscriptis(html_tree, config)
557
532
  text = parser.get_text()
558
533
 
559
534
 
@@ -589,6 +564,56 @@ The following code mitigates this problem on Unix systems by manually forcing lx
589
564
  return libc.malloc_trim(0)
590
565
 
591
566
 
567
+ Examples
568
+ ========
569
+
570
+ Strict indentation handling
571
+ ---------------------------
572
+
573
+ The following example demonstrates modifying ``ParserConfig`` for strict indentation handling.
574
+
575
+ .. code-block:: python
576
+
577
+ from inscriptis import get_text
578
+ from inscriptis.css_profiles import CSS_PROFILES
579
+ from inscriptis.model.config import ParserConfig
580
+
581
+ config = ParserConfig(css=CSS_PROFILES['strict'].copy())
582
+ text = get_text('fi<span>r</span>st', config)
583
+ print(text)
584
+
585
+ Ignore elements during parsing
586
+ ------------------------------
587
+
588
+ Overwriting the default CSS profile also allows changing the rendering of selected elements.
589
+ The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
590
+
591
+ .. code-block:: python
592
+
593
+ from inscriptis import get_text
594
+ from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
595
+ from inscriptis.html_properties import Display
596
+ from inscriptis.model.config import ParserConfig
597
+
598
+ # create a custom CSS based on the default style sheet and change the
599
+ # rendering of `div` and `span` elements
600
+ css = CSS_PROFILES['strict'].copy()
601
+ css['form'] = HtmlElement(display=Display.none)
602
+
603
+ # create a parser configuration using a custom css
604
+ html = """First line.
605
+ <form>
606
+ User data
607
+ <label for="name">Name:</label><br>
608
+ <input type="text" id="name" name="name"><br>
609
+ <label for="pass">Password:</label><br>
610
+ <input type="hidden" id="pass" name="pass">
611
+ </form>"""
612
+ config = ParserConfig(css=css)
613
+ text = get_text(html, config)
614
+ print(text)
615
+
616
+
592
617
  Citation
593
618
  ========
594
619
 
@@ -0,0 +1,63 @@
1
+ [tool.poetry]
2
+ name = "inscriptis"
3
+ version = "2.4.0.1"
4
+ authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
5
+ description = "inscriptis - HTML to text converter."
6
+ keywords = ["HTML", "converter", "text"]
7
+ classifiers = [
8
+ 'Development Status :: 5 - Production/Stable',
9
+ 'Intended Audience :: Developers',
10
+ 'License :: OSI Approved :: Apache Software License',
11
+ 'Topic :: Text Processing',
12
+ 'Topic :: Text Processing :: Markup :: HTML',
13
+ 'Topic :: Utilities',
14
+ 'Programming Language :: Python :: 3',
15
+ 'Programming Language :: Python :: 3.8',
16
+ 'Programming Language :: Python :: 3.9',
17
+ 'Programming Language :: Python :: 3.10',
18
+ 'Programming Language :: Python :: 3.11',
19
+ 'Programming Language :: Python :: 3.12',
20
+ ]
21
+ homepage = "https://github.com/weblyzard/inscriptis"
22
+ repository = "https://github.com/weblyzard/inscriptis"
23
+ documentation = "https://inscriptis.readthedocs.io/en"
24
+ license = "Apache-2.0"
25
+ readme = "README.rst"
26
+
27
+ packages = [
28
+ {include = "inscriptis", from="src"},
29
+ ]
30
+
31
+
32
+ [tool.poetry.scripts]
33
+ inscript = "inscriptis.cli.inscript:cli"
34
+ inscriptis-api = "inscriptis.service.web:start"
35
+
36
+
37
+ [tool.poetry.extras]
38
+ web-service = ["fastapi", "uvicorn"]
39
+
40
+
41
+ [tool.poetry.dependencies]
42
+ python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12"
43
+ requests = ">=2.31.0"
44
+ lxml = ">=4.9.3"
45
+
46
+ # optional dependencies
47
+ fastapi = { version = "^0.109.0", optional = true }
48
+ uvicorn = { version = "^0.25.0", optional = true }
49
+
50
+
51
+ [build-system]
52
+ requires = ["poetry-core"]
53
+ build-backend = "poetry.core.masonry.api"
54
+
55
+
56
+ # code formatting with black
57
+ [tool.black]
58
+ line-length = 88
59
+ target-version = ["py38", "py39", "py310", "py311", "py312"]
60
+ extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
61
+ include = '''
62
+ ^/src/|^/tests/|^/benchmarking/
63
+ '''
@@ -68,7 +68,7 @@ from typing import Dict, Optional, Any
68
68
  from inscriptis.model.config import ParserConfig
69
69
  from inscriptis.html_engine import Inscriptis
70
70
 
71
- RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')
71
+ RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
72
72
 
73
73
 
74
74
  def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
@@ -85,13 +85,13 @@ def _get_html_tree(html_content: str) -> Optional[HtmlElement]:
85
85
  return None
86
86
 
87
87
  # strip XML declaration, if necessary
88
- if html_content.startswith('<?xml '):
89
- html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)
88
+ if html_content.startswith("<?xml "):
89
+ html_content = RE_STRIP_XML_DECLARATION.sub("", html_content, count=1)
90
90
 
91
91
  try:
92
92
  return fromstring(html_content)
93
93
  except ParserError:
94
- return fromstring('<pre>' + html_content + '</pre>')
94
+ return fromstring("<pre>" + html_content + "</pre>")
95
95
 
96
96
 
97
97
  def get_text(html_content: str, config: ParserConfig = None) -> str:
@@ -105,12 +105,12 @@ def get_text(html_content: str, config: ParserConfig = None) -> str:
105
105
  The text representation of the HTML content.
106
106
  """
107
107
  html_tree = _get_html_tree(html_content)
108
- return Inscriptis(html_tree, config).get_text() if html_tree is not None \
109
- else ''
108
+ return Inscriptis(html_tree, config).get_text() if html_tree is not None else ""
110
109
 
111
110
 
112
- def get_annotated_text(html_content: str,
113
- config: ParserConfig = None) -> Dict[str, Any]:
111
+ def get_annotated_text(
112
+ html_content: str, config: ParserConfig = None
113
+ ) -> Dict[str, Any]:
114
114
  """Return a dictionary of the extracted text and annotations.
115
115
 
116
116
  Notes:
@@ -132,7 +132,5 @@ def get_annotated_text(html_content: str,
132
132
  return {}
133
133
 
134
134
  inscriptis = Inscriptis(html_tree, config)
135
- labels = [(a.start, a.end, a.metadata)
136
- for a in inscriptis.get_annotations()]
137
- return {'text': inscriptis.get_text(),
138
- 'label': labels}
135
+ labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
136
+ return {"text": inscriptis.get_text(), "label": labels}
@@ -29,9 +29,13 @@ class Annotation(NamedTuple):
29
29
  """a tuple of tags to be attached to the annotation."""
30
30
 
31
31
 
32
- def horizontal_shift(annotations: List[Annotation], content_width: int,
33
- line_width: int, align: HorizontalAlignment,
34
- shift: int = 0) -> List[Annotation]:
32
+ def horizontal_shift(
33
+ annotations: List[Annotation],
34
+ content_width: int,
35
+ line_width: int,
36
+ align: HorizontalAlignment,
37
+ shift: int = 0,
38
+ ) -> List[Annotation]:
35
39
  r"""Shift annotations based on the given line's formatting.
36
40
 
37
41
  Adjusts the start and end indices of annotations based on the line's
@@ -56,5 +60,6 @@ def horizontal_shift(annotations: List[Annotation], content_width: int,
56
60
  else:
57
61
  h_align = shift + (line_width - content_width) // 2
58
62
 
59
- return [Annotation(a.start + h_align, a.end + h_align, a.metadata)
60
- for a in annotations]
63
+ return [
64
+ Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations
65
+ ]