html-to-markdown 1.14.1__tar.gz → 1.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/PKG-INFO +61 -1
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/README.md +60 -0
- html_to_markdown-1.16.0/html_to_markdown/hocr_processor.py +128 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/preprocessor.py +49 -25
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/processing.py +23 -3
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/PKG-INFO +61 -1
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/SOURCES.txt +1 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/pyproject.toml +2 -2
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/LICENSE +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/converters.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown/whitespace.py +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/requires.txt +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -55,6 +55,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
55
55
|
## Features
|
|
56
56
|
|
|
57
57
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
58
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
58
59
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
59
60
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
60
61
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -266,6 +267,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
266
267
|
html_to_markdown --list-indent-type tabs input.html
|
|
267
268
|
```
|
|
268
269
|
|
|
270
|
+
### Working with HOCR Documents
|
|
271
|
+
|
|
272
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
273
|
+
|
|
274
|
+
**Python:**
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from html_to_markdown import convert_to_markdown
|
|
278
|
+
|
|
279
|
+
# HOCR from Tesseract OCR
|
|
280
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
281
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
282
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
283
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
284
|
+
<head>
|
|
285
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
286
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
287
|
+
</head>
|
|
288
|
+
<body>
|
|
289
|
+
<div class='ocr_page' id='page_1'>
|
|
290
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
291
|
+
<p class='ocr_par' id='par_1_1'>
|
|
292
|
+
<span class='ocr_line' id='line_1_1'>
|
|
293
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
294
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
295
|
+
</span>
|
|
296
|
+
</p>
|
|
297
|
+
</div>
|
|
298
|
+
</div>
|
|
299
|
+
</body>
|
|
300
|
+
</html>"""
|
|
301
|
+
|
|
302
|
+
# Automatically detected as HOCR and converted to clean text
|
|
303
|
+
markdown = convert_to_markdown(hocr_content)
|
|
304
|
+
print(markdown) # Output: "Hello world"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
**CLI:**
|
|
308
|
+
|
|
309
|
+
```shell
|
|
310
|
+
# Process HOCR files directly
|
|
311
|
+
tesseract image.png output hocr
|
|
312
|
+
html_to_markdown output.hocr
|
|
313
|
+
|
|
314
|
+
# Or pipe directly from Tesseract
|
|
315
|
+
tesseract image.png - hocr | html_to_markdown
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Features:**
|
|
319
|
+
|
|
320
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
321
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
322
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
323
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
324
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
325
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
326
|
+
|
|
269
327
|
## Advanced Usage
|
|
270
328
|
|
|
271
329
|
### Configuration Example
|
|
@@ -627,6 +685,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
627
685
|
- `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
|
|
628
686
|
- `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
|
|
629
687
|
- `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
|
|
688
|
+
- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
|
|
689
|
+
- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
|
|
630
690
|
|
|
631
691
|
## Contribution
|
|
632
692
|
|
|
@@ -15,6 +15,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
15
15
|
## Features
|
|
16
16
|
|
|
17
17
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
18
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
18
19
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
19
20
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
20
21
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -226,6 +227,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
226
227
|
html_to_markdown --list-indent-type tabs input.html
|
|
227
228
|
```
|
|
228
229
|
|
|
230
|
+
### Working with HOCR Documents
|
|
231
|
+
|
|
232
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
233
|
+
|
|
234
|
+
**Python:**
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from html_to_markdown import convert_to_markdown
|
|
238
|
+
|
|
239
|
+
# HOCR from Tesseract OCR
|
|
240
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
241
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
242
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
243
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
244
|
+
<head>
|
|
245
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
246
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
247
|
+
</head>
|
|
248
|
+
<body>
|
|
249
|
+
<div class='ocr_page' id='page_1'>
|
|
250
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
251
|
+
<p class='ocr_par' id='par_1_1'>
|
|
252
|
+
<span class='ocr_line' id='line_1_1'>
|
|
253
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
254
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
255
|
+
</span>
|
|
256
|
+
</p>
|
|
257
|
+
</div>
|
|
258
|
+
</div>
|
|
259
|
+
</body>
|
|
260
|
+
</html>"""
|
|
261
|
+
|
|
262
|
+
# Automatically detected as HOCR and converted to clean text
|
|
263
|
+
markdown = convert_to_markdown(hocr_content)
|
|
264
|
+
print(markdown) # Output: "Hello world"
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**CLI:**
|
|
268
|
+
|
|
269
|
+
```shell
|
|
270
|
+
# Process HOCR files directly
|
|
271
|
+
tesseract image.png output hocr
|
|
272
|
+
html_to_markdown output.hocr
|
|
273
|
+
|
|
274
|
+
# Or pipe directly from Tesseract
|
|
275
|
+
tesseract image.png - hocr | html_to_markdown
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
**Features:**
|
|
279
|
+
|
|
280
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
281
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
282
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
283
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
284
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
285
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
286
|
+
|
|
229
287
|
## Advanced Usage
|
|
230
288
|
|
|
231
289
|
### Configuration Example
|
|
@@ -587,6 +645,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
587
645
|
- `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
|
|
588
646
|
- `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
|
|
589
647
|
- `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
|
|
648
|
+
- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
|
|
649
|
+
- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
|
|
590
650
|
|
|
591
651
|
## Contribution
|
|
592
652
|
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""HOCR (HTML-based OCR) document processing utilities.
|
|
2
|
+
|
|
3
|
+
This module handles the conversion of HOCR documents to clean markdown text,
|
|
4
|
+
including proper spacing, layout preservation, and metadata suppression.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
11
|
+
|
|
12
|
+
from bs4 import Tag
|
|
13
|
+
from bs4.element import NavigableString, PageElement
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HOCRProcessor:
|
|
20
|
+
"""Handles HOCR-specific document processing."""
|
|
21
|
+
|
|
22
|
+
_HOCR_PATTERNS: ClassVar[list[re.Pattern[str]]] = [
|
|
23
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_page.*?["\']', re.IGNORECASE),
|
|
24
|
+
re.compile(r'class\s*=\s*["\'].*?ocrx_word.*?["\']', re.IGNORECASE),
|
|
25
|
+
re.compile(r'name\s*=\s*["\']ocr-system["\']', re.IGNORECASE),
|
|
26
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_carea.*?["\']', re.IGNORECASE),
|
|
27
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_par.*?["\']', re.IGNORECASE),
|
|
28
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_line.*?["\']', re.IGNORECASE),
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def is_hocr_document(cls, content: str) -> bool:
|
|
33
|
+
"""Check if content is an HOCR document.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Raw HTML/XML content to check
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if content appears to be HOCR format
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If content is too large (>10MB)
|
|
43
|
+
"""
|
|
44
|
+
if len(content) > 10_000_000:
|
|
45
|
+
raise ValueError("Document too large for HOCR processing")
|
|
46
|
+
|
|
47
|
+
content_sample = content[:50000]
|
|
48
|
+
|
|
49
|
+
return any(pattern.search(content_sample) for pattern in cls._HOCR_PATTERNS)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def is_hocr_word_element(cls, tag: Tag | None) -> bool:
|
|
53
|
+
"""Check if a tag is an HOCR word element.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
tag: BeautifulSoup tag to check
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if tag is a span with ocrx_word class
|
|
60
|
+
"""
|
|
61
|
+
if not tag or tag.name != "span":
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
class_attr = tag.get("class")
|
|
65
|
+
if isinstance(class_attr, list):
|
|
66
|
+
return "ocrx_word" in class_attr
|
|
67
|
+
return class_attr == "ocrx_word"
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def should_add_space_before_word(cls, children: list[PageElement], current_index: int) -> bool:
|
|
71
|
+
"""Determine if space should be added before an HOCR word.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
children: List of child elements
|
|
75
|
+
current_index: Index of current element
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
True if a space should be added before this word
|
|
79
|
+
"""
|
|
80
|
+
if not (0 < current_index < len(children)):
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
prev_element = children[current_index - 1]
|
|
84
|
+
|
|
85
|
+
if isinstance(prev_element, NavigableString):
|
|
86
|
+
text_content = str(prev_element)
|
|
87
|
+
return not (text_content.strip() or " " in text_content)
|
|
88
|
+
|
|
89
|
+
return isinstance(prev_element, Tag) and cls.is_hocr_word_element(prev_element)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def is_hocr_element_in_soup(cls, soup: BeautifulSoup) -> bool:
|
|
93
|
+
"""Check if parsed soup contains HOCR elements.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
soup: Parsed BeautifulSoup document
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
True if soup contains HOCR elements
|
|
100
|
+
"""
|
|
101
|
+
return bool(
|
|
102
|
+
soup.find("meta", attrs={"name": "ocr-system"})
|
|
103
|
+
or soup.find("meta", attrs={"name": "ocr-capabilities"})
|
|
104
|
+
or soup.find(class_="ocr_page")
|
|
105
|
+
or soup.find(class_="ocrx_word")
|
|
106
|
+
or soup.find(class_="ocr_carea")
|
|
107
|
+
or soup.find(class_="ocr_par")
|
|
108
|
+
or soup.find(class_="ocr_line")
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def get_optimal_parser(cls, content: str, lxml_available: bool) -> str:
|
|
113
|
+
"""Get optimal parser for HOCR content.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
content: Document content
|
|
117
|
+
lxml_available: Whether lxml is available
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Parser name to use ('xml', 'lxml', or 'html.parser')
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
if cls.is_hocr_document(content) and lxml_available:
|
|
124
|
+
return "xml"
|
|
125
|
+
except ValueError:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
return "lxml" if lxml_available else "html.parser"
|
|
@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
|
|
|
97
97
|
}
|
|
98
98
|
)
|
|
99
99
|
|
|
100
|
+
DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
|
|
101
|
+
{
|
|
102
|
+
"vector-header",
|
|
103
|
+
"vector-main-menu",
|
|
104
|
+
"vector-page-tools",
|
|
105
|
+
"vector-toc",
|
|
106
|
+
"mw-jump-link",
|
|
107
|
+
"mw-navigation",
|
|
108
|
+
"navbox",
|
|
109
|
+
"navigation-box",
|
|
110
|
+
"sidebar",
|
|
111
|
+
"nav",
|
|
112
|
+
"header",
|
|
113
|
+
"footer",
|
|
114
|
+
"menu",
|
|
115
|
+
"breadcrumb",
|
|
116
|
+
"topbar",
|
|
117
|
+
"toolbar",
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
100
121
|
|
|
101
122
|
def preprocess_html(
|
|
102
123
|
html: str,
|
|
@@ -111,11 +132,18 @@ def preprocess_html(
|
|
|
111
132
|
preserve_media: bool = True,
|
|
112
133
|
custom_tags_to_remove: set[str] | None = None,
|
|
113
134
|
custom_attributes_to_remove: set[str] | None = None,
|
|
135
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
136
|
+
extra_navigation_classes: set[str] | None = None,
|
|
114
137
|
) -> str:
|
|
115
138
|
if not html or not html.strip(): # pragma: no cover
|
|
116
139
|
return html
|
|
117
140
|
|
|
118
|
-
html = _remove_class_based_navigation(
|
|
141
|
+
html = _remove_class_based_navigation(
|
|
142
|
+
html,
|
|
143
|
+
remove_navigation,
|
|
144
|
+
excluded_navigation_classes,
|
|
145
|
+
extra_navigation_classes,
|
|
146
|
+
)
|
|
119
147
|
|
|
120
148
|
nh3_config = _configure_cleaning_rules(
|
|
121
149
|
remove_navigation=remove_navigation,
|
|
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
|
|
|
242
270
|
}
|
|
243
271
|
|
|
244
272
|
|
|
245
|
-
def _remove_class_based_navigation(
|
|
273
|
+
def _remove_class_based_navigation(
|
|
274
|
+
html: str,
|
|
275
|
+
remove_navigation: bool,
|
|
276
|
+
excluded_navigation_classes: set[str] | None,
|
|
277
|
+
extra_navigation_classes: set[str] | None,
|
|
278
|
+
) -> str:
|
|
246
279
|
if not remove_navigation:
|
|
247
280
|
return html
|
|
248
281
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
r'nav[^"]*',
|
|
260
|
-
r'header[^"]*',
|
|
261
|
-
r'footer[^"]*',
|
|
262
|
-
r'menu[^"]*',
|
|
263
|
-
r'breadcrumb[^"]*',
|
|
264
|
-
r'topbar[^"]*',
|
|
265
|
-
r'toolbar[^"]*',
|
|
266
|
-
]
|
|
282
|
+
class_names = set(DEFAULT_NAVIGATION_CLASSES)
|
|
283
|
+
|
|
284
|
+
if excluded_navigation_classes:
|
|
285
|
+
class_names.difference_update(excluded_navigation_classes)
|
|
286
|
+
|
|
287
|
+
if extra_navigation_classes:
|
|
288
|
+
class_names.update(extra_navigation_classes)
|
|
289
|
+
|
|
290
|
+
for class_name in class_names:
|
|
291
|
+
class_pattern = rf'{re.escape(class_name)}[^"]*'
|
|
267
292
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
293
|
+
block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
|
|
294
|
+
html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
271
295
|
|
|
272
|
-
|
|
273
|
-
html = re.sub(
|
|
296
|
+
self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
|
|
297
|
+
html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
|
|
274
298
|
|
|
275
299
|
return html
|
|
276
300
|
|
|
@@ -38,6 +38,7 @@ from html_to_markdown.constants import (
|
|
|
38
38
|
)
|
|
39
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
40
|
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
41
|
+
from html_to_markdown.hocr_processor import HOCRProcessor
|
|
41
42
|
from html_to_markdown.utils import escape
|
|
42
43
|
from html_to_markdown.whitespace import WhitespaceHandler
|
|
43
44
|
|
|
@@ -150,6 +151,11 @@ def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
|
|
|
150
151
|
return " " * list_indent_width
|
|
151
152
|
|
|
152
153
|
|
|
154
|
+
_is_hocr_document = HOCRProcessor.is_hocr_document
|
|
155
|
+
_is_hocr_word_element = HOCRProcessor.is_hocr_word_element
|
|
156
|
+
_should_add_space_before_hocr_word = HOCRProcessor.should_add_space_before_word
|
|
157
|
+
|
|
158
|
+
|
|
153
159
|
def _is_nested_tag(el: PageElement) -> bool:
|
|
154
160
|
return isinstance(el, Tag) and el.name in {
|
|
155
161
|
"ol",
|
|
@@ -244,6 +250,10 @@ def _process_tag(
|
|
|
244
250
|
)
|
|
245
251
|
elif isinstance(el, Tag):
|
|
246
252
|
current_text = "".join(text_parts)
|
|
253
|
+
|
|
254
|
+
if _is_hocr_word_element(el) and _should_add_space_before_hocr_word(children, i):
|
|
255
|
+
text_parts.append(" ")
|
|
256
|
+
|
|
247
257
|
text_parts.append(
|
|
248
258
|
_process_tag(
|
|
249
259
|
el,
|
|
@@ -477,6 +487,8 @@ def convert_to_markdown(
|
|
|
477
487
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
478
488
|
remove_forms: bool = True,
|
|
479
489
|
remove_navigation: bool = True,
|
|
490
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
491
|
+
extra_navigation_classes: set[str] | None = None,
|
|
480
492
|
strip: str | Iterable[str] | None = None,
|
|
481
493
|
strip_newlines: bool = False,
|
|
482
494
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -521,6 +533,8 @@ def convert_to_markdown(
|
|
|
521
533
|
preprocessing_preset: Preprocessing aggressiveness level.
|
|
522
534
|
remove_forms: Remove form elements during preprocessing.
|
|
523
535
|
remove_navigation: Remove navigation elements during preprocessing.
|
|
536
|
+
excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
|
|
537
|
+
extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
|
|
524
538
|
strip: HTML tags to strip from output.
|
|
525
539
|
strip_newlines: Remove newlines from HTML before processing.
|
|
526
540
|
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
@@ -576,13 +590,15 @@ def convert_to_markdown(
|
|
|
576
590
|
config = create_preprocessor(
|
|
577
591
|
preset=preprocessing_preset,
|
|
578
592
|
remove_navigation=remove_navigation,
|
|
593
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
594
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
579
595
|
remove_forms=remove_forms,
|
|
580
596
|
)
|
|
581
597
|
source = preprocess_fn(source, **config)
|
|
582
598
|
|
|
583
599
|
if "".join(source.split("\n")):
|
|
584
600
|
if parser is None:
|
|
585
|
-
parser =
|
|
601
|
+
parser = HOCRProcessor.get_optimal_parser(source, LXML_AVAILABLE)
|
|
586
602
|
|
|
587
603
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
588
604
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
@@ -943,7 +959,7 @@ def _process_html_core(
|
|
|
943
959
|
|
|
944
960
|
if "".join(source.split("\n")):
|
|
945
961
|
if parser is None:
|
|
946
|
-
parser =
|
|
962
|
+
parser = HOCRProcessor.get_optimal_parser(source, LXML_AVAILABLE)
|
|
947
963
|
|
|
948
964
|
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
949
965
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
@@ -1006,7 +1022,7 @@ def _process_html_core(
|
|
|
1006
1022
|
if custom_converters:
|
|
1007
1023
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
1008
1024
|
|
|
1009
|
-
if extract_metadata and not convert_as_inline:
|
|
1025
|
+
if extract_metadata and not convert_as_inline and not HOCRProcessor.is_hocr_element_in_soup(source):
|
|
1010
1026
|
metadata = _extract_metadata(source)
|
|
1011
1027
|
metadata_comment = _format_metadata_comment(metadata)
|
|
1012
1028
|
if metadata_comment:
|
|
@@ -1078,6 +1094,8 @@ def convert_to_markdown_stream(
|
|
|
1078
1094
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1079
1095
|
remove_forms: bool = True,
|
|
1080
1096
|
remove_navigation: bool = True,
|
|
1097
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
1098
|
+
extra_navigation_classes: set[str] | None = None,
|
|
1081
1099
|
strip: str | Iterable[str] | None = None,
|
|
1082
1100
|
strip_newlines: bool = False,
|
|
1083
1101
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -1096,6 +1114,8 @@ def convert_to_markdown_stream(
|
|
|
1096
1114
|
config = create_preprocessor(
|
|
1097
1115
|
preset=preprocessing_preset,
|
|
1098
1116
|
remove_navigation=remove_navigation,
|
|
1117
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
1118
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
1099
1119
|
remove_forms=remove_forms,
|
|
1100
1120
|
)
|
|
1101
1121
|
source = preprocess_fn(source, **config)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -55,6 +55,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
55
55
|
## Features
|
|
56
56
|
|
|
57
57
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
58
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
58
59
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
59
60
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
60
61
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -266,6 +267,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
266
267
|
html_to_markdown --list-indent-type tabs input.html
|
|
267
268
|
```
|
|
268
269
|
|
|
270
|
+
### Working with HOCR Documents
|
|
271
|
+
|
|
272
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
273
|
+
|
|
274
|
+
**Python:**
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from html_to_markdown import convert_to_markdown
|
|
278
|
+
|
|
279
|
+
# HOCR from Tesseract OCR
|
|
280
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
281
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
282
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
283
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
284
|
+
<head>
|
|
285
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
286
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
287
|
+
</head>
|
|
288
|
+
<body>
|
|
289
|
+
<div class='ocr_page' id='page_1'>
|
|
290
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
291
|
+
<p class='ocr_par' id='par_1_1'>
|
|
292
|
+
<span class='ocr_line' id='line_1_1'>
|
|
293
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
294
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
295
|
+
</span>
|
|
296
|
+
</p>
|
|
297
|
+
</div>
|
|
298
|
+
</div>
|
|
299
|
+
</body>
|
|
300
|
+
</html>"""
|
|
301
|
+
|
|
302
|
+
# Automatically detected as HOCR and converted to clean text
|
|
303
|
+
markdown = convert_to_markdown(hocr_content)
|
|
304
|
+
print(markdown) # Output: "Hello world"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
**CLI:**
|
|
308
|
+
|
|
309
|
+
```shell
|
|
310
|
+
# Process HOCR files directly
|
|
311
|
+
tesseract image.png output hocr
|
|
312
|
+
html_to_markdown output.hocr
|
|
313
|
+
|
|
314
|
+
# Or pipe directly from Tesseract
|
|
315
|
+
tesseract image.png - hocr | html_to_markdown
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Features:**
|
|
319
|
+
|
|
320
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
321
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
322
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
323
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
324
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
325
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
326
|
+
|
|
269
327
|
## Advanced Usage
|
|
270
328
|
|
|
271
329
|
### Configuration Example
|
|
@@ -627,6 +685,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
627
685
|
- `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
|
|
628
686
|
- `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
|
|
629
687
|
- `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
|
|
688
|
+
- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
|
|
689
|
+
- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
|
|
630
690
|
|
|
631
691
|
## Contribution
|
|
632
692
|
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.16.0"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -69,7 +69,7 @@ dev = [
|
|
|
69
69
|
"pytest-benchmark>=5.1",
|
|
70
70
|
"pytest-cov>=7",
|
|
71
71
|
"pytest-mock>=3.15.1",
|
|
72
|
-
"ruff>=0.13.
|
|
72
|
+
"ruff>=0.13.2",
|
|
73
73
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
74
74
|
"types-psutil>=7.0.0.20250822",
|
|
75
75
|
"uv-bump",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.14.1 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|