html-to-markdown 1.15.0__tar.gz → 1.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/PKG-INFO +59 -1
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/README.md +58 -0
- html_to_markdown-1.16.0/html_to_markdown/hocr_processor.py +128 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/processing.py +13 -3
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/PKG-INFO +59 -1
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/SOURCES.txt +1 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/pyproject.toml +2 -2
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/LICENSE +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/__init__.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/__main__.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/cli.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/constants.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/converters.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/exceptions.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/preprocessor.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/py.typed +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/utils.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown/whitespace.py +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/requires.txt +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/top_level.txt +0 -0
- {html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -55,6 +55,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
55
55
|
## Features
|
|
56
56
|
|
|
57
57
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
58
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
58
59
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
59
60
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
60
61
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -266,6 +267,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
266
267
|
html_to_markdown --list-indent-type tabs input.html
|
|
267
268
|
```
|
|
268
269
|
|
|
270
|
+
### Working with HOCR Documents
|
|
271
|
+
|
|
272
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
273
|
+
|
|
274
|
+
**Python:**
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from html_to_markdown import convert_to_markdown
|
|
278
|
+
|
|
279
|
+
# HOCR from Tesseract OCR
|
|
280
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
281
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
282
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
283
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
284
|
+
<head>
|
|
285
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
286
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
287
|
+
</head>
|
|
288
|
+
<body>
|
|
289
|
+
<div class='ocr_page' id='page_1'>
|
|
290
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
291
|
+
<p class='ocr_par' id='par_1_1'>
|
|
292
|
+
<span class='ocr_line' id='line_1_1'>
|
|
293
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
294
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
295
|
+
</span>
|
|
296
|
+
</p>
|
|
297
|
+
</div>
|
|
298
|
+
</div>
|
|
299
|
+
</body>
|
|
300
|
+
</html>"""
|
|
301
|
+
|
|
302
|
+
# Automatically detected as HOCR and converted to clean text
|
|
303
|
+
markdown = convert_to_markdown(hocr_content)
|
|
304
|
+
print(markdown) # Output: "Hello world"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
**CLI:**
|
|
308
|
+
|
|
309
|
+
```shell
|
|
310
|
+
# Process HOCR files directly
|
|
311
|
+
tesseract image.png output hocr
|
|
312
|
+
html_to_markdown output.hocr
|
|
313
|
+
|
|
314
|
+
# Or pipe directly from Tesseract
|
|
315
|
+
tesseract image.png - hocr | html_to_markdown
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Features:**
|
|
319
|
+
|
|
320
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
321
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
322
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
323
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
324
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
325
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
326
|
+
|
|
269
327
|
## Advanced Usage
|
|
270
328
|
|
|
271
329
|
### Configuration Example
|
|
@@ -15,6 +15,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
15
15
|
## Features
|
|
16
16
|
|
|
17
17
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
18
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
18
19
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
19
20
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
20
21
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -226,6 +227,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
226
227
|
html_to_markdown --list-indent-type tabs input.html
|
|
227
228
|
```
|
|
228
229
|
|
|
230
|
+
### Working with HOCR Documents
|
|
231
|
+
|
|
232
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
233
|
+
|
|
234
|
+
**Python:**
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from html_to_markdown import convert_to_markdown
|
|
238
|
+
|
|
239
|
+
# HOCR from Tesseract OCR
|
|
240
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
241
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
242
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
243
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
244
|
+
<head>
|
|
245
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
246
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
247
|
+
</head>
|
|
248
|
+
<body>
|
|
249
|
+
<div class='ocr_page' id='page_1'>
|
|
250
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
251
|
+
<p class='ocr_par' id='par_1_1'>
|
|
252
|
+
<span class='ocr_line' id='line_1_1'>
|
|
253
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
254
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
255
|
+
</span>
|
|
256
|
+
</p>
|
|
257
|
+
</div>
|
|
258
|
+
</div>
|
|
259
|
+
</body>
|
|
260
|
+
</html>"""
|
|
261
|
+
|
|
262
|
+
# Automatically detected as HOCR and converted to clean text
|
|
263
|
+
markdown = convert_to_markdown(hocr_content)
|
|
264
|
+
print(markdown) # Output: "Hello world"
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**CLI:**
|
|
268
|
+
|
|
269
|
+
```shell
|
|
270
|
+
# Process HOCR files directly
|
|
271
|
+
tesseract image.png output hocr
|
|
272
|
+
html_to_markdown output.hocr
|
|
273
|
+
|
|
274
|
+
# Or pipe directly from Tesseract
|
|
275
|
+
tesseract image.png - hocr | html_to_markdown
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
**Features:**
|
|
279
|
+
|
|
280
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
281
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
282
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
283
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
284
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
285
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
286
|
+
|
|
229
287
|
## Advanced Usage
|
|
230
288
|
|
|
231
289
|
### Configuration Example
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""HOCR (HTML-based OCR) document processing utilities.
|
|
2
|
+
|
|
3
|
+
This module handles the conversion of HOCR documents to clean markdown text,
|
|
4
|
+
including proper spacing, layout preservation, and metadata suppression.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
11
|
+
|
|
12
|
+
from bs4 import Tag
|
|
13
|
+
from bs4.element import NavigableString, PageElement
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HOCRProcessor:
|
|
20
|
+
"""Handles HOCR-specific document processing."""
|
|
21
|
+
|
|
22
|
+
_HOCR_PATTERNS: ClassVar[list[re.Pattern[str]]] = [
|
|
23
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_page.*?["\']', re.IGNORECASE),
|
|
24
|
+
re.compile(r'class\s*=\s*["\'].*?ocrx_word.*?["\']', re.IGNORECASE),
|
|
25
|
+
re.compile(r'name\s*=\s*["\']ocr-system["\']', re.IGNORECASE),
|
|
26
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_carea.*?["\']', re.IGNORECASE),
|
|
27
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_par.*?["\']', re.IGNORECASE),
|
|
28
|
+
re.compile(r'class\s*=\s*["\'].*?ocr_line.*?["\']', re.IGNORECASE),
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def is_hocr_document(cls, content: str) -> bool:
|
|
33
|
+
"""Check if content is an HOCR document.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Raw HTML/XML content to check
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
True if content appears to be HOCR format
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If content is too large (>10MB)
|
|
43
|
+
"""
|
|
44
|
+
if len(content) > 10_000_000:
|
|
45
|
+
raise ValueError("Document too large for HOCR processing")
|
|
46
|
+
|
|
47
|
+
content_sample = content[:50000]
|
|
48
|
+
|
|
49
|
+
return any(pattern.search(content_sample) for pattern in cls._HOCR_PATTERNS)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def is_hocr_word_element(cls, tag: Tag | None) -> bool:
|
|
53
|
+
"""Check if a tag is an HOCR word element.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
tag: BeautifulSoup tag to check
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
True if tag is a span with ocrx_word class
|
|
60
|
+
"""
|
|
61
|
+
if not tag or tag.name != "span":
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
class_attr = tag.get("class")
|
|
65
|
+
if isinstance(class_attr, list):
|
|
66
|
+
return "ocrx_word" in class_attr
|
|
67
|
+
return class_attr == "ocrx_word"
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def should_add_space_before_word(cls, children: list[PageElement], current_index: int) -> bool:
|
|
71
|
+
"""Determine if space should be added before an HOCR word.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
children: List of child elements
|
|
75
|
+
current_index: Index of current element
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
True if a space should be added before this word
|
|
79
|
+
"""
|
|
80
|
+
if not (0 < current_index < len(children)):
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
prev_element = children[current_index - 1]
|
|
84
|
+
|
|
85
|
+
if isinstance(prev_element, NavigableString):
|
|
86
|
+
text_content = str(prev_element)
|
|
87
|
+
return not (text_content.strip() or " " in text_content)
|
|
88
|
+
|
|
89
|
+
return isinstance(prev_element, Tag) and cls.is_hocr_word_element(prev_element)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def is_hocr_element_in_soup(cls, soup: BeautifulSoup) -> bool:
|
|
93
|
+
"""Check if parsed soup contains HOCR elements.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
soup: Parsed BeautifulSoup document
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
True if soup contains HOCR elements
|
|
100
|
+
"""
|
|
101
|
+
return bool(
|
|
102
|
+
soup.find("meta", attrs={"name": "ocr-system"})
|
|
103
|
+
or soup.find("meta", attrs={"name": "ocr-capabilities"})
|
|
104
|
+
or soup.find(class_="ocr_page")
|
|
105
|
+
or soup.find(class_="ocrx_word")
|
|
106
|
+
or soup.find(class_="ocr_carea")
|
|
107
|
+
or soup.find(class_="ocr_par")
|
|
108
|
+
or soup.find(class_="ocr_line")
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def get_optimal_parser(cls, content: str, lxml_available: bool) -> str:
|
|
113
|
+
"""Get optimal parser for HOCR content.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
content: Document content
|
|
117
|
+
lxml_available: Whether lxml is available
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Parser name to use ('xml', 'lxml', or 'html.parser')
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
if cls.is_hocr_document(content) and lxml_available:
|
|
124
|
+
return "xml"
|
|
125
|
+
except ValueError:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
return "lxml" if lxml_available else "html.parser"
|
|
@@ -38,6 +38,7 @@ from html_to_markdown.constants import (
|
|
|
38
38
|
)
|
|
39
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
40
|
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
41
|
+
from html_to_markdown.hocr_processor import HOCRProcessor
|
|
41
42
|
from html_to_markdown.utils import escape
|
|
42
43
|
from html_to_markdown.whitespace import WhitespaceHandler
|
|
43
44
|
|
|
@@ -150,6 +151,11 @@ def _get_list_indent(list_indent_type: str, list_indent_width: int) -> str:
|
|
|
150
151
|
return " " * list_indent_width
|
|
151
152
|
|
|
152
153
|
|
|
154
|
+
_is_hocr_document = HOCRProcessor.is_hocr_document
|
|
155
|
+
_is_hocr_word_element = HOCRProcessor.is_hocr_word_element
|
|
156
|
+
_should_add_space_before_hocr_word = HOCRProcessor.should_add_space_before_word
|
|
157
|
+
|
|
158
|
+
|
|
153
159
|
def _is_nested_tag(el: PageElement) -> bool:
|
|
154
160
|
return isinstance(el, Tag) and el.name in {
|
|
155
161
|
"ol",
|
|
@@ -244,6 +250,10 @@ def _process_tag(
|
|
|
244
250
|
)
|
|
245
251
|
elif isinstance(el, Tag):
|
|
246
252
|
current_text = "".join(text_parts)
|
|
253
|
+
|
|
254
|
+
if _is_hocr_word_element(el) and _should_add_space_before_hocr_word(children, i):
|
|
255
|
+
text_parts.append(" ")
|
|
256
|
+
|
|
247
257
|
text_parts.append(
|
|
248
258
|
_process_tag(
|
|
249
259
|
el,
|
|
@@ -588,7 +598,7 @@ def convert_to_markdown(
|
|
|
588
598
|
|
|
589
599
|
if "".join(source.split("\n")):
|
|
590
600
|
if parser is None:
|
|
591
|
-
parser =
|
|
601
|
+
parser = HOCRProcessor.get_optimal_parser(source, LXML_AVAILABLE)
|
|
592
602
|
|
|
593
603
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
594
604
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
@@ -949,7 +959,7 @@ def _process_html_core(
|
|
|
949
959
|
|
|
950
960
|
if "".join(source.split("\n")):
|
|
951
961
|
if parser is None:
|
|
952
|
-
parser =
|
|
962
|
+
parser = HOCRProcessor.get_optimal_parser(source, LXML_AVAILABLE)
|
|
953
963
|
|
|
954
964
|
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
955
965
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
@@ -1012,7 +1022,7 @@ def _process_html_core(
|
|
|
1012
1022
|
if custom_converters:
|
|
1013
1023
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
1014
1024
|
|
|
1015
|
-
if extract_metadata and not convert_as_inline:
|
|
1025
|
+
if extract_metadata and not convert_as_inline and not HOCRProcessor.is_hocr_element_in_soup(source):
|
|
1016
1026
|
metadata = _extract_metadata(source)
|
|
1017
1027
|
metadata_comment = _format_metadata_comment(metadata)
|
|
1018
1028
|
if metadata_comment:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -55,6 +55,7 @@ Your support helps maintain and improve this library for the community.
|
|
|
55
55
|
## Features
|
|
56
56
|
|
|
57
57
|
- **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
|
|
58
|
+
- **HOCR Support**: Automatic detection and processing of HOCR (HTML-based OCR) documents with clean text extraction and proper spacing
|
|
58
59
|
- **Table Support**: Advanced handling of complex tables with rowspan/colspan support
|
|
59
60
|
- **Type Safety**: Strict MyPy adherence with comprehensive type hints
|
|
60
61
|
- **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
|
|
@@ -266,6 +267,63 @@ markdown = convert_to_markdown(html, list_indent_type="tabs")
|
|
|
266
267
|
html_to_markdown --list-indent-type tabs input.html
|
|
267
268
|
```
|
|
268
269
|
|
|
270
|
+
### Working with HOCR Documents
|
|
271
|
+
|
|
272
|
+
HOCR (HTML-based OCR) is a standard format used by OCR software like Tesseract to output structured text with positioning and confidence information. The library automatically detects and processes HOCR documents, extracting clean text while preserving proper spacing and structure.
|
|
273
|
+
|
|
274
|
+
**Python:**
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from html_to_markdown import convert_to_markdown
|
|
278
|
+
|
|
279
|
+
# HOCR from Tesseract OCR
|
|
280
|
+
hocr_content = """<?xml version="1.0" encoding="UTF-8"?>
|
|
281
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
282
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
283
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
284
|
+
<head>
|
|
285
|
+
<meta name='ocr-system' content='tesseract 5.5.1' />
|
|
286
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
|
287
|
+
</head>
|
|
288
|
+
<body>
|
|
289
|
+
<div class='ocr_page' id='page_1'>
|
|
290
|
+
<div class='ocr_carea' id='block_1_1'>
|
|
291
|
+
<p class='ocr_par' id='par_1_1'>
|
|
292
|
+
<span class='ocr_line' id='line_1_1'>
|
|
293
|
+
<span class='ocrx_word' id='word_1_1'>Hello</span>
|
|
294
|
+
<span class='ocrx_word' id='word_1_2'>world</span>
|
|
295
|
+
</span>
|
|
296
|
+
</p>
|
|
297
|
+
</div>
|
|
298
|
+
</div>
|
|
299
|
+
</body>
|
|
300
|
+
</html>"""
|
|
301
|
+
|
|
302
|
+
# Automatically detected as HOCR and converted to clean text
|
|
303
|
+
markdown = convert_to_markdown(hocr_content)
|
|
304
|
+
print(markdown) # Output: "Hello world"
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
**CLI:**
|
|
308
|
+
|
|
309
|
+
```shell
|
|
310
|
+
# Process HOCR files directly
|
|
311
|
+
tesseract image.png output hocr
|
|
312
|
+
html_to_markdown output.hocr
|
|
313
|
+
|
|
314
|
+
# Or pipe directly from Tesseract
|
|
315
|
+
tesseract image.png - hocr | html_to_markdown
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Features:**
|
|
319
|
+
|
|
320
|
+
- **Automatic Detection**: No configuration needed - HOCR documents are detected automatically
|
|
321
|
+
- **Clean Output**: Removes OCR metadata, bounding boxes, and confidence scores
|
|
322
|
+
- **Proper Spacing**: Maintains correct word spacing and text structure
|
|
323
|
+
- **Multi-language Support**: Works with HOCR output in any language
|
|
324
|
+
- **Performance Optimized**: Efficient processing of large OCR documents
|
|
325
|
+
- **Error Resilient**: Handles malformed or incomplete HOCR gracefully
|
|
326
|
+
|
|
269
327
|
## Advanced Usage
|
|
270
328
|
|
|
271
329
|
### Configuration Example
|
|
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "html-to-markdown"
|
|
8
|
-
version = "1.
|
|
8
|
+
version = "1.16.0"
|
|
9
9
|
description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
keywords = [
|
|
@@ -69,7 +69,7 @@ dev = [
|
|
|
69
69
|
"pytest-benchmark>=5.1",
|
|
70
70
|
"pytest-cov>=7",
|
|
71
71
|
"pytest-mock>=3.15.1",
|
|
72
|
-
"ruff>=0.13.
|
|
72
|
+
"ruff>=0.13.2",
|
|
73
73
|
"types-beautifulsoup4>=4.12.0.20250516",
|
|
74
74
|
"types-psutil>=7.0.0.20250822",
|
|
75
75
|
"uv-bump",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{html_to_markdown-1.15.0 → html_to_markdown-1.16.0}/html_to_markdown.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|