html-to-markdown 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +23 -86
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +111 -67
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/METADATA +2 -1
- html_to_markdown-1.8.0.dist-info/RECORD +16 -0
- html_to_markdown-1.6.0.dist-info/RECORD +0 -15
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.8.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Generator, Mapping
|
|
7
|
-
|
|
7
|
+
|
|
8
8
|
import re
|
|
9
9
|
from contextvars import ContextVar
|
|
10
10
|
from io import StringIO
|
|
@@ -14,7 +14,13 @@ from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
|
14
14
|
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
15
15
|
from bs4.element import NavigableString, PageElement
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
try:
|
|
18
|
+
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
|
+
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
+
except ImportError:
|
|
21
|
+
create_preprocessor = None # type: ignore[assignment]
|
|
22
|
+
preprocess_fn = None # type: ignore[assignment]
|
|
23
|
+
|
|
18
24
|
try:
|
|
19
25
|
import importlib.util
|
|
20
26
|
|
|
@@ -212,8 +218,7 @@ def _process_tag(
|
|
|
212
218
|
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
213
219
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
214
220
|
)
|
|
215
|
-
|
|
216
|
-
# Edge case where the document starts with a \n and then a heading
|
|
221
|
+
|
|
217
222
|
if is_heading and context_before not in {"", "\n"}:
|
|
218
223
|
n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
|
|
219
224
|
if n_eol_to_add > 0:
|
|
@@ -233,27 +238,37 @@ def _process_text(
|
|
|
233
238
|
) -> str:
|
|
234
239
|
text = str(el) or ""
|
|
235
240
|
|
|
236
|
-
# Cache parent lookups to avoid repeated traversal
|
|
237
241
|
parent = el.parent
|
|
238
242
|
parent_name = parent.name if parent else None
|
|
239
243
|
|
|
240
|
-
# Build set of ancestor tag names for efficient lookup
|
|
241
|
-
# Only traverse once instead of multiple find_parent calls
|
|
242
244
|
ancestor_names = set()
|
|
243
245
|
current = parent
|
|
244
246
|
while current and hasattr(current, "name"):
|
|
245
247
|
if current.name:
|
|
246
248
|
ancestor_names.add(current.name)
|
|
247
249
|
current = getattr(current, "parent", None)
|
|
248
|
-
|
|
250
|
+
|
|
249
251
|
if len(ancestor_names) > 10:
|
|
250
252
|
break
|
|
251
253
|
|
|
252
|
-
# Check for pre ancestor (whitespace handling)
|
|
253
254
|
if "pre" not in ancestor_names:
|
|
254
|
-
|
|
255
|
+
has_leading_space = text.startswith((" ", "\t"))
|
|
256
|
+
|
|
257
|
+
has_trailing_space = text.endswith((" ", "\t"))
|
|
258
|
+
|
|
259
|
+
middle_content = (
|
|
260
|
+
text[1:-1]
|
|
261
|
+
if has_leading_space and has_trailing_space
|
|
262
|
+
else text[1:]
|
|
263
|
+
if has_leading_space
|
|
264
|
+
else text[:-1]
|
|
265
|
+
if has_trailing_space
|
|
266
|
+
else text
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
middle_content = whitespace_re.sub(" ", middle_content.strip())
|
|
270
|
+
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
255
271
|
|
|
256
|
-
# Check for code-like ancestors (escaping)
|
|
257
272
|
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
258
273
|
text = escape(
|
|
259
274
|
text=text,
|
|
@@ -262,14 +277,12 @@ def _process_text(
|
|
|
262
277
|
escape_underscores=escape_underscores,
|
|
263
278
|
)
|
|
264
279
|
|
|
265
|
-
# List item text processing
|
|
266
280
|
if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
|
|
267
281
|
text = text.rstrip()
|
|
268
282
|
|
|
269
283
|
return text
|
|
270
284
|
|
|
271
285
|
|
|
272
|
-
# Context variable for ancestor cache - automatically isolated per conversion
|
|
273
286
|
_ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
|
|
274
287
|
|
|
275
288
|
|
|
@@ -281,7 +294,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
281
294
|
cache = {}
|
|
282
295
|
_ancestor_cache.set(cache)
|
|
283
296
|
|
|
284
|
-
# Check cache first
|
|
285
297
|
if elem_id in cache:
|
|
286
298
|
return cache[elem_id]
|
|
287
299
|
|
|
@@ -293,17 +305,14 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
293
305
|
if hasattr(current, "name") and current.name:
|
|
294
306
|
ancestor_names.add(current.name)
|
|
295
307
|
|
|
296
|
-
# Check if we've already cached this parent's ancestors
|
|
297
308
|
parent_id = id(current)
|
|
298
309
|
if parent_id in cache:
|
|
299
|
-
# Reuse cached ancestors
|
|
300
310
|
ancestor_names.update(cache[parent_id])
|
|
301
311
|
break
|
|
302
312
|
|
|
303
313
|
current = getattr(current, "parent", None)
|
|
304
314
|
depth += 1
|
|
305
315
|
|
|
306
|
-
# Cache the result
|
|
307
316
|
cache[elem_id] = ancestor_names
|
|
308
317
|
return ancestor_names
|
|
309
318
|
|
|
@@ -345,33 +354,29 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
345
354
|
"""
|
|
346
355
|
metadata = {}
|
|
347
356
|
|
|
348
|
-
# Extract title
|
|
349
357
|
title_tag = soup.find("title")
|
|
350
358
|
if title_tag and isinstance(title_tag, Tag) and title_tag.string:
|
|
351
359
|
metadata["title"] = title_tag.string.strip()
|
|
352
360
|
|
|
353
|
-
# Extract base href
|
|
354
361
|
base_tag = soup.find("base", href=True)
|
|
355
362
|
if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
|
|
356
363
|
metadata["base-href"] = base_tag["href"]
|
|
357
364
|
|
|
358
|
-
# Extract meta tags
|
|
359
365
|
for meta in soup.find_all("meta"):
|
|
360
|
-
# Handle name-based meta tags
|
|
361
366
|
if meta.get("name") and meta.get("content") is not None:
|
|
362
367
|
name = meta["name"]
|
|
363
368
|
content = meta["content"]
|
|
364
369
|
if isinstance(name, str) and isinstance(content, str):
|
|
365
370
|
key = f"meta-{name.lower()}"
|
|
366
371
|
metadata[key] = content
|
|
367
|
-
|
|
372
|
+
|
|
368
373
|
elif meta.get("property") and meta.get("content") is not None:
|
|
369
374
|
prop = meta["property"]
|
|
370
375
|
content = meta["content"]
|
|
371
376
|
if isinstance(prop, str) and isinstance(content, str):
|
|
372
377
|
key = f"meta-{prop.lower().replace(':', '-')}"
|
|
373
378
|
metadata[key] = content
|
|
374
|
-
|
|
379
|
+
|
|
375
380
|
elif meta.get("http-equiv") and meta.get("content") is not None:
|
|
376
381
|
equiv = meta["http-equiv"]
|
|
377
382
|
content = meta["content"]
|
|
@@ -379,12 +384,10 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
379
384
|
key = f"meta-{equiv.lower()}"
|
|
380
385
|
metadata[key] = content
|
|
381
386
|
|
|
382
|
-
# Extract canonical link
|
|
383
387
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
384
388
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
385
389
|
metadata["canonical"] = canonical["href"]
|
|
386
390
|
|
|
387
|
-
# Extract other important link relations
|
|
388
391
|
for rel_type in ["author", "license", "alternate"]:
|
|
389
392
|
link = soup.find("link", rel=rel_type, href=True)
|
|
390
393
|
if link and isinstance(link, Tag) and isinstance(link["href"], str):
|
|
@@ -407,7 +410,6 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
407
410
|
|
|
408
411
|
lines = ["<!--"]
|
|
409
412
|
for key, value in sorted(metadata.items()):
|
|
410
|
-
# Escape any potential comment closers in the value
|
|
411
413
|
safe_value = value.replace("-->", "-->")
|
|
412
414
|
lines.append(f"{key}: {safe_value}")
|
|
413
415
|
lines.append("-->")
|
|
@@ -446,6 +448,10 @@ def convert_to_markdown(
|
|
|
446
448
|
sup_symbol: str = "",
|
|
447
449
|
wrap: bool = False,
|
|
448
450
|
wrap_width: int = 80,
|
|
451
|
+
preprocess_html: bool = False,
|
|
452
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
453
|
+
remove_navigation: bool = True,
|
|
454
|
+
remove_forms: bool = True,
|
|
449
455
|
) -> str:
|
|
450
456
|
"""Convert HTML to Markdown.
|
|
451
457
|
|
|
@@ -480,6 +486,10 @@ def convert_to_markdown(
|
|
|
480
486
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
481
487
|
wrap: Wrap text to the specified width. Defaults to False.
|
|
482
488
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
489
|
+
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
490
|
+
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
491
|
+
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
492
|
+
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
483
493
|
|
|
484
494
|
Raises:
|
|
485
495
|
ConflictingOptionsError: If both 'strip' and 'convert' are specified.
|
|
@@ -499,27 +509,63 @@ def convert_to_markdown(
|
|
|
499
509
|
return source
|
|
500
510
|
|
|
501
511
|
if strip_newlines:
|
|
502
|
-
# Replace all newlines with spaces before parsing
|
|
503
512
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
504
513
|
|
|
514
|
+
# Fix lxml parsing of void elements like <wbr>
|
|
515
|
+
# lxml incorrectly treats them as container tags
|
|
516
|
+
source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
|
|
517
|
+
|
|
518
|
+
if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
519
|
+
config = create_preprocessor(
|
|
520
|
+
preset=preprocessing_preset,
|
|
521
|
+
remove_navigation=remove_navigation,
|
|
522
|
+
remove_forms=remove_forms,
|
|
523
|
+
)
|
|
524
|
+
source = preprocess_fn(source, **config)
|
|
525
|
+
|
|
505
526
|
if "".join(source.split("\n")):
|
|
506
|
-
# Determine parser to use
|
|
507
527
|
if parser is None:
|
|
508
|
-
# Auto-detect best available parser
|
|
509
528
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
510
529
|
|
|
511
|
-
# Validate parser choice
|
|
512
530
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
513
531
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
514
532
|
|
|
533
|
+
original_source = source if isinstance(source, str) else str(source)
|
|
534
|
+
needs_leading_whitespace_fix = (
|
|
535
|
+
parser == "lxml" and isinstance(source, str) and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
536
|
+
)
|
|
537
|
+
|
|
515
538
|
source = BeautifulSoup(source, parser)
|
|
539
|
+
|
|
540
|
+
if parser == "lxml":
|
|
541
|
+
body = source.find("body")
|
|
542
|
+
if body and isinstance(body, Tag):
|
|
543
|
+
children = list(body.children)
|
|
544
|
+
|
|
545
|
+
if (
|
|
546
|
+
len(children) == 1
|
|
547
|
+
and isinstance(children[0], NavigableString)
|
|
548
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
549
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
550
|
+
):
|
|
551
|
+
first_child = children[0]
|
|
552
|
+
|
|
553
|
+
leading_ws = ""
|
|
554
|
+
for char in original_source:
|
|
555
|
+
if char in " \t":
|
|
556
|
+
leading_ws += char
|
|
557
|
+
else:
|
|
558
|
+
break
|
|
559
|
+
|
|
560
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
561
|
+
first_child.replace_with(new_text)
|
|
562
|
+
needs_leading_space_fix = False
|
|
516
563
|
else:
|
|
517
564
|
raise EmptyHtmlError
|
|
518
565
|
|
|
519
566
|
if strip is not None and convert is not None:
|
|
520
567
|
raise ConflictingOptionsError("strip", "convert")
|
|
521
568
|
|
|
522
|
-
# Use streaming processing if requested
|
|
523
569
|
if stream_processing:
|
|
524
570
|
result_chunks = []
|
|
525
571
|
for chunk in convert_to_markdown_stream(
|
|
@@ -555,19 +601,15 @@ def convert_to_markdown(
|
|
|
555
601
|
chunk_callback(chunk)
|
|
556
602
|
result_chunks.append(chunk)
|
|
557
603
|
|
|
558
|
-
# Apply same post-processing as regular path
|
|
559
604
|
result = "".join(result_chunks)
|
|
560
605
|
|
|
561
|
-
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
562
606
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
563
607
|
|
|
564
|
-
# Strip all trailing newlines in inline mode
|
|
565
608
|
if convert_as_inline:
|
|
566
609
|
result = result.rstrip("\n")
|
|
567
610
|
|
|
568
611
|
return result
|
|
569
612
|
|
|
570
|
-
# Use shared core with string sink for regular processing
|
|
571
613
|
sink = StringSink()
|
|
572
614
|
|
|
573
615
|
_process_html_core(
|
|
@@ -601,10 +643,42 @@ def convert_to_markdown(
|
|
|
601
643
|
|
|
602
644
|
result = sink.get_result()
|
|
603
645
|
|
|
604
|
-
|
|
646
|
+
if (
|
|
647
|
+
"needs_leading_whitespace_fix" in locals()
|
|
648
|
+
and needs_leading_whitespace_fix
|
|
649
|
+
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
650
|
+
):
|
|
651
|
+
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
652
|
+
leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
653
|
+
if leading_whitespace_match:
|
|
654
|
+
leading_whitespace = leading_whitespace_match.group(0)
|
|
655
|
+
|
|
656
|
+
if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
|
|
657
|
+
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
658
|
+
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
659
|
+
|
|
660
|
+
if leading_whitespace:
|
|
661
|
+
result = leading_whitespace + result
|
|
662
|
+
|
|
605
663
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
606
664
|
|
|
607
|
-
|
|
665
|
+
def normalize_spaces_outside_code(text: str) -> str:
|
|
666
|
+
parts = text.split("```")
|
|
667
|
+
for i in range(0, len(parts), 2):
|
|
668
|
+
# Preserve definition list formatting (: followed by 3 spaces)
|
|
669
|
+
# Split by definition list patterns to preserve them
|
|
670
|
+
def_parts = re.split(r"(:\s{3})", parts[i])
|
|
671
|
+
for j in range(0, len(def_parts), 2):
|
|
672
|
+
# Only normalize non-definition-list parts
|
|
673
|
+
def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
|
|
674
|
+
parts[i] = "".join(def_parts)
|
|
675
|
+
return "```".join(parts)
|
|
676
|
+
|
|
677
|
+
result = normalize_spaces_outside_code(result)
|
|
678
|
+
|
|
679
|
+
result = re.sub(r"\*\* {2,}", "** ", result)
|
|
680
|
+
result = re.sub(r" {2,}\*\*", " **", result)
|
|
681
|
+
|
|
608
682
|
if convert_as_inline:
|
|
609
683
|
result = result.rstrip("\n")
|
|
610
684
|
|
|
@@ -654,25 +728,19 @@ class StreamingSink(OutputSink):
|
|
|
654
728
|
if not text:
|
|
655
729
|
return
|
|
656
730
|
|
|
657
|
-
# Use string concatenation instead of StringIO for better performance
|
|
658
731
|
current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
|
|
659
732
|
current_content += text
|
|
660
733
|
|
|
661
|
-
# Yield chunks when buffer is large enough
|
|
662
734
|
while len(current_content) >= self.chunk_size:
|
|
663
|
-
# Find optimal split point (prefer after newlines)
|
|
664
735
|
split_pos = self._find_split_position(current_content)
|
|
665
736
|
|
|
666
|
-
# Extract chunk and update remaining content
|
|
667
737
|
chunk = current_content[:split_pos]
|
|
668
738
|
current_content = current_content[split_pos:]
|
|
669
739
|
|
|
670
|
-
# Store chunk and update progress
|
|
671
740
|
self.chunks.append(chunk)
|
|
672
741
|
self.processed_bytes += len(chunk)
|
|
673
742
|
self._update_progress()
|
|
674
743
|
|
|
675
|
-
# Update buffer with remaining content
|
|
676
744
|
self.buffer = StringIO()
|
|
677
745
|
if current_content:
|
|
678
746
|
self.buffer.write(current_content)
|
|
@@ -692,7 +760,6 @@ class StreamingSink(OutputSink):
|
|
|
692
760
|
|
|
693
761
|
def _find_split_position(self, content: str) -> int:
|
|
694
762
|
"""Find optimal position to split content for chunks."""
|
|
695
|
-
# Look for newline within reasonable distance of target size
|
|
696
763
|
target = self.chunk_size
|
|
697
764
|
lookahead = min(100, len(content) - target)
|
|
698
765
|
|
|
@@ -740,11 +807,9 @@ def _process_html_core(
|
|
|
740
807
|
wrap_width: int,
|
|
741
808
|
) -> None:
|
|
742
809
|
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
743
|
-
# Set up a fresh cache for this conversion
|
|
744
810
|
token = _ancestor_cache.set({})
|
|
745
811
|
|
|
746
812
|
try:
|
|
747
|
-
# Input validation and preprocessing
|
|
748
813
|
if isinstance(source, str):
|
|
749
814
|
if (
|
|
750
815
|
heading_style == UNDERLINED
|
|
@@ -759,12 +824,9 @@ def _process_html_core(
|
|
|
759
824
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
760
825
|
|
|
761
826
|
if "".join(source.split("\n")):
|
|
762
|
-
# Determine parser to use
|
|
763
827
|
if parser is None:
|
|
764
|
-
# Auto-detect best available parser
|
|
765
828
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
766
829
|
|
|
767
|
-
# Validate parser choice
|
|
768
830
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
769
831
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
770
832
|
|
|
@@ -775,7 +837,6 @@ def _process_html_core(
|
|
|
775
837
|
if strip is not None and convert is not None:
|
|
776
838
|
raise ConflictingOptionsError("strip", "convert")
|
|
777
839
|
|
|
778
|
-
# Create converters map
|
|
779
840
|
converters_map = create_converters_map(
|
|
780
841
|
autolinks=autolinks,
|
|
781
842
|
bullets=bullets,
|
|
@@ -795,18 +856,15 @@ def _process_html_core(
|
|
|
795
856
|
if custom_converters:
|
|
796
857
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
797
858
|
|
|
798
|
-
# Extract metadata if requested
|
|
799
859
|
if extract_metadata and not convert_as_inline:
|
|
800
860
|
metadata = _extract_metadata(source)
|
|
801
861
|
metadata_comment = _format_metadata_comment(metadata)
|
|
802
862
|
if metadata_comment:
|
|
803
863
|
sink.write(metadata_comment)
|
|
804
864
|
|
|
805
|
-
# Find the body tag to process only its content
|
|
806
865
|
body = source.find("body")
|
|
807
866
|
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
808
867
|
|
|
809
|
-
# Process elements using shared logic
|
|
810
868
|
context = ""
|
|
811
869
|
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
812
870
|
if isinstance(el, NavigableString):
|
|
@@ -833,10 +891,8 @@ def _process_html_core(
|
|
|
833
891
|
sink.write(text)
|
|
834
892
|
context += text
|
|
835
893
|
|
|
836
|
-
# Finalize output
|
|
837
894
|
sink.finalize()
|
|
838
895
|
finally:
|
|
839
|
-
# Reset context
|
|
840
896
|
_ancestor_cache.reset(token)
|
|
841
897
|
|
|
842
898
|
|
|
@@ -909,16 +965,13 @@ def convert_to_markdown_stream(
|
|
|
909
965
|
Yields:
|
|
910
966
|
str: Chunks of Markdown-formatted text.
|
|
911
967
|
"""
|
|
912
|
-
# Use shared core with streaming sink
|
|
913
968
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
914
969
|
|
|
915
|
-
# Estimate total size for progress reporting
|
|
916
970
|
if isinstance(source, str):
|
|
917
971
|
sink.total_bytes = len(source)
|
|
918
972
|
elif isinstance(source, BeautifulSoup):
|
|
919
973
|
sink.total_bytes = len(str(source))
|
|
920
974
|
|
|
921
|
-
# Process using shared core
|
|
922
975
|
_process_html_core(
|
|
923
976
|
source,
|
|
924
977
|
sink,
|
|
@@ -948,30 +1001,22 @@ def convert_to_markdown_stream(
|
|
|
948
1001
|
wrap_width=wrap_width,
|
|
949
1002
|
)
|
|
950
1003
|
|
|
951
|
-
# Get all chunks from the sink and apply post-processing
|
|
952
1004
|
all_chunks = list(sink.get_chunks())
|
|
953
1005
|
combined_result = "".join(all_chunks)
|
|
954
1006
|
|
|
955
|
-
# Apply same post-processing as regular conversion
|
|
956
|
-
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
957
1007
|
combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
|
|
958
1008
|
|
|
959
|
-
# Strip all trailing newlines in inline mode
|
|
960
1009
|
if convert_as_inline:
|
|
961
1010
|
combined_result = combined_result.rstrip("\n")
|
|
962
1011
|
|
|
963
|
-
# Now split the post-processed result back into chunks at good boundaries
|
|
964
1012
|
if not combined_result:
|
|
965
1013
|
return
|
|
966
1014
|
|
|
967
1015
|
pos = 0
|
|
968
1016
|
while pos < len(combined_result):
|
|
969
|
-
# Calculate chunk end position
|
|
970
1017
|
end_pos = min(pos + chunk_size, len(combined_result))
|
|
971
1018
|
|
|
972
|
-
# If not at the end, try to find a good split point
|
|
973
1019
|
if end_pos < len(combined_result):
|
|
974
|
-
# Look for newline within reasonable distance
|
|
975
1020
|
search_start = max(pos, end_pos - 50)
|
|
976
1021
|
search_end = min(len(combined_result), end_pos + 50)
|
|
977
1022
|
search_area = combined_result[search_start:search_end]
|
|
@@ -980,7 +1025,6 @@ def convert_to_markdown_stream(
|
|
|
980
1025
|
if newline_pos > 0:
|
|
981
1026
|
end_pos = search_start + newline_pos + 1
|
|
982
1027
|
|
|
983
|
-
# Yield the chunk
|
|
984
1028
|
chunk = combined_result[pos:end_pos]
|
|
985
1029
|
if chunk:
|
|
986
1030
|
yield chunk
|
html_to_markdown/utils.py
CHANGED
|
@@ -6,18 +6,25 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""
|
|
10
|
-
|
|
9
|
+
"""Simplified whitespace handling for inline elements.
|
|
10
|
+
|
|
11
|
+
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
+
and normalizes internal whitespace.
|
|
11
13
|
|
|
12
14
|
Args:
|
|
13
15
|
text: The text to chomp.
|
|
14
16
|
|
|
15
17
|
Returns:
|
|
16
|
-
A tuple containing the prefix, suffix, and the
|
|
18
|
+
A tuple containing the prefix, suffix, and the normalized text.
|
|
17
19
|
"""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
if not text:
|
|
21
|
+
return "", "", ""
|
|
22
|
+
|
|
23
|
+
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
24
|
+
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
25
|
+
|
|
20
26
|
text = text.strip()
|
|
27
|
+
|
|
21
28
|
return prefix, suffix, text
|
|
22
29
|
|
|
23
30
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -32,6 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
License-File: LICENSE
|
|
34
34
|
Requires-Dist: beautifulsoup4>=4.13.4
|
|
35
|
+
Requires-Dist: nh3>=0.2.21
|
|
35
36
|
Provides-Extra: lxml
|
|
36
37
|
Requires-Dist: lxml>=5; extra == "lxml"
|
|
37
38
|
Dynamic: license-file
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
+
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
+
html_to_markdown/cli.py,sha256=8xlgSEcnqsSM_dr1TCSgPDAo09YvUtO78PvDFivFFdg,6973
|
|
4
|
+
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
+
html_to_markdown/converters.py,sha256=COC2KqPelJlMCY5eXUS5gdiPOG8Yzx0U719FeXPw3GA,55514
|
|
6
|
+
html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=S4S1ZfLC_hkJVgmA5atImTyWQDOxfHctPbaep2QtyrQ,11248
|
|
8
|
+
html_to_markdown/processing.py,sha256=wkbhLg42U3aeVQSZFuzGt5irtN037XzRKpCE71QYZXI,36520
|
|
9
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
html_to_markdown/utils.py,sha256=QgWPzmpZKFd6wDTe8IY3gbVT3xNzoGV3PBgd17J0O-w,2066
|
|
11
|
+
html_to_markdown-1.8.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
12
|
+
html_to_markdown-1.8.0.dist-info/METADATA,sha256=6pgiK4p0A77axLfD8MH1EGgzifP06koVV8KWS_5-iYk,17175
|
|
13
|
+
html_to_markdown-1.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
+
html_to_markdown-1.8.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
15
|
+
html_to_markdown-1.8.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
16
|
+
html_to_markdown-1.8.0.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=-JFtH1mquoU_FLgAvq2NUvaeI0HUWd2lnoinimh5wxM,586
|
|
2
|
-
html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
|
|
3
|
-
html_to_markdown/cli.py,sha256=WzQVr97jKECEZwW-xIJofSl3v4EhqU-De7XRQjmgc08,7179
|
|
4
|
-
html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
|
|
5
|
-
html_to_markdown/converters.py,sha256=z7vphGLAGKn1f8T3xJojfKCdGbzKdof3LyjKTTmwkQo,59694
|
|
6
|
-
html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
|
|
7
|
-
html_to_markdown/processing.py,sha256=S3EtjDG9xM4WcIzPEgVDrey04eT33OS2LOPwu6AhZT0,35107
|
|
8
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
html_to_markdown/utils.py,sha256=HJUDej5HSpXRtYv-OkCyD0hwnPnVfQCwY6rBRlIOt9s,1989
|
|
10
|
-
html_to_markdown-1.6.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
11
|
-
html_to_markdown-1.6.0.dist-info/METADATA,sha256=xLpWliFQDooUVrxxN_SaA4gXy7GixPakOdJal0iC7RQ,17148
|
|
12
|
-
html_to_markdown-1.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
html_to_markdown-1.6.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
14
|
-
html_to_markdown-1.6.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
15
|
-
html_to_markdown-1.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|