html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +3 -1
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +375 -645
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +227 -87
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/METADATA +87 -14
- html_to_markdown-1.9.0.dist-info/RECORD +16 -0
- html_to_markdown-1.6.0.dist-info/RECORD +0 -15
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.6.0.dist-info → html_to_markdown-1.9.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -3,18 +3,24 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
|
-
from collections.abc import Generator, Mapping
|
|
7
|
-
|
|
6
|
+
from collections.abc import Callable, Generator, Mapping
|
|
7
|
+
|
|
8
8
|
import re
|
|
9
9
|
from contextvars import ContextVar
|
|
10
10
|
from io import StringIO
|
|
11
11
|
from itertools import chain
|
|
12
|
-
from typing import TYPE_CHECKING, Any,
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
13
13
|
|
|
14
14
|
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
15
15
|
from bs4.element import NavigableString, PageElement
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
try:
|
|
18
|
+
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
|
+
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
+
except ImportError:
|
|
21
|
+
create_preprocessor = None # type: ignore[assignment]
|
|
22
|
+
preprocess_fn = None # type: ignore[assignment]
|
|
23
|
+
|
|
18
24
|
try:
|
|
19
25
|
import importlib.util
|
|
20
26
|
|
|
@@ -170,7 +176,7 @@ def _process_tag(
|
|
|
170
176
|
tag_name: SupportedTag | None = (
|
|
171
177
|
cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
|
|
172
178
|
)
|
|
173
|
-
|
|
179
|
+
text_parts: list[str] = []
|
|
174
180
|
|
|
175
181
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
176
182
|
is_cell = tag_name in {"td", "th"}
|
|
@@ -187,33 +193,61 @@ def _process_tag(
|
|
|
187
193
|
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
188
194
|
el.extract()
|
|
189
195
|
|
|
190
|
-
|
|
196
|
+
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
197
|
+
|
|
198
|
+
# List of tags that return empty string when they have no content
|
|
199
|
+
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
200
|
+
|
|
201
|
+
for i, el in enumerate(children):
|
|
191
202
|
if isinstance(el, NavigableString):
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
203
|
+
# Check if this is whitespace between empty elements
|
|
204
|
+
if el.strip() == "" and i > 0 and i < len(children) - 1:
|
|
205
|
+
prev_el = children[i - 1]
|
|
206
|
+
next_el = children[i + 1]
|
|
207
|
+
|
|
208
|
+
# If previous element was a tag that produced empty output
|
|
209
|
+
# and next element is also a tag that could be empty, skip this whitespace
|
|
210
|
+
if (
|
|
211
|
+
isinstance(prev_el, Tag)
|
|
212
|
+
and isinstance(next_el, Tag)
|
|
213
|
+
and prev_el.name.lower() in empty_when_no_content_tags
|
|
214
|
+
and next_el.name.lower() in empty_when_no_content_tags
|
|
215
|
+
and not prev_el.get_text().strip()
|
|
216
|
+
):
|
|
217
|
+
# Previous tag is empty and next could be empty too, skip this whitespace
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
text_parts.append(
|
|
221
|
+
_process_text(
|
|
222
|
+
el=el,
|
|
223
|
+
escape_misc=escape_misc,
|
|
224
|
+
escape_asterisks=escape_asterisks,
|
|
225
|
+
escape_underscores=escape_underscores,
|
|
226
|
+
)
|
|
197
227
|
)
|
|
198
228
|
elif isinstance(el, Tag):
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
229
|
+
current_text = "".join(text_parts)
|
|
230
|
+
text_parts.append(
|
|
231
|
+
_process_tag(
|
|
232
|
+
el,
|
|
233
|
+
converters_map,
|
|
234
|
+
convert_as_inline=convert_children_as_inline,
|
|
235
|
+
convert=convert,
|
|
236
|
+
escape_asterisks=escape_asterisks,
|
|
237
|
+
escape_misc=escape_misc,
|
|
238
|
+
escape_underscores=escape_underscores,
|
|
239
|
+
strip=strip,
|
|
240
|
+
context_before=(context_before + current_text)[-2:],
|
|
241
|
+
)
|
|
209
242
|
)
|
|
210
243
|
|
|
244
|
+
text = "".join(text_parts)
|
|
245
|
+
|
|
211
246
|
if tag_name and should_convert_tag:
|
|
212
247
|
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
213
248
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
214
249
|
)
|
|
215
|
-
|
|
216
|
-
# Edge case where the document starts with a \n and then a heading
|
|
250
|
+
|
|
217
251
|
if is_heading and context_before not in {"", "\n"}:
|
|
218
252
|
n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
|
|
219
253
|
if n_eol_to_add > 0:
|
|
@@ -233,27 +267,90 @@ def _process_text(
|
|
|
233
267
|
) -> str:
|
|
234
268
|
text = str(el) or ""
|
|
235
269
|
|
|
236
|
-
# Cache parent lookups to avoid repeated traversal
|
|
237
270
|
parent = el.parent
|
|
238
271
|
parent_name = parent.name if parent else None
|
|
239
272
|
|
|
240
|
-
# Build set of ancestor tag names for efficient lookup
|
|
241
|
-
# Only traverse once instead of multiple find_parent calls
|
|
242
273
|
ancestor_names = set()
|
|
243
274
|
current = parent
|
|
244
275
|
while current and hasattr(current, "name"):
|
|
245
276
|
if current.name:
|
|
246
277
|
ancestor_names.add(current.name)
|
|
247
278
|
current = getattr(current, "parent", None)
|
|
248
|
-
|
|
279
|
+
|
|
249
280
|
if len(ancestor_names) > 10:
|
|
250
281
|
break
|
|
251
282
|
|
|
252
|
-
# Check for pre ancestor (whitespace handling)
|
|
253
283
|
if "pre" not in ancestor_names:
|
|
254
|
-
|
|
284
|
+
# Special case: if the text is only whitespace
|
|
285
|
+
if text.strip() == "":
|
|
286
|
+
# If it contains newlines, it's probably indentation whitespace, return empty
|
|
287
|
+
if "\n" in text:
|
|
288
|
+
text = ""
|
|
289
|
+
else:
|
|
290
|
+
# Check if this whitespace is between block elements
|
|
291
|
+
# Define block elements that should not have whitespace between them
|
|
292
|
+
block_elements = {
|
|
293
|
+
"p",
|
|
294
|
+
"ul",
|
|
295
|
+
"ol",
|
|
296
|
+
"div",
|
|
297
|
+
"blockquote",
|
|
298
|
+
"pre",
|
|
299
|
+
"h1",
|
|
300
|
+
"h2",
|
|
301
|
+
"h3",
|
|
302
|
+
"h4",
|
|
303
|
+
"h5",
|
|
304
|
+
"h6",
|
|
305
|
+
"table",
|
|
306
|
+
"dl",
|
|
307
|
+
"hr",
|
|
308
|
+
"figure",
|
|
309
|
+
"article",
|
|
310
|
+
"section",
|
|
311
|
+
"nav",
|
|
312
|
+
"aside",
|
|
313
|
+
"header",
|
|
314
|
+
"footer",
|
|
315
|
+
"main",
|
|
316
|
+
"form",
|
|
317
|
+
"fieldset",
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
prev_sibling = el.previous_sibling
|
|
321
|
+
next_sibling = el.next_sibling
|
|
322
|
+
|
|
323
|
+
# Check if whitespace is between block elements
|
|
324
|
+
if (
|
|
325
|
+
prev_sibling
|
|
326
|
+
and hasattr(prev_sibling, "name")
|
|
327
|
+
and prev_sibling.name in block_elements
|
|
328
|
+
and next_sibling
|
|
329
|
+
and hasattr(next_sibling, "name")
|
|
330
|
+
and next_sibling.name in block_elements
|
|
331
|
+
):
|
|
332
|
+
# Remove whitespace between block elements
|
|
333
|
+
text = ""
|
|
334
|
+
else:
|
|
335
|
+
# Otherwise it's inline whitespace, normalize to single space
|
|
336
|
+
text = " " if text else ""
|
|
337
|
+
else:
|
|
338
|
+
has_leading_space = text.startswith((" ", "\t"))
|
|
339
|
+
has_trailing_space = text.endswith((" ", "\t"))
|
|
340
|
+
|
|
341
|
+
middle_content = (
|
|
342
|
+
text[1:-1]
|
|
343
|
+
if has_leading_space and has_trailing_space
|
|
344
|
+
else text[1:]
|
|
345
|
+
if has_leading_space
|
|
346
|
+
else text[:-1]
|
|
347
|
+
if has_trailing_space
|
|
348
|
+
else text
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
middle_content = whitespace_re.sub(" ", middle_content.strip())
|
|
352
|
+
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
255
353
|
|
|
256
|
-
# Check for code-like ancestors (escaping)
|
|
257
354
|
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
258
355
|
text = escape(
|
|
259
356
|
text=text,
|
|
@@ -262,14 +359,12 @@ def _process_text(
|
|
|
262
359
|
escape_underscores=escape_underscores,
|
|
263
360
|
)
|
|
264
361
|
|
|
265
|
-
# List item text processing
|
|
266
362
|
if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
|
|
267
363
|
text = text.rstrip()
|
|
268
364
|
|
|
269
365
|
return text
|
|
270
366
|
|
|
271
367
|
|
|
272
|
-
# Context variable for ancestor cache - automatically isolated per conversion
|
|
273
368
|
_ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
|
|
274
369
|
|
|
275
370
|
|
|
@@ -281,7 +376,6 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
281
376
|
cache = {}
|
|
282
377
|
_ancestor_cache.set(cache)
|
|
283
378
|
|
|
284
|
-
# Check cache first
|
|
285
379
|
if elem_id in cache:
|
|
286
380
|
return cache[elem_id]
|
|
287
381
|
|
|
@@ -293,17 +387,14 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
293
387
|
if hasattr(current, "name") and current.name:
|
|
294
388
|
ancestor_names.add(current.name)
|
|
295
389
|
|
|
296
|
-
# Check if we've already cached this parent's ancestors
|
|
297
390
|
parent_id = id(current)
|
|
298
391
|
if parent_id in cache:
|
|
299
|
-
# Reuse cached ancestors
|
|
300
392
|
ancestor_names.update(cache[parent_id])
|
|
301
393
|
break
|
|
302
394
|
|
|
303
395
|
current = getattr(current, "parent", None)
|
|
304
396
|
depth += 1
|
|
305
397
|
|
|
306
|
-
# Cache the result
|
|
307
398
|
cache[elem_id] = ancestor_names
|
|
308
399
|
return ancestor_names
|
|
309
400
|
|
|
@@ -345,33 +436,29 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
345
436
|
"""
|
|
346
437
|
metadata = {}
|
|
347
438
|
|
|
348
|
-
# Extract title
|
|
349
439
|
title_tag = soup.find("title")
|
|
350
440
|
if title_tag and isinstance(title_tag, Tag) and title_tag.string:
|
|
351
441
|
metadata["title"] = title_tag.string.strip()
|
|
352
442
|
|
|
353
|
-
# Extract base href
|
|
354
443
|
base_tag = soup.find("base", href=True)
|
|
355
444
|
if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
|
|
356
445
|
metadata["base-href"] = base_tag["href"]
|
|
357
446
|
|
|
358
|
-
# Extract meta tags
|
|
359
447
|
for meta in soup.find_all("meta"):
|
|
360
|
-
# Handle name-based meta tags
|
|
361
448
|
if meta.get("name") and meta.get("content") is not None:
|
|
362
449
|
name = meta["name"]
|
|
363
450
|
content = meta["content"]
|
|
364
451
|
if isinstance(name, str) and isinstance(content, str):
|
|
365
452
|
key = f"meta-{name.lower()}"
|
|
366
453
|
metadata[key] = content
|
|
367
|
-
|
|
454
|
+
|
|
368
455
|
elif meta.get("property") and meta.get("content") is not None:
|
|
369
456
|
prop = meta["property"]
|
|
370
457
|
content = meta["content"]
|
|
371
458
|
if isinstance(prop, str) and isinstance(content, str):
|
|
372
459
|
key = f"meta-{prop.lower().replace(':', '-')}"
|
|
373
460
|
metadata[key] = content
|
|
374
|
-
|
|
461
|
+
|
|
375
462
|
elif meta.get("http-equiv") and meta.get("content") is not None:
|
|
376
463
|
equiv = meta["http-equiv"]
|
|
377
464
|
content = meta["content"]
|
|
@@ -379,13 +466,13 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
379
466
|
key = f"meta-{equiv.lower()}"
|
|
380
467
|
metadata[key] = content
|
|
381
468
|
|
|
382
|
-
# Extract canonical link
|
|
383
469
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
384
470
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
385
471
|
metadata["canonical"] = canonical["href"]
|
|
386
472
|
|
|
387
|
-
# Extract
|
|
388
|
-
|
|
473
|
+
# Extract link relations
|
|
474
|
+
link_relations = {"author", "license", "alternate"}
|
|
475
|
+
for rel_type in link_relations:
|
|
389
476
|
link = soup.find("link", rel=rel_type, href=True)
|
|
390
477
|
if link and isinstance(link, Tag) and isinstance(link["href"], str):
|
|
391
478
|
metadata[f"link-{rel_type}"] = link["href"]
|
|
@@ -407,7 +494,6 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
407
494
|
|
|
408
495
|
lines = ["<!--"]
|
|
409
496
|
for key, value in sorted(metadata.items()):
|
|
410
|
-
# Escape any potential comment closers in the value
|
|
411
497
|
safe_value = value.replace("-->", "-->")
|
|
412
498
|
lines.append(f"{key}: {safe_value}")
|
|
413
499
|
lines.append("-->")
|
|
@@ -446,6 +532,10 @@ def convert_to_markdown(
|
|
|
446
532
|
sup_symbol: str = "",
|
|
447
533
|
wrap: bool = False,
|
|
448
534
|
wrap_width: int = 80,
|
|
535
|
+
preprocess_html: bool = False,
|
|
536
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
537
|
+
remove_navigation: bool = True,
|
|
538
|
+
remove_forms: bool = True,
|
|
449
539
|
) -> str:
|
|
450
540
|
"""Convert HTML to Markdown.
|
|
451
541
|
|
|
@@ -480,6 +570,10 @@ def convert_to_markdown(
|
|
|
480
570
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
481
571
|
wrap: Wrap text to the specified width. Defaults to False.
|
|
482
572
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
573
|
+
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
574
|
+
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
575
|
+
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
576
|
+
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
483
577
|
|
|
484
578
|
Raises:
|
|
485
579
|
ConflictingOptionsError: If both 'strip' and 'convert' are specified.
|
|
@@ -499,27 +593,63 @@ def convert_to_markdown(
|
|
|
499
593
|
return source
|
|
500
594
|
|
|
501
595
|
if strip_newlines:
|
|
502
|
-
# Replace all newlines with spaces before parsing
|
|
503
596
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
504
597
|
|
|
598
|
+
# Fix lxml parsing of void elements like <wbr>
|
|
599
|
+
# lxml incorrectly treats them as container tags
|
|
600
|
+
source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
|
|
601
|
+
|
|
602
|
+
if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
603
|
+
config = create_preprocessor(
|
|
604
|
+
preset=preprocessing_preset,
|
|
605
|
+
remove_navigation=remove_navigation,
|
|
606
|
+
remove_forms=remove_forms,
|
|
607
|
+
)
|
|
608
|
+
source = preprocess_fn(source, **config)
|
|
609
|
+
|
|
505
610
|
if "".join(source.split("\n")):
|
|
506
|
-
# Determine parser to use
|
|
507
611
|
if parser is None:
|
|
508
|
-
# Auto-detect best available parser
|
|
509
612
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
510
613
|
|
|
511
|
-
# Validate parser choice
|
|
512
614
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
513
615
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
514
616
|
|
|
617
|
+
original_source = source if isinstance(source, str) else str(source)
|
|
618
|
+
needs_leading_whitespace_fix = (
|
|
619
|
+
parser == "lxml" and isinstance(source, str) and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
620
|
+
)
|
|
621
|
+
|
|
515
622
|
source = BeautifulSoup(source, parser)
|
|
623
|
+
|
|
624
|
+
if parser == "lxml":
|
|
625
|
+
body = source.find("body")
|
|
626
|
+
if body and isinstance(body, Tag):
|
|
627
|
+
children = list(body.children)
|
|
628
|
+
|
|
629
|
+
if (
|
|
630
|
+
len(children) == 1
|
|
631
|
+
and isinstance(children[0], NavigableString)
|
|
632
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
633
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
634
|
+
):
|
|
635
|
+
first_child = children[0]
|
|
636
|
+
|
|
637
|
+
leading_ws = ""
|
|
638
|
+
for char in original_source:
|
|
639
|
+
if char in " \t":
|
|
640
|
+
leading_ws += char
|
|
641
|
+
else:
|
|
642
|
+
break
|
|
643
|
+
|
|
644
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
645
|
+
first_child.replace_with(new_text)
|
|
646
|
+
needs_leading_space_fix = False
|
|
516
647
|
else:
|
|
517
648
|
raise EmptyHtmlError
|
|
518
649
|
|
|
519
650
|
if strip is not None and convert is not None:
|
|
520
651
|
raise ConflictingOptionsError("strip", "convert")
|
|
521
652
|
|
|
522
|
-
# Use streaming processing if requested
|
|
523
653
|
if stream_processing:
|
|
524
654
|
result_chunks = []
|
|
525
655
|
for chunk in convert_to_markdown_stream(
|
|
@@ -555,19 +685,15 @@ def convert_to_markdown(
|
|
|
555
685
|
chunk_callback(chunk)
|
|
556
686
|
result_chunks.append(chunk)
|
|
557
687
|
|
|
558
|
-
# Apply same post-processing as regular path
|
|
559
688
|
result = "".join(result_chunks)
|
|
560
689
|
|
|
561
|
-
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
562
690
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
563
691
|
|
|
564
|
-
# Strip all trailing newlines in inline mode
|
|
565
692
|
if convert_as_inline:
|
|
566
693
|
result = result.rstrip("\n")
|
|
567
694
|
|
|
568
695
|
return result
|
|
569
696
|
|
|
570
|
-
# Use shared core with string sink for regular processing
|
|
571
697
|
sink = StringSink()
|
|
572
698
|
|
|
573
699
|
_process_html_core(
|
|
@@ -601,10 +727,54 @@ def convert_to_markdown(
|
|
|
601
727
|
|
|
602
728
|
result = sink.get_result()
|
|
603
729
|
|
|
604
|
-
|
|
730
|
+
if (
|
|
731
|
+
"needs_leading_whitespace_fix" in locals()
|
|
732
|
+
and needs_leading_whitespace_fix
|
|
733
|
+
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
734
|
+
):
|
|
735
|
+
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
736
|
+
leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
737
|
+
if leading_whitespace_match:
|
|
738
|
+
leading_whitespace = leading_whitespace_match.group(0)
|
|
739
|
+
|
|
740
|
+
# Check if input contains list or heading tags
|
|
741
|
+
list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
|
|
742
|
+
if any(tag in original_input for tag in list_heading_tags):
|
|
743
|
+
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
744
|
+
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
745
|
+
|
|
746
|
+
if leading_whitespace:
|
|
747
|
+
result = leading_whitespace + result
|
|
748
|
+
|
|
605
749
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
606
750
|
|
|
607
|
-
|
|
751
|
+
def normalize_spaces_outside_code(text: str) -> str:
|
|
752
|
+
parts = text.split("```")
|
|
753
|
+
for i in range(0, len(parts), 2):
|
|
754
|
+
# Process each line separately to preserve leading spaces
|
|
755
|
+
lines = parts[i].split("\n")
|
|
756
|
+
processed_lines = []
|
|
757
|
+
for line in lines:
|
|
758
|
+
# Preserve definition list formatting (: followed by 3 spaces)
|
|
759
|
+
def_parts = re.split(r"(:\s{3})", line)
|
|
760
|
+
for j in range(0, len(def_parts), 2):
|
|
761
|
+
# Only normalize non-definition-list parts
|
|
762
|
+
# Also preserve leading spaces (for list indentation)
|
|
763
|
+
match = re.match(r"^(\s*)(.*)", def_parts[j])
|
|
764
|
+
if match:
|
|
765
|
+
leading_spaces, rest = match.groups()
|
|
766
|
+
# Only normalize multiple spaces that are not at the beginning
|
|
767
|
+
rest = re.sub(r" {3,}", " ", rest)
|
|
768
|
+
def_parts[j] = leading_spaces + rest
|
|
769
|
+
processed_lines.append("".join(def_parts))
|
|
770
|
+
parts[i] = "\n".join(processed_lines)
|
|
771
|
+
return "```".join(parts)
|
|
772
|
+
|
|
773
|
+
result = normalize_spaces_outside_code(result)
|
|
774
|
+
|
|
775
|
+
result = re.sub(r"\*\* {2,}", "** ", result)
|
|
776
|
+
result = re.sub(r" {2,}\*\*", " **", result)
|
|
777
|
+
|
|
608
778
|
if convert_as_inline:
|
|
609
779
|
result = result.rstrip("\n")
|
|
610
780
|
|
|
@@ -654,25 +824,19 @@ class StreamingSink(OutputSink):
|
|
|
654
824
|
if not text:
|
|
655
825
|
return
|
|
656
826
|
|
|
657
|
-
# Use string concatenation instead of StringIO for better performance
|
|
658
827
|
current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
|
|
659
828
|
current_content += text
|
|
660
829
|
|
|
661
|
-
# Yield chunks when buffer is large enough
|
|
662
830
|
while len(current_content) >= self.chunk_size:
|
|
663
|
-
# Find optimal split point (prefer after newlines)
|
|
664
831
|
split_pos = self._find_split_position(current_content)
|
|
665
832
|
|
|
666
|
-
# Extract chunk and update remaining content
|
|
667
833
|
chunk = current_content[:split_pos]
|
|
668
834
|
current_content = current_content[split_pos:]
|
|
669
835
|
|
|
670
|
-
# Store chunk and update progress
|
|
671
836
|
self.chunks.append(chunk)
|
|
672
837
|
self.processed_bytes += len(chunk)
|
|
673
838
|
self._update_progress()
|
|
674
839
|
|
|
675
|
-
# Update buffer with remaining content
|
|
676
840
|
self.buffer = StringIO()
|
|
677
841
|
if current_content:
|
|
678
842
|
self.buffer.write(current_content)
|
|
@@ -692,7 +856,6 @@ class StreamingSink(OutputSink):
|
|
|
692
856
|
|
|
693
857
|
def _find_split_position(self, content: str) -> int:
|
|
694
858
|
"""Find optimal position to split content for chunks."""
|
|
695
|
-
# Look for newline within reasonable distance of target size
|
|
696
859
|
target = self.chunk_size
|
|
697
860
|
lookahead = min(100, len(content) - target)
|
|
698
861
|
|
|
@@ -740,11 +903,9 @@ def _process_html_core(
|
|
|
740
903
|
wrap_width: int,
|
|
741
904
|
) -> None:
|
|
742
905
|
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
743
|
-
# Set up a fresh cache for this conversion
|
|
744
906
|
token = _ancestor_cache.set({})
|
|
745
907
|
|
|
746
908
|
try:
|
|
747
|
-
# Input validation and preprocessing
|
|
748
909
|
if isinstance(source, str):
|
|
749
910
|
if (
|
|
750
911
|
heading_style == UNDERLINED
|
|
@@ -759,12 +920,9 @@ def _process_html_core(
|
|
|
759
920
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
760
921
|
|
|
761
922
|
if "".join(source.split("\n")):
|
|
762
|
-
# Determine parser to use
|
|
763
923
|
if parser is None:
|
|
764
|
-
# Auto-detect best available parser
|
|
765
924
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
766
925
|
|
|
767
|
-
# Validate parser choice
|
|
768
926
|
if parser == "lxml" and not LXML_AVAILABLE:
|
|
769
927
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
770
928
|
|
|
@@ -775,7 +933,6 @@ def _process_html_core(
|
|
|
775
933
|
if strip is not None and convert is not None:
|
|
776
934
|
raise ConflictingOptionsError("strip", "convert")
|
|
777
935
|
|
|
778
|
-
# Create converters map
|
|
779
936
|
converters_map = create_converters_map(
|
|
780
937
|
autolinks=autolinks,
|
|
781
938
|
bullets=bullets,
|
|
@@ -795,18 +952,15 @@ def _process_html_core(
|
|
|
795
952
|
if custom_converters:
|
|
796
953
|
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
797
954
|
|
|
798
|
-
# Extract metadata if requested
|
|
799
955
|
if extract_metadata and not convert_as_inline:
|
|
800
956
|
metadata = _extract_metadata(source)
|
|
801
957
|
metadata_comment = _format_metadata_comment(metadata)
|
|
802
958
|
if metadata_comment:
|
|
803
959
|
sink.write(metadata_comment)
|
|
804
960
|
|
|
805
|
-
# Find the body tag to process only its content
|
|
806
961
|
body = source.find("body")
|
|
807
962
|
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
808
963
|
|
|
809
|
-
# Process elements using shared logic
|
|
810
964
|
context = ""
|
|
811
965
|
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
812
966
|
if isinstance(el, NavigableString):
|
|
@@ -833,10 +987,8 @@ def _process_html_core(
|
|
|
833
987
|
sink.write(text)
|
|
834
988
|
context += text
|
|
835
989
|
|
|
836
|
-
# Finalize output
|
|
837
990
|
sink.finalize()
|
|
838
991
|
finally:
|
|
839
|
-
# Reset context
|
|
840
992
|
_ancestor_cache.reset(token)
|
|
841
993
|
|
|
842
994
|
|
|
@@ -909,16 +1061,13 @@ def convert_to_markdown_stream(
|
|
|
909
1061
|
Yields:
|
|
910
1062
|
str: Chunks of Markdown-formatted text.
|
|
911
1063
|
"""
|
|
912
|
-
# Use shared core with streaming sink
|
|
913
1064
|
sink = StreamingSink(chunk_size, progress_callback)
|
|
914
1065
|
|
|
915
|
-
# Estimate total size for progress reporting
|
|
916
1066
|
if isinstance(source, str):
|
|
917
1067
|
sink.total_bytes = len(source)
|
|
918
1068
|
elif isinstance(source, BeautifulSoup):
|
|
919
1069
|
sink.total_bytes = len(str(source))
|
|
920
1070
|
|
|
921
|
-
# Process using shared core
|
|
922
1071
|
_process_html_core(
|
|
923
1072
|
source,
|
|
924
1073
|
sink,
|
|
@@ -948,30 +1097,22 @@ def convert_to_markdown_stream(
|
|
|
948
1097
|
wrap_width=wrap_width,
|
|
949
1098
|
)
|
|
950
1099
|
|
|
951
|
-
# Get all chunks from the sink and apply post-processing
|
|
952
1100
|
all_chunks = list(sink.get_chunks())
|
|
953
1101
|
combined_result = "".join(all_chunks)
|
|
954
1102
|
|
|
955
|
-
# Apply same post-processing as regular conversion
|
|
956
|
-
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
957
1103
|
combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
|
|
958
1104
|
|
|
959
|
-
# Strip all trailing newlines in inline mode
|
|
960
1105
|
if convert_as_inline:
|
|
961
1106
|
combined_result = combined_result.rstrip("\n")
|
|
962
1107
|
|
|
963
|
-
# Now split the post-processed result back into chunks at good boundaries
|
|
964
1108
|
if not combined_result:
|
|
965
1109
|
return
|
|
966
1110
|
|
|
967
1111
|
pos = 0
|
|
968
1112
|
while pos < len(combined_result):
|
|
969
|
-
# Calculate chunk end position
|
|
970
1113
|
end_pos = min(pos + chunk_size, len(combined_result))
|
|
971
1114
|
|
|
972
|
-
# If not at the end, try to find a good split point
|
|
973
1115
|
if end_pos < len(combined_result):
|
|
974
|
-
# Look for newline within reasonable distance
|
|
975
1116
|
search_start = max(pos, end_pos - 50)
|
|
976
1117
|
search_end = min(len(combined_result), end_pos + 50)
|
|
977
1118
|
search_area = combined_result[search_start:search_end]
|
|
@@ -980,7 +1121,6 @@ def convert_to_markdown_stream(
|
|
|
980
1121
|
if newline_pos > 0:
|
|
981
1122
|
end_pos = search_start + newline_pos + 1
|
|
982
1123
|
|
|
983
|
-
# Yield the chunk
|
|
984
1124
|
chunk = combined_result[pos:end_pos]
|
|
985
1125
|
if chunk:
|
|
986
1126
|
yield chunk
|
html_to_markdown/utils.py
CHANGED
|
@@ -6,18 +6,25 @@ from html_to_markdown.constants import line_beginning_re
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def chomp(text: str) -> tuple[str, str, str]:
|
|
9
|
-
"""
|
|
10
|
-
|
|
9
|
+
"""Simplified whitespace handling for inline elements.
|
|
10
|
+
|
|
11
|
+
For semantic markdown output, preserves leading/trailing spaces as single spaces
|
|
12
|
+
and normalizes internal whitespace.
|
|
11
13
|
|
|
12
14
|
Args:
|
|
13
15
|
text: The text to chomp.
|
|
14
16
|
|
|
15
17
|
Returns:
|
|
16
|
-
A tuple containing the prefix, suffix, and the
|
|
18
|
+
A tuple containing the prefix, suffix, and the normalized text.
|
|
17
19
|
"""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
if not text:
|
|
21
|
+
return "", "", ""
|
|
22
|
+
|
|
23
|
+
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
24
|
+
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
25
|
+
|
|
20
26
|
text = text.strip()
|
|
27
|
+
|
|
21
28
|
return prefix, suffix, text
|
|
22
29
|
|
|
23
30
|
|