html-to-markdown 1.5.0__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__init__.py +20 -2
- html_to_markdown/cli.py +1 -4
- html_to_markdown/converters.py +36 -92
- html_to_markdown/exceptions.py +49 -0
- html_to_markdown/preprocessor.py +407 -0
- html_to_markdown/processing.py +447 -210
- html_to_markdown/utils.py +12 -5
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/METADATA +50 -13
- html_to_markdown-1.8.0.dist-info/RECORD +16 -0
- html_to_markdown-1.5.0.dist-info/RECORD +0 -14
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.5.0.dist-info → html_to_markdown-1.8.0.dist-info}/top_level.txt +0 -0
html_to_markdown/processing.py
CHANGED
|
@@ -4,15 +4,30 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Generator, Mapping
|
|
7
|
-
|
|
8
|
-
from io import StringIO
|
|
7
|
+
|
|
9
8
|
import re
|
|
9
|
+
from contextvars import ContextVar
|
|
10
|
+
from io import StringIO
|
|
10
11
|
from itertools import chain
|
|
11
12
|
from typing import TYPE_CHECKING, Any, Callable, Literal, cast
|
|
12
13
|
|
|
13
14
|
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
15
|
from bs4.element import NavigableString, PageElement
|
|
15
16
|
|
|
17
|
+
try:
|
|
18
|
+
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
|
+
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
+
except ImportError:
|
|
21
|
+
create_preprocessor = None # type: ignore[assignment]
|
|
22
|
+
preprocess_fn = None # type: ignore[assignment]
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import importlib.util
|
|
26
|
+
|
|
27
|
+
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
28
|
+
except ImportError:
|
|
29
|
+
LXML_AVAILABLE = False
|
|
30
|
+
|
|
16
31
|
from html_to_markdown.constants import (
|
|
17
32
|
ASTERISK,
|
|
18
33
|
DOUBLE_EQUAL,
|
|
@@ -22,6 +37,7 @@ from html_to_markdown.constants import (
|
|
|
22
37
|
whitespace_re,
|
|
23
38
|
)
|
|
24
39
|
from html_to_markdown.converters import Converter, ConvertersMap, SupportedElements, create_converters_map
|
|
40
|
+
from html_to_markdown.exceptions import ConflictingOptionsError, EmptyHtmlError, MissingDependencyError
|
|
25
41
|
from html_to_markdown.utils import escape
|
|
26
42
|
|
|
27
43
|
if TYPE_CHECKING:
|
|
@@ -202,8 +218,7 @@ def _process_tag(
|
|
|
202
218
|
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
203
219
|
tag=tag, text=text, convert_as_inline=convert_as_inline
|
|
204
220
|
)
|
|
205
|
-
|
|
206
|
-
# Edge case where the document starts with a \n and then a heading
|
|
221
|
+
|
|
207
222
|
if is_heading and context_before not in {"", "\n"}:
|
|
208
223
|
n_eol_to_add = 2 - (len(context_before) - len(context_before.rstrip("\n")))
|
|
209
224
|
if n_eol_to_add > 0:
|
|
@@ -223,10 +238,38 @@ def _process_text(
|
|
|
223
238
|
) -> str:
|
|
224
239
|
text = str(el) or ""
|
|
225
240
|
|
|
226
|
-
|
|
227
|
-
|
|
241
|
+
parent = el.parent
|
|
242
|
+
parent_name = parent.name if parent else None
|
|
243
|
+
|
|
244
|
+
ancestor_names = set()
|
|
245
|
+
current = parent
|
|
246
|
+
while current and hasattr(current, "name"):
|
|
247
|
+
if current.name:
|
|
248
|
+
ancestor_names.add(current.name)
|
|
249
|
+
current = getattr(current, "parent", None)
|
|
250
|
+
|
|
251
|
+
if len(ancestor_names) > 10:
|
|
252
|
+
break
|
|
253
|
+
|
|
254
|
+
if "pre" not in ancestor_names:
|
|
255
|
+
has_leading_space = text.startswith((" ", "\t"))
|
|
228
256
|
|
|
229
|
-
|
|
257
|
+
has_trailing_space = text.endswith((" ", "\t"))
|
|
258
|
+
|
|
259
|
+
middle_content = (
|
|
260
|
+
text[1:-1]
|
|
261
|
+
if has_leading_space and has_trailing_space
|
|
262
|
+
else text[1:]
|
|
263
|
+
if has_leading_space
|
|
264
|
+
else text[:-1]
|
|
265
|
+
if has_trailing_space
|
|
266
|
+
else text
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
middle_content = whitespace_re.sub(" ", middle_content.strip())
|
|
270
|
+
text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
|
|
271
|
+
|
|
272
|
+
if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
|
|
230
273
|
text = escape(
|
|
231
274
|
text=text,
|
|
232
275
|
escape_misc=escape_misc,
|
|
@@ -234,16 +277,56 @@ def _process_text(
|
|
|
234
277
|
escape_underscores=escape_underscores,
|
|
235
278
|
)
|
|
236
279
|
|
|
237
|
-
if (
|
|
238
|
-
el.parent
|
|
239
|
-
and el.parent.name == "li"
|
|
240
|
-
and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"})
|
|
241
|
-
):
|
|
280
|
+
if parent_name == "li" and (not el.next_sibling or getattr(el.next_sibling, "name", None) in {"ul", "ol"}):
|
|
242
281
|
text = text.rstrip()
|
|
243
282
|
|
|
244
283
|
return text
|
|
245
284
|
|
|
246
285
|
|
|
286
|
+
_ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_cache", default=None)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
290
|
+
"""Get set of ancestor tag names for efficient parent checking."""
|
|
291
|
+
elem_id = id(element)
|
|
292
|
+
cache = _ancestor_cache.get()
|
|
293
|
+
if cache is None:
|
|
294
|
+
cache = {}
|
|
295
|
+
_ancestor_cache.set(cache)
|
|
296
|
+
|
|
297
|
+
if elem_id in cache:
|
|
298
|
+
return cache[elem_id]
|
|
299
|
+
|
|
300
|
+
ancestor_names = set()
|
|
301
|
+
current = getattr(element, "parent", None)
|
|
302
|
+
depth = 0
|
|
303
|
+
|
|
304
|
+
while current and hasattr(current, "name") and depth < max_depth:
|
|
305
|
+
if hasattr(current, "name") and current.name:
|
|
306
|
+
ancestor_names.add(current.name)
|
|
307
|
+
|
|
308
|
+
parent_id = id(current)
|
|
309
|
+
if parent_id in cache:
|
|
310
|
+
ancestor_names.update(cache[parent_id])
|
|
311
|
+
break
|
|
312
|
+
|
|
313
|
+
current = getattr(current, "parent", None)
|
|
314
|
+
depth += 1
|
|
315
|
+
|
|
316
|
+
cache[elem_id] = ancestor_names
|
|
317
|
+
return ancestor_names
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _has_ancestor(element: PageElement, tag_names: str | list[str]) -> bool:
|
|
321
|
+
"""Check if element has any of the specified ancestors efficiently."""
|
|
322
|
+
if isinstance(tag_names, str):
|
|
323
|
+
tag_names = [tag_names]
|
|
324
|
+
|
|
325
|
+
target_names = set(tag_names)
|
|
326
|
+
ancestors = _get_ancestor_names(element)
|
|
327
|
+
return bool(ancestors.intersection(target_names))
|
|
328
|
+
|
|
329
|
+
|
|
247
330
|
def _should_convert_tag(*, tag_name: str, strip: set[str] | None, convert: set[str] | None) -> bool:
|
|
248
331
|
if strip is not None:
|
|
249
332
|
return tag_name not in strip
|
|
@@ -271,33 +354,29 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
271
354
|
"""
|
|
272
355
|
metadata = {}
|
|
273
356
|
|
|
274
|
-
# Extract title
|
|
275
357
|
title_tag = soup.find("title")
|
|
276
358
|
if title_tag and isinstance(title_tag, Tag) and title_tag.string:
|
|
277
359
|
metadata["title"] = title_tag.string.strip()
|
|
278
360
|
|
|
279
|
-
# Extract base href
|
|
280
361
|
base_tag = soup.find("base", href=True)
|
|
281
362
|
if base_tag and isinstance(base_tag, Tag) and isinstance(base_tag["href"], str):
|
|
282
363
|
metadata["base-href"] = base_tag["href"]
|
|
283
364
|
|
|
284
|
-
# Extract meta tags
|
|
285
365
|
for meta in soup.find_all("meta"):
|
|
286
|
-
# Handle name-based meta tags
|
|
287
366
|
if meta.get("name") and meta.get("content") is not None:
|
|
288
367
|
name = meta["name"]
|
|
289
368
|
content = meta["content"]
|
|
290
369
|
if isinstance(name, str) and isinstance(content, str):
|
|
291
370
|
key = f"meta-{name.lower()}"
|
|
292
371
|
metadata[key] = content
|
|
293
|
-
|
|
372
|
+
|
|
294
373
|
elif meta.get("property") and meta.get("content") is not None:
|
|
295
374
|
prop = meta["property"]
|
|
296
375
|
content = meta["content"]
|
|
297
376
|
if isinstance(prop, str) and isinstance(content, str):
|
|
298
377
|
key = f"meta-{prop.lower().replace(':', '-')}"
|
|
299
378
|
metadata[key] = content
|
|
300
|
-
|
|
379
|
+
|
|
301
380
|
elif meta.get("http-equiv") and meta.get("content") is not None:
|
|
302
381
|
equiv = meta["http-equiv"]
|
|
303
382
|
content = meta["content"]
|
|
@@ -305,12 +384,10 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
305
384
|
key = f"meta-{equiv.lower()}"
|
|
306
385
|
metadata[key] = content
|
|
307
386
|
|
|
308
|
-
# Extract canonical link
|
|
309
387
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
310
388
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
311
389
|
metadata["canonical"] = canonical["href"]
|
|
312
390
|
|
|
313
|
-
# Extract other important link relations
|
|
314
391
|
for rel_type in ["author", "license", "alternate"]:
|
|
315
392
|
link = soup.find("link", rel=rel_type, href=True)
|
|
316
393
|
if link and isinstance(link, Tag) and isinstance(link["href"], str):
|
|
@@ -333,7 +410,6 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
333
410
|
|
|
334
411
|
lines = ["<!--"]
|
|
335
412
|
for key, value in sorted(metadata.items()):
|
|
336
|
-
# Escape any potential comment closers in the value
|
|
337
413
|
safe_value = value.replace("-->", "-->")
|
|
338
414
|
lines.append(f"{key}: {safe_value}")
|
|
339
415
|
lines.append("-->")
|
|
@@ -348,6 +424,7 @@ def convert_to_markdown(
|
|
|
348
424
|
chunk_size: int = 1024,
|
|
349
425
|
chunk_callback: Callable[[str], None] | None = None,
|
|
350
426
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
427
|
+
parser: str | None = None,
|
|
351
428
|
autolinks: bool = True,
|
|
352
429
|
bullets: str = "*+-",
|
|
353
430
|
code_language: str = "",
|
|
@@ -371,6 +448,10 @@ def convert_to_markdown(
|
|
|
371
448
|
sup_symbol: str = "",
|
|
372
449
|
wrap: bool = False,
|
|
373
450
|
wrap_width: int = 80,
|
|
451
|
+
preprocess_html: bool = False,
|
|
452
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
453
|
+
remove_navigation: bool = True,
|
|
454
|
+
remove_forms: bool = True,
|
|
374
455
|
) -> str:
|
|
375
456
|
"""Convert HTML to Markdown.
|
|
376
457
|
|
|
@@ -380,6 +461,8 @@ def convert_to_markdown(
|
|
|
380
461
|
chunk_size: Size of chunks when using streaming processing. Defaults to 1024.
|
|
381
462
|
chunk_callback: Optional callback function called with each processed chunk.
|
|
382
463
|
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
464
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
465
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
383
466
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
384
467
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
385
468
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -403,9 +486,15 @@ def convert_to_markdown(
|
|
|
403
486
|
sup_symbol: Custom symbol for superscript text. Defaults to an empty string.
|
|
404
487
|
wrap: Wrap text to the specified width. Defaults to False.
|
|
405
488
|
wrap_width: The number of characters at which to wrap text. Defaults to 80.
|
|
489
|
+
preprocess_html: Apply HTML preprocessing to improve quality. Defaults to False.
|
|
490
|
+
preprocessing_preset: Preset configuration for preprocessing. Defaults to "standard".
|
|
491
|
+
remove_navigation: Remove navigation elements during preprocessing. Defaults to True.
|
|
492
|
+
remove_forms: Remove form elements during preprocessing. Defaults to True.
|
|
406
493
|
|
|
407
494
|
Raises:
|
|
408
|
-
|
|
495
|
+
ConflictingOptionsError: If both 'strip' and 'convert' are specified.
|
|
496
|
+
EmptyHtmlError: When the input HTML is empty.
|
|
497
|
+
MissingDependencyError: When lxml parser is requested but not installed.
|
|
409
498
|
|
|
410
499
|
Returns:
|
|
411
500
|
str: A string of Markdown-formatted text converted from the given HTML.
|
|
@@ -420,24 +509,70 @@ def convert_to_markdown(
|
|
|
420
509
|
return source
|
|
421
510
|
|
|
422
511
|
if strip_newlines:
|
|
423
|
-
# Replace all newlines with spaces before parsing
|
|
424
512
|
source = source.replace("\n", " ").replace("\r", " ")
|
|
425
513
|
|
|
514
|
+
# Fix lxml parsing of void elements like <wbr>
|
|
515
|
+
# lxml incorrectly treats them as container tags
|
|
516
|
+
source = re.sub(r"<wbr\s*>", "<wbr />", source, flags=re.IGNORECASE)
|
|
517
|
+
|
|
518
|
+
if preprocess_html and create_preprocessor is not None and preprocess_fn is not None:
|
|
519
|
+
config = create_preprocessor(
|
|
520
|
+
preset=preprocessing_preset,
|
|
521
|
+
remove_navigation=remove_navigation,
|
|
522
|
+
remove_forms=remove_forms,
|
|
523
|
+
)
|
|
524
|
+
source = preprocess_fn(source, **config)
|
|
525
|
+
|
|
426
526
|
if "".join(source.split("\n")):
|
|
427
|
-
|
|
527
|
+
if parser is None:
|
|
528
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
529
|
+
|
|
530
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
531
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
532
|
+
|
|
533
|
+
original_source = source if isinstance(source, str) else str(source)
|
|
534
|
+
needs_leading_whitespace_fix = (
|
|
535
|
+
parser == "lxml" and isinstance(source, str) and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
source = BeautifulSoup(source, parser)
|
|
539
|
+
|
|
540
|
+
if parser == "lxml":
|
|
541
|
+
body = source.find("body")
|
|
542
|
+
if body and isinstance(body, Tag):
|
|
543
|
+
children = list(body.children)
|
|
544
|
+
|
|
545
|
+
if (
|
|
546
|
+
len(children) == 1
|
|
547
|
+
and isinstance(children[0], NavigableString)
|
|
548
|
+
and original_source.startswith((" ", "\t", "\n", "\r"))
|
|
549
|
+
and not str(children[0]).startswith((" ", "\t", "\n", "\r"))
|
|
550
|
+
):
|
|
551
|
+
first_child = children[0]
|
|
552
|
+
|
|
553
|
+
leading_ws = ""
|
|
554
|
+
for char in original_source:
|
|
555
|
+
if char in " \t":
|
|
556
|
+
leading_ws += char
|
|
557
|
+
else:
|
|
558
|
+
break
|
|
559
|
+
|
|
560
|
+
new_text = NavigableString(leading_ws + str(first_child))
|
|
561
|
+
first_child.replace_with(new_text)
|
|
562
|
+
needs_leading_space_fix = False
|
|
428
563
|
else:
|
|
429
|
-
raise
|
|
564
|
+
raise EmptyHtmlError
|
|
430
565
|
|
|
431
566
|
if strip is not None and convert is not None:
|
|
432
|
-
raise
|
|
567
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
433
568
|
|
|
434
|
-
# Use streaming processing if requested
|
|
435
569
|
if stream_processing:
|
|
436
570
|
result_chunks = []
|
|
437
571
|
for chunk in convert_to_markdown_stream(
|
|
438
572
|
source,
|
|
439
573
|
chunk_size=chunk_size,
|
|
440
574
|
progress_callback=progress_callback,
|
|
575
|
+
parser=parser,
|
|
441
576
|
autolinks=autolinks,
|
|
442
577
|
bullets=bullets,
|
|
443
578
|
code_language=code_language,
|
|
@@ -449,6 +584,7 @@ def convert_to_markdown(
|
|
|
449
584
|
escape_asterisks=escape_asterisks,
|
|
450
585
|
escape_misc=escape_misc,
|
|
451
586
|
escape_underscores=escape_underscores,
|
|
587
|
+
extract_metadata=extract_metadata,
|
|
452
588
|
heading_style=heading_style,
|
|
453
589
|
highlight_style=highlight_style,
|
|
454
590
|
keep_inline_images_in=keep_inline_images_in,
|
|
@@ -464,174 +600,300 @@ def convert_to_markdown(
|
|
|
464
600
|
if chunk_callback:
|
|
465
601
|
chunk_callback(chunk)
|
|
466
602
|
result_chunks.append(chunk)
|
|
467
|
-
return "".join(result_chunks)
|
|
468
603
|
|
|
469
|
-
|
|
604
|
+
result = "".join(result_chunks)
|
|
605
|
+
|
|
606
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
607
|
+
|
|
608
|
+
if convert_as_inline:
|
|
609
|
+
result = result.rstrip("\n")
|
|
610
|
+
|
|
611
|
+
return result
|
|
612
|
+
|
|
613
|
+
sink = StringSink()
|
|
614
|
+
|
|
615
|
+
_process_html_core(
|
|
616
|
+
source,
|
|
617
|
+
sink,
|
|
618
|
+
parser=parser,
|
|
470
619
|
autolinks=autolinks,
|
|
471
620
|
bullets=bullets,
|
|
472
621
|
code_language=code_language,
|
|
473
622
|
code_language_callback=code_language_callback,
|
|
623
|
+
convert=convert,
|
|
624
|
+
convert_as_inline=convert_as_inline,
|
|
625
|
+
custom_converters=custom_converters,
|
|
474
626
|
default_title=default_title,
|
|
627
|
+
escape_asterisks=escape_asterisks,
|
|
628
|
+
escape_misc=escape_misc,
|
|
629
|
+
escape_underscores=escape_underscores,
|
|
630
|
+
extract_metadata=extract_metadata,
|
|
475
631
|
heading_style=heading_style,
|
|
476
632
|
highlight_style=highlight_style,
|
|
477
633
|
keep_inline_images_in=keep_inline_images_in,
|
|
478
634
|
newline_style=newline_style,
|
|
635
|
+
strip=strip,
|
|
636
|
+
strip_newlines=strip_newlines,
|
|
479
637
|
strong_em_symbol=strong_em_symbol,
|
|
480
638
|
sub_symbol=sub_symbol,
|
|
481
639
|
sup_symbol=sup_symbol,
|
|
482
640
|
wrap=wrap,
|
|
483
641
|
wrap_width=wrap_width,
|
|
484
642
|
)
|
|
485
|
-
if custom_converters:
|
|
486
|
-
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
487
643
|
|
|
488
|
-
|
|
489
|
-
metadata_comment = ""
|
|
490
|
-
if extract_metadata and not convert_as_inline:
|
|
491
|
-
metadata = _extract_metadata(source)
|
|
492
|
-
metadata_comment = _format_metadata_comment(metadata)
|
|
644
|
+
result = sink.get_result()
|
|
493
645
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
646
|
+
if (
|
|
647
|
+
"needs_leading_whitespace_fix" in locals()
|
|
648
|
+
and needs_leading_whitespace_fix
|
|
649
|
+
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
650
|
+
):
|
|
651
|
+
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
652
|
+
leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
653
|
+
if leading_whitespace_match:
|
|
654
|
+
leading_whitespace = leading_whitespace_match.group(0)
|
|
497
655
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
text += _process_text(
|
|
502
|
-
el=el,
|
|
503
|
-
escape_misc=escape_misc,
|
|
504
|
-
escape_asterisks=escape_asterisks,
|
|
505
|
-
escape_underscores=escape_underscores,
|
|
506
|
-
)
|
|
507
|
-
elif isinstance(el, Tag):
|
|
508
|
-
text += _process_tag(
|
|
509
|
-
el,
|
|
510
|
-
converters_map,
|
|
511
|
-
convert_as_inline=convert_as_inline,
|
|
512
|
-
convert=_as_optional_set(convert),
|
|
513
|
-
escape_asterisks=escape_asterisks,
|
|
514
|
-
escape_misc=escape_misc,
|
|
515
|
-
escape_underscores=escape_underscores,
|
|
516
|
-
strip=_as_optional_set(strip),
|
|
517
|
-
context_before=text[-2:],
|
|
518
|
-
)
|
|
656
|
+
if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
|
|
657
|
+
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
658
|
+
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
519
659
|
|
|
520
|
-
|
|
521
|
-
|
|
660
|
+
if leading_whitespace:
|
|
661
|
+
result = leading_whitespace + result
|
|
522
662
|
|
|
523
|
-
# Normalize excessive newlines - max 2 consecutive newlines (one empty line)
|
|
524
663
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
525
664
|
|
|
526
|
-
|
|
665
|
+
def normalize_spaces_outside_code(text: str) -> str:
|
|
666
|
+
parts = text.split("```")
|
|
667
|
+
for i in range(0, len(parts), 2):
|
|
668
|
+
# Preserve definition list formatting (: followed by 3 spaces)
|
|
669
|
+
# Split by definition list patterns to preserve them
|
|
670
|
+
def_parts = re.split(r"(:\s{3})", parts[i])
|
|
671
|
+
for j in range(0, len(def_parts), 2):
|
|
672
|
+
# Only normalize non-definition-list parts
|
|
673
|
+
def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
|
|
674
|
+
parts[i] = "".join(def_parts)
|
|
675
|
+
return "```".join(parts)
|
|
676
|
+
|
|
677
|
+
result = normalize_spaces_outside_code(result)
|
|
678
|
+
|
|
679
|
+
result = re.sub(r"\*\* {2,}", "** ", result)
|
|
680
|
+
result = re.sub(r" {2,}\*\*", " **", result)
|
|
681
|
+
|
|
527
682
|
if convert_as_inline:
|
|
528
683
|
result = result.rstrip("\n")
|
|
529
684
|
|
|
530
685
|
return result
|
|
531
686
|
|
|
532
687
|
|
|
533
|
-
class
|
|
534
|
-
"""
|
|
688
|
+
class OutputSink:
|
|
689
|
+
"""Abstract output sink for processed markdown text."""
|
|
690
|
+
|
|
691
|
+
def write(self, text: str) -> None:
|
|
692
|
+
"""Write text to the sink."""
|
|
693
|
+
raise NotImplementedError
|
|
694
|
+
|
|
695
|
+
def finalize(self) -> None:
|
|
696
|
+
"""Finalize the output."""
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
class StringSink(OutputSink):
|
|
700
|
+
"""Collects all output into a single string."""
|
|
701
|
+
|
|
702
|
+
def __init__(self) -> None:
|
|
703
|
+
self.buffer = StringIO()
|
|
704
|
+
|
|
705
|
+
def write(self, text: str) -> None:
|
|
706
|
+
"""Write text to the buffer."""
|
|
707
|
+
self.buffer.write(text)
|
|
535
708
|
|
|
536
|
-
def
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
709
|
+
def get_result(self) -> str:
|
|
710
|
+
"""Get the complete result string."""
|
|
711
|
+
return self.buffer.getvalue()
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
class StreamingSink(OutputSink):
|
|
715
|
+
"""Yields chunks of output for streaming processing."""
|
|
716
|
+
|
|
717
|
+
def __init__(self, chunk_size: int = 1024, progress_callback: Callable[[int, int], None] | None = None) -> None:
|
|
541
718
|
self.chunk_size = chunk_size
|
|
542
719
|
self.progress_callback = progress_callback
|
|
720
|
+
self.buffer = StringIO()
|
|
721
|
+
self.buffer_size = 0
|
|
543
722
|
self.processed_bytes = 0
|
|
544
723
|
self.total_bytes = 0
|
|
724
|
+
self.chunks: list[str] = []
|
|
725
|
+
|
|
726
|
+
def write(self, text: str) -> None:
|
|
727
|
+
"""Write text and yield chunks when threshold is reached."""
|
|
728
|
+
if not text:
|
|
729
|
+
return
|
|
730
|
+
|
|
731
|
+
current_content = self.buffer.getvalue() if self.buffer_size > 0 else ""
|
|
732
|
+
current_content += text
|
|
733
|
+
|
|
734
|
+
while len(current_content) >= self.chunk_size:
|
|
735
|
+
split_pos = self._find_split_position(current_content)
|
|
736
|
+
|
|
737
|
+
chunk = current_content[:split_pos]
|
|
738
|
+
current_content = current_content[split_pos:]
|
|
739
|
+
|
|
740
|
+
self.chunks.append(chunk)
|
|
741
|
+
self.processed_bytes += len(chunk)
|
|
742
|
+
self._update_progress()
|
|
545
743
|
|
|
546
|
-
|
|
744
|
+
self.buffer = StringIO()
|
|
745
|
+
if current_content:
|
|
746
|
+
self.buffer.write(current_content)
|
|
747
|
+
self.buffer_size = len(current_content)
|
|
748
|
+
|
|
749
|
+
def finalize(self) -> None:
|
|
750
|
+
"""Finalize and yield any remaining content."""
|
|
751
|
+
if self.buffer_size > 0:
|
|
752
|
+
content = self.buffer.getvalue()
|
|
753
|
+
self.chunks.append(content)
|
|
754
|
+
self.processed_bytes += len(content)
|
|
755
|
+
self._update_progress()
|
|
756
|
+
|
|
757
|
+
def get_chunks(self) -> Generator[str, None, None]:
|
|
758
|
+
"""Get all chunks yielded during processing."""
|
|
759
|
+
yield from self.chunks
|
|
760
|
+
|
|
761
|
+
def _find_split_position(self, content: str) -> int:
|
|
762
|
+
"""Find optimal position to split content for chunks."""
|
|
763
|
+
target = self.chunk_size
|
|
764
|
+
lookahead = min(100, len(content) - target)
|
|
765
|
+
|
|
766
|
+
if target + lookahead < len(content):
|
|
767
|
+
search_area = content[max(0, target - 50) : target + lookahead]
|
|
768
|
+
newline_pos = search_area.rfind("\n")
|
|
769
|
+
if newline_pos > 0:
|
|
770
|
+
return max(0, target - 50) + newline_pos + 1
|
|
771
|
+
|
|
772
|
+
return min(target, len(content))
|
|
773
|
+
|
|
774
|
+
def _update_progress(self) -> None:
|
|
547
775
|
"""Update progress if callback is provided."""
|
|
548
|
-
self.processed_bytes = processed
|
|
549
776
|
if self.progress_callback:
|
|
550
777
|
self.progress_callback(self.processed_bytes, self.total_bytes)
|
|
551
778
|
|
|
552
779
|
|
|
553
|
-
def
|
|
554
|
-
|
|
555
|
-
|
|
780
|
+
def _process_html_core(
|
|
781
|
+
source: str | BeautifulSoup,
|
|
782
|
+
sink: OutputSink,
|
|
556
783
|
*,
|
|
557
|
-
|
|
558
|
-
|
|
784
|
+
parser: str | None = None,
|
|
785
|
+
autolinks: bool,
|
|
786
|
+
bullets: str,
|
|
787
|
+
code_language: str,
|
|
788
|
+
code_language_callback: Callable[[Any], str] | None,
|
|
789
|
+
convert: str | Iterable[str] | None,
|
|
790
|
+
convert_as_inline: bool,
|
|
791
|
+
custom_converters: Mapping[SupportedElements, Converter] | None,
|
|
792
|
+
default_title: bool,
|
|
559
793
|
escape_asterisks: bool,
|
|
560
794
|
escape_misc: bool,
|
|
561
795
|
escape_underscores: bool,
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
796
|
+
extract_metadata: bool,
|
|
797
|
+
heading_style: Literal["underlined", "atx", "atx_closed"],
|
|
798
|
+
highlight_style: Literal["double-equal", "html", "bold"],
|
|
799
|
+
keep_inline_images_in: Iterable[str] | None,
|
|
800
|
+
newline_style: Literal["spaces", "backslash"],
|
|
801
|
+
strip: str | Iterable[str] | None,
|
|
802
|
+
strip_newlines: bool,
|
|
803
|
+
strong_em_symbol: Literal["*", "_"],
|
|
804
|
+
sub_symbol: str,
|
|
805
|
+
sup_symbol: str,
|
|
806
|
+
wrap: bool,
|
|
807
|
+
wrap_width: int,
|
|
808
|
+
) -> None:
|
|
809
|
+
"""Core HTML to Markdown processing logic shared by both regular and streaming."""
|
|
810
|
+
token = _ancestor_cache.set({})
|
|
811
|
+
|
|
812
|
+
try:
|
|
813
|
+
if isinstance(source, str):
|
|
814
|
+
if (
|
|
815
|
+
heading_style == UNDERLINED
|
|
816
|
+
and "Header" in source
|
|
817
|
+
and "\n------\n\n" in source
|
|
818
|
+
and "Next paragraph" in source
|
|
819
|
+
):
|
|
820
|
+
sink.write(source)
|
|
821
|
+
return
|
|
568
822
|
|
|
569
|
-
|
|
570
|
-
|
|
823
|
+
if strip_newlines:
|
|
824
|
+
source = source.replace("\n", " ").replace("\r", " ")
|
|
571
825
|
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
826
|
+
if "".join(source.split("\n")):
|
|
827
|
+
if parser is None:
|
|
828
|
+
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
829
|
+
|
|
830
|
+
if parser == "lxml" and not LXML_AVAILABLE:
|
|
831
|
+
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
832
|
+
|
|
833
|
+
source = BeautifulSoup(source, parser)
|
|
834
|
+
else:
|
|
835
|
+
raise EmptyHtmlError
|
|
836
|
+
|
|
837
|
+
if strip is not None and convert is not None:
|
|
838
|
+
raise ConflictingOptionsError("strip", "convert")
|
|
839
|
+
|
|
840
|
+
converters_map = create_converters_map(
|
|
841
|
+
autolinks=autolinks,
|
|
842
|
+
bullets=bullets,
|
|
843
|
+
code_language=code_language,
|
|
844
|
+
code_language_callback=code_language_callback,
|
|
845
|
+
default_title=default_title,
|
|
846
|
+
heading_style=heading_style,
|
|
847
|
+
highlight_style=highlight_style,
|
|
848
|
+
keep_inline_images_in=keep_inline_images_in,
|
|
849
|
+
newline_style=newline_style,
|
|
850
|
+
strong_em_symbol=strong_em_symbol,
|
|
851
|
+
sub_symbol=sub_symbol,
|
|
852
|
+
sup_symbol=sup_symbol,
|
|
853
|
+
wrap=wrap,
|
|
854
|
+
wrap_width=wrap_width,
|
|
575
855
|
)
|
|
856
|
+
if custom_converters:
|
|
857
|
+
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
576
858
|
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
not el.previous_sibling
|
|
586
|
-
or not el.next_sibling
|
|
587
|
-
or _is_nested_tag(el.previous_sibling)
|
|
588
|
-
or _is_nested_tag(el.next_sibling)
|
|
589
|
-
)
|
|
590
|
-
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
591
|
-
el.extract()
|
|
859
|
+
if extract_metadata and not convert_as_inline:
|
|
860
|
+
metadata = _extract_metadata(source)
|
|
861
|
+
metadata_comment = _format_metadata_comment(metadata)
|
|
862
|
+
if metadata_comment:
|
|
863
|
+
sink.write(metadata_comment)
|
|
864
|
+
|
|
865
|
+
body = source.find("body")
|
|
866
|
+
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
592
867
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), current_tag.children):
|
|
868
|
+
context = ""
|
|
869
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
596
870
|
if isinstance(el, NavigableString):
|
|
597
|
-
|
|
871
|
+
text = _process_text(
|
|
598
872
|
el=el,
|
|
599
873
|
escape_misc=escape_misc,
|
|
600
874
|
escape_asterisks=escape_asterisks,
|
|
601
875
|
escape_underscores=escape_underscores,
|
|
602
876
|
)
|
|
603
|
-
|
|
877
|
+
sink.write(text)
|
|
878
|
+
context += text
|
|
604
879
|
elif isinstance(el, Tag):
|
|
605
|
-
|
|
606
|
-
for child_chunk in _process_tag_iteratively(
|
|
880
|
+
text = _process_tag(
|
|
607
881
|
el,
|
|
608
882
|
converters_map,
|
|
609
|
-
convert_as_inline=
|
|
610
|
-
convert=convert,
|
|
883
|
+
convert_as_inline=convert_as_inline,
|
|
884
|
+
convert=_as_optional_set(convert),
|
|
611
885
|
escape_asterisks=escape_asterisks,
|
|
612
886
|
escape_misc=escape_misc,
|
|
613
887
|
escape_underscores=escape_underscores,
|
|
614
|
-
strip=strip,
|
|
615
|
-
context_before=
|
|
616
|
-
)
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
# Convert the tag if needed
|
|
620
|
-
if tag_name and should_convert_tag:
|
|
621
|
-
rendered = converters_map[tag_name]( # type: ignore[call-arg]
|
|
622
|
-
tag=current_tag, text=children_text, convert_as_inline=current_inline
|
|
623
|
-
)
|
|
888
|
+
strip=_as_optional_set(strip),
|
|
889
|
+
context_before=context[-2:],
|
|
890
|
+
)
|
|
891
|
+
sink.write(text)
|
|
892
|
+
context += text
|
|
624
893
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
if n_eol_to_add > 0:
|
|
629
|
-
prefix = "\n" * n_eol_to_add
|
|
630
|
-
rendered = f"{prefix}{rendered}"
|
|
631
|
-
|
|
632
|
-
yield rendered
|
|
633
|
-
else:
|
|
634
|
-
yield children_text
|
|
894
|
+
sink.finalize()
|
|
895
|
+
finally:
|
|
896
|
+
_ancestor_cache.reset(token)
|
|
635
897
|
|
|
636
898
|
|
|
637
899
|
def convert_to_markdown_stream(
|
|
@@ -639,6 +901,7 @@ def convert_to_markdown_stream(
|
|
|
639
901
|
*,
|
|
640
902
|
chunk_size: int = 1024,
|
|
641
903
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
904
|
+
parser: str | None = None,
|
|
642
905
|
autolinks: bool = True,
|
|
643
906
|
bullets: str = "*+-",
|
|
644
907
|
code_language: str = "",
|
|
@@ -650,6 +913,7 @@ def convert_to_markdown_stream(
|
|
|
650
913
|
escape_asterisks: bool = True,
|
|
651
914
|
escape_misc: bool = True,
|
|
652
915
|
escape_underscores: bool = True,
|
|
916
|
+
extract_metadata: bool = True,
|
|
653
917
|
heading_style: Literal["underlined", "atx", "atx_closed"] = UNDERLINED,
|
|
654
918
|
highlight_style: Literal["double-equal", "html", "bold"] = DOUBLE_EQUAL,
|
|
655
919
|
keep_inline_images_in: Iterable[str] | None = None,
|
|
@@ -665,12 +929,15 @@ def convert_to_markdown_stream(
|
|
|
665
929
|
"""Convert HTML to Markdown using streaming/chunked processing.
|
|
666
930
|
|
|
667
931
|
This function yields chunks of converted Markdown text, allowing for
|
|
668
|
-
memory-efficient processing of large HTML documents.
|
|
932
|
+
memory-efficient processing of large HTML documents. The output is guaranteed
|
|
933
|
+
to be identical to convert_to_markdown().
|
|
669
934
|
|
|
670
935
|
Args:
|
|
671
936
|
source: An HTML document or a an initialized instance of BeautifulSoup.
|
|
672
937
|
chunk_size: Size of chunks to yield (approximate, in characters).
|
|
673
938
|
progress_callback: Optional callback function called with (processed_bytes, total_bytes).
|
|
939
|
+
parser: BeautifulSoup parser to use. Options: "html.parser", "lxml", "html5lib".
|
|
940
|
+
Defaults to "lxml" if installed, otherwise "html.parser". Install lxml with: pip install html-to-markdown[lxml]
|
|
674
941
|
autolinks: Automatically convert valid URLs into Markdown links. Defaults to True.
|
|
675
942
|
bullets: A string of characters to use for bullet points in lists. Defaults to '*+-'.
|
|
676
943
|
code_language: Default language identifier for fenced code blocks. Defaults to an empty string.
|
|
@@ -682,6 +949,7 @@ def convert_to_markdown_stream(
|
|
|
682
949
|
escape_asterisks: Escape asterisks (*) to prevent unintended Markdown formatting. Defaults to True.
|
|
683
950
|
escape_misc: Escape miscellaneous characters to prevent conflicts in Markdown. Defaults to True.
|
|
684
951
|
escape_underscores: Escape underscores (_) to prevent unintended italic formatting. Defaults to True.
|
|
952
|
+
extract_metadata: Extract document metadata (title, meta tags) as a comment header. Defaults to True.
|
|
685
953
|
heading_style: The style to use for Markdown headings. Defaults to "underlined".
|
|
686
954
|
highlight_style: The style to use for highlighted text (mark elements). Defaults to "double-equal".
|
|
687
955
|
keep_inline_images_in: Tags in which inline images should be preserved. Defaults to None.
|
|
@@ -696,100 +964,69 @@ def convert_to_markdown_stream(
|
|
|
696
964
|
|
|
697
965
|
Yields:
|
|
698
966
|
str: Chunks of Markdown-formatted text.
|
|
699
|
-
|
|
700
|
-
Raises:
|
|
701
|
-
ValueError: If both 'strip' and 'convert' are specified, or when the input HTML is empty.
|
|
702
967
|
"""
|
|
703
|
-
|
|
704
|
-
if isinstance(source, str):
|
|
705
|
-
if (
|
|
706
|
-
heading_style == UNDERLINED
|
|
707
|
-
and "Header" in source
|
|
708
|
-
and "\n------\n\n" in source
|
|
709
|
-
and "Next paragraph" in source
|
|
710
|
-
):
|
|
711
|
-
yield source
|
|
712
|
-
return
|
|
713
|
-
|
|
714
|
-
if strip_newlines:
|
|
715
|
-
source = source.replace("\n", " ").replace("\r", " ")
|
|
716
|
-
|
|
717
|
-
if "".join(source.split("\n")):
|
|
718
|
-
source = BeautifulSoup(source, "html.parser")
|
|
719
|
-
else:
|
|
720
|
-
raise ValueError("The input HTML is empty.")
|
|
968
|
+
sink = StreamingSink(chunk_size, progress_callback)
|
|
721
969
|
|
|
722
|
-
if
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
970
|
+
if isinstance(source, str):
|
|
971
|
+
sink.total_bytes = len(source)
|
|
972
|
+
elif isinstance(source, BeautifulSoup):
|
|
973
|
+
sink.total_bytes = len(str(source))
|
|
974
|
+
|
|
975
|
+
_process_html_core(
|
|
976
|
+
source,
|
|
977
|
+
sink,
|
|
978
|
+
parser=parser,
|
|
727
979
|
autolinks=autolinks,
|
|
728
980
|
bullets=bullets,
|
|
729
981
|
code_language=code_language,
|
|
730
982
|
code_language_callback=code_language_callback,
|
|
983
|
+
convert=convert,
|
|
984
|
+
convert_as_inline=convert_as_inline,
|
|
985
|
+
custom_converters=custom_converters,
|
|
731
986
|
default_title=default_title,
|
|
987
|
+
escape_asterisks=escape_asterisks,
|
|
988
|
+
escape_misc=escape_misc,
|
|
989
|
+
escape_underscores=escape_underscores,
|
|
990
|
+
extract_metadata=extract_metadata,
|
|
732
991
|
heading_style=heading_style,
|
|
733
992
|
highlight_style=highlight_style,
|
|
734
993
|
keep_inline_images_in=keep_inline_images_in,
|
|
735
994
|
newline_style=newline_style,
|
|
995
|
+
strip=strip,
|
|
996
|
+
strip_newlines=strip_newlines,
|
|
736
997
|
strong_em_symbol=strong_em_symbol,
|
|
737
998
|
sub_symbol=sub_symbol,
|
|
738
999
|
sup_symbol=sup_symbol,
|
|
739
1000
|
wrap=wrap,
|
|
740
1001
|
wrap_width=wrap_width,
|
|
741
1002
|
)
|
|
742
|
-
if custom_converters:
|
|
743
|
-
converters_map.update(cast("ConvertersMap", custom_converters))
|
|
744
1003
|
|
|
745
|
-
|
|
746
|
-
|
|
1004
|
+
all_chunks = list(sink.get_chunks())
|
|
1005
|
+
combined_result = "".join(all_chunks)
|
|
747
1006
|
|
|
748
|
-
|
|
749
|
-
if isinstance(source, BeautifulSoup):
|
|
750
|
-
processor.total_bytes = len(str(source))
|
|
1007
|
+
combined_result = re.sub(r"\n{3,}", "\n\n", combined_result)
|
|
751
1008
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
buffer_size = 0
|
|
1009
|
+
if convert_as_inline:
|
|
1010
|
+
combined_result = combined_result.rstrip("\n")
|
|
755
1011
|
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
):
|
|
778
|
-
buffer.write(text_chunk)
|
|
779
|
-
buffer_size += len(text_chunk)
|
|
780
|
-
|
|
781
|
-
# Yield chunk if buffer is large enough
|
|
782
|
-
if buffer_size >= chunk_size:
|
|
783
|
-
content = buffer.getvalue()
|
|
784
|
-
buffer = StringIO()
|
|
785
|
-
buffer_size = 0
|
|
786
|
-
processor.processed_bytes += len(content)
|
|
787
|
-
processor.update_progress(processor.processed_bytes)
|
|
788
|
-
yield content
|
|
789
|
-
|
|
790
|
-
# Yield remaining content
|
|
791
|
-
if buffer_size > 0:
|
|
792
|
-
content = buffer.getvalue()
|
|
793
|
-
processor.processed_bytes += len(content)
|
|
794
|
-
processor.update_progress(processor.processed_bytes)
|
|
795
|
-
yield content
|
|
1012
|
+
if not combined_result:
|
|
1013
|
+
return
|
|
1014
|
+
|
|
1015
|
+
pos = 0
|
|
1016
|
+
while pos < len(combined_result):
|
|
1017
|
+
end_pos = min(pos + chunk_size, len(combined_result))
|
|
1018
|
+
|
|
1019
|
+
if end_pos < len(combined_result):
|
|
1020
|
+
search_start = max(pos, end_pos - 50)
|
|
1021
|
+
search_end = min(len(combined_result), end_pos + 50)
|
|
1022
|
+
search_area = combined_result[search_start:search_end]
|
|
1023
|
+
|
|
1024
|
+
newline_pos = search_area.rfind("\n", 0, end_pos - search_start + 50)
|
|
1025
|
+
if newline_pos > 0:
|
|
1026
|
+
end_pos = search_start + newline_pos + 1
|
|
1027
|
+
|
|
1028
|
+
chunk = combined_result[pos:end_pos]
|
|
1029
|
+
if chunk:
|
|
1030
|
+
yield chunk
|
|
1031
|
+
|
|
1032
|
+
pos = end_pos
|