html-to-markdown 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -3,13 +3,13 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  if TYPE_CHECKING:
6
- from collections.abc import Generator, Mapping
6
+ from collections.abc import Callable, Generator, Mapping
7
7
 
8
8
  import re
9
9
  from contextvars import ContextVar
10
10
  from io import StringIO
11
11
  from itertools import chain
12
- from typing import TYPE_CHECKING, Any, Callable, Literal, cast
12
+ from typing import TYPE_CHECKING, Any, Literal, cast
13
13
 
14
14
  from bs4 import BeautifulSoup, Comment, Doctype, Tag
15
15
  from bs4.element import NavigableString, PageElement
@@ -176,7 +176,7 @@ def _process_tag(
176
176
  tag_name: SupportedTag | None = (
177
177
  cast("SupportedTag", tag.name.lower()) if tag.name.lower() in converters_map else None
178
178
  )
179
- text = ""
179
+ text_parts: list[str] = []
180
180
 
181
181
  is_heading = html_heading_re.match(tag.name) is not None
182
182
  is_cell = tag_name in {"td", "th"}
@@ -193,27 +193,56 @@ def _process_tag(
193
193
  if can_extract and isinstance(el, NavigableString) and not el.strip():
194
194
  el.extract()
195
195
 
196
- for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children):
196
+ children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
197
+
198
+ # List of tags that return empty string when they have no content
199
+ empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
200
+
201
+ for i, el in enumerate(children):
197
202
  if isinstance(el, NavigableString):
198
- text += _process_text(
199
- el=el,
200
- escape_misc=escape_misc,
201
- escape_asterisks=escape_asterisks,
202
- escape_underscores=escape_underscores,
203
+ # Check if this is whitespace between empty elements
204
+ if el.strip() == "" and i > 0 and i < len(children) - 1:
205
+ prev_el = children[i - 1]
206
+ next_el = children[i + 1]
207
+
208
+ # If previous element was a tag that produced empty output
209
+ # and next element is also a tag that could be empty, skip this whitespace
210
+ if (
211
+ isinstance(prev_el, Tag)
212
+ and isinstance(next_el, Tag)
213
+ and prev_el.name.lower() in empty_when_no_content_tags
214
+ and next_el.name.lower() in empty_when_no_content_tags
215
+ and not prev_el.get_text().strip()
216
+ ):
217
+ # Previous tag is empty and next could be empty too, skip this whitespace
218
+ continue
219
+
220
+ text_parts.append(
221
+ _process_text(
222
+ el=el,
223
+ escape_misc=escape_misc,
224
+ escape_asterisks=escape_asterisks,
225
+ escape_underscores=escape_underscores,
226
+ )
203
227
  )
204
228
  elif isinstance(el, Tag):
205
- text += _process_tag(
206
- el,
207
- converters_map,
208
- convert_as_inline=convert_children_as_inline,
209
- convert=convert,
210
- escape_asterisks=escape_asterisks,
211
- escape_misc=escape_misc,
212
- escape_underscores=escape_underscores,
213
- strip=strip,
214
- context_before=(context_before + text)[-2:],
229
+ current_text = "".join(text_parts)
230
+ text_parts.append(
231
+ _process_tag(
232
+ el,
233
+ converters_map,
234
+ convert_as_inline=convert_children_as_inline,
235
+ convert=convert,
236
+ escape_asterisks=escape_asterisks,
237
+ escape_misc=escape_misc,
238
+ escape_underscores=escape_underscores,
239
+ strip=strip,
240
+ context_before=(context_before + current_text)[-2:],
241
+ )
215
242
  )
216
243
 
244
+ text = "".join(text_parts)
245
+
217
246
  if tag_name and should_convert_tag:
218
247
  rendered = converters_map[tag_name]( # type: ignore[call-arg]
219
248
  tag=tag, text=text, convert_as_inline=convert_as_inline
@@ -252,22 +281,75 @@ def _process_text(
252
281
  break
253
282
 
254
283
  if "pre" not in ancestor_names:
255
- has_leading_space = text.startswith((" ", "\t"))
256
-
257
- has_trailing_space = text.endswith((" ", "\t"))
258
-
259
- middle_content = (
260
- text[1:-1]
261
- if has_leading_space and has_trailing_space
262
- else text[1:]
263
- if has_leading_space
264
- else text[:-1]
265
- if has_trailing_space
266
- else text
267
- )
284
+ # Special case: if the text is only whitespace
285
+ if text.strip() == "":
286
+ # If it contains newlines, it's probably indentation whitespace, return empty
287
+ if "\n" in text:
288
+ text = ""
289
+ else:
290
+ # Check if this whitespace is between block elements
291
+ # Define block elements that should not have whitespace between them
292
+ block_elements = {
293
+ "p",
294
+ "ul",
295
+ "ol",
296
+ "div",
297
+ "blockquote",
298
+ "pre",
299
+ "h1",
300
+ "h2",
301
+ "h3",
302
+ "h4",
303
+ "h5",
304
+ "h6",
305
+ "table",
306
+ "dl",
307
+ "hr",
308
+ "figure",
309
+ "article",
310
+ "section",
311
+ "nav",
312
+ "aside",
313
+ "header",
314
+ "footer",
315
+ "main",
316
+ "form",
317
+ "fieldset",
318
+ }
319
+
320
+ prev_sibling = el.previous_sibling
321
+ next_sibling = el.next_sibling
322
+
323
+ # Check if whitespace is between block elements
324
+ if (
325
+ prev_sibling
326
+ and hasattr(prev_sibling, "name")
327
+ and prev_sibling.name in block_elements
328
+ and next_sibling
329
+ and hasattr(next_sibling, "name")
330
+ and next_sibling.name in block_elements
331
+ ):
332
+ # Remove whitespace between block elements
333
+ text = ""
334
+ else:
335
+ # Otherwise it's inline whitespace, normalize to single space
336
+ text = " " if text else ""
337
+ else:
338
+ has_leading_space = text.startswith((" ", "\t"))
339
+ has_trailing_space = text.endswith((" ", "\t"))
340
+
341
+ middle_content = (
342
+ text[1:-1]
343
+ if has_leading_space and has_trailing_space
344
+ else text[1:]
345
+ if has_leading_space
346
+ else text[:-1]
347
+ if has_trailing_space
348
+ else text
349
+ )
268
350
 
269
- middle_content = whitespace_re.sub(" ", middle_content.strip())
270
- text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
351
+ middle_content = whitespace_re.sub(" ", middle_content.strip())
352
+ text = (" " if has_leading_space else "") + middle_content + (" " if has_trailing_space else "")
271
353
 
272
354
  if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
273
355
  text = escape(
@@ -388,7 +470,9 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
388
470
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
389
471
  metadata["canonical"] = canonical["href"]
390
472
 
391
- for rel_type in ["author", "license", "alternate"]:
473
+ # Extract link relations
474
+ link_relations = {"author", "license", "alternate"}
475
+ for rel_type in link_relations:
392
476
  link = soup.find("link", rel=rel_type, href=True)
393
477
  if link and isinstance(link, Tag) and isinstance(link["href"], str):
394
478
  metadata[f"link-{rel_type}"] = link["href"]
@@ -653,7 +737,9 @@ def convert_to_markdown(
653
737
  if leading_whitespace_match:
654
738
  leading_whitespace = leading_whitespace_match.group(0)
655
739
 
656
- if any(tag in original_input for tag in ["<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"]):
740
+ # Check if input contains list or heading tags
741
+ list_heading_tags = {"<ol", "<ul", "<li", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6"}
742
+ if any(tag in original_input for tag in list_heading_tags):
657
743
  leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
658
744
  leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
659
745
 
@@ -665,13 +751,23 @@ def convert_to_markdown(
665
751
  def normalize_spaces_outside_code(text: str) -> str:
666
752
  parts = text.split("```")
667
753
  for i in range(0, len(parts), 2):
668
- # Preserve definition list formatting (: followed by 3 spaces)
669
- # Split by definition list patterns to preserve them
670
- def_parts = re.split(r"(:\s{3})", parts[i])
671
- for j in range(0, len(def_parts), 2):
672
- # Only normalize non-definition-list parts
673
- def_parts[j] = re.sub(r" {3,}", " ", def_parts[j])
674
- parts[i] = "".join(def_parts)
754
+ # Process each line separately to preserve leading spaces
755
+ lines = parts[i].split("\n")
756
+ processed_lines = []
757
+ for line in lines:
758
+ # Preserve definition list formatting (: followed by 3 spaces)
759
+ def_parts = re.split(r"(:\s{3})", line)
760
+ for j in range(0, len(def_parts), 2):
761
+ # Only normalize non-definition-list parts
762
+ # Also preserve leading spaces (for list indentation)
763
+ match = re.match(r"^(\s*)(.*)", def_parts[j])
764
+ if match:
765
+ leading_spaces, rest = match.groups()
766
+ # Only normalize multiple spaces that are not at the beginning
767
+ rest = re.sub(r" {3,}", " ", rest)
768
+ def_parts[j] = leading_spaces + rest
769
+ processed_lines.append("".join(def_parts))
770
+ parts[i] = "\n".join(processed_lines)
675
771
  return "```".join(parts)
676
772
 
677
773
  result = normalize_spaces_outside_code(result)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: Developers
15
15
  Classifier: License :: OSI Approved :: MIT License
16
16
  Classifier: Operating System :: OS Independent
17
17
  Classifier: Programming Language :: Python :: 3 :: Only
18
- Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3.11
21
20
  Classifier: Programming Language :: Python :: 3.12
@@ -28,13 +27,13 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
28
27
  Classifier: Topic :: Text Processing :: Markup :: Markdown
29
28
  Classifier: Topic :: Utilities
30
29
  Classifier: Typing :: Typed
31
- Requires-Python: >=3.9
30
+ Requires-Python: >=3.10
32
31
  Description-Content-Type: text/markdown
33
32
  License-File: LICENSE
34
33
  Requires-Dist: beautifulsoup4>=4.13.4
35
- Requires-Dist: nh3>=0.2.21
34
+ Requires-Dist: nh3>=0.3
36
35
  Provides-Extra: lxml
37
- Requires-Dist: lxml>=5; extra == "lxml"
36
+ Requires-Dist: lxml>=6; extra == "lxml"
38
37
  Dynamic: license-file
39
38
 
40
39
  # html-to-markdown
@@ -46,6 +45,7 @@ Python 3.9+.
46
45
  ## Features
47
46
 
48
47
  - **Full HTML5 Support**: Comprehensive support for all modern HTML5 elements including semantic, form, table, ruby, interactive, structural, SVG, and math elements
48
+ - **Enhanced Table Support**: Advanced handling of merged cells with rowspan/colspan support for better table representation
49
49
  - **Type Safety**: Strict MyPy adherence with comprehensive type hints
50
50
  - **Metadata Extraction**: Automatic extraction of document metadata (title, meta tags) as comment headers
51
51
  - **Streaming Support**: Memory-efficient processing for large documents with progress callbacks
@@ -55,7 +55,7 @@ Python 3.9+.
55
55
  - **CLI Tool**: Full-featured command-line interface with all API options exposed
56
56
  - **Custom Converters**: Extensible converter system for custom HTML tag handling
57
57
  - **BeautifulSoup Integration**: Support for pre-configured BeautifulSoup instances
58
- - **Extensive Test Coverage**: 100% test coverage requirement with comprehensive test suite
58
+ - **Comprehensive Test Coverage**: 91%+ test coverage with 623+ comprehensive tests
59
59
 
60
60
  ## Installation
61
61
 
@@ -203,6 +203,51 @@ print(markdown)
203
203
 
204
204
  Custom converters take precedence over the built-in converters and can be used alongside other configuration options.
205
205
 
206
+ ### Enhanced Table Support
207
+
208
+ The library now provides better handling of complex tables with merged cells:
209
+
210
+ ```python
211
+ from html_to_markdown import convert_to_markdown
212
+
213
+ # HTML table with merged cells
214
+ html = """
215
+ <table>
216
+ <tr>
217
+ <th rowspan="2">Category</th>
218
+ <th colspan="2">Sales Data</th>
219
+ </tr>
220
+ <tr>
221
+ <th>Q1</th>
222
+ <th>Q2</th>
223
+ </tr>
224
+ <tr>
225
+ <td>Product A</td>
226
+ <td>$100K</td>
227
+ <td>$150K</td>
228
+ </tr>
229
+ </table>
230
+ """
231
+
232
+ markdown = convert_to_markdown(html)
233
+ print(markdown)
234
+ ```
235
+
236
+ Output:
237
+
238
+ ```markdown
239
+ | Category | Sales Data | |
240
+ | --- | --- | --- |
241
+ | | Q1 | Q2 |
242
+ | Product A | $100K | $150K |
243
+ ```
244
+
245
+ The library handles:
246
+
247
+ - **Rowspan**: Inserts empty cells in subsequent rows
248
+ - **Colspan**: Properly manages column spanning
249
+ - **Clean output**: Removes `<colgroup>` and `<col>` elements that have no Markdown equivalent
250
+
206
251
  ### Key Configuration Options
207
252
 
208
253
  | Option | Type | Default | Description |
@@ -438,7 +483,9 @@ This library provides comprehensive support for all modern HTML5 elements:
438
483
 
439
484
  ### Table Elements
440
485
 
441
- - `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`, `<col>`, `<colgroup>`
486
+ - `<table>`, `<thead>`, `<tbody>`, `<tfoot>`, `<tr>`, `<th>`, `<td>`, `<caption>`
487
+ - **Merged cell support**: Handles `rowspan` and `colspan` attributes for complex table layouts
488
+ - **Smart cleanup**: Automatically handles table styling elements for clean Markdown output
442
489
 
443
490
  ### Interactive Elements
444
491
 
@@ -457,16 +504,41 @@ This library provides comprehensive support for all modern HTML5 elements:
457
504
 
458
505
  - `<math>` (MathML support)
459
506
 
460
- ## Breaking Changes (Major Version)
507
+ ## Advanced Table Support
508
+
509
+ The library provides sophisticated handling of complex HTML tables, including merged cells and proper structure conversion:
510
+
511
+ ```python
512
+ from html_to_markdown import convert_to_markdown
513
+
514
+ # Complex table with merged cells
515
+ html = """
516
+ <table>
517
+ <caption>Sales Report</caption>
518
+ <tr>
519
+ <th rowspan="2">Product</th>
520
+ <th colspan="2">Quarterly Sales</th>
521
+ </tr>
522
+ <tr>
523
+ <th>Q1</th>
524
+ <th>Q2</th>
525
+ </tr>
526
+ <tr>
527
+ <td>Widget A</td>
528
+ <td>$50K</td>
529
+ <td>$75K</td>
530
+ </tr>
531
+ </table>
532
+ """
533
+
534
+ result = convert_to_markdown(html)
535
+ ```
461
536
 
462
- This version introduces several breaking changes for improved consistency and functionality:
537
+ **Features:**
463
538
 
464
- 1. **Enhanced Metadata Extraction**: Now enabled by default with comprehensive extraction of title, meta tags, and link relations
465
- 1. **Improved Newline Handling**: Better normalization of excessive newlines (max 2 consecutive)
466
- 1. **Extended HTML5 Support**: Added support for 40+ new HTML5 elements
467
- 1. **Streaming API**: New streaming parameters for large document processing
468
- 1. **Task List Support**: Automatic conversion of HTML checkboxes to GitHub-compatible task lists
469
- 1. **Highlight Styles**: New `highlight_style` parameter with multiple options for `<mark>` elements
539
+ - **Merged cell support**: Handles `rowspan` and `colspan` attributes intelligently
540
+ - **Clean output**: Automatically removes table styling elements that don't translate to Markdown
541
+ - **Structure preservation**: Maintains table hierarchy and relationships
470
542
 
471
543
  ## Acknowledgments
472
544
 
@@ -2,15 +2,15 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
2
2
  html_to_markdown/__main__.py,sha256=DJyJX7NIK0BVPNS2r3BYJ0Ci_lKHhgVOpw7ZEqACH3c,323
3
3
  html_to_markdown/cli.py,sha256=8xlgSEcnqsSM_dr1TCSgPDAo09YvUtO78PvDFivFFdg,6973
4
4
  html_to_markdown/constants.py,sha256=8vqANd-7wYvDzBm1VXZvdIxS4Xom4Ov_Yghg6jvmyio,584
5
- html_to_markdown/converters.py,sha256=COC2KqPelJlMCY5eXUS5gdiPOG8Yzx0U719FeXPw3GA,55514
5
+ html_to_markdown/converters.py,sha256=ESOZQSW8qGAG1S9f_iDpPUirKIc9MGz_G0_rqbTCJ30,50018
6
6
  html_to_markdown/exceptions.py,sha256=s1DaG6A23rOurF91e4jryuUzplWcC_JIAuK9_bw_4jQ,1558
7
7
  html_to_markdown/preprocessor.py,sha256=S4S1ZfLC_hkJVgmA5atImTyWQDOxfHctPbaep2QtyrQ,11248
8
- html_to_markdown/processing.py,sha256=wkbhLg42U3aeVQSZFuzGt5irtN037XzRKpCE71QYZXI,36520
8
+ html_to_markdown/processing.py,sha256=iUVZfDG_QmFsY32O3mJZEuyxS2m8cjZaNnsstx2RkQo,40544
9
9
  html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  html_to_markdown/utils.py,sha256=QgWPzmpZKFd6wDTe8IY3gbVT3xNzoGV3PBgd17J0O-w,2066
11
- html_to_markdown-1.8.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
12
- html_to_markdown-1.8.0.dist-info/METADATA,sha256=6pgiK4p0A77axLfD8MH1EGgzifP06koVV8KWS_5-iYk,17175
13
- html_to_markdown-1.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- html_to_markdown-1.8.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
15
- html_to_markdown-1.8.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
16
- html_to_markdown-1.8.0.dist-info/RECORD,,
11
+ html_to_markdown-1.9.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
12
+ html_to_markdown-1.9.0.dist-info/METADATA,sha256=Rptd2quL9YEGi7Bmh-pgbdPGx-8Ud8EZeZZLQNIMEik,18450
13
+ html_to_markdown-1.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ html_to_markdown-1.9.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
15
+ html_to_markdown-1.9.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
16
+ html_to_markdown-1.9.0.dist-info/RECORD,,